mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned.  The exact requirements vary by OS and file system (typically
sectors and/or memory pages).  The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size.  There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler.  (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ.  It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0.  This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead.  That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
		
	
		
			
				
	
	
		
			205 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			205 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-------------------------------------------------------------------------
 | |
|  *
 | |
|  * pg_prewarm.c
 | |
|  *		  prewarming utilities
 | |
|  *
 | |
|  * Copyright (c) 2010-2023, PostgreSQL Global Development Group
 | |
|  *
 | |
|  * IDENTIFICATION
 | |
|  *		  contrib/pg_prewarm/pg_prewarm.c
 | |
|  *
 | |
|  *-------------------------------------------------------------------------
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include <sys/stat.h>
 | |
| #include <unistd.h>
 | |
| 
 | |
| #include "access/relation.h"
 | |
| #include "fmgr.h"
 | |
| #include "miscadmin.h"
 | |
| #include "storage/bufmgr.h"
 | |
| #include "storage/smgr.h"
 | |
| #include "utils/acl.h"
 | |
| #include "utils/builtins.h"
 | |
| #include "utils/lsyscache.h"
 | |
| #include "utils/rel.h"
 | |
| 
 | |
| PG_MODULE_MAGIC;
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(pg_prewarm);
 | |
| 
 | |
| typedef enum
 | |
| {
 | |
| 	PREWARM_PREFETCH,
 | |
| 	PREWARM_READ,
 | |
| 	PREWARM_BUFFER
 | |
| } PrewarmType;
 | |
| 
 | |
| static PGIOAlignedBlock blockbuffer;
 | |
| 
 | |
| /*
 | |
|  * pg_prewarm(regclass, mode text, fork text,
 | |
|  *			  first_block int8, last_block int8)
 | |
|  *
 | |
|  * The first argument is the relation to be prewarmed; the second controls
 | |
|  * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'.
 | |
|  * The third is the name of the relation fork to be prewarmed.  The fourth
 | |
|  * and fifth arguments specify the first and last block to be prewarmed.
 | |
|  * If the fourth argument is NULL, it will be taken as 0; if the fifth argument
 | |
|  * is NULL, it will be taken as the number of blocks in the relation.  The
 | |
|  * return value is the number of blocks successfully prewarmed.
 | |
|  */
 | |
| Datum
 | |
| pg_prewarm(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Oid			relOid;
 | |
| 	text	   *forkName;
 | |
| 	text	   *type;
 | |
| 	int64		first_block;
 | |
| 	int64		last_block;
 | |
| 	int64		nblocks;
 | |
| 	int64		blocks_done = 0;
 | |
| 	int64		block;
 | |
| 	Relation	rel;
 | |
| 	ForkNumber	forkNumber;
 | |
| 	char	   *forkString;
 | |
| 	char	   *ttype;
 | |
| 	PrewarmType ptype;
 | |
| 	AclResult	aclresult;
 | |
| 
 | |
| 	/* Basic sanity checking. */
 | |
| 	if (PG_ARGISNULL(0))
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("relation cannot be null")));
 | |
| 	relOid = PG_GETARG_OID(0);
 | |
| 	if (PG_ARGISNULL(1))
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("prewarm type cannot be null")));
 | |
| 	type = PG_GETARG_TEXT_PP(1);
 | |
| 	ttype = text_to_cstring(type);
 | |
| 	if (strcmp(ttype, "prefetch") == 0)
 | |
| 		ptype = PREWARM_PREFETCH;
 | |
| 	else if (strcmp(ttype, "read") == 0)
 | |
| 		ptype = PREWARM_READ;
 | |
| 	else if (strcmp(ttype, "buffer") == 0)
 | |
| 		ptype = PREWARM_BUFFER;
 | |
| 	else
 | |
| 	{
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("invalid prewarm type"),
 | |
| 				 errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\".")));
 | |
| 		PG_RETURN_INT64(0);		/* Placate compiler. */
 | |
| 	}
 | |
| 	if (PG_ARGISNULL(2))
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("relation fork cannot be null")));
 | |
| 	forkName = PG_GETARG_TEXT_PP(2);
 | |
| 	forkString = text_to_cstring(forkName);
 | |
| 	forkNumber = forkname_to_number(forkString);
 | |
| 
 | |
| 	/* Open relation and check privileges. */
 | |
| 	rel = relation_open(relOid, AccessShareLock);
 | |
| 	aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT);
 | |
| 	if (aclresult != ACLCHECK_OK)
 | |
| 		aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), get_rel_name(relOid));
 | |
| 
 | |
| 	/* Check that the fork exists. */
 | |
| 	if (!smgrexists(RelationGetSmgr(rel), forkNumber))
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("fork \"%s\" does not exist for this relation",
 | |
| 						forkString)));
 | |
| 
 | |
| 	/* Validate block numbers, or handle nulls. */
 | |
| 	nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber);
 | |
| 	if (PG_ARGISNULL(3))
 | |
| 		first_block = 0;
 | |
| 	else
 | |
| 	{
 | |
| 		first_block = PG_GETARG_INT64(3);
 | |
| 		if (first_block < 0 || first_block >= nblocks)
 | |
| 			ereport(ERROR,
 | |
| 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 					 errmsg("starting block number must be between 0 and %lld",
 | |
| 							(long long) (nblocks - 1))));
 | |
| 	}
 | |
| 	if (PG_ARGISNULL(4))
 | |
| 		last_block = nblocks - 1;
 | |
| 	else
 | |
| 	{
 | |
| 		last_block = PG_GETARG_INT64(4);
 | |
| 		if (last_block < 0 || last_block >= nblocks)
 | |
| 			ereport(ERROR,
 | |
| 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 					 errmsg("ending block number must be between 0 and %lld",
 | |
| 							(long long) (nblocks - 1))));
 | |
| 	}
 | |
| 
 | |
| 	/* Now we're ready to do the real work. */
 | |
| 	if (ptype == PREWARM_PREFETCH)
 | |
| 	{
 | |
| #ifdef USE_PREFETCH
 | |
| 
 | |
| 		/*
 | |
| 		 * In prefetch mode, we just hint the OS to read the blocks, but we
 | |
| 		 * don't know whether it really does it, and we don't wait for it to
 | |
| 		 * finish.
 | |
| 		 *
 | |
| 		 * It would probably be better to pass our prefetch requests in chunks
 | |
| 		 * of a megabyte or maybe even a whole segment at a time, but there's
 | |
| 		 * no practical way to do that at present without a gross modularity
 | |
| 		 * violation, so we just do this.
 | |
| 		 */
 | |
| 		for (block = first_block; block <= last_block; ++block)
 | |
| 		{
 | |
| 			CHECK_FOR_INTERRUPTS();
 | |
| 			PrefetchBuffer(rel, forkNumber, block);
 | |
| 			++blocks_done;
 | |
| 		}
 | |
| #else
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 | |
| 				 errmsg("prefetch is not supported by this build")));
 | |
| #endif
 | |
| 	}
 | |
| 	else if (ptype == PREWARM_READ)
 | |
| 	{
 | |
| 		/*
 | |
| 		 * In read mode, we actually read the blocks, but not into shared
 | |
| 		 * buffers.  This is more portable than prefetch mode (it works
 | |
| 		 * everywhere) and is synchronous.
 | |
| 		 */
 | |
| 		for (block = first_block; block <= last_block; ++block)
 | |
| 		{
 | |
| 			CHECK_FOR_INTERRUPTS();
 | |
| 			smgrread(RelationGetSmgr(rel), forkNumber, block, blockbuffer.data);
 | |
| 			++blocks_done;
 | |
| 		}
 | |
| 	}
 | |
| 	else if (ptype == PREWARM_BUFFER)
 | |
| 	{
 | |
| 		/*
 | |
| 		 * In buffer mode, we actually pull the data into shared_buffers.
 | |
| 		 */
 | |
| 		for (block = first_block; block <= last_block; ++block)
 | |
| 		{
 | |
| 			Buffer		buf;
 | |
| 
 | |
| 			CHECK_FOR_INTERRUPTS();
 | |
| 			buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
 | |
| 			ReleaseBuffer(buf);
 | |
| 			++blocks_done;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	/* Close relation, release lock. */
 | |
| 	relation_close(rel, AccessShareLock);
 | |
| 
 | |
| 	PG_RETURN_INT64(blocks_done);
 | |
| }
 |