mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned.  The exact requirements vary by OS and file system (typically
sectors and/or memory pages).  The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size.  There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler.  (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ.  It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0.  This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead.  That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
		
	
		
			
				
	
	
		
			205 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			205 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*-------------------------------------------------------------------------
 | 
						|
 *
 | 
						|
 * pg_prewarm.c
 | 
						|
 *		  prewarming utilities
 | 
						|
 *
 | 
						|
 * Copyright (c) 2010-2023, PostgreSQL Global Development Group
 | 
						|
 *
 | 
						|
 * IDENTIFICATION
 | 
						|
 *		  contrib/pg_prewarm/pg_prewarm.c
 | 
						|
 *
 | 
						|
 *-------------------------------------------------------------------------
 | 
						|
 */
 | 
						|
#include "postgres.h"
 | 
						|
 | 
						|
#include <sys/stat.h>
 | 
						|
#include <unistd.h>
 | 
						|
 | 
						|
#include "access/relation.h"
 | 
						|
#include "fmgr.h"
 | 
						|
#include "miscadmin.h"
 | 
						|
#include "storage/bufmgr.h"
 | 
						|
#include "storage/smgr.h"
 | 
						|
#include "utils/acl.h"
 | 
						|
#include "utils/builtins.h"
 | 
						|
#include "utils/lsyscache.h"
 | 
						|
#include "utils/rel.h"
 | 
						|
 | 
						|
PG_MODULE_MAGIC;
 | 
						|
 | 
						|
PG_FUNCTION_INFO_V1(pg_prewarm);
 | 
						|
 | 
						|
typedef enum
 | 
						|
{
 | 
						|
	PREWARM_PREFETCH,
 | 
						|
	PREWARM_READ,
 | 
						|
	PREWARM_BUFFER
 | 
						|
} PrewarmType;
 | 
						|
 | 
						|
static PGIOAlignedBlock blockbuffer;
 | 
						|
 | 
						|
/*
 | 
						|
 * pg_prewarm(regclass, mode text, fork text,
 | 
						|
 *			  first_block int8, last_block int8)
 | 
						|
 *
 | 
						|
 * The first argument is the relation to be prewarmed; the second controls
 | 
						|
 * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'.
 | 
						|
 * The third is the name of the relation fork to be prewarmed.  The fourth
 | 
						|
 * and fifth arguments specify the first and last block to be prewarmed.
 | 
						|
 * If the fourth argument is NULL, it will be taken as 0; if the fifth argument
 | 
						|
 * is NULL, it will be taken as the number of blocks in the relation.  The
 | 
						|
 * return value is the number of blocks successfully prewarmed.
 | 
						|
 */
 | 
						|
Datum
 | 
						|
pg_prewarm(PG_FUNCTION_ARGS)
 | 
						|
{
 | 
						|
	Oid			relOid;
 | 
						|
	text	   *forkName;
 | 
						|
	text	   *type;
 | 
						|
	int64		first_block;
 | 
						|
	int64		last_block;
 | 
						|
	int64		nblocks;
 | 
						|
	int64		blocks_done = 0;
 | 
						|
	int64		block;
 | 
						|
	Relation	rel;
 | 
						|
	ForkNumber	forkNumber;
 | 
						|
	char	   *forkString;
 | 
						|
	char	   *ttype;
 | 
						|
	PrewarmType ptype;
 | 
						|
	AclResult	aclresult;
 | 
						|
 | 
						|
	/* Basic sanity checking. */
 | 
						|
	if (PG_ARGISNULL(0))
 | 
						|
		ereport(ERROR,
 | 
						|
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
				 errmsg("relation cannot be null")));
 | 
						|
	relOid = PG_GETARG_OID(0);
 | 
						|
	if (PG_ARGISNULL(1))
 | 
						|
		ereport(ERROR,
 | 
						|
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
				 errmsg("prewarm type cannot be null")));
 | 
						|
	type = PG_GETARG_TEXT_PP(1);
 | 
						|
	ttype = text_to_cstring(type);
 | 
						|
	if (strcmp(ttype, "prefetch") == 0)
 | 
						|
		ptype = PREWARM_PREFETCH;
 | 
						|
	else if (strcmp(ttype, "read") == 0)
 | 
						|
		ptype = PREWARM_READ;
 | 
						|
	else if (strcmp(ttype, "buffer") == 0)
 | 
						|
		ptype = PREWARM_BUFFER;
 | 
						|
	else
 | 
						|
	{
 | 
						|
		ereport(ERROR,
 | 
						|
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
				 errmsg("invalid prewarm type"),
 | 
						|
				 errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\".")));
 | 
						|
		PG_RETURN_INT64(0);		/* Placate compiler. */
 | 
						|
	}
 | 
						|
	if (PG_ARGISNULL(2))
 | 
						|
		ereport(ERROR,
 | 
						|
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
				 errmsg("relation fork cannot be null")));
 | 
						|
	forkName = PG_GETARG_TEXT_PP(2);
 | 
						|
	forkString = text_to_cstring(forkName);
 | 
						|
	forkNumber = forkname_to_number(forkString);
 | 
						|
 | 
						|
	/* Open relation and check privileges. */
 | 
						|
	rel = relation_open(relOid, AccessShareLock);
 | 
						|
	aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT);
 | 
						|
	if (aclresult != ACLCHECK_OK)
 | 
						|
		aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), get_rel_name(relOid));
 | 
						|
 | 
						|
	/* Check that the fork exists. */
 | 
						|
	if (!smgrexists(RelationGetSmgr(rel), forkNumber))
 | 
						|
		ereport(ERROR,
 | 
						|
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
				 errmsg("fork \"%s\" does not exist for this relation",
 | 
						|
						forkString)));
 | 
						|
 | 
						|
	/* Validate block numbers, or handle nulls. */
 | 
						|
	nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber);
 | 
						|
	if (PG_ARGISNULL(3))
 | 
						|
		first_block = 0;
 | 
						|
	else
 | 
						|
	{
 | 
						|
		first_block = PG_GETARG_INT64(3);
 | 
						|
		if (first_block < 0 || first_block >= nblocks)
 | 
						|
			ereport(ERROR,
 | 
						|
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
					 errmsg("starting block number must be between 0 and %lld",
 | 
						|
							(long long) (nblocks - 1))));
 | 
						|
	}
 | 
						|
	if (PG_ARGISNULL(4))
 | 
						|
		last_block = nblocks - 1;
 | 
						|
	else
 | 
						|
	{
 | 
						|
		last_block = PG_GETARG_INT64(4);
 | 
						|
		if (last_block < 0 || last_block >= nblocks)
 | 
						|
			ereport(ERROR,
 | 
						|
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | 
						|
					 errmsg("ending block number must be between 0 and %lld",
 | 
						|
							(long long) (nblocks - 1))));
 | 
						|
	}
 | 
						|
 | 
						|
	/* Now we're ready to do the real work. */
 | 
						|
	if (ptype == PREWARM_PREFETCH)
 | 
						|
	{
 | 
						|
#ifdef USE_PREFETCH
 | 
						|
 | 
						|
		/*
 | 
						|
		 * In prefetch mode, we just hint the OS to read the blocks, but we
 | 
						|
		 * don't know whether it really does it, and we don't wait for it to
 | 
						|
		 * finish.
 | 
						|
		 *
 | 
						|
		 * It would probably be better to pass our prefetch requests in chunks
 | 
						|
		 * of a megabyte or maybe even a whole segment at a time, but there's
 | 
						|
		 * no practical way to do that at present without a gross modularity
 | 
						|
		 * violation, so we just do this.
 | 
						|
		 */
 | 
						|
		for (block = first_block; block <= last_block; ++block)
 | 
						|
		{
 | 
						|
			CHECK_FOR_INTERRUPTS();
 | 
						|
			PrefetchBuffer(rel, forkNumber, block);
 | 
						|
			++blocks_done;
 | 
						|
		}
 | 
						|
#else
 | 
						|
		ereport(ERROR,
 | 
						|
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 | 
						|
				 errmsg("prefetch is not supported by this build")));
 | 
						|
#endif
 | 
						|
	}
 | 
						|
	else if (ptype == PREWARM_READ)
 | 
						|
	{
 | 
						|
		/*
 | 
						|
		 * In read mode, we actually read the blocks, but not into shared
 | 
						|
		 * buffers.  This is more portable than prefetch mode (it works
 | 
						|
		 * everywhere) and is synchronous.
 | 
						|
		 */
 | 
						|
		for (block = first_block; block <= last_block; ++block)
 | 
						|
		{
 | 
						|
			CHECK_FOR_INTERRUPTS();
 | 
						|
			smgrread(RelationGetSmgr(rel), forkNumber, block, blockbuffer.data);
 | 
						|
			++blocks_done;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	else if (ptype == PREWARM_BUFFER)
 | 
						|
	{
 | 
						|
		/*
 | 
						|
		 * In buffer mode, we actually pull the data into shared_buffers.
 | 
						|
		 */
 | 
						|
		for (block = first_block; block <= last_block; ++block)
 | 
						|
		{
 | 
						|
			Buffer		buf;
 | 
						|
 | 
						|
			CHECK_FOR_INTERRUPTS();
 | 
						|
			buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
 | 
						|
			ReleaseBuffer(buf);
 | 
						|
			++blocks_done;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* Close relation, release lock. */
 | 
						|
	relation_close(rel, AccessShareLock);
 | 
						|
 | 
						|
	PG_RETURN_INT64(blocks_done);
 | 
						|
}
 |