mirror of
https://github.com/postgres/postgres.git
synced 2025-10-24 01:29:19 +03:00
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned. The exact requirements vary by OS and file system (typically
sectors and/or memory pages). The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size. There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175
.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0. This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead. That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
205 lines
5.6 KiB
C
205 lines
5.6 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* pg_prewarm.c
|
|
* prewarming utilities
|
|
*
|
|
* Copyright (c) 2010-2023, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* contrib/pg_prewarm/pg_prewarm.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
|
|
#include "access/relation.h"
|
|
#include "fmgr.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/smgr.h"
|
|
#include "utils/acl.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/rel.h"
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
PG_FUNCTION_INFO_V1(pg_prewarm);
|
|
|
|
typedef enum
|
|
{
|
|
PREWARM_PREFETCH,
|
|
PREWARM_READ,
|
|
PREWARM_BUFFER
|
|
} PrewarmType;
|
|
|
|
static PGIOAlignedBlock blockbuffer;
|
|
|
|
/*
|
|
* pg_prewarm(regclass, mode text, fork text,
|
|
* first_block int8, last_block int8)
|
|
*
|
|
* The first argument is the relation to be prewarmed; the second controls
|
|
* how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'.
|
|
* The third is the name of the relation fork to be prewarmed. The fourth
|
|
* and fifth arguments specify the first and last block to be prewarmed.
|
|
* If the fourth argument is NULL, it will be taken as 0; if the fifth argument
|
|
* is NULL, it will be taken as the number of blocks in the relation. The
|
|
* return value is the number of blocks successfully prewarmed.
|
|
*/
|
|
Datum
|
|
pg_prewarm(PG_FUNCTION_ARGS)
|
|
{
|
|
Oid relOid;
|
|
text *forkName;
|
|
text *type;
|
|
int64 first_block;
|
|
int64 last_block;
|
|
int64 nblocks;
|
|
int64 blocks_done = 0;
|
|
int64 block;
|
|
Relation rel;
|
|
ForkNumber forkNumber;
|
|
char *forkString;
|
|
char *ttype;
|
|
PrewarmType ptype;
|
|
AclResult aclresult;
|
|
|
|
/* Basic sanity checking. */
|
|
if (PG_ARGISNULL(0))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("relation cannot be null")));
|
|
relOid = PG_GETARG_OID(0);
|
|
if (PG_ARGISNULL(1))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("prewarm type cannot be null")));
|
|
type = PG_GETARG_TEXT_PP(1);
|
|
ttype = text_to_cstring(type);
|
|
if (strcmp(ttype, "prefetch") == 0)
|
|
ptype = PREWARM_PREFETCH;
|
|
else if (strcmp(ttype, "read") == 0)
|
|
ptype = PREWARM_READ;
|
|
else if (strcmp(ttype, "buffer") == 0)
|
|
ptype = PREWARM_BUFFER;
|
|
else
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("invalid prewarm type"),
|
|
errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\".")));
|
|
PG_RETURN_INT64(0); /* Placate compiler. */
|
|
}
|
|
if (PG_ARGISNULL(2))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("relation fork cannot be null")));
|
|
forkName = PG_GETARG_TEXT_PP(2);
|
|
forkString = text_to_cstring(forkName);
|
|
forkNumber = forkname_to_number(forkString);
|
|
|
|
/* Open relation and check privileges. */
|
|
rel = relation_open(relOid, AccessShareLock);
|
|
aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT);
|
|
if (aclresult != ACLCHECK_OK)
|
|
aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), get_rel_name(relOid));
|
|
|
|
/* Check that the fork exists. */
|
|
if (!smgrexists(RelationGetSmgr(rel), forkNumber))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("fork \"%s\" does not exist for this relation",
|
|
forkString)));
|
|
|
|
/* Validate block numbers, or handle nulls. */
|
|
nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber);
|
|
if (PG_ARGISNULL(3))
|
|
first_block = 0;
|
|
else
|
|
{
|
|
first_block = PG_GETARG_INT64(3);
|
|
if (first_block < 0 || first_block >= nblocks)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("starting block number must be between 0 and %lld",
|
|
(long long) (nblocks - 1))));
|
|
}
|
|
if (PG_ARGISNULL(4))
|
|
last_block = nblocks - 1;
|
|
else
|
|
{
|
|
last_block = PG_GETARG_INT64(4);
|
|
if (last_block < 0 || last_block >= nblocks)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("ending block number must be between 0 and %lld",
|
|
(long long) (nblocks - 1))));
|
|
}
|
|
|
|
/* Now we're ready to do the real work. */
|
|
if (ptype == PREWARM_PREFETCH)
|
|
{
|
|
#ifdef USE_PREFETCH
|
|
|
|
/*
|
|
* In prefetch mode, we just hint the OS to read the blocks, but we
|
|
* don't know whether it really does it, and we don't wait for it to
|
|
* finish.
|
|
*
|
|
* It would probably be better to pass our prefetch requests in chunks
|
|
* of a megabyte or maybe even a whole segment at a time, but there's
|
|
* no practical way to do that at present without a gross modularity
|
|
* violation, so we just do this.
|
|
*/
|
|
for (block = first_block; block <= last_block; ++block)
|
|
{
|
|
CHECK_FOR_INTERRUPTS();
|
|
PrefetchBuffer(rel, forkNumber, block);
|
|
++blocks_done;
|
|
}
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("prefetch is not supported by this build")));
|
|
#endif
|
|
}
|
|
else if (ptype == PREWARM_READ)
|
|
{
|
|
/*
|
|
* In read mode, we actually read the blocks, but not into shared
|
|
* buffers. This is more portable than prefetch mode (it works
|
|
* everywhere) and is synchronous.
|
|
*/
|
|
for (block = first_block; block <= last_block; ++block)
|
|
{
|
|
CHECK_FOR_INTERRUPTS();
|
|
smgrread(RelationGetSmgr(rel), forkNumber, block, blockbuffer.data);
|
|
++blocks_done;
|
|
}
|
|
}
|
|
else if (ptype == PREWARM_BUFFER)
|
|
{
|
|
/*
|
|
* In buffer mode, we actually pull the data into shared_buffers.
|
|
*/
|
|
for (block = first_block; block <= last_block; ++block)
|
|
{
|
|
Buffer buf;
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
|
|
ReleaseBuffer(buf);
|
|
++blocks_done;
|
|
}
|
|
}
|
|
|
|
/* Close relation, release lock. */
|
|
relation_close(rel, AccessShareLock);
|
|
|
|
PG_RETURN_INT64(blocks_done);
|
|
}
|