1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-02 04:21:28 +03:00

Introduce PG_IO_ALIGN_SIZE and align all I/O buffers.

In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned.  The exact requirements vary by OS and file system (typically
sectors and/or memory pages).  The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size.  There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.

Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.

Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler.  (3) The buffer
pool is also aligned in shared memory.

WAL buffers were already aligned on XLOG_BLCKSZ.  It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.

BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).

If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0.  This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead.  That's an existing and tested class of system.

Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.

Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
This commit is contained in:
Thomas Munro
2023-04-08 10:38:09 +12:00
parent d73c285af5
commit faeedbcefd
26 changed files with 108 additions and 45 deletions

View File

@@ -78,9 +78,12 @@ InitBufferPool(void)
NBuffers * sizeof(BufferDescPadded),
&foundDescs);
/* Align buffer pool on IO page size boundary. */
BufferBlocks = (char *)
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs);
TYPEALIGN(PG_IO_ALIGN_SIZE,
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
&foundBufs));
/* Align condition variables to cacheline boundary. */
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
@@ -163,7 +166,8 @@ BufferShmemSize(void)
/* to allow aligning buffer descriptors */
size = add_size(size, PG_CACHE_LINE_SIZE);
/* size of data pages */
/* size of data pages, plus alignment padding */
size = add_size(size, PG_IO_ALIGN_SIZE);
size = add_size(size, mul_size(NBuffers, BLCKSZ));
/* size of stuff controlled by freelist.c */

View File

@@ -4250,7 +4250,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
bool use_wal;
BlockNumber nblocks;
BlockNumber blkno;
PGAlignedBlock buf;
PGIOAlignedBlock buf;
BufferAccessStrategy bstrategy_src;
BufferAccessStrategy bstrategy_dst;

View File

@@ -744,8 +744,11 @@ GetLocalBufferStorage(void)
/* And don't overflow MaxAllocSize, either */
num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
cur_block = (char *) MemoryContextAlloc(LocalBufferContext,
num_bufs * BLCKSZ);
/* Buffers should be I/O aligned. */
cur_block = (char *)
TYPEALIGN(PG_IO_ALIGN_SIZE,
MemoryContextAlloc(LocalBufferContext,
num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
next_buf_in_block = 0;
num_bufs_in_block = num_bufs;
}

View File

@@ -95,6 +95,12 @@ struct BufFile
off_t curOffset; /* offset part of current pos */
int pos; /* next read/write position in buffer */
int nbytes; /* total # of valid bytes in buffer */
/*
* XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
* wasting per-file alignment padding when some users create many
* files.
*/
PGAlignedBlock buffer;
};

View File

@@ -1522,7 +1522,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
* and second to avoid wasting space in processes that never call this.
*/
if (pageCopy == NULL)
pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
pageCopy = MemoryContextAllocAligned(TopMemoryContext,
BLCKSZ,
PG_IO_ALIGN_SIZE,
0);
memcpy(pageCopy, (char *) page, BLCKSZ);
((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);

View File

@@ -453,6 +453,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
int nbytes;
MdfdVec *v;
/* If this build supports direct I/O, the buffer must be I/O aligned. */
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
/* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
Assert(blocknum >= mdnblocks(reln, forknum));
@@ -783,6 +787,10 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
int nbytes;
MdfdVec *v;
/* If this build supports direct I/O, the buffer must be I/O aligned. */
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
@@ -848,6 +856,10 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
int nbytes;
MdfdVec *v;
/* If this build supports direct I/O, the buffer must be I/O aligned. */
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
/* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
Assert(blocknum < mdnblocks(reln, forknum));
@@ -1429,7 +1441,8 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
*/
if (nblocks < ((BlockNumber) RELSEG_SIZE))
{
char *zerobuf = palloc0(BLCKSZ);
char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
MCXT_ALLOC_ZERO);
mdextend(reln, forknum,
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,