mirror of
https://github.com/postgres/postgres.git
synced 2025-09-02 04:21:28 +03:00
Introduce PG_IO_ALIGN_SIZE and align all I/O buffers.
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned. The exact requirements vary by OS and file system (typically
sectors and/or memory pages). The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size. There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175
.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0. This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead. That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
This commit is contained in:
@@ -78,9 +78,12 @@ InitBufferPool(void)
|
||||
NBuffers * sizeof(BufferDescPadded),
|
||||
&foundDescs);
|
||||
|
||||
/* Align buffer pool on IO page size boundary. */
|
||||
BufferBlocks = (char *)
|
||||
ShmemInitStruct("Buffer Blocks",
|
||||
NBuffers * (Size) BLCKSZ, &foundBufs);
|
||||
TYPEALIGN(PG_IO_ALIGN_SIZE,
|
||||
ShmemInitStruct("Buffer Blocks",
|
||||
NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
|
||||
&foundBufs));
|
||||
|
||||
/* Align condition variables to cacheline boundary. */
|
||||
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
|
||||
@@ -163,7 +166,8 @@ BufferShmemSize(void)
|
||||
/* to allow aligning buffer descriptors */
|
||||
size = add_size(size, PG_CACHE_LINE_SIZE);
|
||||
|
||||
/* size of data pages */
|
||||
/* size of data pages, plus alignment padding */
|
||||
size = add_size(size, PG_IO_ALIGN_SIZE);
|
||||
size = add_size(size, mul_size(NBuffers, BLCKSZ));
|
||||
|
||||
/* size of stuff controlled by freelist.c */
|
||||
|
@@ -4250,7 +4250,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
|
||||
bool use_wal;
|
||||
BlockNumber nblocks;
|
||||
BlockNumber blkno;
|
||||
PGAlignedBlock buf;
|
||||
PGIOAlignedBlock buf;
|
||||
BufferAccessStrategy bstrategy_src;
|
||||
BufferAccessStrategy bstrategy_dst;
|
||||
|
||||
|
@@ -744,8 +744,11 @@ GetLocalBufferStorage(void)
|
||||
/* And don't overflow MaxAllocSize, either */
|
||||
num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
|
||||
|
||||
cur_block = (char *) MemoryContextAlloc(LocalBufferContext,
|
||||
num_bufs * BLCKSZ);
|
||||
/* Buffers should be I/O aligned. */
|
||||
cur_block = (char *)
|
||||
TYPEALIGN(PG_IO_ALIGN_SIZE,
|
||||
MemoryContextAlloc(LocalBufferContext,
|
||||
num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
|
||||
next_buf_in_block = 0;
|
||||
num_bufs_in_block = num_bufs;
|
||||
}
|
||||
|
@@ -95,6 +95,12 @@ struct BufFile
|
||||
off_t curOffset; /* offset part of current pos */
|
||||
int pos; /* next read/write position in buffer */
|
||||
int nbytes; /* total # of valid bytes in buffer */
|
||||
|
||||
/*
|
||||
* XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
|
||||
* wasting per-file alignment padding when some users create many
|
||||
* files.
|
||||
*/
|
||||
PGAlignedBlock buffer;
|
||||
};
|
||||
|
||||
|
@@ -1522,7 +1522,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
|
||||
* and second to avoid wasting space in processes that never call this.
|
||||
*/
|
||||
if (pageCopy == NULL)
|
||||
pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
|
||||
pageCopy = MemoryContextAllocAligned(TopMemoryContext,
|
||||
BLCKSZ,
|
||||
PG_IO_ALIGN_SIZE,
|
||||
0);
|
||||
|
||||
memcpy(pageCopy, (char *) page, BLCKSZ);
|
||||
((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
|
||||
|
@@ -453,6 +453,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
int nbytes;
|
||||
MdfdVec *v;
|
||||
|
||||
/* If this build supports direct I/O, the buffer must be I/O aligned. */
|
||||
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
|
||||
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
|
||||
|
||||
/* This assert is too expensive to have on normally ... */
|
||||
#ifdef CHECK_WRITE_VS_EXTEND
|
||||
Assert(blocknum >= mdnblocks(reln, forknum));
|
||||
@@ -783,6 +787,10 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
int nbytes;
|
||||
MdfdVec *v;
|
||||
|
||||
/* If this build supports direct I/O, the buffer must be I/O aligned. */
|
||||
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
|
||||
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
|
||||
|
||||
TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
|
||||
reln->smgr_rlocator.locator.spcOid,
|
||||
reln->smgr_rlocator.locator.dbOid,
|
||||
@@ -848,6 +856,10 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
int nbytes;
|
||||
MdfdVec *v;
|
||||
|
||||
/* If this build supports direct I/O, the buffer must be I/O aligned. */
|
||||
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
|
||||
Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
|
||||
|
||||
/* This assert is too expensive to have on normally ... */
|
||||
#ifdef CHECK_WRITE_VS_EXTEND
|
||||
Assert(blocknum < mdnblocks(reln, forknum));
|
||||
@@ -1429,7 +1441,8 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
*/
|
||||
if (nblocks < ((BlockNumber) RELSEG_SIZE))
|
||||
{
|
||||
char *zerobuf = palloc0(BLCKSZ);
|
||||
char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
|
||||
MCXT_ALLOC_ZERO);
|
||||
|
||||
mdextend(reln, forknum,
|
||||
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
|
||||
|
Reference in New Issue
Block a user