mirror of
https://github.com/postgres/postgres.git
synced 2025-11-28 11:44:57 +03:00
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the OS page cache. This means that some operating systems can end up collecting a large number of dirty buffers in their respective page caches. When these dirty buffers are flushed to storage rapidly, be it because of fsync(), timeouts, or dirty ratios, latency for other reads and writes can increase massively. This is the primary reason for regular massive stalls observed in real world scenarios and artificial benchmarks; on rotating disks stalls on the order of hundreds of seconds have been observed. On linux it is possible to control this by reducing the global dirty limits significantly, reducing the above problem. But global configuration is rather problematic because it'll affect other applications; also PostgreSQL itself doesn't always generally want this behavior, e.g. for temporary files it's undesirable. Several operating systems allow some control over the kernel page cache. Linux has sync_file_range(2), several posix systems have msync(2) and posix_fadvise(2). sync_file_range(2) is preferable because it requires no special setup, whereas msync() requires the to-be-flushed range to be mmap'ed. For the purpose of flushing dirty data posix_fadvise(2) is the worst alternative, as flushing dirty data is just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages from the page cache. Thus the feature is enabled by default only on linux, but can be enabled on all systems that have any of the above APIs. While desirable and likely possible this patch does not contain an implementation for windows. With the infrastructure added, writes made via checkpointer, bgwriter and normal user backends can be flushed after a configurable number of writes. Each of these sources of writes controlled by a separate GUC, checkpointer_flush_after, bgwriter_flush_after and backend_flush_after respectively; they're separate because the number of flushes that are good are separate, and because the performance considerations of controlled flushing for each of these are different. A later patch will add checkpoint sorting - after that flushes from the ckeckpoint will almost always be desirable. Bgwriter flushes are most of the time going to be random, which are slow on lots of storage hardware. Flushing in backends works well if the storage and bgwriter can keep up, but if not it can have negative consequences. This patch is likely to have negative performance consequences without checkpoint sorting, but unfortunately so has sorting without flush control. Discussion: alpine.DEB.2.10.1506011320000.28433@sto Author: Fabien Coelho and Andres Freund
This commit is contained in:
@@ -662,6 +662,56 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
#endif /* USE_PREFETCH */
|
||||
}
|
||||
|
||||
/*
|
||||
* mdwriteback() -- Tell the kernel to write pages back to storage.
|
||||
*
|
||||
* This accepts a range of blocks because flushing several pages at once is
|
||||
* considerably more efficient than doing so individually.
|
||||
*/
|
||||
void
|
||||
mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
|
||||
{
|
||||
/*
|
||||
* Issue flush requests in as few requests as possible; have to split at
|
||||
* segment boundaries though, since those are actually separate files.
|
||||
*/
|
||||
while (nblocks != 0)
|
||||
{
|
||||
int nflush = nblocks;
|
||||
off_t seekpos;
|
||||
MdfdVec *v;
|
||||
int segnum_start,
|
||||
segnum_end;
|
||||
|
||||
v = _mdfd_getseg(reln, forknum, blocknum, false,
|
||||
EXTENSION_RETURN_NULL);
|
||||
|
||||
/*
|
||||
* We might be flushing buffers of already removed relations, that's
|
||||
* ok, just ignore that case.
|
||||
*/
|
||||
if (!v)
|
||||
return;
|
||||
|
||||
/* compute offset inside the current segment */
|
||||
segnum_start = blocknum / RELSEG_SIZE;
|
||||
|
||||
/* compute number of desired writes within the current segment */
|
||||
segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
|
||||
if (segnum_start != segnum_end)
|
||||
nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
|
||||
|
||||
Assert(nflush >= 1);
|
||||
Assert(nflush <= nblocks);
|
||||
|
||||
seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
|
||||
|
||||
FileWriteback(v->mdfd_vfd, seekpos, BLCKSZ * nflush);
|
||||
|
||||
nblocks -= nflush;
|
||||
blocknum += nflush;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* mdread() -- Read the specified block from a relation.
|
||||
|
||||
@@ -53,6 +53,8 @@ typedef struct f_smgr
|
||||
BlockNumber blocknum, char *buffer);
|
||||
void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, int nblocks);
|
||||
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
|
||||
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
@@ -66,8 +68,8 @@ typedef struct f_smgr
|
||||
static const f_smgr smgrsw[] = {
|
||||
/* magnetic disk */
|
||||
{mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
|
||||
mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
|
||||
mdpreckpt, mdsync, mdpostckpt
|
||||
mdprefetch, mdread, mdwrite, mdwriteback, mdnblocks, mdtruncate,
|
||||
mdimmedsync, mdpreckpt, mdsync, mdpostckpt
|
||||
}
|
||||
};
|
||||
|
||||
@@ -649,6 +651,19 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
buffer, skipFsync);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* smgrwriteback() -- Trigger kernel writeback for the supplied range of
|
||||
* blocks.
|
||||
*/
|
||||
void
|
||||
smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
int nblocks)
|
||||
{
|
||||
(*(smgrsw[reln->smgr_which].smgr_writeback)) (reln, forknum, blocknum,
|
||||
nblocks);
|
||||
}
|
||||
|
||||
/*
|
||||
* smgrnblocks() -- Calculate the number of blocks in the
|
||||
* supplied relation.
|
||||
|
||||
Reference in New Issue
Block a user