1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-10 17:42:29 +03:00

Make large sequential scans and VACUUMs work in a limited-size "ring" of

buffers, rather than blowing out the whole shared-buffer arena.  Aside from
avoiding cache spoliation, this fixes the problem that VACUUM formerly tended
to cause a WAL flush for every page it modified, because we had it hacked to
use only a single buffer.  Those flushes will now occur only once per
ring-ful.  The exact ring size, and the threshold for seqscans to switch into
the ring usage pattern, remain under debate; but the infrastructure seems
done.  The key bit of infrastructure is a new optional BufferAccessStrategy
object that can be passed to ReadBuffer operations; this replaces the former
StrategyHintVacuum API.

This patch also changes the buffer usage-count methodology a bit: we now
advance usage_count when first pinning a buffer, rather than when last
unpinning it.  To preserve the behavior that a buffer's lifetime starts to
decrease when it's released, the clock sweep code is modified to not decrement
usage_count of pinned buffers.

Work not done in this commit: teach GiST and GIN indexes to use the vacuum
BufferAccessStrategy for vacuum-driven fetches.

Original patch by Simon, reworked by Heikki and again by Tom.
This commit is contained in:
Tom Lane
2007-05-30 20:12:03 +00:00
parent 0a6f2ee84d
commit d526575f89
24 changed files with 722 additions and 262 deletions

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.94 2007/05/03 16:45:58 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.95 2007/05/30 20:11:51 tgl Exp $
*
* NOTES
* This file contains only the public interface routines.
@@ -547,8 +547,9 @@ loop_top:
vacuum_delay_point();
buf = _hash_getbuf(rel, blkno, HASH_WRITE,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
info->strategy);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == cur_bucket);
@@ -596,7 +597,8 @@ loop_top:
/* If we deleted anything, try to compact free space */
if (bucket_dirty)
_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
info->strategy);
/* Release bucket lock */
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.57 2007/05/03 16:45:58 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.58 2007/05/30 20:11:51 tgl Exp $
*
* NOTES
* Overflow pages look like ordinary relation pages.
@@ -362,6 +362,9 @@ _hash_firstfreebit(uint32 map)
* Remove this overflow page from its bucket's chain, and mark the page as
* free. On entry, ovflbuf is write-locked; it is released before exiting.
*
* Since this function is invoked in VACUUM, we provide an access strategy
* parameter that controls fetches of the bucket pages.
*
* Returns the block number of the page that followed the given page
* in the bucket, or InvalidBlockNumber if no following page.
*
@@ -370,7 +373,8 @@ _hash_firstfreebit(uint32 map)
* on the bucket, too.
*/
BlockNumber
_hash_freeovflpage(Relation rel, Buffer ovflbuf)
_hash_freeovflpage(Relation rel, Buffer ovflbuf,
BufferAccessStrategy bstrategy)
{
HashMetaPage metap;
Buffer metabuf;
@@ -413,8 +417,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
*/
if (BlockNumberIsValid(prevblkno))
{
Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
Buffer prevbuf = _hash_getbuf_with_strategy(rel,
prevblkno,
HASH_WRITE,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
bstrategy);
Page prevpage = BufferGetPage(prevbuf);
HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
@@ -424,8 +431,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
}
if (BlockNumberIsValid(nextblkno))
{
Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE,
LH_OVERFLOW_PAGE);
Buffer nextbuf = _hash_getbuf_with_strategy(rel,
nextblkno,
HASH_WRITE,
LH_OVERFLOW_PAGE,
bstrategy);
Page nextpage = BufferGetPage(nextbuf);
HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
@@ -434,6 +444,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
_hash_wrtbuf(rel, nextbuf);
}
/* Note: bstrategy is intentionally not used for metapage and bitmap */
/* Read the metapage so we can determine which bitmap page to use */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
@@ -558,11 +570,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
*
* Caller must hold exclusive lock on the target bucket. This allows
* us to safely lock multiple pages in the bucket.
*
* Since this function is invoked in VACUUM, we provide an access strategy
* parameter that controls fetches of the bucket pages.
*/
void
_hash_squeezebucket(Relation rel,
Bucket bucket,
BlockNumber bucket_blkno)
BlockNumber bucket_blkno,
BufferAccessStrategy bstrategy)
{
Buffer wbuf;
Buffer rbuf = 0;
@@ -581,7 +597,11 @@ _hash_squeezebucket(Relation rel,
* start squeezing into the base bucket page.
*/
wblkno = bucket_blkno;
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE);
wbuf = _hash_getbuf_with_strategy(rel,
wblkno,
HASH_WRITE,
LH_BUCKET_PAGE,
bstrategy);
wpage = BufferGetPage(wbuf);
wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
@@ -595,8 +615,10 @@ _hash_squeezebucket(Relation rel,
}
/*
* find the last page in the bucket chain by starting at the base bucket
* page and working forward.
* Find the last page in the bucket chain by starting at the base bucket
* page and working forward. Note: we assume that a hash bucket chain is
* usually smaller than the buffer ring being used by VACUUM, else using
* the access strategy here would be counterproductive.
*/
ropaque = wopaque;
do
@@ -604,7 +626,11 @@ _hash_squeezebucket(Relation rel,
rblkno = ropaque->hasho_nextblkno;
if (ropaque != wopaque)
_hash_relbuf(rel, rbuf);
rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
rbuf = _hash_getbuf_with_strategy(rel,
rblkno,
HASH_WRITE,
LH_OVERFLOW_PAGE,
bstrategy);
rpage = BufferGetPage(rbuf);
ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
Assert(ropaque->hasho_bucket == bucket);
@@ -644,7 +670,11 @@ _hash_squeezebucket(Relation rel,
return;
}
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
wbuf = _hash_getbuf_with_strategy(rel,
wblkno,
HASH_WRITE,
LH_OVERFLOW_PAGE,
bstrategy);
wpage = BufferGetPage(wbuf);
wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
Assert(wopaque->hasho_bucket == bucket);
@@ -688,15 +718,19 @@ _hash_squeezebucket(Relation rel,
/* yes, so release wbuf lock first */
_hash_wrtbuf(rel, wbuf);
/* free this overflow page (releases rbuf) */
_hash_freeovflpage(rel, rbuf);
_hash_freeovflpage(rel, rbuf, bstrategy);
/* done */
return;
}
/* free this overflow page, then get the previous one */
_hash_freeovflpage(rel, rbuf);
_hash_freeovflpage(rel, rbuf, bstrategy);
rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
rbuf = _hash_getbuf_with_strategy(rel,
rblkno,
HASH_WRITE,
LH_OVERFLOW_PAGE,
bstrategy);
rpage = BufferGetPage(rbuf);
ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
Assert(ropaque->hasho_bucket == bucket);

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.67 2007/05/03 16:45:58 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -214,6 +214,34 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno)
return buf;
}
/*
* _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
*
* This is identical to _hash_getbuf() but also allows a buffer access
* strategy to be specified. We use this for VACUUM operations.
*/
Buffer
_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
int access, int flags,
BufferAccessStrategy bstrategy)
{
Buffer buf;
if (blkno == P_NEW)
elog(ERROR, "hash AM does not use P_NEW");
buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
if (access != HASH_NOLOCK)
LockBuffer(buf, access);
/* ref count and lock type are correct */
_hash_checkpage(rel, buf, flags);
return buf;
}
/*
* _hash_relbuf() -- release a locked buffer.
*
@@ -840,5 +868,5 @@ _hash_splitbucket(Relation rel,
_hash_wrtbuf(rel, obuf);
_hash_wrtbuf(rel, nbuf);
_hash_squeezebucket(rel, obucket, start_oblkno);
_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
}

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $
*
*
* INTERFACE ROUTINES
@@ -83,6 +83,24 @@ initscan(HeapScanDesc scan, ScanKey key)
*/
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
/*
* If the table is large relative to NBuffers, use a bulk-read access
* strategy, else use the default random-access strategy. During a
* rescan, don't make a new strategy object if we don't have to.
*/
if (scan->rs_nblocks > NBuffers / 4 &&
!scan->rs_rd->rd_istemp)
{
if (scan->rs_strategy == NULL)
scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
}
else
{
if (scan->rs_strategy != NULL)
FreeAccessStrategy(scan->rs_strategy);
scan->rs_strategy = NULL;
}
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -123,9 +141,17 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
Assert(page < scan->rs_nblocks);
scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
scan->rs_rd,
page);
/* release previous scan buffer, if any */
if (BufferIsValid(scan->rs_cbuf))
{
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
}
/* read page using selected strategy */
scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd,
page,
scan->rs_strategy);
scan->rs_cblock = page;
if (!scan->rs_pageatatime)
@@ -938,6 +964,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_rd = relation;
scan->rs_snapshot = snapshot;
scan->rs_nkeys = nkeys;
scan->rs_strategy = NULL; /* set in initscan */
/*
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
@@ -1007,6 +1034,9 @@ heap_endscan(HeapScanDesc scan)
if (scan->rs_key)
pfree(scan->rs_key);
if (scan->rs_strategy != NULL)
FreeAccessStrategy(scan->rs_strategy);
pfree(scan);
}

View File

@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.154 2007/01/05 22:19:23 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -786,9 +786,10 @@ restart:
/*
* We can't use _bt_getbuf() here because it always applies
* _bt_checkpage(), which will barf on an all-zero page. We want to
* recycle all-zero pages, not fail.
* recycle all-zero pages, not fail. Also, we want to use a nondefault
* buffer access strategy.
*/
buf = ReadBuffer(rel, blkno);
buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
LockBuffer(buf, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);

View File

@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.269 2007/05/20 21:08:19 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.270 2007/05/30 20:11:55 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1799,6 +1799,36 @@ XLogFlush(XLogRecPtr record)
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
}
/*
* Test whether XLOG data has been flushed up to (at least) the given position.
*
* Returns true if a flush is still needed. (It may be that someone else
* is already in process of flushing that far, however.)
*/
bool
XLogNeedsFlush(XLogRecPtr record)
{
/* Quick exit if already known flushed */
if (XLByteLE(record, LogwrtResult.Flush))
return false;
/* read LogwrtResult and update local state */
{
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
LogwrtResult = xlogctl->LogwrtResult;
SpinLockRelease(&xlogctl->info_lck);
}
/* check again */
if (XLByteLE(record, LogwrtResult.Flush))
return false;
return true;
}
/*
* Create a new XLOG file segment, or open a pre-existing one.
*