Make large sequential scans and VACUUMs work in a limited-size "ring" of

buffers, rather than blowing out the whole shared-buffer arena. Aside from avoiding cache spoliation, this fixes the problem that VACUUM formerly tended to cause a WAL flush for every page it modified, because we had it hacked to use only a single buffer. Those flushes will now occur only once per ring-ful. The exact ring size, and the threshold for seqscans to switch into the ring usage pattern, remain under debate; but the infrastructure seems done. The key bit of infrastructure is a new optional BufferAccessStrategy object that can be passed to ReadBuffer operations; this replaces the former StrategyHintVacuum API. This patch also changes the buffer usage-count methodology a bit: we now advance usage_count when first pinning a buffer, rather than when last unpinning it. To preserve the behavior that a buffer's lifetime starts to decrease when it's released, the clock sweep code is modified to not decrement usage_count of pinned buffers. Work not done in this commit: teach GiST and GIN indexes to use the vacuum BufferAccessStrategy for vacuum-driven fetches. Original patch by Simon, reworked by Heikki and again by Tom.
2025-12-21 05:21:08 +03:00 · 2007-05-30 20:12:03 +00:00
parent 0a6f2ee84d
commit d526575f89
24 changed files with 722 additions and 262 deletions
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.94 2007/05/03 16:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.95 2007/05/30 20:11:51 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@@ -547,8 +547,9 @@ loop_top:
 			vacuum_delay_point();
-			buf = _hash_getbuf(rel, blkno, HASH_WRITE,
+			buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
-							   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+											 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
 											 info->strategy);
 			page = BufferGetPage(buf);
 			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 			Assert(opaque->hasho_bucket == cur_bucket);
@@ -596,7 +597,8 @@ loop_top:
 		/* If we deleted anything, try to compact free space */
 		if (bucket_dirty)
-			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
+			_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
 								info->strategy);
 		/* Release bucket lock */
 		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.57 2007/05/03 16:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.58 2007/05/30 20:11:51 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@@ -362,6 +362,9 @@ _hash_firstfreebit(uint32 map)
 *	Remove this overflow page from its bucket's chain, and mark the page as
 *	free.  On entry, ovflbuf is write-locked; it is released before exiting.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 *
 *	Returns the block number of the page that followed the given page
 *	in the bucket, or InvalidBlockNumber if no following page.
 *
@@ -370,7 +373,8 @@ _hash_firstfreebit(uint32 map)
 *	on the bucket, too.
 */
 BlockNumber
-_hash_freeovflpage(Relation rel, Buffer ovflbuf)
+_hash_freeovflpage(Relation rel, Buffer ovflbuf,
 				   BufferAccessStrategy bstrategy)
 {
 	HashMetaPage metap;
 	Buffer		metabuf;
@@ -413,8 +417,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	 */
 	if (BlockNumberIsValid(prevblkno))
 	{
-		Buffer		prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE,
+		Buffer		prevbuf = _hash_getbuf_with_strategy(rel,
-										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+														 prevblkno,
 														 HASH_WRITE,
 														 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
 														 bstrategy);
 		Page		prevpage = BufferGetPage(prevbuf);
 		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
@@ -424,8 +431,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	}
 	if (BlockNumberIsValid(nextblkno))
 	{
-		Buffer		nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE,
+		Buffer		nextbuf = _hash_getbuf_with_strategy(rel,
-										   LH_OVERFLOW_PAGE);
+														 nextblkno,
 														 HASH_WRITE,
 														 LH_OVERFLOW_PAGE,
 														 bstrategy);
 		Page		nextpage = BufferGetPage(nextbuf);
 		HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
@@ -434,6 +444,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 		_hash_wrtbuf(rel, nextbuf);
 	}
 	/* Note: bstrategy is intentionally not used for metapage and bitmap */
 	/* Read the metapage so we can determine which bitmap page to use */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 	metap = (HashMetaPage) BufferGetPage(metabuf);
@@ -558,11 +570,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 */
 void
 _hash_squeezebucket(Relation rel,
 					Bucket bucket,
-					BlockNumber bucket_blkno)
+					BlockNumber bucket_blkno,
 					BufferAccessStrategy bstrategy)
 {
 	Buffer		wbuf;
 	Buffer		rbuf = 0;
@@ -581,7 +597,11 @@ _hash_squeezebucket(Relation rel,
 	 * start squeezing into the base bucket page.
 	 */
 	wblkno = bucket_blkno;
-	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE);
+	wbuf = _hash_getbuf_with_strategy(rel,
 									  wblkno,
 									  HASH_WRITE,
 									  LH_BUCKET_PAGE,
 									  bstrategy);
 	wpage = BufferGetPage(wbuf);
 	wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
@@ -595,8 +615,10 @@ _hash_squeezebucket(Relation rel,
 	}
 	/*
-	 * find the last page in the bucket chain by starting at the base bucket
+	 * Find the last page in the bucket chain by starting at the base bucket
-	 * page and working forward.
+	 * page and working forward.  Note: we assume that a hash bucket chain is
 	 * usually smaller than the buffer ring being used by VACUUM, else using
 	 * the access strategy here would be counterproductive.
 	 */
 	ropaque = wopaque;
 	do
@@ -604,7 +626,11 @@ _hash_squeezebucket(Relation rel,
 		rblkno = ropaque->hasho_nextblkno;
 		if (ropaque != wopaque)
 			_hash_relbuf(rel, rbuf);
-		rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+		rbuf = _hash_getbuf_with_strategy(rel,
 										  rblkno,
 										  HASH_WRITE,
 										  LH_OVERFLOW_PAGE,
 										  bstrategy);
 		rpage = BufferGetPage(rbuf);
 		ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
 		Assert(ropaque->hasho_bucket == bucket);
@@ -644,7 +670,11 @@ _hash_squeezebucket(Relation rel,
 					return;
 				}
-				wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+				wbuf = _hash_getbuf_with_strategy(rel,
 												  wblkno,
 												  HASH_WRITE,
 												  LH_OVERFLOW_PAGE,
 												  bstrategy);
 				wpage = BufferGetPage(wbuf);
 				wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
 				Assert(wopaque->hasho_bucket == bucket);
@@ -688,15 +718,19 @@ _hash_squeezebucket(Relation rel,
 				/* yes, so release wbuf lock first */
 				_hash_wrtbuf(rel, wbuf);
 				/* free this overflow page (releases rbuf) */
-				_hash_freeovflpage(rel, rbuf);
+				_hash_freeovflpage(rel, rbuf, bstrategy);
 				/* done */
 				return;
 			}
 			/* free this overflow page, then get the previous one */
-			_hash_freeovflpage(rel, rbuf);
+			_hash_freeovflpage(rel, rbuf, bstrategy);
-			rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+			rbuf = _hash_getbuf_with_strategy(rel,
 											  rblkno,
 											  HASH_WRITE,
 											  LH_OVERFLOW_PAGE,
 											  bstrategy);
 			rpage = BufferGetPage(rbuf);
 			ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
 			Assert(ropaque->hasho_bucket == bucket);
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.67 2007/05/03 16:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -214,6 +214,34 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno)
 	return buf;
 }
 /*
 *	_hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
 *
 *		This is identical to _hash_getbuf() but also allows a buffer access
 *		strategy to be specified.  We use this for VACUUM operations.
 */
 Buffer
 _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
 						   int access, int flags,
 						   BufferAccessStrategy bstrategy)
 {
 	Buffer		buf;
 	if (blkno == P_NEW)
 		elog(ERROR, "hash AM does not use P_NEW");
 	buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
 	if (access != HASH_NOLOCK)
 		LockBuffer(buf, access);
 	/* ref count and lock type are correct */
 	_hash_checkpage(rel, buf, flags);
 	return buf;
 }
 /*
 *	_hash_relbuf() -- release a locked buffer.
 *
@@ -840,5 +868,5 @@ _hash_splitbucket(Relation rel,
 	_hash_wrtbuf(rel, obuf);
 	_hash_wrtbuf(rel, nbuf);
-	_hash_squeezebucket(rel, obucket, start_oblkno);
+	_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
 }
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@@ -83,6 +83,24 @@ initscan(HeapScanDesc scan, ScanKey key)
 	 */
 	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
 	/*
 	 * If the table is large relative to NBuffers, use a bulk-read access
 	 * strategy, else use the default random-access strategy.  During a
 	 * rescan, don't make a new strategy object if we don't have to.
 	 */
 	if (scan->rs_nblocks > NBuffers / 4 &&
 		!scan->rs_rd->rd_istemp)
 	{
 		if (scan->rs_strategy == NULL)
 			scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
 	}
 	else
 	{
 		if (scan->rs_strategy != NULL)
 			FreeAccessStrategy(scan->rs_strategy);
 		scan->rs_strategy = NULL;
 	}
 	scan->rs_inited = false;
 	scan->rs_ctup.t_data = NULL;
 	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -123,9 +141,17 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	Assert(page < scan->rs_nblocks);
-	scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
+	/* release previous scan buffer, if any */
-										 scan->rs_rd,
+	if (BufferIsValid(scan->rs_cbuf))
-										 page);
+	{
 		ReleaseBuffer(scan->rs_cbuf);
 		scan->rs_cbuf = InvalidBuffer;
 	}
 	/* read page using selected strategy */
 	scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd,
 										   page,
 										   scan->rs_strategy);
 	scan->rs_cblock = page;
 	if (!scan->rs_pageatatime)
@@ -938,6 +964,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_rd = relation;
 	scan->rs_snapshot = snapshot;
 	scan->rs_nkeys = nkeys;
 	scan->rs_strategy = NULL;	/* set in initscan */
 	/*
 	 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
@@ -1007,6 +1034,9 @@ heap_endscan(HeapScanDesc scan)
 	if (scan->rs_key)
 		pfree(scan->rs_key);
 	if (scan->rs_strategy != NULL)
 		FreeAccessStrategy(scan->rs_strategy);
 	pfree(scan);
 }
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.154 2007/01/05 22:19:23 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -786,9 +786,10 @@ restart:
 	/*
 	 * We can't use _bt_getbuf() here because it always applies
 	 * _bt_checkpage(), which will barf on an all-zero page. We want to
-	 * recycle all-zero pages, not fail.
+	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
 	 * buffer access strategy.
 	 */
-	buf = ReadBuffer(rel, blkno);
+	buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.269 2007/05/20 21:08:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.270 2007/05/30 20:11:55 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -1799,6 +1799,36 @@ XLogFlush(XLogRecPtr record)
 			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
 }
 /*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
 bool
 XLogNeedsFlush(XLogRecPtr record)
 {
 	/* Quick exit if already known flushed */
 	if (XLByteLE(record, LogwrtResult.Flush))
 		return false;
 	/* read LogwrtResult and update local state */
 	{
 		/* use volatile pointer to prevent code rearrangement */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 		SpinLockAcquire(&xlogctl->info_lck);
 		LogwrtResult = xlogctl->LogwrtResult;
 		SpinLockRelease(&xlogctl->info_lck);
 	}
 	/* check again */
 	if (XLByteLE(record, LogwrtResult.Flush))
 		return false;
 	return true;
 }
 /*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.283 2007/05/16 17:28:20 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@@ -1658,6 +1658,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	ivinfo.vacuum_full = false;
 	ivinfo.message_level = DEBUG2;
 	ivinfo.num_heap_tuples = -1;
 	ivinfo.strategy = NULL;
 	state.tuplesort = tuplesort_begin_datum(TIDOID,
 											TIDLessOperator, false,
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.107 2007/04/30 03:23:48 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.108 2007/05/30 20:11:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -63,10 +63,13 @@ typedef struct AnlIndexData
 /* Default statistics target (GUC parameter) */
 int			default_statistics_target = 10;
 /* A few variables that don't seem worth passing around as parameters */
 static int	elevel = -1;
 static MemoryContext anl_context = NULL;
 static BufferAccessStrategy vac_strategy;
 static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
 				  int samplesize);
@@ -94,7 +97,8 @@ static bool std_typanalyze(VacAttrStats *stats);
 *	analyze_rel() -- analyze one relation
 */
 void
-analyze_rel(Oid relid, VacuumStmt *vacstmt)
+analyze_rel(Oid relid, VacuumStmt *vacstmt,
 			BufferAccessStrategy bstrategy)
 {
 	Relation	onerel;
 	int			attr_cnt,
@@ -120,6 +124,8 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt)
 	else
 		elevel = DEBUG2;
 	vac_strategy = bstrategy;
 	/*
 	 * Use the current context for storing analysis info.  vacuum.c ensures
 	 * that this context will be cleared when I return, thus releasing the
@@ -845,7 +851,7 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
 		 * looking at it.  We don't maintain a lock on the page, so tuples
 		 * could get added to it, but we ignore such tuples.
 		 */
-		targbuffer = ReadBuffer(onerel, targblock);
+		targbuffer = ReadBufferWithStrategy(onerel, targblock, vac_strategy);
 		LockBuffer(targbuffer, BUFFER_LOCK_SHARE);
 		targpage = BufferGetPage(targbuffer);
 		maxoffset = PageGetMaxOffsetNumber(targpage);
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.351 2007/05/17 15:28:29 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.352 2007/05/30 20:11:57 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -191,6 +191,7 @@ ExecContext_Finish(ExecContext ec)
 *----------------------------------------------------------------------
 */
 /* A few variables that don't seem worth passing around as parameters */
 static MemoryContext vac_context = NULL;
 static int	elevel = -1;
@@ -198,6 +199,8 @@ static int	elevel = -1;
 static TransactionId OldestXmin;
 static TransactionId FreezeLimit;
 static BufferAccessStrategy vac_strategy;
 /* non-export function prototypes */
 static List *get_rel_oids(List *relids, const RangeVar *vacrel,
@@ -257,14 +260,18 @@ static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page);
 * relation OIDs to be processed, and vacstmt->relation is ignored.
 * (The non-NIL case is currently only used by autovacuum.)
 *
 * bstrategy is normally given as NULL, but in autovacuum it can be passed
 * in to use the same buffer strategy object across multiple vacuum() calls.
 *
 * isTopLevel should be passed down from ProcessUtility.
 *
- * It is the caller's responsibility that both vacstmt and relids
+ * It is the caller's responsibility that vacstmt, relids, and bstrategy
 * (if given) be allocated in a memory context that won't disappear
 * at transaction commit.
 */
 void
-vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
+vacuum(VacuumStmt *vacstmt, List *relids,
 	   BufferAccessStrategy bstrategy, bool isTopLevel)
 {
 	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
 	volatile MemoryContext anl_context = NULL;
@@ -319,6 +326,19 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
 										ALLOCSET_DEFAULT_INITSIZE,
 										ALLOCSET_DEFAULT_MAXSIZE);
 	/*
 	 * If caller didn't give us a buffer strategy object, make one in the
 	 * cross-transaction memory context.
 	 */
 	if (bstrategy == NULL)
 	{
 		MemoryContext old_context = MemoryContextSwitchTo(vac_context);
 		bstrategy = GetAccessStrategy(BAS_VACUUM);
 		MemoryContextSwitchTo(old_context);
 	}
 	vac_strategy = bstrategy;
 	/* Remember whether we are processing everything in the DB */
 	all_rels = (relids == NIL && vacstmt->relation == NULL);
@@ -417,15 +437,7 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
 				else
 					old_context = MemoryContextSwitchTo(anl_context);
-				/*
+				analyze_rel(relid, vacstmt, vac_strategy);
 				 * Tell the buffer replacement strategy that vacuum is causing
 				 * the IO
 				 */
 				StrategyHintVacuum(true);
 				analyze_rel(relid, vacstmt);
 				StrategyHintVacuum(false);
 				if (use_own_xacts)
 					CommitTransactionCommand();
@@ -441,8 +453,6 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
 	{
 		/* Make sure cost accounting is turned off after error */
 		VacuumCostActive = false;
 		/* And reset buffer replacement strategy, too */
 		StrategyHintVacuum(false);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -1084,21 +1094,13 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
 	 */
 	toast_relid = onerel->rd_rel->reltoastrelid;
 	/*
 	 * Tell the cache replacement strategy that vacuum is causing all
 	 * following IO
 	 */
 	StrategyHintVacuum(true);
 	/*
 	 * Do the actual work --- either FULL or "lazy" vacuum
 	 */
 	if (vacstmt->full)
 		full_vacuum_rel(onerel, vacstmt);
 	else
-		lazy_vacuum_rel(onerel, vacstmt);
+		lazy_vacuum_rel(onerel, vacstmt, vac_strategy);
 	StrategyHintVacuum(false);
 	/* all done with this class, but hold lock until commit */
 	relation_close(onerel, NoLock);
@@ -1290,7 +1292,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		vacuum_delay_point();
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		page = BufferGetPage(buf);
 		/*
@@ -1730,7 +1732,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 		/*
 		 * Process this page of relation.
 		 */
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		page = BufferGetPage(buf);
 		vacpage->offsets_free = 0;
@@ -1954,8 +1956,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					nextTid = tp.t_data->t_ctid;
 					priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
 					/* assume block# is OK (see heap_fetch comments) */
-					nextBuf = ReadBuffer(onerel,
+					nextBuf = ReadBufferWithStrategy(onerel,
-										 ItemPointerGetBlockNumber(&nextTid));
+										 ItemPointerGetBlockNumber(&nextTid),
 													 vac_strategy);
 					nextPage = BufferGetPage(nextBuf);
 					/* If bogus or unused slot, assume tp is end of chain */
 					nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
@@ -2091,8 +2094,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						break;	/* out of check-all-items loop */
 					}
 					tp.t_self = vtlp->this_tid;
-					Pbuf = ReadBuffer(onerel,
+					Pbuf = ReadBufferWithStrategy(onerel,
-									ItemPointerGetBlockNumber(&(tp.t_self)));
+									ItemPointerGetBlockNumber(&(tp.t_self)),
 												  vac_strategy);
 					Ppage = BufferGetPage(Pbuf);
 					Pitemid = PageGetItemId(Ppage,
 								   ItemPointerGetOffsetNumber(&(tp.t_self)));
@@ -2174,11 +2178,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					/* Get page to move from */
 					tuple.t_self = vtmove[ti].tid;
-					Cbuf = ReadBuffer(onerel,
+					Cbuf = ReadBufferWithStrategy(onerel,
-								 ItemPointerGetBlockNumber(&(tuple.t_self)));
+								 ItemPointerGetBlockNumber(&(tuple.t_self)),
 												  vac_strategy);
 					/* Get page to move to */
-					dst_buffer = ReadBuffer(onerel, destvacpage->blkno);
+					dst_buffer = ReadBufferWithStrategy(onerel,
 														destvacpage->blkno,
 														vac_strategy);
 					LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
 					if (dst_buffer != Cbuf)
@@ -2239,7 +2246,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				if (i == num_fraged_pages)
 					break;		/* can't move item anywhere */
 				dst_vacpage = fraged_pages->pagedesc[i];
-				dst_buffer = ReadBuffer(onerel, dst_vacpage->blkno);
+				dst_buffer = ReadBufferWithStrategy(onerel,
 													dst_vacpage->blkno,
 													vac_strategy);
 				LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
 				dst_page = BufferGetPage(dst_buffer);
 				/* if this page was not used before - clean it */
@@ -2386,7 +2395,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			Page		page;
 			/* this page was not used as a move target, so must clean it */
-			buf = ReadBuffer(onerel, (*curpage)->blkno);
+			buf = ReadBufferWithStrategy(onerel,
 										 (*curpage)->blkno,
 										 vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			page = BufferGetPage(buf);
 			if (!PageIsEmpty(page))
@@ -2470,7 +2481,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			int			uncnt;
 			int			num_tuples = 0;
-			buf = ReadBuffer(onerel, vacpage->blkno);
+			buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			page = BufferGetPage(buf);
 			maxoff = PageGetMaxOffsetNumber(page);
@@ -2859,7 +2870,7 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
 			break;				/* no need to scan any further */
 		if ((*curpage)->offsets_used == 0)
 			continue;			/* this page was never used as a move dest */
-		buf = ReadBuffer(rel, (*curpage)->blkno);
+		buf = ReadBufferWithStrategy(rel, (*curpage)->blkno, vac_strategy);
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 		page = BufferGetPage(buf);
 		max_offset = PageGetMaxOffsetNumber(page);
@@ -2925,7 +2936,9 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 		if ((*vacpage)->offsets_free > 0)
 		{
-			buf = ReadBuffer(onerel, (*vacpage)->blkno);
+			buf = ReadBufferWithStrategy(onerel,
 										 (*vacpage)->blkno,
 										 vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			vacuum_page(onerel, buf, *vacpage);
 			UnlockReleaseBuffer(buf);
@@ -3012,6 +3025,7 @@ scan_index(Relation indrel, double num_tuples)
 	ivinfo.vacuum_full = true;
 	ivinfo.message_level = elevel;
 	ivinfo.num_heap_tuples = num_tuples;
 	ivinfo.strategy = vac_strategy;
 	stats = index_vacuum_cleanup(&ivinfo, NULL);
@@ -3077,6 +3091,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
 	ivinfo.vacuum_full = true;
 	ivinfo.message_level = elevel;
 	ivinfo.num_heap_tuples = num_tuples + keep_tuples;
 	ivinfo.strategy = vac_strategy;
 	/* Do bulk deletion */
 	stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist);
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -36,7 +36,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.89 2007/05/17 15:28:29 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.90 2007/05/30 20:11:57 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -96,11 +96,14 @@ typedef struct LVRelStats
 } LVRelStats;
 /* A few variables that don't seem worth passing around as parameters */
 static int	elevel = -1;
 static TransactionId OldestXmin;
 static TransactionId FreezeLimit;
 static BufferAccessStrategy vac_strategy;
 /* non-export function prototypes */
 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
@@ -138,7 +141,8 @@ static int	vac_cmp_page_spaces(const void *left, const void *right);
 *		and locked the relation.
 */
 void
-lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
+lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 				BufferAccessStrategy bstrategy)
 {
 	LVRelStats *vacrelstats;
 	Relation   *Irel;
@@ -158,6 +162,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	else
 		elevel = DEBUG2;
 	vac_strategy = bstrategy;
 	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
 						  &OldestXmin, &FreezeLimit);
@@ -318,7 +324,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			vacrelstats->num_index_scans++;
 		}
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		/* Initially, we only need shared access to the buffer */
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
@@ -586,7 +592,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		vacuum_delay_point();
 		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
-		buf = ReadBuffer(onerel, tblk);
+		buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy);
 		LockBufferForCleanup(buf);
 		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
 		/* Now that we've compacted the page, record its available space */
@@ -684,6 +690,7 @@ lazy_vacuum_index(Relation indrel,
 	ivinfo.message_level = elevel;
 	/* We don't yet know rel_tuples, so pass -1 */
 	ivinfo.num_heap_tuples = -1;
 	ivinfo.strategy = vac_strategy;
 	/* Do bulk deletion */
 	*stats = index_bulk_delete(&ivinfo, *stats,
@@ -713,6 +720,7 @@ lazy_cleanup_index(Relation indrel,
 	ivinfo.vacuum_full = false;
 	ivinfo.message_level = elevel;
 	ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
 	ivinfo.strategy = vac_strategy;
 	stats = index_vacuum_cleanup(&ivinfo, stats);
@@ -869,7 +877,7 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
 		blkno--;
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		/* In this phase we only need shared access to the buffer */
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.46 2007/05/07 20:41:24 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.47 2007/05/30 20:11:57 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -218,7 +218,8 @@ static void relation_needs_vacanalyze(Oid relid, Form_pg_autovacuum avForm,
 						  bool *doanalyze);
 static void autovacuum_do_vac_analyze(Oid relid, bool dovacuum,
-						  bool doanalyze, int freeze_min_age);
+						  bool doanalyze, int freeze_min_age,
 						  BufferAccessStrategy bstrategy);
 static HeapTuple get_pg_autovacuum_tuple_relid(Relation avRel, Oid relid);
 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
 						  PgStat_StatDBEntry *shared,
@@ -1673,6 +1674,7 @@ do_autovacuum(void)
 	ListCell   *cell;
 	PgStat_StatDBEntry *shared;
 	PgStat_StatDBEntry *dbentry;
 	BufferAccessStrategy bstrategy;
 	/*
 	 * may be NULL if we couldn't find an entry (only happens if we
@@ -1812,6 +1814,13 @@ do_autovacuum(void)
 	list_free(toast_oids);
 	toast_oids = NIL;
 	/*
 	 * Create a buffer access strategy object for VACUUM to use.  We want
 	 * to use the same one across all the vacuum operations we perform,
 	 * since the point is for VACUUM not to blow out the shared cache.
 	 */
 	bstrategy = GetAccessStrategy(BAS_VACUUM);
 	/*
 	 * Perform operations on collected tables.
 	 */
@@ -1910,7 +1919,8 @@ next_worker:
 		autovacuum_do_vac_analyze(tab->at_relid,
 								  tab->at_dovacuum,
 								  tab->at_doanalyze,
-								  tab->at_freeze_min_age);
+								  tab->at_freeze_min_age,
 								  bstrategy);
 		/* be tidy */
 		pfree(tab);
 	}
@@ -2328,7 +2338,8 @@ relation_needs_vacanalyze(Oid relid,
 */
 static void
 autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze,
-						  int freeze_min_age)
+						  int freeze_min_age,
 						  BufferAccessStrategy bstrategy)
 {
 	VacuumStmt	vacstmt;
 	MemoryContext old_cxt;
@@ -2354,7 +2365,7 @@ autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze,
 	/* Let pgstat know what we're doing */
 	autovac_report_activity(&vacstmt, relid);
-	vacuum(&vacstmt, list_make1_oid(relid), true);
+	vacuum(&vacstmt, list_make1_oid(relid), bstrategy, true);
 	MemoryContextSwitchTo(old_cxt);
 }
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.11 2006/07/23 03:07:58 tgl Exp $
+$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.12 2007/05/30 20:11:58 tgl Exp $
 Notes about shared buffer access rules
 --------------------------------------
@@ -152,20 +152,21 @@ we could use per-backend LWLocks instead (a buffer header would then contain
 a field to show which backend is doing its I/O).
-Buffer replacement strategy
+Normal buffer replacement strategy
---------------------------
+----------------------------------
 There is a "free list" of buffers that are prime candidates for replacement.
 In particular, buffers that are completely free (contain no valid page) are
-always in this list.  We may also throw buffers into this list if we
+always in this list.  We could also throw buffers into this list if we
-consider their pages unlikely to be needed soon.  The list is singly-linked
+consider their pages unlikely to be needed soon; however, the current
-using fields in the buffer headers; we maintain head and tail pointers in
+algorithm never does that.  The list is singly-linked using fields in the
-global variables.  (Note: although the list links are in the buffer headers,
+buffer headers; we maintain head and tail pointers in global variables.
-they are considered to be protected by the BufFreelistLock, not the
+(Note: although the list links are in the buffer headers, they are
-buffer-header spinlocks.)  To choose a victim buffer to recycle when there
+considered to be protected by the BufFreelistLock, not the buffer-header
-are no free buffers available, we use a simple clock-sweep algorithm, which
+spinlocks.)  To choose a victim buffer to recycle when there are no free
-avoids the need to take system-wide locks during common operations.  It
+buffers available, we use a simple clock-sweep algorithm, which avoids the
-works like this:
+need to take system-wide locks during common operations.  It works like
 this:
 Each buffer header contains a usage counter, which is incremented (up to a
 small limit value) whenever the buffer is unpinned.  (This requires only the
@@ -199,22 +200,40 @@ before we can recycle it; if someone else pins the buffer meanwhile we will
 have to give up and try another buffer.  This however is not a concern
 of the basic select-a-victim-buffer algorithm.)
 A special provision is that while running VACUUM, a backend does not
 increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
 sees that it is dropping the pin count to zero and the usage count is zero,
 then it appends the buffer to the tail of the free list.  (This implies that
 VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
 this shouldn't create much of a contention problem.)  This provision
 encourages VACUUM to work in a relatively small number of buffers rather
 than blowing out the entire buffer cache.  It is reasonable since a page
 that has been touched only by VACUUM is unlikely to be needed again soon.
-Since VACUUM usually requests many pages very fast, the effect of this is that
+Buffer ring replacement strategy
-it will get back the very buffers it filled and possibly modified on the next
+---------------------------------
-call and will therefore do its work in a few shared memory buffers, while
+
-being able to use whatever it finds in the cache already.  This also implies
+When running a query that needs to access a large number of pages just once,
-that most of the write traffic caused by a VACUUM will be done by the VACUUM
+such as VACUUM or a large sequential scan, a different strategy is used.
-itself and not pushed off onto other processes.
+A page that has been touched only by such a scan is unlikely to be needed
 again soon, so instead of running the normal clock sweep algorithm and
 blowing out the entire buffer cache, a small ring of buffers is allocated
 using the normal clock sweep algorithm and those buffers are reused for the
 whole scan.  This also implies that much of the write traffic caused by such
 a statement will be done by the backend itself and not pushed off onto other
 processes.
 For sequential scans, a 256KB ring is used. That's small enough to fit in L2
 cache, which makes transferring pages from OS cache to shared buffer cache
 efficient.  Even less would often be enough, but the ring must be big enough
 to accommodate all pages in the scan that are pinned concurrently.  256KB
 should also be enough to leave a small cache trail for other backends to
 join in a synchronized seq scan.  If a ring buffer is dirtied and its LSN
 updated, we would normally have to write and flush WAL before we could
 re-use the buffer; in this case we instead discard the buffer from the ring
 and (later) choose a replacement using the normal clock-sweep algorithm.
 Hence this strategy works best for scans that are read-only (or at worst
 update hint bits).  In a scan that modifies every page in the scan, like a
 bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and
 the ring strategy effectively degrades to the normal strategy.
 VACUUM uses a 256KB ring like sequential scans, but dirty pages are not
 removed from the ring.  Instead, WAL is flushed if needed to allow reuse of
 the buffers.  Before introducing the buffer ring strategy in 8.3, VACUUM's
 buffers were sent to the freelist, which was effectively a buffer ring of 1
 buffer, resulting in excessive WAL flushing.  Allowing VACUUM to update
 256KB between WAL flushes should be more efficient.
 Background writer's processing
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.219 2007/05/27 03:50:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.220 2007/05/30 20:11:58 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -90,11 +90,11 @@ static volatile BufferDesc *PinCountWaitBuf = NULL;
 static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum,
-								bool zeroPage);
+								bool zeroPage,
-static bool PinBuffer(volatile BufferDesc *buf);
+								BufferAccessStrategy strategy);
 static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(volatile BufferDesc *buf);
-static void UnpinBuffer(volatile BufferDesc *buf,
+static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
 			bool fixOwner, bool normalAccess);
 static bool SyncOneBuffer(int buf_id, bool skip_pinned);
 static void WaitIO(volatile BufferDesc *buf);
 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
@@ -102,7 +102,8 @@ static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
 				  int set_flag_bits);
 static void buffer_write_error_callback(void *arg);
 static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
-			bool *foundPtr);
+										BufferAccessStrategy strategy,
 										bool *foundPtr);
 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
@@ -125,7 +126,18 @@ static void AtProcExit_Buffers(int code, Datum arg);
 Buffer
 ReadBuffer(Relation reln, BlockNumber blockNum)
 {
-	return ReadBuffer_common(reln, blockNum, false);
+	return ReadBuffer_common(reln, blockNum, false, NULL);
 }
 /*
 * ReadBufferWithStrategy -- same as ReadBuffer, except caller can specify
 *		a nondefault buffer access strategy.  See buffer/README for details.
 */
 Buffer
 ReadBufferWithStrategy(Relation reln, BlockNumber blockNum,
 					   BufferAccessStrategy strategy)
 {
 	return ReadBuffer_common(reln, blockNum, false, strategy);
 }
 /*
@@ -140,14 +152,15 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
 Buffer
 ReadOrZeroBuffer(Relation reln, BlockNumber blockNum)
 {
-	return ReadBuffer_common(reln, blockNum, true);
+	return ReadBuffer_common(reln, blockNum, true, NULL);
 }
 /*
- * ReadBuffer_common -- common logic for ReadBuffer and ReadOrZeroBuffer
+ * ReadBuffer_common -- common logic for ReadBuffer variants
 */
 static Buffer
-ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
+ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage,
 				  BufferAccessStrategy strategy)
 {
 	volatile BufferDesc *bufHdr;
 	Block		bufBlock;
@@ -185,7 +198,7 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
 		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
 		 * not currently in memory.
 		 */
-		bufHdr = BufferAlloc(reln, blockNum, &found);
+		bufHdr = BufferAlloc(reln, blockNum, strategy, &found);
 		if (found)
 			BufferHitCount++;
 	}
@@ -330,6 +343,10 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
 *		buffer.  If no buffer exists already, selects a replacement
 *		victim and evicts the old page, but does NOT read in new page.
 *
 * "strategy" can be a buffer replacement strategy object, or NULL for
 * the default strategy.  The selected buffer's usage_count is advanced when
 * using the default strategy, but otherwise possibly not (see PinBuffer).
 *
 * The returned buffer is pinned and is already marked as holding the
 * desired page.  If it already did have the desired page, *foundPtr is
 * set TRUE.  Otherwise, *foundPtr is set FALSE and the buffer is marked
@@ -343,6 +360,7 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
 static volatile BufferDesc *
 BufferAlloc(Relation reln,
 			BlockNumber blockNum,
 			BufferAccessStrategy strategy,
 			bool *foundPtr)
 {
 	BufferTag	newTag;			/* identity of requested block */
@@ -375,7 +393,7 @@ BufferAlloc(Relation reln,
 		 */
 		buf = &BufferDescriptors[buf_id];
-		valid = PinBuffer(buf);
+		valid = PinBuffer(buf, strategy);
 		/* Can release the mapping lock as soon as we've pinned it */
 		LWLockRelease(newPartitionLock);
@@ -413,13 +431,15 @@ BufferAlloc(Relation reln,
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
 		bool lock_held;
 		/*
 		 * Select a victim buffer.	The buffer is returned with its header
-		 * spinlock still held!  Also the BufFreelistLock is still held, since
+		 * spinlock still held!  Also (in most cases) the BufFreelistLock is
-		 * it would be bad to hold the spinlock while possibly waking up other
+		 * still held, since it would be bad to hold the spinlock while
-		 * processes.
+		 * possibly waking up other processes.
 		 */
-		buf = StrategyGetBuffer();
+		buf = StrategyGetBuffer(strategy, &lock_held);
 		Assert(buf->refcount == 0);
@@ -430,7 +450,8 @@ BufferAlloc(Relation reln,
 		PinBuffer_Locked(buf);
 		/* Now it's safe to release the freelist lock */
-		LWLockRelease(BufFreelistLock);
+		if (lock_held)
 			LWLockRelease(BufFreelistLock);
 		/*
 		 * If the buffer was dirty, try to write it out.  There is a race
@@ -458,16 +479,34 @@ BufferAlloc(Relation reln,
 			 */
 			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
 			{
 				/*
 				 * If using a nondefault strategy, and writing the buffer
 				 * would require a WAL flush, let the strategy decide whether
 				 * to go ahead and write/reuse the buffer or to choose another
 				 * victim.  We need lock to inspect the page LSN, so this
 				 * can't be done inside StrategyGetBuffer.
 				 */
 				if (strategy != NULL &&
 					XLogNeedsFlush(BufferGetLSN(buf)) &&
 					StrategyRejectBuffer(strategy, buf))
 				{
 					/* Drop lock/pin and loop around for another buffer */
 					LWLockRelease(buf->content_lock);
 					UnpinBuffer(buf, true);
 					continue;
 				}
 				/* OK, do the I/O */
 				FlushBuffer(buf, NULL);
 				LWLockRelease(buf->content_lock);
 			}
 			else
 			{
 				/*
-				 * Someone else has pinned the buffer, so give it up and loop
+				 * Someone else has locked the buffer, so give it up and loop
 				 * back to get another one.
 				 */
-				UnpinBuffer(buf, true, false /* evidently recently used */ );
+				UnpinBuffer(buf, true);
 				continue;
 			}
 		}
@@ -531,10 +570,9 @@ BufferAlloc(Relation reln,
 			 * Got a collision. Someone has already done what we were about to
 			 * do. We'll just handle this as if it were found in the buffer
 			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.  Don't allow it to be thrown in the free list
+			 * planning to use.
 			 * (we don't want to hold freelist and mapping locks at once).
 			 */
-			UnpinBuffer(buf, true, false);
+			UnpinBuffer(buf, true);
 			/* Can give up that buffer's mapping partition lock now */
 			if ((oldFlags & BM_TAG_VALID) &&
@@ -545,7 +583,7 @@ BufferAlloc(Relation reln,
 			buf = &BufferDescriptors[buf_id];
-			valid = PinBuffer(buf);
+			valid = PinBuffer(buf, strategy);
 			/* Can release the mapping lock as soon as we've pinned it */
 			LWLockRelease(newPartitionLock);
@@ -595,20 +633,21 @@ BufferAlloc(Relation reln,
 			oldPartitionLock != newPartitionLock)
 			LWLockRelease(oldPartitionLock);
 		LWLockRelease(newPartitionLock);
-		UnpinBuffer(buf, true, false /* evidently recently used */ );
+		UnpinBuffer(buf, true);
 	}
 	/*
 	 * Okay, it's finally safe to rename the buffer.
 	 *
 	 * Clearing BM_VALID here is necessary, clearing the dirtybits is just
-	 * paranoia.  We also clear the usage_count since any recency of use of
+	 * paranoia.  We also reset the usage_count since any recency of use of
-	 * the old content is no longer relevant.
+	 * the old content is no longer relevant.  (The usage_count starts out
 	 * at 1 so that the buffer can survive one clock-sweep pass.)
 	 */
 	buf->tag = newTag;
 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
 	buf->flags |= BM_TAG_VALID;
-	buf->usage_count = 0;
+	buf->usage_count = 1;
 	UnlockBufHdr(buf);
@@ -736,7 +775,7 @@ retry:
 	/*
 	 * Insert the buffer at the head of the list of free buffers.
 	 */
-	StrategyFreeBuffer(buf, true);
+	StrategyFreeBuffer(buf);
 }
 /*
@@ -814,9 +853,6 @@ ReleaseAndReadBuffer(Buffer buffer,
 				return buffer;
 			ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 			LocalRefCount[-buffer - 1]--;
 			if (LocalRefCount[-buffer - 1] == 0 &&
 				bufHdr->usage_count < BM_MAX_USAGE_COUNT)
 				bufHdr->usage_count++;
 		}
 		else
 		{
@@ -826,7 +862,7 @@ ReleaseAndReadBuffer(Buffer buffer,
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
 				return buffer;
-			UnpinBuffer(bufHdr, true, true);
+			UnpinBuffer(bufHdr, true);
 		}
 	}
@@ -836,6 +872,14 @@ ReleaseAndReadBuffer(Buffer buffer,
 /*
 * PinBuffer -- make buffer unavailable for replacement.
 *
 * For the default access strategy, the buffer's usage_count is incremented
 * when we first pin it; for other strategies we just make sure the usage_count
 * isn't zero.  (The idea of the latter is that we don't want synchronized
 * heap scans to inflate the count, but we need it to not be zero to discourage
 * other backends from stealing buffers from our ring.  As long as we cycle
 * through the ring faster than the global clock-sweep cycles, buffers in
 * our ring won't be chosen as victims for replacement by other backends.)
 *
 * This should be applied only to shared buffers, never local ones.
 *
 * Note that ResourceOwnerEnlargeBuffers must have been done already.
@@ -844,7 +888,7 @@ ReleaseAndReadBuffer(Buffer buffer,
 * some callers to avoid an extra spinlock cycle.
 */
 static bool
-PinBuffer(volatile BufferDesc *buf)
+PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 {
 	int			b = buf->buf_id;
 	bool		result;
@@ -853,6 +897,16 @@ PinBuffer(volatile BufferDesc *buf)
 	{
 		LockBufHdr(buf);
 		buf->refcount++;
 		if (strategy == NULL)
 		{
 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
 				buf->usage_count++;
 		}
 		else
 		{
 			if (buf->usage_count == 0)
 				buf->usage_count = 1;
 		}
 		result = (buf->flags & BM_VALID) != 0;
 		UnlockBufHdr(buf);
 	}
@@ -872,6 +926,11 @@ PinBuffer(volatile BufferDesc *buf)
 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
 * The spinlock is released before return.
 *
 * Currently, no callers of this function want to modify the buffer's
 * usage_count at all, so there's no need for a strategy parameter.
 * Also we don't bother with a BM_VALID test (the caller could check that for
 * itself).
 *
 * Note: use of this routine is frequently mandatory, not just an optimization
 * to save a spin lock/unlock cycle, because we need to pin a buffer before
 * its state can change under us.
@@ -897,17 +956,9 @@ PinBuffer_Locked(volatile BufferDesc *buf)
 *
 * Most but not all callers want CurrentResourceOwner to be adjusted.
 * Those that don't should pass fixOwner = FALSE.
 *
 * normalAccess indicates that we are finishing a "normal" page access,
 * that is, one requested by something outside the buffer subsystem.
 * Passing FALSE means it's an internal access that should not update the
 * buffer's usage count nor cause a change in the freelist.
 *
 * If we are releasing a buffer during VACUUM, and it's not been otherwise
 * used recently, and normalAccess is true, we send the buffer to the freelist.
 */
 static void
-UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess)
+UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 {
 	int			b = buf->buf_id;
@@ -919,8 +970,6 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess)
 	PrivateRefCount[b]--;
 	if (PrivateRefCount[b] == 0)
 	{
 		bool		immed_free_buffer = false;
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(buf->content_lock));
 		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
@@ -931,22 +980,7 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess)
 		Assert(buf->refcount > 0);
 		buf->refcount--;
-		/* Update buffer usage info, unless this is an internal access */
+		/* Support LockBufferForCleanup() */
 		if (normalAccess)
 		{
 			if (!strategy_hint_vacuum)
 			{
 				if (buf->usage_count < BM_MAX_USAGE_COUNT)
 					buf->usage_count++;
 			}
 			else
 			{
 				/* VACUUM accesses don't bump usage count, instead... */
 				if (buf->refcount == 0 && buf->usage_count == 0)
 					immed_free_buffer = true;
 			}
 		}
 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
 			buf->refcount == 1)
 		{
@@ -959,14 +993,6 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess)
 		}
 		else
 			UnlockBufHdr(buf);
 		/*
 		 * If VACUUM is releasing an otherwise-unused buffer, send it to the
 		 * freelist for near-term reuse.  We put it at the tail so that it
 		 * won't be used before any invalid buffers that may exist.
 		 */
 		if (immed_free_buffer)
 			StrategyFreeBuffer(buf, false);
 	}
 }
@@ -1150,7 +1176,7 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
 	FlushBuffer(bufHdr, NULL);
 	LWLockRelease(bufHdr->content_lock);
-	UnpinBuffer(bufHdr, true, false /* don't change freelist */ );
+	UnpinBuffer(bufHdr, true);
 	return true;
 }
@@ -1266,7 +1292,7 @@ AtProcExit_Buffers(int code, Datum arg)
 			 * here, it suggests that ResourceOwners are messed up.
 			 */
 			PrivateRefCount[i] = 1;		/* make sure we release shared pin */
-			UnpinBuffer(buf, false, false /* don't change freelist */ );
+			UnpinBuffer(buf, false);
 			Assert(PrivateRefCount[i] == 0);
 		}
 	}
@@ -1700,7 +1726,7 @@ FlushRelationBuffers(Relation rel)
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
 			FlushBuffer(bufHdr, rel->rd_smgr);
 			LWLockRelease(bufHdr->content_lock);
-			UnpinBuffer(bufHdr, true, false /* no freelist change */ );
+			UnpinBuffer(bufHdr, true);
 		}
 		else
 			UnlockBufHdr(bufHdr);
@@ -1723,11 +1749,7 @@ ReleaseBuffer(Buffer buffer)
 	if (BufferIsLocal(buffer))
 	{
 		Assert(LocalRefCount[-buffer - 1] > 0);
 		bufHdr = &LocalBufferDescriptors[-buffer - 1];
 		LocalRefCount[-buffer - 1]--;
 		if (LocalRefCount[-buffer - 1] == 0 &&
 			bufHdr->usage_count < BM_MAX_USAGE_COUNT)
 			bufHdr->usage_count++;
 		return;
 	}
@@ -1738,7 +1760,7 @@ ReleaseBuffer(Buffer buffer)
 	if (PrivateRefCount[buffer - 1] > 1)
 		PrivateRefCount[buffer - 1]--;
 	else
-		UnpinBuffer(bufHdr, false, true);
+		UnpinBuffer(bufHdr, false);
 }
 /*
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.58 2007/01/05 22:19:37 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.59 2007/05/30 20:11:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -39,8 +39,42 @@ typedef struct
 /* Pointers to shared state */
 static BufferStrategyControl *StrategyControl = NULL;
-/* Backend-local state about whether currently vacuuming */
+/*
-bool		strategy_hint_vacuum = false;
+ * Private (non-shared) state for managing a ring of shared buffers to re-use.
 * This is currently the only kind of BufferAccessStrategy object, but someday
 * we might have more kinds.
 */
 typedef struct BufferAccessStrategyData
 {
 	/* Overall strategy type */
 	BufferAccessStrategyType btype;
 	/* Number of elements in buffers[] array */
 	int			ring_size;
 	/*
 	 * Index of the "current" slot in the ring, ie, the one most recently
 	 * returned by GetBufferFromRing.
 	 */
 	int			current;
 	/*
 	 * True if the buffer just returned by StrategyGetBuffer had been in
 	 * the ring already.
 	 */
 	bool		current_was_in_ring;
 	/*
 	 * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates
 	 * we have not yet selected a buffer for this ring slot.  For allocation
 	 * simplicity this is palloc'd together with the fixed fields of the
 	 * struct.
 	 */
 	Buffer		buffers[1];				/* VARIABLE SIZE ARRAY */
 } BufferAccessStrategyData;
 /* Prototypes for internal functions */
 static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							volatile BufferDesc *buf);
 /*
@@ -50,17 +84,38 @@ bool		strategy_hint_vacuum = false;
 *	BufferAlloc(). The only hard requirement BufferAlloc() has is that
 *	the selected buffer must not currently be pinned by anyone.
 *
 *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
 *
 *	To ensure that no one else can pin the buffer before we do, we must
- *	return the buffer with the buffer header spinlock still held.  That
+ *	return the buffer with the buffer header spinlock still held.  If
- *	means that we return with the BufFreelistLock still held, as well;
+ *	*lock_held is set on exit, we have returned with the BufFreelistLock
- *	the caller must release that lock once the spinlock is dropped.
+ *	still held, as well; the caller must release that lock once the spinlock
 *	is dropped.  We do it that way because releasing the BufFreelistLock
 *	might awaken other processes, and it would be bad to do the associated
 *	kernel calls while holding the buffer header spinlock.
 */
 volatile BufferDesc *
-StrategyGetBuffer(void)
+StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
 {
 	volatile BufferDesc *buf;
 	int			trycounter;
 	/*
 	 * If given a strategy object, see whether it can select a buffer.
 	 * We assume strategy objects don't need the BufFreelistLock.
 	 */
 	if (strategy != NULL)
 	{
 		buf = GetBufferFromRing(strategy);
 		if (buf != NULL)
 		{
 			*lock_held = false;
 			return buf;
 		}
 	}
 	/* Nope, so lock the freelist */
 	*lock_held = true;
 	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
 	/*
@@ -82,11 +137,16 @@ StrategyGetBuffer(void)
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; discard it and retry.  (This can only happen if VACUUM put a
 		 * valid buffer in the freelist and then someone else used it before
-		 * we got to it.)
+		 * we got to it.  It's probably impossible altogether as of 8.3,
 		 * but we'd better check anyway.)
 		 */
 		LockBufHdr(buf);
 		if (buf->refcount == 0 && buf->usage_count == 0)
 		{
 			if (strategy != NULL)
 				AddBufferToRing(strategy, buf);
 			return buf;
 		}
 		UnlockBufHdr(buf);
 	}
@@ -101,15 +161,23 @@ StrategyGetBuffer(void)
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
-		 * it; decrement the usage_count and keep scanning.
+		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
 		LockBufHdr(buf);
-		if (buf->refcount == 0 && buf->usage_count == 0)
+		if (buf->refcount == 0)
 			return buf;
 		if (buf->usage_count > 0)
 		{
-			buf->usage_count--;
+			if (buf->usage_count > 0)
-			trycounter = NBuffers;
+			{
 				buf->usage_count--;
 				trycounter = NBuffers;
 			}
 			else
 			{
 				/* Found a usable buffer */
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
 				return buf;
 			}
 		}
 		else if (--trycounter == 0)
 		{
@@ -132,13 +200,9 @@ StrategyGetBuffer(void)
 /*
 * StrategyFreeBuffer: put a buffer on the freelist
 *
 * The buffer is added either at the head or the tail, according to the
 * at_head parameter.  This allows a small amount of control over how
 * quickly the buffer is reused.
 */
 void
-StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head)
+StrategyFreeBuffer(volatile BufferDesc *buf)
 {
 	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
@@ -148,22 +212,10 @@ StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head)
 	 */
 	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
 	{
-		if (at_head)
+		buf->freeNext = StrategyControl->firstFreeBuffer;
-		{
+		if (buf->freeNext < 0)
 			buf->freeNext = StrategyControl->firstFreeBuffer;
 			if (buf->freeNext < 0)
 				StrategyControl->lastFreeBuffer = buf->buf_id;
 			StrategyControl->firstFreeBuffer = buf->buf_id;
 		}
 		else
 		{
 			buf->freeNext = FREENEXT_END_OF_LIST;
 			if (StrategyControl->firstFreeBuffer < 0)
 				StrategyControl->firstFreeBuffer = buf->buf_id;
 			else
 				BufferDescriptors[StrategyControl->lastFreeBuffer].freeNext = buf->buf_id;
 			StrategyControl->lastFreeBuffer = buf->buf_id;
-		}
+		StrategyControl->firstFreeBuffer = buf->buf_id;
 	}
 	LWLockRelease(BufFreelistLock);
@@ -190,15 +242,6 @@ StrategySyncStart(void)
 	return result;
 }
 /*
 * StrategyHintVacuum -- tell us whether VACUUM is active
 */
 void
 StrategyHintVacuum(bool vacuum_active)
 {
 	strategy_hint_vacuum = vacuum_active;
 }
 /*
 * StrategyShmemSize
@@ -274,3 +317,172 @@ StrategyInitialize(bool init)
 	else
 		Assert(!init);
 }
 /* ----------------------------------------------------------------
 *				Backend-private buffer ring management
 * ----------------------------------------------------------------
 */
 /*
 * GetAccessStrategy -- create a BufferAccessStrategy object
 *
 * The object is allocated in the current memory context.
 */
 BufferAccessStrategy
 GetAccessStrategy(BufferAccessStrategyType btype)
 {
 	BufferAccessStrategy strategy;
 	int		ring_size;
 	/*
 	 * Select ring size to use.  See buffer/README for rationales.
 	 * (Currently all cases are the same size, but keep this code
 	 * structure for flexibility.)
 	 */
 	switch (btype)
 	{
 		case BAS_NORMAL:
 			/* if someone asks for NORMAL, just give 'em a "default" object */
 			return NULL;
 		case BAS_BULKREAD:
 			ring_size = 256 * 1024 / BLCKSZ;
 			break;
 		case BAS_VACUUM:
 			ring_size = 256 * 1024 / BLCKSZ;
 			break;
 		default:
 			elog(ERROR, "unrecognized buffer access strategy: %d",
 				 (int) btype);
 			return NULL;		/* keep compiler quiet */
 	}
 	/* Make sure ring isn't an undue fraction of shared buffers */
 	ring_size = Min(NBuffers / 8, ring_size);
 	/* Allocate the object and initialize all elements to zeroes */
 	strategy = (BufferAccessStrategy)
 		palloc0(offsetof(BufferAccessStrategyData, buffers) +
 				ring_size * sizeof(Buffer));
 	/* Set fields that don't start out zero */
 	strategy->btype = btype;
 	strategy->ring_size = ring_size;
 	return strategy;
 }
 /*
 * FreeAccessStrategy -- release a BufferAccessStrategy object
 *
 * A simple pfree would do at the moment, but we would prefer that callers
 * don't assume that much about the representation of BufferAccessStrategy.
 */
 void
 FreeAccessStrategy(BufferAccessStrategy strategy)
 {
 	/* don't crash if called on a "default" strategy */
 	if (strategy != NULL)
 		pfree(strategy);
 }
 /*
 * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
 *		ring is empty.
 *
 * The bufhdr spin lock is held on the returned buffer.
 */
 static volatile BufferDesc *
 GetBufferFromRing(BufferAccessStrategy strategy)
 {
 	volatile BufferDesc *buf;
 	Buffer		bufnum;
 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
 		strategy->current = 0;
 	/*
 	 * If the slot hasn't been filled yet, tell the caller to allocate
 	 * a new buffer with the normal allocation strategy.  He will then
 	 * fill this slot by calling AddBufferToRing with the new buffer.
 	 */
 	bufnum = strategy->buffers[strategy->current];
 	if (bufnum == InvalidBuffer)
 	{
 		strategy->current_was_in_ring = false;
 		return NULL;
 	}
 	/*
 	 * If the buffer is pinned we cannot use it under any circumstances.
 	 *
 	 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
 	 * since our own previous usage of the ring element would have left it
 	 * there, but it might've been decremented by clock sweep since then).
 	 * A higher usage_count indicates someone else has touched the buffer,
 	 * so we shouldn't re-use it.
 	 */
 	buf = &BufferDescriptors[bufnum - 1];
 	LockBufHdr(buf);
 	if (buf->refcount == 0 && buf->usage_count <= 1)
 	{
 		strategy->current_was_in_ring = true;
 		return buf;
 	}
 	UnlockBufHdr(buf);
 	/*
 	 * Tell caller to allocate a new buffer with the normal allocation
 	 * strategy.  He'll then replace this ring element via AddBufferToRing.
 	 */
 	strategy->current_was_in_ring = false;
 	return NULL;
 }
 /*
 * AddBufferToRing -- add a buffer to the buffer ring
 *
 * Caller must hold the buffer header spinlock on the buffer.  Since this
 * is called with the spinlock held, it had better be quite cheap.
 */
 static void
 AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf)
 {
 	strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
 }
 /*
 * StrategyRejectBuffer -- consider rejecting a dirty buffer
 *
 * When a nondefault strategy is used, the buffer manager calls this function
 * when it turns out that the buffer selected by StrategyGetBuffer needs to
 * be written out and doing so would require flushing WAL too.  This gives us
 * a chance to choose a different victim.
 *
 * Returns true if buffer manager should ask for a new victim, and false
 * if this buffer should be written and re-used.
 */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf)
 {
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
 	/* Don't muck with behavior of normal buffer-replacement strategy */
 	if (!strategy->current_was_in_ring ||
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 	/*
 	 * Remove the dirty buffer from the ring; necessary to prevent infinite
 	 * loop if all ring members are dirty.
 	 */
 	strategy->buffers[strategy->current] = InvalidBuffer;
 	return true;
 }
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.76 2007/01/05 22:19:37 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.77 2007/05/30 20:11:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -57,7 +57,8 @@ static Block GetLocalBufferStorage(void);
 *
 * API is similar to bufmgr.c's BufferAlloc, except that we do not need
 * to do any locking since this is all local.	Also, IO_IN_PROGRESS
- * does not get set.
+ * does not get set.  Lastly, we support only default access strategy
 * (hence, usage_count is always advanced).
 */
 BufferDesc *
 LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
@@ -88,7 +89,12 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		fprintf(stderr, "LB ALLOC (%u,%d) %d\n",
 				RelationGetRelid(reln), blockNum, -b - 1);
 #endif
-
+		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
 				bufHdr->usage_count++;
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
 									BufferDescriptorGetBuffer(bufHdr));
@@ -121,18 +127,21 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		bufHdr = &LocalBufferDescriptors[b];
-		if (LocalRefCount[b] == 0 && bufHdr->usage_count == 0)
+		if (LocalRefCount[b] == 0)
 		{
-			LocalRefCount[b]++;
+			if (bufHdr->usage_count > 0)
-			ResourceOwnerRememberBuffer(CurrentResourceOwner,
+			{
-										BufferDescriptorGetBuffer(bufHdr));
+				bufHdr->usage_count--;
-			break;
+				trycounter = NLocBuffer;
-		}
+			}
-
+			else
-		if (bufHdr->usage_count > 0)
+			{
-		{
+				/* Found a usable buffer */
-			bufHdr->usage_count--;
+				LocalRefCount[b]++;
-			trycounter = NLocBuffer;
+				ResourceOwnerRememberBuffer(CurrentResourceOwner,
 											BufferDescriptorGetBuffer(bufHdr));
 				break;
 			}
 		}
 		else if (--trycounter == 0)
 			ereport(ERROR,
@@ -199,7 +208,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 	bufHdr->tag = newTag;
 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
 	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 0;
+	bufHdr->usage_count = 1;
 	*foundPtr = FALSE;
 	return bufHdr;
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/utility.c,v 1.279 2007/04/27 22:05:49 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/utility.c,v 1.280 2007/05/30 20:12:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -931,7 +931,7 @@ ProcessUtility(Node *parsetree,
 			break;
 		case T_VacuumStmt:
-			vacuum((VacuumStmt *) parsetree, NIL, isTopLevel);
+			vacuum((VacuumStmt *) parsetree, NIL, NULL, isTopLevel);
 			break;
 		case T_ExplainStmt:
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.66 2007/01/05 22:19:50 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.67 2007/05/30 20:12:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -40,6 +40,7 @@ typedef struct IndexVacuumInfo
 	bool		vacuum_full;	/* VACUUM FULL (we have exclusive lock) */
 	int			message_level;	/* ereport level for progress messages */
 	double		num_heap_tuples;	/* tuples remaining in heap */
 	BufferAccessStrategy strategy;	/* access strategy for reads */
 } IndexVacuumInfo;
 /*
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.80 2007/05/03 16:45:58 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.81 2007/05/30 20:12:02 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@@ -273,11 +273,13 @@ extern void _hash_doinsert(Relation rel, IndexTuple itup);
 /* hashovfl.c */
 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
-extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
+extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 									  BufferAccessStrategy bstrategy);
 extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
 				 BlockNumber blkno);
 extern void _hash_squeezebucket(Relation rel,
-					Bucket bucket, BlockNumber bucket_blkno);
+								Bucket bucket, BlockNumber bucket_blkno,
 								BufferAccessStrategy bstrategy);
 /* hashpage.c */
 extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
@@ -287,6 +289,9 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
 						   int access, int flags);
 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno);
 extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
 										 int access, int flags,
 										 BufferAccessStrategy bstrategy);
 extern void _hash_relbuf(Relation rel, Buffer buf);
 extern void _hash_dropbuf(Relation rel, Buffer buf);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.53 2007/05/27 03:50:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.54 2007/05/30 20:12:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -27,6 +27,7 @@ typedef struct HeapScanDescData
 	int			rs_nkeys;		/* number of scan keys */
 	ScanKey		rs_key;			/* array of scan key descriptors */
 	BlockNumber rs_nblocks;		/* number of blocks to scan */
 	BufferAccessStrategy rs_strategy;	/* access strategy for reads */
 	bool		rs_pageatatime; /* verify visibility page-at-a-time? */
 	/* scan current state */
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.77 2007/05/20 21:08:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.78 2007/05/30 20:12:02 tgl Exp $
 */
 #ifndef XLOG_H
 #define XLOG_H
@@ -159,6 +159,7 @@ extern bool XLOG_DEBUG;
 extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
 extern void XLogFlush(XLogRecPtr RecPtr);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.71 2007/05/17 15:28:29 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.72 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -18,9 +18,11 @@
 #include "catalog/pg_statistic.h"
 #include "catalog/pg_type.h"
 #include "nodes/parsenodes.h"
 #include "storage/buf.h"
 #include "storage/lock.h"
 #include "utils/rel.h"
 /*----------
 * ANALYZE builds one of these structs for each attribute (column) that is
 * to be analyzed.	The struct and subsidiary data are in anl_context,
@@ -110,7 +112,8 @@ extern int	vacuum_freeze_min_age;
 /* in commands/vacuum.c */
-extern void vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel);
+extern void vacuum(VacuumStmt *vacstmt, List *relids,
 				   BufferAccessStrategy bstrategy, bool isTopLevel);
 extern void vac_open_indexes(Relation relation, LOCKMODE lockmode,
 				 int *nindexes, Relation **Irel);
 extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode);
@@ -127,9 +130,11 @@ extern bool vac_is_partial_index(Relation indrel);
 extern void vacuum_delay_point(void);
 /* in commands/vacuumlazy.c */
-extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
+extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 							BufferAccessStrategy bstrategy);
 /* in commands/analyze.c */
-extern void analyze_rel(Oid relid, VacuumStmt *vacstmt);
+extern void analyze_rel(Oid relid, VacuumStmt *vacstmt,
 						BufferAccessStrategy bstrategy);
 #endif   /* VACUUM_H */
--- a/src/include/storage/buf.h
+++ b/src/include/storage/buf.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/buf.h,v 1.21 2007/01/05 22:19:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf.h,v 1.22 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -36,4 +36,11 @@ typedef int Buffer;
 */
 #define BufferIsLocal(buffer)	((buffer) < 0)
 /*
 * Buffer access strategy objects.
 *
 * BufferAccessStrategyData is private to freelist.c
 */
 typedef struct BufferAccessStrategyData *BufferAccessStrategy;
 #endif   /* BUF_H */
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.89 2007/01/05 22:19:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.90 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -167,9 +167,6 @@ extern DLLIMPORT BufferDesc *BufferDescriptors;
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
 /* in freelist.c */
 extern bool strategy_hint_vacuum;
 /* event counters in buf_init.c */
 extern long int ReadBufferCount;
 extern long int ReadLocalBufferCount;
@@ -184,8 +181,12 @@ extern long int LocalBufferFlushCount;
 */
 /* freelist.c */
-extern volatile BufferDesc *StrategyGetBuffer(void);
+extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
-extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
+											  bool *lock_held);
 extern void StrategyFreeBuffer(volatile BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 								 volatile BufferDesc *buf);
 extern int	StrategySyncStart(void);
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.103 2007/05/02 23:18:03 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.104 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,6 +19,14 @@
 typedef void *Block;
 /* Possible arguments for GetAccessStrategy() */
 typedef enum BufferAccessStrategyType
 {
 	BAS_NORMAL,		/* Normal random access */
 	BAS_BULKREAD,	/* Large read-only scan (hint bit updates are ok) */
 	BAS_VACUUM		/* VACUUM */
 } BufferAccessStrategyType;
 /* in globals.c ... this duplicates miscadmin.h */
 extern DLLIMPORT int NBuffers;
@@ -111,6 +119,8 @@ extern DLLIMPORT int32 *LocalRefCount;
 * prototypes for functions in bufmgr.c
 */
 extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
 extern Buffer ReadBufferWithStrategy(Relation reln, BlockNumber blockNum,
 									 BufferAccessStrategy strategy);
 extern Buffer ReadOrZeroBuffer(Relation reln, BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
@@ -157,6 +167,7 @@ extern void BgBufferSync(void);
 extern void AtProcExit_LocalBuffers(void);
 /* in freelist.c */
-extern void StrategyHintVacuum(bool vacuum_active);
+extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
 #endif