Just-in-time background writing strategy. This code avoids re-scanning

buffers that cannot possibly need to be cleaned, and estimates how many buffers it should try to clean based on moving averages of recent allocation requests and density of reusable buffers. The patch also adds a couple more columns to pg_stat_bgwriter to help measure the effectiveness of the bgwriter. Greg Smith, building on his own work and ideas from several other people, in particular a much older patch from Itagaki Takahiro.
2025-11-15 03:41:20 +03:00 · 2007-09-25 20:03:38 +00:00
parent 588901df84
commit 6f5c38dcd0
16 changed files with 447 additions and 105 deletions
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.224 2007/09/20 17:56:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.225 2007/09/25 20:03:37 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -52,11 +52,15 @@
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

+/* Bits in SyncOneBuffer's return value */
+#define BUF_WRITTEN				0x01
+#define BUF_REUSABLE			0x02
+

 /* GUC variables */
 bool		zero_damaged_pages = false;
-double		bgwriter_lru_percent = 1.0;
-int			bgwriter_lru_maxpages = 5;
+int			bgwriter_lru_maxpages = 100;
+double		bgwriter_lru_multiplier = 2.0;


 long		NDirectFileRead;	/* some I/O's are direct file access. bypass
@@ -79,7 +83,7 @@ static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(volatile BufferDesc *buf);
 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
 static void BufferSync(int flags);
-static bool SyncOneBuffer(int buf_id, bool skip_pinned);
+static int	SyncOneBuffer(int buf_id, bool skip_recently_used);
 static void WaitIO(volatile BufferDesc *buf);
 static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
 static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
@@ -1043,8 +1047,11 @@ BufferSync(int flags)
 	 * Loop over all buffers again, and write the ones (still) marked with
 	 * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep
 	 * point since we might as well dump soon-to-be-recycled buffers first.
+	 *
+	 * Note that we don't read the buffer alloc count here --- that should
+	 * be left untouched till the next BgBufferSync() call.
 	 */
-	buf_id = StrategySyncStart();
+	buf_id = StrategySyncStart(NULL, NULL);
 	num_to_scan = NBuffers;
 	num_written = 0;
 	while (num_to_scan-- > 0)
@@ -1065,7 +1072,7 @@ BufferSync(int flags)
 		 */
 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
 		{
-			if (SyncOneBuffer(buf_id, false))
+			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
 			{
 				BgWriterStats.m_buf_written_checkpoints++;
 				num_written++;
@@ -1112,61 +1119,289 @@ BufferSync(int flags)
 void
 BgBufferSync(void)
 {
-	int			buf_id;
+	/* info obtained from freelist.c */
+	int			strategy_buf_id;
+	uint32		strategy_passes;
+	uint32		recent_alloc;
+
+	/*
+	 * Information saved between calls so we can determine the strategy
+	 * point's advance rate and avoid scanning already-cleaned buffers.
+	 */
+	static bool	saved_info_valid = false;
+	static int	prev_strategy_buf_id;
+	static uint32 prev_strategy_passes;
+	static int	next_to_clean;
+	static uint32 next_passes;
+
+	/* Moving averages of allocation rate and clean-buffer density */
+	static float smoothed_alloc = 0;
+	static float smoothed_density = 10.0;
+
+	/* Potentially these could be tunables, but for now, not */
+	float		smoothing_samples = 16;
+	float		scan_whole_pool_milliseconds = 120000.0;
+
+	/* Used to compute how far we scan ahead */
+	long		strategy_delta;
+	int			bufs_to_lap;
+	int			bufs_ahead;
+	float		scans_per_alloc;
+	int			reusable_buffers_est;
+	int			upcoming_alloc_est;
+	int			min_scan_buffers;
+
+	/* Variables for the scanning loop proper */
 	int			num_to_scan;
 	int			num_written;
+	int			reusable_buffers;
+
+	/*
+	 * Find out where the freelist clock sweep currently is, and how
+	 * many buffer allocations have happened since our last call.
+	 */
+	strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+
+	/* Report buffer alloc counts to pgstat */
+	BgWriterStats.m_buf_alloc += recent_alloc;
+
+	/*
+	 * If we're not running the LRU scan, just stop after doing the
+	 * stats stuff.  We mark the saved state invalid so that we can recover
+	 * sanely if LRU scan is turned back on later.
+	 */
+	if (bgwriter_lru_maxpages <= 0)
+	{
+		saved_info_valid = false;
+		return;
+	}
+
+	/*
+	 * Compute strategy_delta = how many buffers have been scanned by the
+	 * clock sweep since last time.  If first time through, assume none.
+	 * Then see if we are still ahead of the clock sweep, and if so, how many
+	 * buffers we could scan before we'd catch up with it and "lap" it.
+	 * Note: weird-looking coding of xxx_passes comparisons are to avoid
+	 * bogus behavior when the passes counts wrap around.
+	 */
+	if (saved_info_valid)
+	{
+		int32	passes_delta = strategy_passes - prev_strategy_passes;
+
+		strategy_delta = strategy_buf_id - prev_strategy_buf_id;
+		strategy_delta += (long) passes_delta * NBuffers;
+		Assert(strategy_delta >= 0);
+
+		if ((int32) (next_passes - strategy_passes) > 0)
+		{
+			/* we're one pass ahead of the strategy point */
+			bufs_to_lap = strategy_buf_id - next_to_clean;
+#ifdef BGW_DEBUG
+			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+				 next_passes, next_to_clean,
+				 strategy_passes, strategy_buf_id,
+				 strategy_delta, bufs_to_lap);
+#endif
+		}
+		else if (next_passes == strategy_passes &&
+				 next_to_clean >= strategy_buf_id)
+		{
+			/* on same pass, but ahead or at least not behind */
+			bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
+#ifdef BGW_DEBUG
+			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+				 next_passes, next_to_clean,
+				 strategy_passes, strategy_buf_id,
+				 strategy_delta, bufs_to_lap);
+#endif
+		}
+		else
+		{
+			/*
+			 * We're behind, so skip forward to the strategy point
+			 * and start cleaning from there.
+			 */
+#ifdef BGW_DEBUG
+			elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
+				 next_passes, next_to_clean,
+				 strategy_passes, strategy_buf_id,
+				 strategy_delta);
+#endif
+			next_to_clean = strategy_buf_id;
+			next_passes = strategy_passes;
+			bufs_to_lap = NBuffers;
+		}
+	}
+	else
+	{
+		/*
+		 * Initializing at startup or after LRU scanning had been off.
+		 * Always start at the strategy point.
+		 */
+#ifdef BGW_DEBUG
+		elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
+			 strategy_passes, strategy_buf_id);
+#endif
+		strategy_delta = 0;
+		next_to_clean = strategy_buf_id;
+		next_passes = strategy_passes;
+		bufs_to_lap = NBuffers;
+	}
+
+	/* Update saved info for next time */
+	prev_strategy_buf_id = strategy_buf_id;
+	prev_strategy_passes = strategy_passes;
+	saved_info_valid = true;
+
+	/*
+	 * Compute how many buffers had to be scanned for each new allocation,
+	 * ie, 1/density of reusable buffers, and track a moving average of that.
+	 *
+	 * If the strategy point didn't move, we don't update the density estimate
+	 */
+	if (strategy_delta > 0 && recent_alloc > 0)
+	{
+		scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+		smoothed_density += (scans_per_alloc - smoothed_density) /
+			smoothing_samples;
+	}
+
+	/*
+	 * Estimate how many reusable buffers there are between the current
+	 * strategy point and where we've scanned ahead to, based on the
+	 * smoothed density estimate.
+	 */
+	bufs_ahead = NBuffers - bufs_to_lap;
+	reusable_buffers_est = (float) bufs_ahead / smoothed_density;
+
+	/*
+	 * Track a moving average of recent buffer allocations.  Here, rather
+	 * than a true average we want a fast-attack, slow-decline behavior:
+	 * we immediately follow any increase.
+	 */
+	if (smoothed_alloc <= (float) recent_alloc)
+		smoothed_alloc = recent_alloc;
+	else
+		smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+			smoothing_samples;
+
+	/* Scale the estimate by a GUC to allow more aggressive tuning. */
+	upcoming_alloc_est = smoothed_alloc * bgwriter_lru_multiplier;
+
+	/*
+	 * Even in cases where there's been little or no buffer allocation
+	 * activity, we want to make a small amount of progress through the buffer
+	 * cache so that as many reusable buffers as possible are clean
+	 * after an idle period.
+	 *
+	 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many
+	 * times the BGW will be called during the scan_whole_pool time;
+	 * slice the buffer pool into that many sections.
+	 */
+	min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
+
+	if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
+	{
+#ifdef BGW_DEBUG
+		elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
+			 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
+#endif
+		upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
+	}
+
+	/*
+	 * Now write out dirty reusable buffers, working forward from the
+	 * next_to_clean point, until we have lapped the strategy scan, or
+	 * cleaned enough buffers to match our estimate of the next cycle's
+	 * allocation requirements, or hit the bgwriter_lru_maxpages limit.
+	 */

 	/* Make sure we can handle the pin inside SyncOneBuffer */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

-	/*
-	 * The purpose of this sweep is to ensure that buffers that
-	 * will be recycled soon are clean when needed; these buffers are the ones
-	 * just ahead of the StrategySyncStart point. 
-	 *
-	 * This loop considers only unpinned buffers close to the clock sweep
-	 * point.
-	 */
-	if (bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0)
+	num_to_scan = bufs_to_lap;
+	num_written = 0;
+	reusable_buffers = reusable_buffers_est;
+
+	/* Execute the LRU scan */
+	while (num_to_scan-- > 0 && reusable_buffers < upcoming_alloc_est)
 	{
-		num_to_scan = (int) ((NBuffers * bgwriter_lru_percent + 99) / 100);
-		num_written = 0;
+		int		buffer_state = SyncOneBuffer(next_to_clean, true);

-		buf_id = StrategySyncStart();
-
-		while (num_to_scan-- > 0)
+		if (buffer_state & BUF_WRITTEN)
 		{
-			if (SyncOneBuffer(buf_id, true))
+			reusable_buffers++;
+			if (++num_written >= bgwriter_lru_maxpages)
 			{
-				if (++num_written >= bgwriter_lru_maxpages)
-				{
-					BgWriterStats.m_maxwritten_clean++;
-					break;
-				}
+				BgWriterStats.m_maxwritten_clean++;
+				break;
 			}
-			if (++buf_id >= NBuffers)
-				buf_id = 0;
 		}
-		BgWriterStats.m_buf_written_clean += num_written;
+		else if (buffer_state & BUF_REUSABLE)
+			reusable_buffers++;
+
+		if (++next_to_clean >= NBuffers)
+		{
+			next_to_clean = 0;
+			next_passes++;
+		}
+	}
+
+	BgWriterStats.m_buf_written_clean += num_written;
+
+#ifdef BGW_DEBUG
+	elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
+		 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
+		 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
+		 bufs_to_lap - num_to_scan - 1,
+		 num_written,
+		 reusable_buffers - reusable_buffers_est);
+#endif
+
+	/*
+	 * Consider the above scan as being like a new allocation scan.
+	 * Characterize its density and update the smoothed one based on it.
+	 * This effectively halves the moving average period in cases where
+	 * both the strategy and the background writer are doing some useful
+	 * scanning, which is helpful because a long memory isn't as desirable
+	 * on the density estimates.
+	 */
+	strategy_delta = bufs_to_lap - num_to_scan - 1;
+	recent_alloc = reusable_buffers - reusable_buffers_est;
+	if (strategy_delta > 0 && recent_alloc > 0)
+	{
+		scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+		smoothed_density += (scans_per_alloc - smoothed_density) /
+			smoothing_samples;
+
+#ifdef BGW_DEBUG
+		elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
+			 recent_alloc, strategy_delta, scans_per_alloc, smoothed_density);
+#endif
 	}
 }

 /*
 * SyncOneBuffer -- process a single buffer during syncing.
 *
- * If skip_pinned is true, we don't write currently-pinned buffers, nor
+ * If skip_recently_used is true, we don't write currently-pinned buffers, nor
 * buffers marked recently used, as these are not replacement candidates.
 *
- * Returns true if buffer was written, else false.	(This could be in error
- * if FlushBuffers finds the buffer clean after locking it, but we don't
- * care all that much.)
+ * Returns a bitmask containing the following flag bits:
+ *	BUF_WRITTEN: we wrote the buffer.
+ *	BUF_REUSABLE: buffer is available for replacement, ie, it has
+ *		pin count 0 and usage count 0.
+ *
+ * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
+ * after locking it, but we don't care all that much.)
 *
 * Note: caller must have done ResourceOwnerEnlargeBuffers.
 */
-static bool
-SyncOneBuffer(int buf_id, bool skip_pinned)
+static int
+SyncOneBuffer(int buf_id, bool skip_recently_used)
 {
 	volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
+	int		result = 0;

 	/*
 	 * Check whether buffer needs writing.
@@ -1178,16 +1413,21 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
 	 * upcoming changes and so we are not required to write such dirty buffer.
 	 */
 	LockBufHdr(bufHdr);
+
+	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+		result |= BUF_REUSABLE;
+	else if (skip_recently_used)
+	{
+		/* Caller told us not to write recently-used buffers */
+		UnlockBufHdr(bufHdr);
+		return result;
+	}
+
 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
 	{
+		/* It's clean, so nothing to do */
 		UnlockBufHdr(bufHdr);
-		return false;
-	}
-	if (skip_pinned &&
-		(bufHdr->refcount != 0 || bufHdr->usage_count != 0))
-	{
-		UnlockBufHdr(bufHdr);
-		return false;
+		return result;
 	}

 	/*
@@ -1202,7 +1442,7 @@ SyncOneBuffer(int buf_id, bool skip_pinned)
 	LWLockRelease(bufHdr->content_lock);
 	UnpinBuffer(bufHdr, true);

-	return true;
+	return result | BUF_WRITTEN;
 }