Introduce a new smgr bulk loading facility.

The new facility makes it easier to optimize bulk loading, as the logic for buffering, WAL-logging, and syncing the relation only needs to be implemented once. It's also less error-prone: We have had a number of bugs in how a relation is fsync'd - or not - at the end of a bulk loading operation. By centralizing that logic to one place, we only need to write it correctly once. The new facility is faster for small relations: Instead of of calling smgrimmedsync(), we register the fsync to happen at next checkpoint, which avoids the fsync latency. That can make a big difference if you are e.g. restoring a schema-only dump with lots of relations. It is also slightly more efficient with large relations, as the WAL logging is performed multiple pages at a time. That avoids some WAL header overhead. The sorted GiST index build did that already, this moves the buffering to the new facility. The changes to pageinspect GiST test needs an explanation: Before this patch, the sorted GiST index build set the LSN on every page to the special GistBuildLSN value, not the LSN of the WAL record, even though they were WAL-logged. There was no particular need for it, it just happened naturally when we wrote out the pages before WAL-logging them. Now we WAL-log the pages first, like in B-tree build, so the pages are stamped with the record's real LSN. When the build is not WAL-logged, we still use GistBuildLSN. To make the test output predictable, use an unlogged index. Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/30e8f366-58b3-b239-c521-422122dd5150%40iki.fi
2025-11-22 12:22:45 +03:00 · 2024-02-23 16:10:51 +02:00
parent e612384fc7
commit 8af2565248
17 changed files with 551 additions and 354 deletions
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -29,11 +29,11 @@
 #include "nodes/execnodes.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "storage/bulk_write.h"
 #include "storage/condition_variable.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
-#include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
@@ -154,32 +154,17 @@ void
 btbuildempty(Relation index)
 {
 	bool		allequalimage = _bt_allequalimage(index, false);
-	Buffer		metabuf;
-	Page		metapage;
+	BulkWriteState *bulkstate;
+	BulkWriteBuffer metabuf;

-	/*
-	 * Initialize the metapage.
-	 *
-	 * Regular index build bypasses the buffer manager and uses smgr functions
-	 * directly, with an smgrimmedsync() call at the end.  That makes sense
-	 * when the index is large, but for an empty index, it's better to use the
-	 * buffer cache to avoid the smgrimmedsync().
-	 */
-	metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
-	_bt_lockbuf(index, metabuf, BT_WRITE);
+	bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);

-	START_CRIT_SECTION();
+	/* Construct metapage. */
+	metabuf = smgr_bulk_get_buf(bulkstate);
+	_bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
+	smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);

-	metapage = BufferGetPage(metabuf);
-	_bt_initmetapage(metapage, P_NONE, 0, allequalimage);
-	MarkBufferDirty(metabuf);
-	log_newpage_buffer(metabuf, true);
-
-	END_CRIT_SECTION();
-
-	_bt_unlockbuf(index, metabuf);
-	ReleaseBuffer(metabuf);
+	smgr_bulk_finish(bulkstate);
 }

 /*
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -23,13 +23,8 @@
 * many upper pages if the keys are reasonable-size) without risking a lot of
 * cascading splits during early insertions.
 *
- * Formerly the index pages being built were kept in shared buffers, but
- * that is of no value (since other backends have no interest in them yet)
- * and it created locking problems for CHECKPOINT, because the upper-level
- * pages were held exclusive-locked for long periods.  Now we just build
- * the pages in local memory and smgrwrite or smgrextend them as we finish
- * them.  They will need to be re-read into shared buffers on first use after
- * the build finishes.
+ * We use the bulk smgr loading facility to bypass the buffer cache and
+ * WAL-log the pages efficiently.
 *
 * This code isn't concerned about the FSM at all. The caller is responsible
 * for initializing that.
@@ -57,7 +52,7 @@
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
 #include "utils/rel.h"
 #include "utils/sortsupport.h"
@@ -234,7 +229,7 @@ typedef struct BTBuildState
 */
 typedef struct BTPageState
 {
-	Page		btps_page;		/* workspace for page building */
+	BulkWriteBuffer btps_buf;	/* workspace for page building */
 	BlockNumber btps_blkno;		/* block # to write this page at */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
@@ -251,11 +246,9 @@ typedef struct BTWriteState
 {
 	Relation	heap;
 	Relation	index;
+	BulkWriteState *bulkstate;
 	BTScanInsert inskey;		/* generic insertion scankey */
-	bool		btws_use_wal;	/* dump pages to WAL? */
 	BlockNumber btws_pages_alloced; /* # pages allocated */
-	BlockNumber btws_pages_written; /* # pages written out */
-	Page		btws_zeropage;	/* workspace for filling zeroes */
 } BTWriteState;


@@ -267,7 +260,7 @@ static void _bt_spool(BTSpool *btspool, ItemPointer self,
 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
 							   bool *isnull, bool tupleIsAlive, void *state);
-static Page _bt_blnewpage(uint32 level);
+static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
 static void _bt_slideleft(Page rightmostpage);
 static void _bt_sortaddtup(Page page, Size itemsize,
@@ -569,12 +562,9 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
 	/* _bt_mkscankey() won't set allequalimage without metapage */
 	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
-	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);

 	/* reserve the metapage */
 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
-	wstate.btws_pages_written = 0;
-	wstate.btws_zeropage = NULL;	/* until needed */

 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
@@ -613,13 +603,15 @@ _bt_build_callback(Relation index,
 /*
 * allocate workspace for a new, clean btree page, not linked to any siblings.
 */
-static Page
-_bt_blnewpage(uint32 level)
+static BulkWriteBuffer
+_bt_blnewpage(BTWriteState *wstate, uint32 level)
 {
+	BulkWriteBuffer buf;
 	Page		page;
 	BTPageOpaque opaque;

-	page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	buf = smgr_bulk_get_buf(wstate->bulkstate);
+	page = (Page) buf;

 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);
@@ -634,63 +626,17 @@ _bt_blnewpage(uint32 level)
 	/* Make the P_HIKEY line pointer appear allocated */
 	((PageHeader) page)->pd_lower += sizeof(ItemIdData);

-	return page;
+	return buf;
 }

 /*
 * emit a completed btree page, and release the working storage.
 */
 static void
-_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+_bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
 {
-	/* XLOG stuff */
-	if (wstate->btws_use_wal)
-	{
-		/* We use the XLOG_FPI record type for this */
-		log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
-	}
-
-	/*
-	 * If we have to write pages nonsequentially, fill in the space with
-	 * zeroes until we come back and overwrite.  This is not logically
-	 * necessary on standard Unix filesystems (unwritten space will read as
-	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
-	 * pages aren't WAL-logged though.
-	 */
-	while (blkno > wstate->btws_pages_written)
-	{
-		if (!wstate->btws_zeropage)
-			wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
-														  PG_IO_ALIGN_SIZE,
-														  MCXT_ALLOC_ZERO);
-		/* don't set checksum for all-zero page */
-		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
-				   wstate->btws_pages_written++,
-				   wstate->btws_zeropage,
-				   true);
-	}
-
-	PageSetChecksumInplace(page, blkno);
-
-	/*
-	 * Now write the page.  There's no need for smgr to schedule an fsync for
-	 * this write; we'll do it ourselves before ending the build.
-	 */
-	if (blkno == wstate->btws_pages_written)
-	{
-		/* extending the file... */
-		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				   page, true);
-		wstate->btws_pages_written++;
-	}
-	else
-	{
-		/* overwriting a block we zero-filled before */
-		smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				  page, true);
-	}
-
-	pfree(page);
+	smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
+	/* smgr_bulk_write took ownership of 'buf' */
 }

 /*
@@ -703,7 +649,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));

 	/* create initial page for level */
-	state->btps_page = _bt_blnewpage(level);
+	state->btps_buf = _bt_blnewpage(wstate, level);

 	/* and assign it a page position */
 	state->btps_blkno = wstate->btws_pages_alloced++;
@@ -839,6 +785,7 @@ static void
 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 			 Size truncextra)
 {
+	BulkWriteBuffer nbuf;
 	Page		npage;
 	BlockNumber nblkno;
 	OffsetNumber last_off;
@@ -853,7 +800,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 	 */
 	CHECK_FOR_INTERRUPTS();

-	npage = state->btps_page;
+	nbuf = state->btps_buf;
+	npage = (Page) nbuf;
 	nblkno = state->btps_blkno;
 	last_off = state->btps_lastoff;
 	last_truncextra = state->btps_lastextra;
@@ -909,6 +857,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		/*
 		 * Finish off the page and write it out.
 		 */
+		BulkWriteBuffer obuf = nbuf;
 		Page		opage = npage;
 		BlockNumber oblkno = nblkno;
 		ItemId		ii;
@@ -916,7 +865,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		IndexTuple	oitup;

 		/* Create new page of same level */
-		npage = _bt_blnewpage(state->btps_level);
+		nbuf = _bt_blnewpage(wstate, state->btps_level);
+		npage = (Page) nbuf;

 		/* and assign it a page position */
 		nblkno = wstate->btws_pages_alloced++;
@@ -1028,10 +978,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		}

 		/*
-		 * Write out the old page.  We never need to touch it again, so we can
-		 * free the opage workspace too.
+		 * Write out the old page. _bt_blwritepage takes ownership of the
+		 * 'opage' buffer.
 		 */
-		_bt_blwritepage(wstate, opage, oblkno);
+		_bt_blwritepage(wstate, obuf, oblkno);

 		/*
 		 * Reset last_off to point to new page
@@ -1064,7 +1014,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 	_bt_sortaddtup(npage, itupsz, itup, last_off,
 				   !isleaf && last_off == P_FIRSTKEY);

-	state->btps_page = npage;
+	state->btps_buf = nbuf;
 	state->btps_blkno = nblkno;
 	state->btps_lastoff = last_off;
 }
@@ -1116,7 +1066,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	BTPageState *s;
 	BlockNumber rootblkno = P_NONE;
 	uint32		rootlevel = 0;
-	Page		metapage;
+	BulkWriteBuffer metabuf;

 	/*
 	 * Each iteration of this loop completes one more level of the tree.
@@ -1127,7 +1077,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		BTPageOpaque opaque;

 		blkno = s->btps_blkno;
-		opaque = BTPageGetOpaque(s->btps_page);
+		opaque = BTPageGetOpaque((Page) s->btps_buf);

 		/*
 		 * We have to link the last page on this level to somewhere.
@@ -1161,9 +1111,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		 * This is the rightmost page, so the ItemId array needs to be slid
 		 * back one slot.  Then we can dump out the page.
 		 */
-		_bt_slideleft(s->btps_page);
-		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
-		s->btps_page = NULL;	/* writepage freed the workspace */
+		_bt_slideleft((Page) s->btps_buf);
+		_bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
+		s->btps_buf = NULL;		/* writepage took ownership of the buffer */
 	}

 	/*
@@ -1172,10 +1122,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
 	 */
-	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
-	_bt_initmetapage(metapage, rootblkno, rootlevel,
+	metabuf = smgr_bulk_get_buf(wstate->bulkstate);
+	_bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
-	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+	_bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
 }

 /*
@@ -1197,6 +1147,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	int64		tuples_done = 0;
 	bool		deduplicate;

+	wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
+
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);

@@ -1352,7 +1304,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 				 */
 				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
 					sizeof(ItemIdData);
-				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+				Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
 					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
 				dstate->htids = palloc(dstate->maxpostingsize);

@@ -1422,18 +1374,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)

 	/* Close down final pages and write the metapage */
 	_bt_uppershutdown(wstate, state);
-
-	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
-	 */
-	if (wstate->btws_use_wal)
-		smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
+	smgr_bulk_finish(wstate->bulkstate);
 }

 /*