Revert "Skip WAL for new relfilenodes, under wal_level=minimal."

This reverts commit cb2fd7eac2. Per numerous buildfarm members, it was incompatible with parallel query, and a test case assumed LP64. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
2025-11-24 00:23:06 +03:00 · 2020-03-22 09:24:09 -07:00
parent d0587f52b3
commit de9396326e
51 changed files with 362 additions and 1438 deletions
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -29,13 +29,9 @@
 #include "miscadmin.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
-#include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"

-/* GUC variables */
-int			wal_skip_threshold = 2048;	/* in kilobytes */
-
 /*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
@@ -65,14 +61,7 @@ typedef struct PendingRelDelete
 	struct PendingRelDelete *next;	/* linked-list link */
 } PendingRelDelete;

-typedef struct pendingSync
-{
-	RelFileNode rnode;
-	bool		is_truncated;	/* Has the file experienced truncation? */
-} pendingSync;
-
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
-HTAB	   *pendingSyncHash = NULL;

 /*
 * RelationCreateStorage
@@ -128,32 +117,6 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;

-	/* Queue an at-commit sync. */
-	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
-	{
-		pendingSync *pending;
-		bool		found;
-
-		/* we sync only permanent relations */
-		Assert(backend == InvalidBackendId);
-
-		if (!pendingSyncHash)
-		{
-			HASHCTL		ctl;
-
-			ctl.keysize = sizeof(RelFileNode);
-			ctl.entrysize = sizeof(pendingSync);
-			ctl.hcxt = TopTransactionContext;
-			pendingSyncHash =
-				hash_create("pending sync hash",
-							16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-		}
-
-		pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found);
-		Assert(!found);
-		pending->is_truncated = false;
-	}
-
 	return srel;
 }

@@ -312,8 +275,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 		}
 	}

-	RelationPreTruncate(rel);
-
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
 	 * trouble if the truncation fails. If we then crash, the WAL replay
@@ -364,28 +325,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 		FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
 }

-/*
- * RelationPreTruncate
- *		Perform AM-independent work before a physical truncation.
- *
- * If an access method's relation_nontransactional_truncate does not call
- * RelationTruncate(), it must call this before decreasing the table size.
- */
-void
-RelationPreTruncate(Relation rel)
-{
-	pendingSync *pending;
-
-	if (!pendingSyncHash)
-		return;
-	RelationOpenSmgr(rel);
-
-	pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
-						  HASH_FIND, NULL);
-	if (pending)
-		pending->is_truncated = true;
-}
-
 /*
 * Copy a fork's data, block by block.
 *
@@ -416,9 +355,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,

 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * enabled AND it's a permanent relation.  This gives the same answer as
-	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
-	 * current operation created a new relfilenode.
+	 * enabled AND it's a permanent relation.
 	 */
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -460,39 +397,24 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	}

 	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
-	 * occurring during the copy has no way to flush the previously written
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
-	 * they might still not be on disk when the crash occurs.
+	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
+	 * to ensure that the toast table gets fsync'd too.  (For a temp or
+	 * unlogged rel we don't care since the data will be gone after a crash
+	 * anyway.)
+	 *
+	 * It's obvious that we must do this when not WAL-logging the copy. It's
+	 * less obvious that we have to do it even if we did WAL-log the copied
+	 * pages. The reason is that since we're copying outside shared buffers, a
+	 * CHECKPOINT occurring during the copy has no way to flush the previously
+	 * written data to disk (indeed it won't know the new rel even exists).  A
+	 * crash later on would replay WAL from the checkpoint, therefore it
+	 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
+	 * here, they might still not be on disk when the crash occurs.
 	 */
-	if (use_wal || copying_initfork)
+	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }

-/*
- * RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL
- *
- *   Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
- *   New RelFileNode" in src/backend/access/transam/README.  Though it is
- *   known from Relation efficiently, this function is intended for the code
- *   paths not having access to Relation.
- */
-bool
-RelFileNodeSkippingWAL(RelFileNode rnode)
-{
-	if (XLogIsNeeded())
-		return false;			/* no permanent relfilenode skips WAL */
-
-	if (!pendingSyncHash ||
-		hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
-		return false;
-
-	return true;
-}
-
 /*
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
 *
@@ -570,144 +492,6 @@ smgrDoPendingDeletes(bool isCommit)
 	}
 }

-/*
- *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
- */
-void
-smgrDoPendingSyncs(bool isCommit)
-{
-	PendingRelDelete *pending;
-	int			nrels = 0,
-				maxrels = 0;
-	SMgrRelation *srels = NULL;
-	HASH_SEQ_STATUS scan;
-	pendingSync *pendingsync;
-
-	if (XLogIsNeeded())
-		return;					/* no relation can use this */
-
-	Assert(GetCurrentTransactionNestLevel() == 1);
-
-	if (!pendingSyncHash)
-		return;					/* no relation needs sync */
-
-	/* Just throw away all pending syncs if any at rollback */
-	if (!isCommit)
-	{
-		pendingSyncHash = NULL;
-		return;
-	}
-
-	AssertPendingSyncs_RelationCache();
-
-	/* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
-	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
-	{
-		if (!pending->atCommit)
-			continue;
-
-		(void) hash_search(pendingSyncHash, (void *) &pending->relnode,
-						   HASH_REMOVE, NULL);
-	}
-
-	hash_seq_init(&scan, pendingSyncHash);
-	while ((pendingsync = (pendingSync *) hash_seq_search(&scan)))
-	{
-		ForkNumber	fork;
-		BlockNumber nblocks[MAX_FORKNUM + 1];
-		BlockNumber total_blocks = 0;
-		SMgrRelation srel;
-
-		srel = smgropen(pendingsync->rnode, InvalidBackendId);
-
-		/*
-		 * We emit newpage WAL records for smaller relations.
-		 *
-		 * Small WAL records have a chance to be emitted along with other
-		 * backends' WAL records.  We emit WAL records instead of syncing for
-		 * files that are smaller than a certain threshold, expecting faster
-		 * commit.  The threshold is defined by the GUC wal_skip_threshold.
-		 */
-		if (!pendingsync->is_truncated)
-		{
-			for (fork = 0; fork <= MAX_FORKNUM; fork++)
-			{
-				if (smgrexists(srel, fork))
-				{
-					BlockNumber n = smgrnblocks(srel, fork);
-
-					/* we shouldn't come here for unlogged relations */
-					Assert(fork != INIT_FORKNUM);
-					nblocks[fork] = n;
-					total_blocks += n;
-				}
-				else
-					nblocks[fork] = InvalidBlockNumber;
-			}
-		}
-
-		/*
-		 * Sync file or emit WAL records for its contents.
-		 *
-		 * Although we emit WAL record if the file is small enough, do file
-		 * sync regardless of the size if the file has experienced a
-		 * truncation. It is because the file would be followed by trailing
-		 * garbage blocks after a crash recovery if, while a past longer file
-		 * had been flushed out, we omitted syncing-out of the file and
-		 * emitted WAL instead.  You might think that we could choose WAL if
-		 * the current main fork is longer than ever, but there's a case where
-		 * main fork is longer than ever but FSM fork gets shorter.
-		 */
-		if (pendingsync->is_truncated ||
-			total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
-		{
-			/* allocate the initial array, or extend it, if needed */
-			if (maxrels == 0)
-			{
-				maxrels = 8;
-				srels = palloc(sizeof(SMgrRelation) * maxrels);
-			}
-			else if (maxrels <= nrels)
-			{
-				maxrels *= 2;
-				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
-			}
-
-			srels[nrels++] = srel;
-		}
-		else
-		{
-			/* Emit WAL records for all blocks.  The file is small enough. */
-			for (fork = 0; fork <= MAX_FORKNUM; fork++)
-			{
-				int			n = nblocks[fork];
-				Relation	rel;
-
-				if (!BlockNumberIsValid(n))
-					continue;
-
-				/*
-				 * Emit WAL for the whole file.  Unfortunately we don't know
-				 * what kind of a page this is, so we have to log the full
-				 * page including any unused space.  ReadBufferExtended()
-				 * counts some pgstat events; unfortunately, we discard them.
-				 */
-				rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
-				log_newpage_range(rel, fork, 0, n, false);
-				FreeFakeRelcacheEntry(rel);
-			}
-		}
-	}
-
-	pendingSyncHash = NULL;
-
-	if (nrels > 0)
-	{
-		smgrdosyncall(srels, nrels);
-		pfree(srels);
-	}
-}
-
 /*
 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
 *