Revert "Skip WAL for new relfilenodes, under wal_level=minimal."

This reverts commit cb2fd7eac2. Per numerous buildfarm members, it was incompatible with parallel query, and a test case assumed LP64. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
2025-09-03 15:22:11 +03:00 · 2020-03-22 09:24:09 -07:00
parent 03b89f1949
commit 2fbdebc248
48 changed files with 334 additions and 1401 deletions
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -763,6 +763,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	bool	   *isnull;
 	IndexScanDesc indexScan;
 	HeapScanDesc heapScan;
+	bool		use_wal;
 	bool		is_system_catalog;
 	TransactionId OldestXmin;
 	TransactionId FreezeXid;
@@ -819,9 +820,12 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);

 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
-	 * This may be harmless, but this function hasn't planned for it.
+	 * We need to log the copied data in WAL iff WAL archiving/streaming is
+	 * enabled AND it's a WAL-logged rel.
 	 */
+	use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
+
+	/* use_wal off requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);

 	/*
@@ -889,7 +893,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,

 	/* Initialize the rewrite operation */
 	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
-								 MultiXactCutoff);
+								 MultiXactCutoff, use_wal);

 	/*
 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
@@ -1281,25 +1285,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 		*mapped_tables++ = r2;
 	}

-	/*
-	 * Recognize that rel1's relfilenode (swapped from rel2) is new in this
-	 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
-	 * new.
-	 */
-	{
-		Relation	rel1,
-					rel2;
-
-		rel1 = relation_open(r1, NoLock);
-		rel2 = relation_open(r2, NoLock);
-		rel2->rd_createSubid = rel1->rd_createSubid;
-		rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
-		rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
-		RelationAssumeNewRelfilenode(rel1);
-		relation_close(rel1, NoLock);
-		relation_close(rel2, NoLock);
-	}
-
 	/*
 	 * In the case of a shared catalog, these next few steps will only affect
 	 * our own database's pg_class row; but that's okay, because they are all
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2382,17 +2382,65 @@ CopyFrom(CopyState cstate)

 	tupDesc = RelationGetDescr(cstate->rel);

-	/*
-	 * If the target file is new-in-transaction, we assume that checking FSM
-	 * for free space is a waste of time.  This could possibly be wrong, but
-	 * it's unlikely.
+	/*----------
+	 * Check to see if we can avoid writing WAL
+	 *
+	 * If archive logging/streaming is not enabled *and* either
+	 *	- table was created in same transaction as this COPY
+	 *	- data is being written to relfilenode created in this transaction
+	 * then we can skip writing WAL.  It's safe because if the transaction
+	 * doesn't commit, we'll discard the table (or the new relfilenode file).
+	 * If it does commit, we'll have done the heap_sync at the bottom of this
+	 * routine first.
+	 *
+	 * As mentioned in comments in utils/rel.h, the in-same-transaction test
+	 * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
+	 * can be cleared before the end of the transaction. The exact case is
+	 * when a relation sets a new relfilenode twice in same transaction, yet
+	 * the second one fails in an aborted subtransaction, e.g.
+	 *
+	 * BEGIN;
+	 * TRUNCATE t;
+	 * SAVEPOINT save;
+	 * TRUNCATE t;
+	 * ROLLBACK TO save;
+	 * COPY ...
+	 *
+	 * Also, if the target file is new-in-transaction, we assume that checking
+	 * FSM for free space is a waste of time, even if we must use WAL because
+	 * of archiving.  This could possibly be wrong, but it's unlikely.
+	 *
+	 * The comments for heap_insert and RelationGetBufferForTuple specify that
+	 * skipping WAL logging is only safe if we ensure that our tuples do not
+	 * go into pages containing tuples from any other transactions --- but this
+	 * must be the case if we have a new table or new relfilenode, so we need
+	 * no additional work to enforce that.
+	 *
+	 * We currently don't support this optimization if the COPY target is a
+	 * partitioned table as we currently only lazily initialize partition
+	 * information when routing the first tuple to the partition.  We cannot
+	 * know at this stage if we can perform this optimization.  It should be
+	 * possible to improve on this, but it does mean maintaining heap insert
+	 * option flags per partition and setting them when we first open the
+	 * partition.
+	 *
+	 * This optimization is not supported for relation types which do not
+	 * have any physical storage, with foreign tables and views using
+	 * INSTEAD OF triggers entering in this category.  Partitioned tables
+	 * are not supported as per the description above.
+	 *----------
 	 */
+	/* createSubid is creation check, newRelfilenodeSubid is truncation check */
 	if (cstate->rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE &&
 		cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE &&
 		cstate->rel->rd_rel->relkind != RELKIND_VIEW &&
 		(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
-		 cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId))
+		 cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
+	{
 		hi_options |= HEAP_INSERT_SKIP_FSM;
+		if (!XLogIsNeeded())
+			hi_options |= HEAP_INSERT_SKIP_WAL;
+	}

 	/*
 	 * Optimize if new relfilenode was created in this subxact or one of its
@@ -2884,6 +2932,13 @@ next_tuple:

 	FreeExecutorState(estate);

+	/*
+	 * If we skipped writing WAL, then we need to sync the heap (but not
+	 * indexes since those use WAL anyway)
+	 */
+	if (hi_options & HEAP_INSERT_SKIP_WAL)
+		heap_sync(cstate->rel);
+
 	return processed;
 }

--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -562,13 +562,16 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	myState->rel = intoRelationDesc;
 	myState->reladdr = intoRelationAddr;
 	myState->output_cid = GetCurrentCommandId(true);
-	myState->hi_options = HEAP_INSERT_SKIP_FSM;
-	myState->bistate = GetBulkInsertState();

 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
-	 * This may be harmless, but this function hasn't planned for it.
+	 * We can skip WAL-logging the insertions, unless PITR or streaming
+	 * replication is in use. We can skip the FSM in any case.
 	 */
+	myState->hi_options = HEAP_INSERT_SKIP_FSM |
+		(XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL);
+	myState->bistate = GetBulkInsertState();
+
+	/* Not using WAL requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber);
 }

@@ -614,6 +617,10 @@ intorel_shutdown(DestReceiver *self)

 	FreeBulkInsertState(myState->bistate);

+	/* If we skipped using WAL, must heap_sync before commit */
+	if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
+		heap_sync(myState->rel);
+
 	/* close rel, but keep lock until commit */
 	heap_close(myState->rel, NoLock);
 	myState->rel = NULL;
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -1050,8 +1050,6 @@ DefineIndex(Oid relationId,
 					childStmt->relationId = childRelid;
 					childStmt->indexOid = InvalidOid;
 					childStmt->oldNode = InvalidOid;
-					childStmt->oldCreateSubid = InvalidSubTransactionId;
-					childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;

 					/*
 					 * Adjust any Vars (both in expressions and in the index's
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -457,13 +457,17 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 */
 	myState->transientrel = transientrel;
 	myState->output_cid = GetCurrentCommandId(true);
-	myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
-	myState->bistate = GetBulkInsertState();

 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
-	 * This may be harmless, but this function hasn't planned for it.
+	 * We can skip WAL-logging the insertions, unless PITR or streaming
+	 * replication is in use. We can skip the FSM in any case.
 	 */
+	myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
+	if (!XLogIsNeeded())
+		myState->hi_options |= HEAP_INSERT_SKIP_WAL;
+	myState->bistate = GetBulkInsertState();
+
+	/* Not using WAL requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber);
 }

@@ -503,6 +507,10 @@ transientrel_shutdown(DestReceiver *self)

 	FreeBulkInsertState(myState->bistate);

+	/* If we skipped using WAL, must heap_sync before commit */
+	if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
+		heap_sync(myState->transientrel);
+
 	/* close transientrel, but keep lock until commit */
 	heap_close(myState->transientrel, NoLock);
 	myState->transientrel = NULL;
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4650,14 +4650,19 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 		newrel = NULL;

 	/*
-	 * Prepare a BulkInsertState and options for heap_insert.  The FSM is
-	 * empty, so don't bother using it.
+	 * Prepare a BulkInsertState and options for heap_insert. Because we're
+	 * building a new heap, we can skip WAL-logging and fsync it to disk at
+	 * the end instead (unless WAL-logging is required for archiving or
+	 * streaming replication). The FSM is empty too, so don't bother using it.
 	 */
 	if (newrel)
 	{
 		mycid = GetCurrentCommandId(true);
 		bistate = GetBulkInsertState();
+
 		hi_options = HEAP_INSERT_SKIP_FSM;
+		if (!XLogIsNeeded())
+			hi_options |= HEAP_INSERT_SKIP_WAL;
 	}
 	else
 	{
@@ -4929,6 +4934,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 	{
 		FreeBulkInsertState(bistate);

+		/* If we skipped writing WAL, then we need to sync the heap. */
+		if (hi_options & HEAP_INSERT_SKIP_WAL)
+			heap_sync(newrel);
+
 		heap_close(newrel, NoLock);
 	}
 }
@@ -7072,19 +7081,14 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,

 	/*
 	 * If TryReuseIndex() stashed a relfilenode for us, we used it for the new
-	 * index instead of building from scratch.  Restore associated fields.
-	 * This may store InvalidSubTransactionId in both fields, in which case
-	 * relcache.c will assume it can rebuild the relcache entry.  Hence, do
-	 * this after the CCI that made catalog rows visible to any rebuild.  The
-	 * DROP of the old edition of this index will have scheduled the storage
-	 * for deletion at commit, so cancel that pending deletion.
+	 * index instead of building from scratch.  The DROP of the old edition of
+	 * this index will have scheduled the storage for deletion at commit, so
+	 * cancel that pending deletion.
 	 */
 	if (OidIsValid(stmt->oldNode))
 	{
 		Relation	irel = index_open(address.objectId, NoLock);

-		irel->rd_createSubid = stmt->oldCreateSubid;
-		irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid;
 		RelationPreserveStorage(irel->rd_node, true);
 		index_close(irel, NoLock);
 	}
@@ -10876,11 +10880,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)

 		/* If it's a partitioned index, there is no storage to share. */
 		if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
-		{
 			stmt->oldNode = irel->rd_node.relNode;
-			stmt->oldCreateSubid = irel->rd_createSubid;
-			stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid;
-		}
 		index_close(irel, NoLock);
 	}
 }
@@ -11735,8 +11735,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)

 	heap_close(pg_class, RowExclusiveLock);

-	RelationAssumeNewRelfilenode(rel);
-
 	relation_close(rel, NoLock);

 	/* Make sure the reltablespace change is visible */
@@ -12008,9 +12006,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,

 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * enabled AND it's a permanent relation.  This gives the same answer as
-	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
-	 * current operation created a new relfilenode.
+	 * enabled AND it's a permanent relation.
 	 */
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -12052,15 +12048,21 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	}

 	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
-	 * occurring during the copy has no way to flush the previously written
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
-	 * they might still not be on disk when the crash occurs.
+	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
+	 * to ensure that the toast table gets fsync'd too.  (For a temp or
+	 * unlogged rel we don't care since the data will be gone after a crash
+	 * anyway.)
+	 *
+	 * It's obvious that we must do this when not WAL-logging the copy. It's
+	 * less obvious that we have to do it even if we did WAL-log the copied
+	 * pages. The reason is that since we're copying outside shared buffers, a
+	 * CHECKPOINT occurring during the copy has no way to flush the previously
+	 * written data to disk (indeed it won't know the new rel even exists).  A
+	 * crash later on would replay WAL from the checkpoint, therefore it
+	 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
+	 * here, they might still not be on disk when the crash occurs.
 	 */
-	if (use_wal || copying_initfork)
+	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }