Revert "Skip WAL for new relfilenodes, under wal_level=minimal."

This reverts commit cb2fd7eac2. Per numerous buildfarm members, it was incompatible with parallel query, and a test case assumed LP64. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
2025-07-07 00:36:50 +03:00 · 2020-03-22 09:24:09 -07:00
parent e4b0a02ef8
commit 63631ee64f
51 changed files with 331 additions and 1446 deletions
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@ -1019,44 +1019,23 @@ gistproperty(Oid index_oid, int attno,
 }

 /*
- * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page
- * splits anyway. This function provides a fake sequence of LSNs for that
- * purpose.
+ * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
+ * to detect concurrent page splits anyway. This function provides a fake
+ * sequence of LSNs for that purpose.
 */
 XLogRecPtr
 gistGetFakeLSN(Relation rel)
 {
+	static XLogRecPtr counter = FirstNormalUnloggedLSN;
+
 	if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
 	{
 		/*
 		 * Temporary relations are only accessible in our session, so a simple
 		 * backend-local counter will do.
 		 */
-		static XLogRecPtr counter = FirstNormalUnloggedLSN;
-
 		return counter++;
 	}
-	else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
-	{
-		/*
-		 * WAL-logging on this relation will start after commit, so its LSNs
-		 * must be distinct numbers smaller than the LSN at the next commit.
-		 * Emit a dummy WAL record if insert-LSN hasn't advanced after the
-		 * last call.
-		 */
-		static XLogRecPtr lastlsn = InvalidXLogRecPtr;
-		XLogRecPtr	currlsn = GetXLogInsertRecPtr();
-
-		/* Shouldn't be called for WAL-logging relations */
-		Assert(!RelationNeedsWAL(rel));
-
-		/* No need for an actual record if we already have a distinct LSN */
-		if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn)
-			currlsn = gistXLogAssignLSN();
-
-		lastlsn = currlsn;
-		return currlsn;
-	}
 	else
 	{
 		/*
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@ -449,9 +449,6 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_PAGE_DELETE:
 			gistRedoPageDelete(record);
 			break;
-		case XLOG_GIST_ASSIGN_LSN:
-			/* nop. See gistGetFakeLSN(). */
-			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@ -595,24 +592,6 @@ gistXLogPageDelete(Buffer buffer, FullTransactionId xid,
 	return recptr;
 }

-/*
- * Write an empty XLOG record to assign a distinct LSN.
- */
-XLogRecPtr
-gistXLogAssignLSN(void)
-{
-	int			dummy = 0;
-
-	/*
-	 * Records other than SWITCH_WAL must have content. We use an integer 0 to
-	 * follow the restriction.
-	 */
-	XLogBeginInsert();
-	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
-	XLogRegisterData((char *) &dummy, sizeof(dummy));
-	return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN);
-}
-
 /*
 * Write XLOG record about reuse of a deleted page.
 */
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@ -21,6 +21,7 @@
 *		heap_multi_insert - insert multiple tuples into a relation
 *		heap_delete		- delete a tuple from a relation
 *		heap_update		- replace a tuple in a relation with another tuple
+ *		heap_sync		- sync heap, for when no WAL has been written
 *
 * NOTES
 *	  This file contains the heap_ routines which implement
@ -1935,7 +1936,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	MarkBufferDirty(buffer);

 	/* XLOG stuff */
-	if (RelationNeedsWAL(relation))
+	if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
 	{
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
@ -2118,7 +2119,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	/* currently not needed (thus unsupported) for heap_multi_insert() */
 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));

-	needwal = RelationNeedsWAL(relation);
+	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);

@ -8920,13 +8921,18 @@ heap2_redo(XLogReaderState *record)
 }

 /*
- *	heap_sync		- for binary compatibility
+ *	heap_sync		- sync a heap, for use when no WAL has been written
 *
- * A newer PostgreSQL version removes this function.  It exists here just in
- * case an extension calls it.  See "Skipping WAL for New RelFileNode" in
- * src/backend/access/transam/README for the system that superseded it,
- * allowing removal of most calls.  Cases like RelationCopyStorage() should
- * call smgrimmedsync() directly.
+ * This forces the heap contents (including TOAST heap if any) down to disk.
+ * If we skipped using WAL, and WAL is otherwise needed, we must force the
+ * relation down to disk before it's safe to commit the transaction.  This
+ * requires writing out any dirty buffers and then doing a forced fsync.
+ *
+ * Indexes are not touched.  (Currently, index operations associated with
+ * the commands that use this are WAL-logged and so do not need fsync.
+ * That behavior might change someday, but in any case it's likely that
+ * any fsync decisions required would be per-index and hence not appropriate
+ * to be done here.)
 */
 void
 heap_sync(Relation rel)
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@ -559,6 +559,17 @@ tuple_lock_retry:
 	return result;
 }

+static void
+heapam_finish_bulk_insert(Relation relation, int options)
+{
+	/*
+	 * If we skipped writing WAL, then we need to sync the heap (but not
+	 * indexes since those use WAL anyway / don't go through tableam)
+	 */
+	if (options & HEAP_INSERT_SKIP_WAL)
+		heap_sync(relation);
+}
+

 /* ------------------------------------------------------------------------
 * DDL related callbacks for heap AM.
@ -691,6 +702,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	IndexScanDesc indexScan;
 	TableScanDesc tableScan;
 	HeapScanDesc heapScan;
+	bool		use_wal;
 	bool		is_system_catalog;
 	Tuplesortstate *tuplesort;
 	TupleDesc	oldTupDesc = RelationGetDescr(OldHeap);
@ -705,9 +717,12 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	is_system_catalog = IsSystemRelation(OldHeap);

 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
-	 * This may be harmless, but this function hasn't planned for it.
+	 * We need to log the copied data in WAL iff WAL archiving/streaming is
+	 * enabled AND it's a WAL-logged rel.
 	 */
+	use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
+
+	/* use_wal off requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);

 	/* Preallocate values/isnull arrays */
@ -717,7 +732,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,

 	/* Initialize the rewrite operation */
 	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
-								 *multi_cutoff);
+								 *multi_cutoff, use_wal);


 	/* Set up sorting if wanted */
@ -2611,6 +2626,7 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_delete = heapam_tuple_delete,
 	.tuple_update = heapam_tuple_update,
 	.tuple_lock = heapam_tuple_lock,
+	.finish_bulk_insert = heapam_finish_bulk_insert,

 	.tuple_fetch_row_version = heapam_fetch_row_version,
 	.tuple_get_latest_tid = heap_get_latest_tid,
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@ -144,6 +144,7 @@ typedef struct RewriteStateData
 	Page		rs_buffer;		/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
 	bool		rs_buffer_valid;	/* T if any tuples in buffer */
+	bool		rs_use_wal;		/* must we WAL-log inserts? */
 	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;	/* oldest xmin used by caller to determine
 									 * tuple visibility */
@ -237,13 +238,15 @@ static void logical_end_heap_rewrite(RewriteState state);
 * oldest_xmin	xid used by the caller to determine which tuples are dead
 * freeze_xid	xid before which tuples will be frozen
 * min_multi	multixact before which multis will be removed
+ * use_wal		should the inserts to the new heap be WAL-logged?
 *
 * Returns an opaque RewriteState, allocated in current memory context,
 * to be used in subsequent calls to the other functions.
 */
 RewriteState
 begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
-				   TransactionId freeze_xid, MultiXactId cutoff_multi)
+				   TransactionId freeze_xid, MultiXactId cutoff_multi,
+				   bool use_wal)
 {
 	RewriteState state;
 	MemoryContext rw_cxt;
@ -268,6 +271,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
 	state->rs_buffer_valid = false;
+	state->rs_use_wal = use_wal;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
@ -326,7 +330,7 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
-		if (RelationNeedsWAL(state->rs_new_rel))
+		if (state->rs_use_wal)
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
 						state->rs_blockno,
@ -341,14 +345,18 @@ end_heap_rewrite(RewriteState state)
 	}

 	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
+	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
+	 * to ensure that the toast table gets fsync'd too.
+	 *
+	 * It's obvious that we must do this when not WAL-logging. It's less
+	 * obvious that we have to do it even if we did WAL-log the pages. The
 	 * reason is the same as in storage.c's RelationCopyStorage(): we're
 	 * writing data that's not in shared buffers, and so a CHECKPOINT
 	 * occurring during the rewriteheap operation won't have fsync'd data we
 	 * wrote before the checkpoint.
 	 */
 	if (RelationNeedsWAL(state->rs_new_rel))
-		smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM);
+		heap_sync(state->rs_new_rel);

 	logical_end_heap_rewrite(state);

@ -646,6 +654,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	{
 		int			options = HEAP_INSERT_SKIP_FSM;

+		if (!state->rs_use_wal)
+			options |= HEAP_INSERT_SKIP_WAL;
+
 		/*
 		 * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
 		 * for the TOAST table are not logically decoded.  The main heap is
@ -684,7 +695,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			/* Doesn't fit, so write out the existing page */

 			/* XLOG stuff */
-			if (RelationNeedsWAL(state->rs_new_rel))
+			if (state->rs_use_wal)
 				log_newpage(&state->rs_new_rel->rd_node,
 							MAIN_FORKNUM,
 							state->rs_blockno,
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@ -31,6 +31,18 @@
 * them.  They will need to be re-read into shared buffers on first use after
 * the build finishes.
 *
+ * Since the index will never be used unless it is completely built,
+ * from a crash-recovery point of view there is no need to WAL-log the
+ * steps of the build.  After completing the index build, we can just sync
+ * the whole file to disk using smgrimmedsync() before exiting this module.
+ * This can be seen to be sufficient for crash recovery by considering that
+ * it's effectively equivalent to what would happen if a CHECKPOINT occurred
+ * just after the index build.  However, it is clearly not sufficient if the
+ * DBA is using the WAL log for PITR or replication purposes, since another
+ * machine would not be able to reconstruct the index from WAL.  Therefore,
+ * we log the completed index pages to WAL if and only if WAL archiving is
+ * active.
+ *
 * This code isn't concerned about the FSM at all. The caller is responsible
 * for initializing that.
 *
@ -560,7 +572,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 	wstate.heap = btspool->heap;
 	wstate.index = btspool->index;
 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
-	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+
+	/*
+	 * We need to log index creation in WAL iff WAL archiving/streaming is
+	 * enabled UNLESS the index isn't WAL-logged anyway.
+	 */
+	wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);

 	/* reserve the metapage */
 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
@ -1269,15 +1286,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	_bt_uppershutdown(wstate, state);

 	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
+	 * If the index is WAL-logged, we must fsync it down to disk before it's
+	 * safe to commit the transaction.  (For a non-WAL-logged index we don't
+	 * care since the index will be uninteresting after a crash anyway.)
+	 *
+	 * It's obvious that we must do this when not WAL-logging the build. It's
+	 * less obvious that we have to do it even if we did WAL-log the index
+	 * pages.  The reason is that since we're building outside shared buffers,
+	 * a CHECKPOINT occurring during the build has no way to flush the
+	 * previously written data to disk (indeed it won't know the index even
+	 * exists).  A crash later on would replay WAL from the checkpoint,
+	 * therefore it wouldn't replay our earlier WAL entries. If we do not
+	 * fsync those pages here, they might still not be on disk when the crash
+	 * occurs.
 	 */
-	if (wstate->btws_use_wal)
+	if (RelationNeedsWAL(wstate->index))
 	{
 		RelationOpenSmgr(wstate->index);
 		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@ -80,9 +80,6 @@ gist_desc(StringInfo buf, XLogReaderState *record)
 		case XLOG_GIST_PAGE_DELETE:
 			out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec);
 			break;
-		case XLOG_GIST_ASSIGN_LSN:
-			/* No details to write out */
-			break;
 	}
 }

@ -108,9 +105,6 @@ gist_identify(uint8 info)
 		case XLOG_GIST_PAGE_DELETE:
 			id = "PAGE_DELETE";
 			break;
-		case XLOG_GIST_ASSIGN_LSN:
-			id = "ASSIGN_LSN";
-			break;
 	}

 	return id;
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@ -717,38 +717,6 @@ then restart recovery.  This is part of the reason for not writing a WAL
 entry until we've successfully done the original action.


-Skipping WAL for New RelFileNode
--------------------------------
-
-Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK
-would unlink, in-tree access methods write no WAL for that change.  Code that
-writes WAL without calling RelationNeedsWAL() must check for this case.  This
-skipping is mandatory.  If a WAL-writing change preceded a WAL-skipping change
-for the same block, REDO could overwrite the WAL-skipping change.  If a
-WAL-writing change followed a WAL-skipping change for the same block, a
-related problem would arise.  When a WAL record contains no full-page image,
-REDO expects the page to match its contents from just before record insertion.
-A WAL-skipping change may not reach disk at all, violating REDO's expectation
-under full_page_writes=off.  For any access method, CommitTransaction() writes
-and fsyncs affected blocks before recording the commit.
-
-Prefer to do the same in future access methods.  However, two other approaches
-can work.  First, an access method can irreversibly transition a given fork
-from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and
-smgrimmedsync().  Second, an access method can opt to write WAL
-unconditionally for permanent relations.  Under these approaches, the access
-method callbacks must not call functions that react to RelationNeedsWAL().
-
-This applies only to WAL records whose replay would modify bytes stored in the
-new relfilenode.  It does not apply to other records about the relfilenode,
-such as XLOG_SMGR_CREATE.  Because it operates at the level of individual
-relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations.
-Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which
-ALTER TABLE adds a TOAST relation.  The TOAST relation will skip WAL, while
-the table owning it will not.  ALTER TABLE SET TABLESPACE will cause a table
-to skip WAL, but that won't affect its indexes.
-
-
 Asynchronous Commit
 -------------------

@ -852,12 +820,13 @@ Changes to a temp table are not WAL-logged, hence could reach disk in
 advance of T1's commit, but we don't care since temp table contents don't
 survive crashes anyway.

-Database writes that skip WAL for new relfilenodes are also safe.  In these
-cases it's entirely possible for the data to reach disk before T1's commit,
-because T1 will fsync it down to disk without any sort of interlock.  However,
-all these paths are designed to write data that no other transaction can see
-until after T1 commits.  The situation is thus not different from ordinary
-WAL-logged updates.
+Database writes made via any of the paths we have introduced to avoid WAL
+overhead for bulk updates are also safe.  In these cases it's entirely
+possible for the data to reach disk before T1's commit, because T1 will
+fsync it down to disk without any sort of interlock, as soon as it finishes
+the bulk update.  However, all these paths are designed to write data that
+no other transaction can see until after T1 commits.  The situation is thus
+not different from ordinary WAL-logged updates.

 Transaction Emulation during Recovery
 -------------------------------------
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@ -2107,13 +2107,6 @@ CommitTransaction(void)
 	 */
 	PreCommit_on_commit_actions();

-	/*
-	 * Synchronize files that are created and not WAL-logged during this
-	 * transaction. This must happen before AtEOXact_RelationMap(), so that we
-	 * don't see committed-but-broken files after a crash.
-	 */
-	smgrDoPendingSyncs(true);
-
 	/* close large objects before lower-level cleanup */
 	AtEOXact_LargeObject(true);

@ -2347,13 +2340,6 @@ PrepareTransaction(void)
 	 */
 	PreCommit_on_commit_actions();

-	/*
-	 * Synchronize files that are created and not WAL-logged during this
-	 * transaction. This must happen before EndPrepare(), so that we don't see
-	 * committed-but-broken files after a crash and COMMIT PREPARED.
-	 */
-	smgrDoPendingSyncs(true);
-
 	/* close large objects before lower-level cleanup */
 	AtEOXact_LargeObject(true);

@ -2672,7 +2658,6 @@ AbortTransaction(void)
 	 */
 	AfterTriggerEndXact(false); /* 'false' means it's abort */
 	AtAbort_Portals();
-	smgrDoPendingSyncs(false);
 	AtEOXact_LargeObject(false);
 	AtAbort_Notify();
 	AtEOXact_RelationMap(false, is_parallel_worker);
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@ -544,8 +544,6 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry;
 * fields related to physical storage, like rd_rel, are initialized, so the
 * fake entry is only usable in low-level operations like ReadBuffer().
 *
- * This is also used for syncing WAL-skipped files.
- *
 * Caller must free the returned entry with FreeFakeRelcacheEntry().
 */
 Relation
@ -554,20 +552,18 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
 	FakeRelCacheEntry fakeentry;
 	Relation	rel;

+	Assert(InRecovery);
+
 	/* Allocate the Relation struct and all related space in one block. */
 	fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
 	rel = (Relation) fakeentry;

 	rel->rd_rel = &fakeentry->pgc;
 	rel->rd_node = rnode;
-
-	/*
-	 * We will never be working with temp rels during recovery or while
-	 * syncing WAL-skipped files.
-	 */
+	/* We will never be working with temp rels during recovery */
 	rel->rd_backend = InvalidBackendId;

-	/* It must be a permanent table here */
+	/* It must be a permanent table if we're in recovery. */
 	rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;

 	/* We don't know the name of the relation; use relfilenode instead */
@ -576,9 +572,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
 	/*
 	 * We set up the lockRelId in case anything tries to lock the dummy
 	 * relation.  Note that this is fairly bogus since relNode may be
-	 * different from the relation's OID.  It shouldn't really matter though.
-	 * In recovery, we are running by ourselves and can't have any lock
-	 * conflicts.  While syncing, we already hold AccessExclusiveLock.
+	 * different from the relation's OID.  It shouldn't really matter though,
+	 * since we are presumably running by ourselves and can't have any lock
+	 * conflicts ...
 	 */
 	rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
 	rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@ -306,8 +306,6 @@ Boot_DeclareIndexStmt:
 					stmt->idxcomment = NULL;
 					stmt->indexOid = InvalidOid;
 					stmt->oldNode = InvalidOid;
-					stmt->oldCreateSubid = InvalidSubTransactionId;
-					stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					stmt->unique = false;
 					stmt->primary = false;
 					stmt->isconstraint = false;
@ -358,8 +356,6 @@ Boot_DeclareUniqueIndexStmt:
 					stmt->idxcomment = NULL;
 					stmt->indexOid = InvalidOid;
 					stmt->oldNode = InvalidOid;
-					stmt->oldCreateSubid = InvalidSubTransactionId;
-					stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					stmt->unique = true;
 					stmt->primary = false;
 					stmt->isconstraint = false;
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@ -30,13 +30,9 @@
 #include "catalog/storage_xlog.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
-#include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"

-/* GUC variables */
-int			wal_skip_threshold = 2048;	/* in kilobytes */
-
 /*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
@ -66,14 +62,7 @@ typedef struct PendingRelDelete
 	struct PendingRelDelete *next;	/* linked-list link */
 } PendingRelDelete;

-typedef struct pendingSync
-{
-	RelFileNode rnode;
-	bool		is_truncated;	/* Has the file experienced truncation? */
-} pendingSync;
-
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
-HTAB	   *pendingSyncHash = NULL;

 /*
 * RelationCreateStorage
@ -129,32 +118,6 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;

-	/* Queue an at-commit sync. */
-	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
-	{
-		pendingSync *pending;
-		bool		found;
-
-		/* we sync only permanent relations */
-		Assert(backend == InvalidBackendId);
-
-		if (!pendingSyncHash)
-		{
-			HASHCTL		ctl;
-
-			ctl.keysize = sizeof(RelFileNode);
-			ctl.entrysize = sizeof(pendingSync);
-			ctl.hcxt = TopTransactionContext;
-			pendingSyncHash =
-				hash_create("pending sync hash",
-							16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-		}
-
-		pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found);
-		Assert(!found);
-		pending->is_truncated = false;
-	}
-
 	return srel;
 }

@ -289,8 +252,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	if (vm)
 		visibilitymap_truncate(rel, nblocks);

-	RelationPreTruncate(rel);
-
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
 	 * trouble if the truncation fails. If we then crash, the WAL replay
@ -333,28 +294,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
 }

-/*
- * RelationPreTruncate
- *		Perform AM-independent work before a physical truncation.
- *
- * If an access method's relation_nontransactional_truncate does not call
- * RelationTruncate(), it must call this before decreasing the table size.
- */
-void
-RelationPreTruncate(Relation rel)
-{
-	pendingSync *pending;
-
-	if (!pendingSyncHash)
-		return;
-	RelationOpenSmgr(rel);
-
-	pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
-						  HASH_FIND, NULL);
-	if (pending)
-		pending->is_truncated = true;
-}
-
 /*
 * Copy a fork's data, block by block.
 *
@ -385,9 +324,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,

 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * enabled AND it's a permanent relation.  This gives the same answer as
-	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
-	 * current operation created a new relfilenode.
+	 * enabled AND it's a permanent relation.
 	 */
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@ -429,39 +366,24 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	}

 	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
-	 * occurring during the copy has no way to flush the previously written
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
-	 * they might still not be on disk when the crash occurs.
+	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
+	 * to ensure that the toast table gets fsync'd too.  (For a temp or
+	 * unlogged rel we don't care since the data will be gone after a crash
+	 * anyway.)
+	 *
+	 * It's obvious that we must do this when not WAL-logging the copy. It's
+	 * less obvious that we have to do it even if we did WAL-log the copied
+	 * pages. The reason is that since we're copying outside shared buffers, a
+	 * CHECKPOINT occurring during the copy has no way to flush the previously
+	 * written data to disk (indeed it won't know the new rel even exists).  A
+	 * crash later on would replay WAL from the checkpoint, therefore it
+	 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
+	 * here, they might still not be on disk when the crash occurs.
 	 */
-	if (use_wal || copying_initfork)
+	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }

-/*
- * RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL
- *
- *   Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
- *   New RelFileNode" in src/backend/access/transam/README.  Though it is
- *   known from Relation efficiently, this function is intended for the code
- *   paths not having access to Relation.
- */
-bool
-RelFileNodeSkippingWAL(RelFileNode rnode)
-{
-	if (XLogIsNeeded())
-		return false;			/* no permanent relfilenode skips WAL */
-
-	if (!pendingSyncHash ||
-		hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
-		return false;
-
-	return true;
-}
-
 /*
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
 *
@ -539,144 +461,6 @@ smgrDoPendingDeletes(bool isCommit)
 	}
 }

-/*
- *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
- */
-void
-smgrDoPendingSyncs(bool isCommit)
-{
-	PendingRelDelete *pending;
-	int			nrels = 0,
-				maxrels = 0;
-	SMgrRelation *srels = NULL;
-	HASH_SEQ_STATUS scan;
-	pendingSync *pendingsync;
-
-	if (XLogIsNeeded())
-		return;					/* no relation can use this */
-
-	Assert(GetCurrentTransactionNestLevel() == 1);
-
-	if (!pendingSyncHash)
-		return;					/* no relation needs sync */
-
-	/* Just throw away all pending syncs if any at rollback */
-	if (!isCommit)
-	{
-		pendingSyncHash = NULL;
-		return;
-	}
-
-	AssertPendingSyncs_RelationCache();
-
-	/* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
-	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
-	{
-		if (!pending->atCommit)
-			continue;
-
-		(void) hash_search(pendingSyncHash, (void *) &pending->relnode,
-						   HASH_REMOVE, NULL);
-	}
-
-	hash_seq_init(&scan, pendingSyncHash);
-	while ((pendingsync = (pendingSync *) hash_seq_search(&scan)))
-	{
-		ForkNumber	fork;
-		BlockNumber nblocks[MAX_FORKNUM + 1];
-		BlockNumber total_blocks = 0;
-		SMgrRelation srel;
-
-		srel = smgropen(pendingsync->rnode, InvalidBackendId);
-
-		/*
-		 * We emit newpage WAL records for smaller relations.
-		 *
-		 * Small WAL records have a chance to be emitted along with other
-		 * backends' WAL records.  We emit WAL records instead of syncing for
-		 * files that are smaller than a certain threshold, expecting faster
-		 * commit.  The threshold is defined by the GUC wal_skip_threshold.
-		 */
-		if (!pendingsync->is_truncated)
-		{
-			for (fork = 0; fork <= MAX_FORKNUM; fork++)
-			{
-				if (smgrexists(srel, fork))
-				{
-					BlockNumber n = smgrnblocks(srel, fork);
-
-					/* we shouldn't come here for unlogged relations */
-					Assert(fork != INIT_FORKNUM);
-					nblocks[fork] = n;
-					total_blocks += n;
-				}
-				else
-					nblocks[fork] = InvalidBlockNumber;
-			}
-		}
-
-		/*
-		 * Sync file or emit WAL records for its contents.
-		 *
-		 * Although we emit WAL record if the file is small enough, do file
-		 * sync regardless of the size if the file has experienced a
-		 * truncation. It is because the file would be followed by trailing
-		 * garbage blocks after a crash recovery if, while a past longer file
-		 * had been flushed out, we omitted syncing-out of the file and
-		 * emitted WAL instead.  You might think that we could choose WAL if
-		 * the current main fork is longer than ever, but there's a case where
-		 * main fork is longer than ever but FSM fork gets shorter.
-		 */
-		if (pendingsync->is_truncated ||
-			total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
-		{
-			/* allocate the initial array, or extend it, if needed */
-			if (maxrels == 0)
-			{
-				maxrels = 8;
-				srels = palloc(sizeof(SMgrRelation) * maxrels);
-			}
-			else if (maxrels <= nrels)
-			{
-				maxrels *= 2;
-				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
-			}
-
-			srels[nrels++] = srel;
-		}
-		else
-		{
-			/* Emit WAL records for all blocks.  The file is small enough. */
-			for (fork = 0; fork <= MAX_FORKNUM; fork++)
-			{
-				int			n = nblocks[fork];
-				Relation	rel;
-
-				if (!BlockNumberIsValid(n))
-					continue;
-
-				/*
-				 * Emit WAL for the whole file.  Unfortunately we don't know
-				 * what kind of a page this is, so we have to log the full
-				 * page including any unused space.  ReadBufferExtended()
-				 * counts some pgstat events; unfortunately, we discard them.
-				 */
-				rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
-				log_newpage_range(rel, fork, 0, n, false);
-				FreeFakeRelcacheEntry(rel);
-			}
-		}
-	}
-
-	pendingSyncHash = NULL;
-
-	if (nrels > 0)
-	{
-		smgrdosyncall(srels, nrels);
-		pfree(srels);
-	}
-}
-
 /*
 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
 *
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@ -1111,25 +1111,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 		*mapped_tables++ = r2;
 	}

-	/*
-	 * Recognize that rel1's relfilenode (swapped from rel2) is new in this
-	 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
-	 * new.
-	 */
-	{
-		Relation	rel1,
-					rel2;
-
-		rel1 = relation_open(r1, NoLock);
-		rel2 = relation_open(r2, NoLock);
-		rel2->rd_createSubid = rel1->rd_createSubid;
-		rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
-		rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
-		RelationAssumeNewRelfilenode(rel1);
-		relation_close(rel1, NoLock);
-		relation_close(rel2, NoLock);
-	}
-
 	/*
 	 * In the case of a shared catalog, these next few steps will only affect
 	 * our own database's pg_class row; but that's okay, because they are all
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@ -2715,15 +2715,63 @@ CopyFrom(CopyState cstate)
 							RelationGetRelationName(cstate->rel))));
 	}

-	/*
-	 * If the target file is new-in-transaction, we assume that checking FSM
-	 * for free space is a waste of time.  This could possibly be wrong, but
-	 * it's unlikely.
+	/*----------
+	 * Check to see if we can avoid writing WAL
+	 *
+	 * If archive logging/streaming is not enabled *and* either
+	 *	- table was created in same transaction as this COPY
+	 *	- data is being written to relfilenode created in this transaction
+	 * then we can skip writing WAL.  It's safe because if the transaction
+	 * doesn't commit, we'll discard the table (or the new relfilenode file).
+	 * If it does commit, we'll have done the table_finish_bulk_insert() at
+	 * the bottom of this routine first.
+	 *
+	 * As mentioned in comments in utils/rel.h, the in-same-transaction test
+	 * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
+	 * can be cleared before the end of the transaction. The exact case is
+	 * when a relation sets a new relfilenode twice in same transaction, yet
+	 * the second one fails in an aborted subtransaction, e.g.
+	 *
+	 * BEGIN;
+	 * TRUNCATE t;
+	 * SAVEPOINT save;
+	 * TRUNCATE t;
+	 * ROLLBACK TO save;
+	 * COPY ...
+	 *
+	 * Also, if the target file is new-in-transaction, we assume that checking
+	 * FSM for free space is a waste of time, even if we must use WAL because
+	 * of archiving.  This could possibly be wrong, but it's unlikely.
+	 *
+	 * The comments for table_tuple_insert and RelationGetBufferForTuple
+	 * specify that skipping WAL logging is only safe if we ensure that our
+	 * tuples do not go into pages containing tuples from any other
+	 * transactions --- but this must be the case if we have a new table or
+	 * new relfilenode, so we need no additional work to enforce that.
+	 *
+	 * We currently don't support this optimization if the COPY target is a
+	 * partitioned table as we currently only lazily initialize partition
+	 * information when routing the first tuple to the partition.  We cannot
+	 * know at this stage if we can perform this optimization.  It should be
+	 * possible to improve on this, but it does mean maintaining heap insert
+	 * option flags per partition and setting them when we first open the
+	 * partition.
+	 *
+	 * This optimization is not supported for relation types which do not
+	 * have any physical storage, with foreign tables and views using
+	 * INSTEAD OF triggers entering in this category.  Partitioned tables
+	 * are not supported as per the description above.
+	 *----------
 	 */
+	/* createSubid is creation check, newRelfilenodeSubid is truncation check */
 	if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) &&
 		(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
-		 cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId))
+		 cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
+	{
 		ti_options |= TABLE_INSERT_SKIP_FSM;
+		if (!XLogIsNeeded())
+			ti_options |= TABLE_INSERT_SKIP_WAL;
+	}

 	/*
 	 * Optimize if new relfilenode was created in this subxact or one of its
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@ -553,13 +553,16 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	myState->rel = intoRelationDesc;
 	myState->reladdr = intoRelationAddr;
 	myState->output_cid = GetCurrentCommandId(true);
-	myState->ti_options = TABLE_INSERT_SKIP_FSM;
-	myState->bistate = GetBulkInsertState();

 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
-	 * This may be harmless, but this function hasn't planned for it.
+	 * We can skip WAL-logging the insertions, unless PITR or streaming
+	 * replication is in use. We can skip the FSM in any case.
 	 */
+	myState->ti_options = TABLE_INSERT_SKIP_FSM |
+		(XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL);
+	myState->bistate = GetBulkInsertState();
+
+	/* Not using WAL requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber);
 }

--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@ -1195,8 +1195,6 @@ DefineIndex(Oid relationId,
 					childStmt->relation = NULL;
 					childStmt->indexOid = InvalidOid;
 					childStmt->oldNode = InvalidOid;
-					childStmt->oldCreateSubid = InvalidSubTransactionId;
-					childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;

 					/*
 					 * Adjust any Vars (both in expressions and in the index's
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@ -457,13 +457,17 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 */
 	myState->transientrel = transientrel;
 	myState->output_cid = GetCurrentCommandId(true);
-	myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
-	myState->bistate = GetBulkInsertState();

 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
-	 * This may be harmless, but this function hasn't planned for it.
+	 * We can skip WAL-logging the insertions, unless PITR or streaming
+	 * replication is in use. We can skip the FSM in any case.
 	 */
+	myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
+	if (!XLogIsNeeded())
+		myState->ti_options |= TABLE_INSERT_SKIP_WAL;
+	myState->bistate = GetBulkInsertState();
+
+	/* Not using WAL requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber);
 }

--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@ -4785,14 +4785,19 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 		newrel = NULL;

 	/*
-	 * Prepare a BulkInsertState and options for table_tuple_insert.  The FSM
-	 * is empty, so don't bother using it.
+	 * Prepare a BulkInsertState and options for table_tuple_insert. Because
+	 * we're building a new heap, we can skip WAL-logging and fsync it to disk
+	 * at the end instead (unless WAL-logging is required for archiving or
+	 * streaming replication). The FSM is empty too, so don't bother using it.
 	 */
 	if (newrel)
 	{
 		mycid = GetCurrentCommandId(true);
 		bistate = GetBulkInsertState();
+
 		ti_options = TABLE_INSERT_SKIP_FSM;
+		if (!XLogIsNeeded())
+			ti_options |= TABLE_INSERT_SKIP_WAL;
 	}
 	else
 	{
@ -7300,19 +7305,14 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,

 	/*
 	 * If TryReuseIndex() stashed a relfilenode for us, we used it for the new
-	 * index instead of building from scratch.  Restore associated fields.
-	 * This may store InvalidSubTransactionId in both fields, in which case
-	 * relcache.c will assume it can rebuild the relcache entry.  Hence, do
-	 * this after the CCI that made catalog rows visible to any rebuild.  The
-	 * DROP of the old edition of this index will have scheduled the storage
-	 * for deletion at commit, so cancel that pending deletion.
+	 * index instead of building from scratch.  The DROP of the old edition of
+	 * this index will have scheduled the storage for deletion at commit, so
+	 * cancel that pending deletion.
 	 */
 	if (OidIsValid(stmt->oldNode))
 	{
 		Relation	irel = index_open(address.objectId, NoLock);

-		irel->rd_createSubid = stmt->oldCreateSubid;
-		irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid;
 		RelationPreserveStorage(irel->rd_node, true);
 		index_close(irel, NoLock);
 	}
@ -11619,11 +11619,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)

 		/* If it's a partitioned index, there is no storage to share. */
 		if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
-		{
 			stmt->oldNode = irel->rd_node.relNode;
-			stmt->oldCreateSubid = irel->rd_createSubid;
-			stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid;
-		}
 		index_close(irel, NoLock);
 	}
 }
@ -12557,8 +12553,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)

 	table_close(pg_class, RowExclusiveLock);

-	RelationAssumeNewRelfilenode(rel);
-
 	relation_close(rel, NoLock);

 	/* Make sure the reltablespace change is visible */
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@ -3469,8 +3469,6 @@ _copyIndexStmt(const IndexStmt *from)
 	COPY_STRING_FIELD(idxcomment);
 	COPY_SCALAR_FIELD(indexOid);
 	COPY_SCALAR_FIELD(oldNode);
-	COPY_SCALAR_FIELD(oldCreateSubid);
-	COPY_SCALAR_FIELD(oldFirstRelfilenodeSubid);
 	COPY_SCALAR_FIELD(unique);
 	COPY_SCALAR_FIELD(primary);
 	COPY_SCALAR_FIELD(isconstraint);
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@ -1339,8 +1339,6 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b)
 	COMPARE_STRING_FIELD(idxcomment);
 	COMPARE_SCALAR_FIELD(indexOid);
 	COMPARE_SCALAR_FIELD(oldNode);
-	COMPARE_SCALAR_FIELD(oldCreateSubid);
-	COMPARE_SCALAR_FIELD(oldFirstRelfilenodeSubid);
 	COMPARE_SCALAR_FIELD(unique);
 	COMPARE_SCALAR_FIELD(primary);
 	COMPARE_SCALAR_FIELD(isconstraint);
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@ -2642,8 +2642,6 @@ _outIndexStmt(StringInfo str, const IndexStmt *node)
 	WRITE_STRING_FIELD(idxcomment);
 	WRITE_OID_FIELD(indexOid);
 	WRITE_OID_FIELD(oldNode);
-	WRITE_UINT_FIELD(oldCreateSubid);
-	WRITE_UINT_FIELD(oldFirstRelfilenodeSubid);
 	WRITE_BOOL_FIELD(unique);
 	WRITE_BOOL_FIELD(primary);
 	WRITE_BOOL_FIELD(isconstraint);
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@ -7357,8 +7357,6 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->idxcomment = NULL;
 					n->indexOid = InvalidOid;
 					n->oldNode = InvalidOid;
-					n->oldCreateSubid = InvalidSubTransactionId;
-					n->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					n->primary = false;
 					n->isconstraint = false;
 					n->deferrable = false;
@ -7387,8 +7385,6 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->idxcomment = NULL;
 					n->indexOid = InvalidOid;
 					n->oldNode = InvalidOid;
-					n->oldCreateSubid = InvalidSubTransactionId;
-					n->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					n->primary = false;
 					n->isconstraint = false;
 					n->deferrable = false;
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@ -1401,8 +1401,6 @@ generateClonedIndexStmt(RangeVar *heapRel, Relation source_idx,
 	index->idxcomment = NULL;
 	index->indexOid = InvalidOid;
 	index->oldNode = InvalidOid;
-	index->oldCreateSubid = InvalidSubTransactionId;
-	index->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 	index->unique = idxrec->indisunique;
 	index->primary = idxrec->indisprimary;
 	index->transformed = true;	/* don't need transformIndexStmt */
@ -2001,8 +1999,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 	index->idxcomment = NULL;
 	index->indexOid = InvalidOid;
 	index->oldNode = InvalidOid;
-	index->oldCreateSubid = InvalidSubTransactionId;
-	index->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 	index->transformed = false;
 	index->concurrent = false;
 	index->if_not_exists = false;
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@ -66,7 +66,7 @@
 #define BUF_WRITTEN				0x01
 #define BUF_REUSABLE			0x02

-#define RELS_BSEARCH_THRESHOLD		20
+#define DROP_RELS_BSEARCH_THRESHOLD		20

 typedef struct PrivateRefCountEntry
 {
@ -105,19 +105,6 @@ typedef struct CkptTsStatus
 	int			index;
 } CkptTsStatus;

-/*
- * Type for array used to sort SMgrRelations
- *
- * FlushRelationsAllBuffers shares the same comparator function with
- * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
- * compatible.
- */
-typedef struct SMgrSortArray
-{
-	RelFileNode rnode;			/* This must be the first member */
-	SMgrRelation srel;
-} SMgrSortArray;
-
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@ -3033,7 +3020,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 	 * an exactly determined value, as it depends on many factors (CPU and RAM
 	 * speeds, amount of shared buffers etc.).
 	 */
-	use_bsearch = n > RELS_BSEARCH_THRESHOLD;
+	use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;

 	/* sort the list of rnodes if necessary */
 	if (use_bsearch)
@ -3283,104 +3270,6 @@ FlushRelationBuffers(Relation rel)
 	}
 }

-/* ---------------------------------------------------------------------
- *		FlushRelationsAllBuffers
- *
- *		This function flushes out of the buffer pool all the pages of all
- *		forks of the specified smgr relations.  It's equivalent to calling
- *		FlushRelationBuffers once per fork per relation.  The relations are
- *		assumed not to use local buffers.
- * --------------------------------------------------------------------
- */
-void
-FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
-{
-	int			i;
-	SMgrSortArray *srels;
-	bool		use_bsearch;
-
-	if (nrels == 0)
-		return;
-
-	/* fill-in array for qsort */
-	srels = palloc(sizeof(SMgrSortArray) * nrels);
-
-	for (i = 0; i < nrels; i++)
-	{
-		Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
-
-		srels[i].rnode = smgrs[i]->smgr_rnode.node;
-		srels[i].srel = smgrs[i];
-	}
-
-	/*
-	 * Save the bsearch overhead for low number of relations to sync. See
-	 * DropRelFileNodesAllBuffers for details.
-	 */
-	use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
-
-	/* sort the list of SMgrRelations if necessary */
-	if (use_bsearch)
-		pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
-
-	/* Make sure we can handle the pin inside the loop */
-	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-
-	for (i = 0; i < NBuffers; i++)
-	{
-		SMgrSortArray *srelent = NULL;
-		BufferDesc *bufHdr = GetBufferDescriptor(i);
-		uint32		buf_state;
-
-		/*
-		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
-		 * and saves some cycles.
-		 */
-
-		if (!use_bsearch)
-		{
-			int			j;
-
-			for (j = 0; j < nrels; j++)
-			{
-				if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
-				{
-					srelent = &srels[j];
-					break;
-				}
-			}
-
-		}
-		else
-		{
-			srelent = bsearch((const void *) &(bufHdr->tag.rnode),
-							  srels, nrels, sizeof(SMgrSortArray),
-							  rnode_comparator);
-		}
-
-		/* buffer doesn't belong to any of the given relfilenodes; skip it */
-		if (srelent == NULL)
-			continue;
-
-		ReservePrivateRefCountEntry();
-
-		buf_state = LockBufHdr(bufHdr);
-		if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
-			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
-		{
-			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-			FlushBuffer(bufHdr, srelent->srel);
-			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
-			UnpinBuffer(bufHdr, true);
-		}
-		else
-			UnlockBufHdr(bufHdr, buf_state);
-	}
-
-	pfree(srels);
-}
-
 /* ---------------------------------------------------------------------
 *		FlushDatabaseBuffers
 *
@ -3582,15 +3471,13 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
 		{
 			/*
-			 * If we must not write WAL, due to a relfilenode-specific
-			 * condition or being in recovery, don't dirty the page.  We can
-			 * set the hint, just not dirty the page as a result so the hint
-			 * is lost when we evict the page or shutdown.
+			 * If we're in recovery we cannot dirty a page because of a hint.
+			 * We can set the hint, just not dirty the page as a result so the
+			 * hint is lost when we evict the page or shutdown.
 			 *
 			 * See src/backend/storage/page/README for longer discussion.
 			 */
-			if (RecoveryInProgress() ||
-				RelFileNodeSkippingWAL(bufHdr->tag.rnode))
+			if (RecoveryInProgress())
 				return;

 			/*
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@ -587,18 +587,6 @@ LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode)
 	return (locallock && locallock->nLocks > 0);
 }

-#ifdef USE_ASSERT_CHECKING
-/*
- * GetLockMethodLocalHash -- return the hash of local locks, for modules that
- *		evaluate assertions based on all locks held.
- */
-HTAB *
-GetLockMethodLocalHash(void)
-{
-	return LockMethodLocalHash;
-}
-#endif
-
 /*
 * LockHasWaiters -- look up 'locktag' and check if releasing this
 *		lock would wake up other processes waiting for it.
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@ -234,10 +234,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 * During replay, we would delete the file and then recreate it, which is fine
 * if the contents of the file were repopulated by subsequent WAL entries.
 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as we do at wal_level=minimal), the contents of
- * the file would be lost forever.  By leaving the empty file until after the
- * next checkpoint, we prevent reassignment of the relfilenode number until
- * it's safe, because relfilenode assignment skips over any existing file.
+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
+ * the contents of the file would be lost forever.  By leaving the empty file
+ * until after the next checkpoint, we prevent reassignment of the relfilenode
+ * number until it's safe, because relfilenode assignment skips over any
+ * existing file.
 *
 * We do not need to go through this dance for temp relations, though, because
 * we never make WAL entries for temp rels, and so a temp rel poses no threat
@ -851,18 +852,12 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 *	mdimmedsync() -- Immediately sync a relation to stable storage.
 *
 * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.  We
- * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
- * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
- * some segment, then mdtruncate() renders that segment inactive.  If we
- * crash before the next checkpoint syncs the newly-inactive segment, that
- * segment may survive recovery, reintroducing unwanted data into the table.
+ * nothing of dirty buffers that may exist inside the buffer manager.
 */
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
 	int			segno;
-	int			min_inactive_seg;

 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@ -870,16 +865,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	 */
 	mdnblocks(reln, forknum);

-	min_inactive_seg = segno = reln->md_num_open_segs[forknum];
-
-	/*
-	 * Temporarily open inactive segments, then close them after sync.  There
-	 * may be some inactive segments left opened after fsync() error, but that
-	 * is harmless.  We don't bother to clean them up and take a risk of
-	 * further trouble.  The next mdclose() will soon close them.
-	 */
-	while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
-		segno++;
+	segno = reln->md_num_open_segs[forknum];

 	while (segno > 0)
 	{
@ -890,14 +876,6 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-
-		/* Close inactive segments immediately */
-		if (segno > min_inactive_seg)
-		{
-			FileClose(v->mdfd_vfd);
-			_fdvec_resize(reln, forknum, segno - 1);
-		}
-
 		segno--;
 	}
 }
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@ -416,41 +416,6 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
 	smgrsw[which].smgr_unlink(rnode, InvalidForkNumber, isRedo);
 }

-/*
- *	smgrdosyncall() -- Immediately sync all forks of all given relations
- *
- *		All forks of all given relations are synced out to the store.
- *
- *		This is equivalent to FlushRelationBuffers() for each smgr relation,
- *		then calling smgrimmedsync() for all forks of each relation, but it's
- *		significantly quicker so should be preferred when possible.
- */
-void
-smgrdosyncall(SMgrRelation *rels, int nrels)
-{
-	int			i = 0;
-	ForkNumber	forknum;
-
-	if (nrels == 0)
-		return;
-
-	FlushRelationsAllBuffers(rels, nrels);
-
-	/*
-	 * Sync the physical file(s).
-	 */
-	for (i = 0; i < nrels; i++)
-	{
-		int			which = rels[i]->smgr_which;
-
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			if (smgrsw[which].smgr_exists(rels[i], forknum))
-				smgrsw[which].smgr_immedsync(rels[i], forknum);
-		}
-	}
-}
-
 /*
 *	smgrdounlinkall() -- Immediately unlink all forks of all given relations
 *
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@ -263,9 +263,6 @@ static void RelationReloadIndexInfo(Relation relation);
 static void RelationReloadNailed(Relation relation);
 static void RelationFlushRelation(Relation relation);
 static void RememberToFreeTupleDescAtEOX(TupleDesc td);
-#ifdef USE_ASSERT_CHECKING
-static void AssertPendingSyncConsistency(Relation relation);
-#endif
 static void AtEOXact_cleanup(Relation relation, bool isCommit);
 static void AtEOSubXact_cleanup(Relation relation, bool isCommit,
 								SubTransactionId mySubid, SubTransactionId parentSubid);
@ -1100,8 +1097,6 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 	relation->rd_isnailed = false;
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
-	relation->rd_droppedSubid = InvalidSubTransactionId;
 	switch (relation->rd_rel->relpersistence)
 	{
 		case RELPERSISTENCE_UNLOGGED:
@ -1835,8 +1830,6 @@ formrdesc(const char *relationName, Oid relationReltype,
 	relation->rd_isnailed = true;
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
-	relation->rd_droppedSubid = InvalidSubTransactionId;
 	relation->rd_backend = InvalidBackendId;
 	relation->rd_islocaltemp = false;

@ -2009,13 +2002,6 @@ RelationIdGetRelation(Oid relationId)

 	if (RelationIsValid(rd))
 	{
-		/* return NULL for dropped relations */
-		if (rd->rd_droppedSubid != InvalidSubTransactionId)
-		{
-			Assert(!rd->rd_isvalid);
-			return NULL;
-		}
-
 		RelationIncrementReferenceCount(rd);
 		/* revalidate cache entry if necessary */
 		if (!rd->rd_isvalid)
@ -2109,7 +2095,7 @@ RelationClose(Relation relation)
 #ifdef RELCACHE_FORCE_RELEASE
 	if (RelationHasReferenceCountZero(relation) &&
 		relation->rd_createSubid == InvalidSubTransactionId &&
-		relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
+		relation->rd_newRelfilenodeSubid == InvalidSubTransactionId)
 		RelationClearRelation(relation, false);
 #endif
 }
@ -2148,11 +2134,10 @@ RelationReloadIndexInfo(Relation relation)
 	HeapTuple	pg_class_tuple;
 	Form_pg_class relp;

-	/* Should be called only for invalidated, live indexes */
+	/* Should be called only for invalidated indexes */
 	Assert((relation->rd_rel->relkind == RELKIND_INDEX ||
 			relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) &&
-		   !relation->rd_isvalid &&
-		   relation->rd_droppedSubid == InvalidSubTransactionId);
+		   !relation->rd_isvalid);

 	/* Ensure it's closed at smgr level */
 	RelationCloseSmgr(relation);
@ -2448,13 +2433,6 @@ RelationClearRelation(Relation relation, bool rebuild)
 		return;
 	}

-	/* Mark it invalid until we've finished rebuild */
-	relation->rd_isvalid = false;
-
-	/* See RelationForgetRelation(). */
-	if (relation->rd_droppedSubid != InvalidSubTransactionId)
-		return;
-
 	/*
 	 * Even non-system indexes should not be blown away if they are open and
 	 * have valid index support information.  This avoids problems with active
@ -2467,11 +2445,15 @@ RelationClearRelation(Relation relation, bool rebuild)
 		relation->rd_refcnt > 0 &&
 		relation->rd_indexcxt != NULL)
 	{
+		relation->rd_isvalid = false;	/* needs to be revalidated */
 		if (IsTransactionState())
 			RelationReloadIndexInfo(relation);
 		return;
 	}

+	/* Mark it invalid until we've finished rebuild */
+	relation->rd_isvalid = false;
+
 	/*
 	 * If we're really done with the relcache entry, blow it away. But if
 	 * someone is still using it, reconstruct the whole deal without moving
@ -2529,13 +2511,13 @@ RelationClearRelation(Relation relation, bool rebuild)
 		 * problem.
 		 *
 		 * When rebuilding an open relcache entry, we must preserve ref count,
-		 * rd_*Subid, and rd_toastoid state.  Also attempt to preserve the
-		 * pg_class entry (rd_rel), tupledesc, rewrite-rule, partition key,
-		 * and partition descriptor substructures in place, because various
-		 * places assume that these structures won't move while they are
-		 * working with an open relcache entry.  (Note:  the refcount
-		 * mechanism for tupledescs might someday allow us to remove this hack
-		 * for the tupledesc.)
+		 * rd_createSubid/rd_newRelfilenodeSubid, and rd_toastoid state.  Also
+		 * attempt to preserve the pg_class entry (rd_rel), tupledesc,
+		 * rewrite-rule, partition key, and partition descriptor substructures
+		 * in place, because various places assume that these structures won't
+		 * move while they are working with an open relcache entry.  (Note:
+		 * the refcount mechanism for tupledescs might someday allow us to
+		 * remove this hack for the tupledesc.)
 		 *
 		 * Note that this process does not touch CurrentResourceOwner; which
 		 * is good because whatever ref counts the entry may have do not
@ -2619,8 +2601,6 @@ RelationClearRelation(Relation relation, bool rebuild)
 		/* creation sub-XIDs must be preserved */
 		SWAPFIELD(SubTransactionId, rd_createSubid);
 		SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid);
-		SWAPFIELD(SubTransactionId, rd_firstRelfilenodeSubid);
-		SWAPFIELD(SubTransactionId, rd_droppedSubid);
 		/* un-swap rd_rel pointers, swap contents instead */
 		SWAPFIELD(Form_pg_class, rd_rel);
 		/* ... but actually, we don't have to update newrel->rd_rel */
@ -2688,12 +2668,12 @@ static void
 RelationFlushRelation(Relation relation)
 {
 	if (relation->rd_createSubid != InvalidSubTransactionId ||
-		relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+		relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
 	{
 		/*
 		 * New relcache entries are always rebuilt, not flushed; else we'd
-		 * forget the "new" status of the relation.  Ditto for the
-		 * new-relfilenode status.
+		 * forget the "new" status of the relation, which is a useful
+		 * optimization to have.  Ditto for the new-relfilenode status.
 		 *
 		 * The rel could have zero refcnt here, so temporarily increment the
 		 * refcnt to ensure it's safe to rebuild it.  We can assume that the
@ -2715,7 +2695,10 @@ RelationFlushRelation(Relation relation)
 }

 /*
- * RelationForgetRelation - caller reports that it dropped the relation
+ * RelationForgetRelation - unconditionally remove a relcache entry
+ *
+ *		   External interface for destroying a relcache entry when we
+ *		   drop the relation.
 */
 void
 RelationForgetRelation(Oid rid)
@ -2730,19 +2713,7 @@ RelationForgetRelation(Oid rid)
 	if (!RelationHasReferenceCountZero(relation))
 		elog(ERROR, "relation %u is still open", rid);

-	Assert(relation->rd_droppedSubid == InvalidSubTransactionId);
-	if (relation->rd_createSubid != InvalidSubTransactionId ||
-		relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
-	{
-		/*
-		 * In the event of subtransaction rollback, we must not forget
-		 * rd_*Subid.  Mark the entry "dropped" so RelationClearRelation()
-		 * invalidates it in lieu of destroying it.  (If we're in a top
-		 * transaction, we could opt to destroy the entry.)
-		 */
-		relation->rd_droppedSubid = GetCurrentSubTransactionId();
-	}
-
+	/* Unconditionally destroy the relcache entry */
 	RelationClearRelation(relation, false);
 }

@ -2782,10 +2753,11 @@ RelationCacheInvalidateEntry(Oid relationId)
 *	 relation cache and re-read relation mapping data.
 *
 *	 This is currently used only to recover from SI message buffer overflow,
- *	 so we do not touch relations having new-in-transaction relfilenodes; they
- *	 cannot be targets of cross-backend SI updates (and our own updates now go
- *	 through a separate linked list that isn't limited by the SI message
- *	 buffer size).
+ *	 so we do not touch new-in-transaction relations; they cannot be targets
+ *	 of cross-backend SI updates (and our own updates now go through a
+ *	 separate linked list that isn't limited by the SI message buffer size).
+ *	 Likewise, we need not discard new-relfilenode-in-transaction hints,
+ *	 since any invalidation of those would be a local event.
 *
 *	 We do this in two phases: the first pass deletes deletable items, and
 *	 the second one rebuilds the rebuildable items.  This is essential for
@ -2836,7 +2808,7 @@ RelationCacheInvalidate(void)
 		 * pending invalidations.
 		 */
 		if (relation->rd_createSubid != InvalidSubTransactionId ||
-			relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+			relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
 			continue;

 		relcacheInvalsReceived++;
@ -2948,84 +2920,6 @@ RememberToFreeTupleDescAtEOX(TupleDesc td)
 	EOXactTupleDescArray[NextEOXactTupleDescNum++] = td;
 }

-#ifdef USE_ASSERT_CHECKING
-static void
-AssertPendingSyncConsistency(Relation relation)
-{
-	bool		relcache_verdict =
-	relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&
-	((relation->rd_createSubid != InvalidSubTransactionId &&
-	  RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) ||
-	 relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId);
-
-	Assert(relcache_verdict == RelFileNodeSkippingWAL(relation->rd_node));
-
-	if (relation->rd_droppedSubid != InvalidSubTransactionId)
-		Assert(!relation->rd_isvalid &&
-			   (relation->rd_createSubid != InvalidSubTransactionId ||
-				relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId));
-}
-
-/*
- * AssertPendingSyncs_RelationCache
- *
- *	Assert that relcache.c and storage.c agree on whether to skip WAL.
- */
-void
-AssertPendingSyncs_RelationCache(void)
-{
-	HASH_SEQ_STATUS status;
-	LOCALLOCK  *locallock;
-	Relation   *rels;
-	int			maxrels;
-	int			nrels;
-	RelIdCacheEnt *idhentry;
-	int			i;
-
-	/*
-	 * Open every relation that this transaction has locked.  If, for some
-	 * relation, storage.c is skipping WAL and relcache.c is not skipping WAL,
-	 * a CommandCounterIncrement() typically yields a local invalidation
-	 * message that destroys the relcache entry.  By recreating such entries
-	 * here, we detect the problem.
-	 */
-	PushActiveSnapshot(GetTransactionSnapshot());
-	maxrels = 1;
-	rels = palloc(maxrels * sizeof(*rels));
-	nrels = 0;
-	hash_seq_init(&status, GetLockMethodLocalHash());
-	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
-	{
-		Oid			relid;
-		Relation	r;
-
-		if (locallock->nLocks <= 0)
-			continue;
-		if ((LockTagType) locallock->tag.lock.locktag_type !=
-			LOCKTAG_RELATION)
-			continue;
-		relid = ObjectIdGetDatum(locallock->tag.lock.locktag_field2);
-		r = RelationIdGetRelation(relid);
-		if (!RelationIsValid(r))
-			continue;
-		if (nrels >= maxrels)
-		{
-			maxrels *= 2;
-			rels = repalloc(rels, maxrels * sizeof(*rels));
-		}
-		rels[nrels++] = r;
-	}
-
-	hash_seq_init(&status, RelationIdCache);
-	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
-		AssertPendingSyncConsistency(idhentry->reldesc);
-
-	for (i = 0; i < nrels; i++)
-		RelationClose(rels[i]);
-	PopActiveSnapshot();
-}
-#endif
-
 /*
 * AtEOXact_RelationCache
 *
@ -3108,8 +3002,6 @@ AtEOXact_RelationCache(bool isCommit)
 static void
 AtEOXact_cleanup(Relation relation, bool isCommit)
 {
-	bool		clear_relcache = false;
-
 	/*
 	 * The relcache entry's ref count should be back to its normal
 	 * not-in-a-transaction state: 0 unless it's nailed in cache.
@ -3135,31 +3027,17 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
 #endif

 	/*
-	 * Is the relation live after this transaction ends?
+	 * Is it a relation created in the current transaction?
 	 *
-	 * During commit, clear the relcache entry if it is preserved after
-	 * relation drop, in order not to orphan the entry.  During rollback,
-	 * clear the relcache entry if the relation is created in the current
-	 * transaction since it isn't interesting any longer once we are out of
-	 * the transaction.
+	 * During commit, reset the flag to zero, since we are now out of the
+	 * creating transaction.  During abort, simply delete the relcache entry
+	 * --- it isn't interesting any longer.
 	 */
-	clear_relcache =
-		(isCommit ?
-		 relation->rd_droppedSubid != InvalidSubTransactionId :
-		 relation->rd_createSubid != InvalidSubTransactionId);
-
-	/*
-	 * Since we are now out of the transaction, reset the subids to zero.
-	 * That also lets RelationClearRelation() drop the relcache entry.
-	 */
-	relation->rd_createSubid = InvalidSubTransactionId;
-	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
-	relation->rd_droppedSubid = InvalidSubTransactionId;
-
-	if (clear_relcache)
+	if (relation->rd_createSubid != InvalidSubTransactionId)
 	{
-		if (RelationHasReferenceCountZero(relation))
+		if (isCommit)
+			relation->rd_createSubid = InvalidSubTransactionId;
+		else if (RelationHasReferenceCountZero(relation))
 		{
 			RelationClearRelation(relation, false);
 			return;
@ -3174,10 +3052,16 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
 			 * eventually.  This must be just a WARNING to avoid
 			 * error-during-error-recovery loops.
 			 */
+			relation->rd_createSubid = InvalidSubTransactionId;
 			elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
 				 RelationGetRelationName(relation));
 		}
 	}
+
+	/*
+	 * Likewise, reset the hint about the relfilenode being new.
+	 */
+	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 }

 /*
@ -3241,28 +3125,15 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 	/*
 	 * Is it a relation created in the current subtransaction?
 	 *
-	 * During subcommit, mark it as belonging to the parent, instead, as long
-	 * as it has not been dropped. Otherwise simply delete the relcache entry.
-	 * --- it isn't interesting any longer.
+	 * During subcommit, mark it as belonging to the parent, instead. During
+	 * subabort, simply delete the relcache entry.
 	 */
 	if (relation->rd_createSubid == mySubid)
 	{
-		/*
-		 * Valid rd_droppedSubid means the corresponding relation is dropped
-		 * but the relcache entry is preserved for at-commit pending sync. We
-		 * need to drop it explicitly here not to make the entry orphan.
-		 */
-		Assert(relation->rd_droppedSubid == mySubid ||
-			   relation->rd_droppedSubid == InvalidSubTransactionId);
-		if (isCommit && relation->rd_droppedSubid == InvalidSubTransactionId)
+		if (isCommit)
 			relation->rd_createSubid = parentSubid;
 		else if (RelationHasReferenceCountZero(relation))
 		{
-			/* allow the entry to be removed */
-			relation->rd_createSubid = InvalidSubTransactionId;
-			relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-			relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
-			relation->rd_droppedSubid = InvalidSubTransactionId;
 			RelationClearRelation(relation, false);
 			return;
 		}
@ -3282,8 +3153,7 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 	}

 	/*
-	 * Likewise, update or drop any new-relfilenode-in-subtransaction record
-	 * or drop record.
+	 * Likewise, update or drop any new-relfilenode-in-subtransaction hint.
 	 */
 	if (relation->rd_newRelfilenodeSubid == mySubid)
 	{
@ -3292,22 +3162,6 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 		else
 			relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	}
-
-	if (relation->rd_firstRelfilenodeSubid == mySubid)
-	{
-		if (isCommit)
-			relation->rd_firstRelfilenodeSubid = parentSubid;
-		else
-			relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
-	}
-
-	if (relation->rd_droppedSubid == mySubid)
-	{
-		if (isCommit)
-			relation->rd_droppedSubid = parentSubid;
-		else
-			relation->rd_droppedSubid = InvalidSubTransactionId;
-	}
 }


@ -3397,7 +3251,6 @@ RelationBuildLocalRelation(const char *relname,
 	/* it's being created in this transaction */
 	rel->rd_createSubid = GetCurrentSubTransactionId();
 	rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-	rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;

 	/*
 	 * create a new tuple descriptor from the one passed in.  We do this
@ -3695,29 +3548,14 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
 	 */
 	CommandCounterIncrement();

-	RelationAssumeNewRelfilenode(relation);
-}
-
-/*
- * RelationAssumeNewRelfilenode
- *
- * Code that modifies pg_class.reltablespace or pg_class.relfilenode must call
- * this.  The call shall precede any code that might insert WAL records whose
- * replay would modify bytes in the new RelFileNode, and the call shall follow
- * any WAL modifying bytes in the prior RelFileNode.  See struct RelationData.
- * Ideally, call this as near as possible to the CommandCounterIncrement()
- * that makes the pg_class change visible (before it or after it); that
- * minimizes the chance of future development adding a forbidden WAL insertion
- * between RelationAssumeNewRelfilenode() and CommandCounterIncrement().
- */
-void
-RelationAssumeNewRelfilenode(Relation relation)
-{
+	/*
+	 * Mark the rel as having been given a new relfilenode in the current
+	 * (sub) transaction.  This is a hint that can be used to optimize later
+	 * operations on the rel in the same transaction.
+	 */
 	relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId();
-	if (relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
-		relation->rd_firstRelfilenodeSubid = relation->rd_newRelfilenodeSubid;

-	/* Flag relation as needing eoxact cleanup (to clear these fields) */
+	/* Flag relation as needing eoxact cleanup (to remove the hint) */
 	EOXactListAdd(relation);
 }

@ -5830,8 +5668,6 @@ load_relcache_init_file(bool shared)
 		rel->rd_fkeylist = NIL;
 		rel->rd_createSubid = InvalidSubTransactionId;
 		rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-		rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
-		rel->rd_droppedSubid = InvalidSubTransactionId;
 		rel->rd_amcache = NULL;
 		MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));

--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@ -36,7 +36,6 @@
 #include "access/xlog_internal.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_authid.h"
-#include "catalog/storage.h"
 #include "commands/async.h"
 #include "commands/prepare.h"
 #include "commands/user.h"
@ -2633,17 +2632,6 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},

-	{
-		{"wal_skip_threshold", PGC_USERSET, WAL_SETTINGS,
-			gettext_noop("Size of new file to fsync instead of writing WAL."),
-			NULL,
-			GUC_UNIT_KB
-		},
-		&wal_skip_threshold,
-		2048, 0, MAX_KILOBYTES,
-		NULL, NULL, NULL
-	},
-
 	{
 		{"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum number of simultaneously running WAL sender processes."),
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@ -214,7 +214,6 @@
 					# (change requires restart)
 #wal_writer_delay = 200ms		# 1-10000 milliseconds
 #wal_writer_flush_after = 1MB		# measured in pages, 0 disables
-#wal_skip_threshold = 2MB

 #commit_delay = 0			# range 0-100000, in microseconds
 #commit_siblings = 5			# range 1-1000