Revert "Skip WAL for new relfilenodes, under wal_level=minimal."

This reverts commit cb2fd7eac2. Per numerous buildfarm members, it was incompatible with parallel query, and a test case assumed LP64. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
2025-12-18 05:01:01 +03:00 · 2020-03-22 09:24:09 -07:00
parent a653bd8aa7
commit 348f15e22e
46 changed files with 321 additions and 1414 deletions
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2171,19 +2171,16 @@ include_dir 'conf.d'
        levels.  This parameter can only be set at server start.
       </para>
       <para>
-        In <literal>minimal</literal> level, no information is logged for
+        In <literal>minimal</> level, WAL-logging of some bulk
-        permanent relations for the remainder of a transaction that creates or
+        operations can be safely skipped, which can make those
-        rewrites them.  This can make operations much faster (see
+        operations much faster (see <xref linkend="populate-pitr">).
-        <xref linkend="populate-pitr">).  Operations that initiate this
+        Operations in which this optimization can be applied include:
        optimization include:
        <simplelist>
-         <member><command>ALTER ... SET TABLESPACE</command></member>
+         <member><command>CREATE TABLE AS</></member>
-         <member><command>CLUSTER</command></member>
+         <member><command>CREATE INDEX</></member>
-         <member><command>CREATE TABLE</command></member>
+         <member><command>CLUSTER</></member>
-         <member><command>REFRESH MATERIALIZED VIEW</command>
+         <member><command>COPY</> into tables that were created or truncated in the same
-         (without <option>CONCURRENTLY</option>)</member>
+         transaction</member>
         <member><command>REINDEX</command></member>
         <member><command>TRUNCATE</command></member>
        </simplelist>
        But minimal WAL does not contain enough information to reconstruct the
        data from a base backup and the WAL logs, so <literal>replica</> or
@@ -2572,26 +2569,6 @@ include_dir 'conf.d'
      </listitem>
     </varlistentry>
     <varlistentry id="guc-wal-skip-threshold" xreflabel="wal_skip_threshold">
      <term><varname>wal_skip_threshold</varname> (<type>integer</type>)
      <indexterm>
       <primary><varname>wal_skip_threshold</varname> configuration parameter</primary>
      </indexterm>
      </term>
      <listitem>
       <para>
        When <varname>wal_level</varname> is <literal>minimal</literal> and a
        transaction commits after creating or rewriting a permanent relation,
        this setting determines how to persist the new data.  If the data is
        smaller than this setting, write it to the WAL log; otherwise, use an
        fsync of affected files.  Depending on the properties of your storage,
        raising or lowering this value might help if such commits are slowing
        concurrent transactions.  The default is two megabytes
        (<literal>2MB</literal>).
       </para>
      </listitem>
     </varlistentry>
     <varlistentry id="guc-commit-delay" xreflabel="commit_delay">
      <term><varname>commit_delay</varname> (<type>integer</type>)
      <indexterm>
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1394,13 +1394,42 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
   </para>
   <para>
-    Aside from avoiding the time for the archiver or WAL sender to process the
+    Aside from avoiding the time for the archiver or WAL sender to
-    WAL data, doing this will actually make certain commands faster, because
+    process the WAL data,
-    they do not to write WAL at all if <varname>wal_level</varname>
+    doing this will actually make certain commands faster, because they
-    is <literal>minimal</literal> and the current subtransaction (or top-level
+    are designed not to write WAL at all if <varname>wal_level</varname>
-    transaction) created or truncated the table or index they change.  (They
+    is <literal>minimal</>.  (They can guarantee crash safety more cheaply
-    can guarantee crash safety more cheaply by doing
+    by doing an <function>fsync</> at the end than by writing WAL.)
-    an <function>fsync</function> at the end than by writing WAL.)
+    This applies to the following commands:
    <itemizedlist>
     <listitem>
      <para>
       <command>CREATE TABLE AS SELECT</command>
      </para>
     </listitem>
     <listitem>
      <para>
       <command>CREATE INDEX</command> (and variants such as
       <command>ALTER TABLE ADD PRIMARY KEY</command>)
      </para>
     </listitem>
     <listitem>
      <para>
       <command>ALTER TABLE SET TABLESPACE</command>
      </para>
     </listitem>
     <listitem>
      <para>
       <command>CLUSTER</command>
      </para>
     </listitem>
     <listitem>
      <para>
       <command>COPY FROM</command>, when the target table has been
       created or truncated earlier in the same transaction
      </para>
     </listitem>
    </itemizedlist>
   </para>
  </sect2>
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -190,7 +190,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		PageSetLSN(page, recptr);
 	}
 	else
-		PageSetLSN(page, gistGetFakeLSN(index));
+		PageSetLSN(page, gistGetFakeLSN(heap));
 	UnlockReleaseBuffer(buffer);
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -937,44 +937,23 @@ gistproperty(Oid index_oid, int attno,
 }
 /*
- * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page
+ * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
- * splits anyway. This function provides a fake sequence of LSNs for that
+ * to detect concurrent page splits anyway. This function provides a fake
- * purpose.
+ * sequence of LSNs for that purpose.
 */
 XLogRecPtr
 gistGetFakeLSN(Relation rel)
 {
 	static XLogRecPtr counter = 1;
 	if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
 	{
 		/*
 		 * Temporary relations are only accessible in our session, so a simple
 		 * backend-local counter will do.
 		 */
 		static XLogRecPtr counter = 1;
 		return counter++;
 	}
 	else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
 	{
 		/*
 		 * WAL-logging on this relation will start after commit, so its LSNs
 		 * must be distinct numbers smaller than the LSN at the next commit.
 		 * Emit a dummy WAL record if insert-LSN hasn't advanced after the
 		 * last call.
 		 */
 		static XLogRecPtr lastlsn = InvalidXLogRecPtr;
 		XLogRecPtr	currlsn = GetXLogInsertRecPtr();
 		/* Shouldn't be called for WAL-logging relations */
 		Assert(!RelationNeedsWAL(rel));
 		/* No need for an actual record if we already have a distinct LSN */
 		if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn)
 			currlsn = gistXLogAssignLSN();
 		lastlsn = currlsn;
 		return currlsn;
 	}
 	else
 	{
 		/*
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -480,9 +480,6 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
 		case XLOG_GIST_ASSIGN_LSN:
 			/* nop. See gistGetFakeLSN(). */
 			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -559,23 +556,6 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 /*
 * Write an empty XLOG record to assign a distinct LSN.
 */
 XLogRecPtr
 gistXLogAssignLSN(void)
 {
 	int			dummy = 0;
 	/*
 	 * Records other than SWITCH_WAL must have content. We use an integer 0 to
 	 * follow the restriction.
 	 */
 	XLogBeginInsert();
 	XLogRegisterData((char *) &dummy, sizeof(dummy));
 	return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN);
 }
 /*
 * Write XLOG record describing a page update. The update can include any
 * number of deletions and/or insertions of tuples on a single index page.
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -27,6 +27,7 @@
 *		heap_multi_insert - insert multiple tuples into a relation
 *		heap_delete		- delete a tuple from a relation
 *		heap_update		- replace a tuple in a relation with another tuple
 *		heap_sync		- sync heap, for when no WAL has been written
 *
 * NOTES
 *	  This file contains the heap_ routines which implement
@@ -2325,6 +2326,12 @@ FreeBulkInsertState(BulkInsertState bistate)
 * The new tuple is stamped with current transaction ID and the specified
 * command ID.
 *
 * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
 * logged in WAL, even for a non-temp relation.  Safe usage of this behavior
 * requires that we arrange that all new tuples go into new pages not
 * containing any tuples from other transactions, and that the relation gets
 * fsync'd before commit.  (See also heap_sync() comments)
 *
 * The HEAP_INSERT_SKIP_FSM option is passed directly to
 * RelationGetBufferForTuple, which see for more info.
 *
@@ -2433,7 +2440,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	MarkBufferDirty(buffer);
 	/* XLOG stuff */
-	if (RelationNeedsWAL(relation))
+	if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
 	{
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
@@ -2641,7 +2648,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 	/* currently not needed (thus unsupported) for heap_multi_insert() */
 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
-	needwal = RelationNeedsWAL(relation);
+	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);
@@ -9279,13 +9286,18 @@ heap2_redo(XLogReaderState *record)
 }
 /*
- *	heap_sync		- for binary compatibility
+ *	heap_sync		- sync a heap, for use when no WAL has been written
 *
- * A newer PostgreSQL version removes this function.  It exists here just in
+ * This forces the heap contents (including TOAST heap if any) down to disk.
- * case an extension calls it.  See "Skipping WAL for New RelFileNode" in
+ * If we skipped using WAL, and WAL is otherwise needed, we must force the
- * src/backend/access/transam/README for the system that superseded it,
+ * relation down to disk before it's safe to commit the transaction.  This
- * allowing removal of most calls.  Cases like copy_relation_data() should
+ * requires writing out any dirty buffers and then doing a forced fsync.
- * call smgrimmedsync() directly.
+ *
 * Indexes are not touched.  (Currently, index operations associated with
 * the commands that use this are WAL-logged and so do not need fsync.
 * That behavior might change someday, but in any case it's likely that
 * any fsync decisions required would be per-index and hence not appropriate
 * to be done here.)
 */
 void
 heap_sync(Relation rel)
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -143,6 +143,7 @@ typedef struct RewriteStateData
 	Page		rs_buffer;		/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
 	bool		rs_buffer_valid;	/* T if any tuples in buffer */
 	bool		rs_use_wal;		/* must we WAL-log inserts? */
 	bool		rs_logical_rewrite;		/* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;		/* oldest xmin used by caller to
 										 * determine tuple visibility */
@@ -236,13 +237,15 @@ static void logical_end_heap_rewrite(RewriteState state);
 * oldest_xmin	xid used by the caller to determine which tuples are dead
 * freeze_xid	xid before which tuples will be frozen
 * min_multi	multixact before which multis will be removed
 * use_wal		should the inserts to the new heap be WAL-logged?
 *
 * Returns an opaque RewriteState, allocated in current memory context,
 * to be used in subsequent calls to the other functions.
 */
 RewriteState
 begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
-				   TransactionId freeze_xid, MultiXactId cutoff_multi)
+				   TransactionId freeze_xid, MultiXactId cutoff_multi,
 				   bool use_wal)
 {
 	RewriteState state;
 	MemoryContext rw_cxt;
@@ -267,6 +270,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
 	state->rs_buffer_valid = false;
 	state->rs_use_wal = use_wal;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
@@ -325,7 +329,7 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
-		if (RelationNeedsWAL(state->rs_new_rel))
+		if (state->rs_use_wal)
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
 						state->rs_blockno,
@@ -340,14 +344,18 @@ end_heap_rewrite(RewriteState state)
 	}
 	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
+	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
 	 * to ensure that the toast table gets fsync'd too.
 	 *
 	 * It's obvious that we must do this when not WAL-logging. It's less
 	 * obvious that we have to do it even if we did WAL-log the pages. The
 	 * reason is the same as in tablecmds.c's copy_relation_data(): we're
 	 * writing data that's not in shared buffers, and so a CHECKPOINT
 	 * occurring during the rewriteheap operation won't have fsync'd data we
 	 * wrote before the checkpoint.
 	 */
 	if (RelationNeedsWAL(state->rs_new_rel))
-		smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM);
+		heap_sync(state->rs_new_rel);
 	logical_end_heap_rewrite(state);
@@ -644,6 +652,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	{
 		int options = HEAP_INSERT_SKIP_FSM;
 		if (!state->rs_use_wal)
 			options |= HEAP_INSERT_SKIP_WAL;
 		/*
 		 * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
 		 * for the TOAST table are not logically decoded.  The main heap is
@@ -682,7 +693,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			/* Doesn't fit, so write out the existing page */
 			/* XLOG stuff */
-			if (RelationNeedsWAL(state->rs_new_rel))
+			if (state->rs_use_wal)
 				log_newpage(&state->rs_new_rel->rd_node,
 							MAIN_FORKNUM,
 							state->rs_blockno,
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -40,6 +40,18 @@
 * them.  They will need to be re-read into shared buffers on first use after
 * the build finishes.
 *
 * Since the index will never be used unless it is completely built,
 * from a crash-recovery point of view there is no need to WAL-log the
 * steps of the build.  After completing the index build, we can just sync
 * the whole file to disk using smgrimmedsync() before exiting this module.
 * This can be seen to be sufficient for crash recovery by considering that
 * it's effectively equivalent to what would happen if a CHECKPOINT occurred
 * just after the index build.  However, it is clearly not sufficient if the
 * DBA is using the WAL log for PITR or replication purposes, since another
 * machine would not be able to reconstruct the index from WAL.  Therefore,
 * we log the completed index pages to WAL if and only if WAL archiving is
 * active.
 *
 * This code isn't concerned about the FSM at all. The caller is responsible
 * for initializing that.
 *
@@ -204,7 +216,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 	wstate.heap = btspool->heap;
 	wstate.index = btspool->index;
-	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+
 	/*
 	 * We need to log index creation in WAL iff WAL archiving/streaming is
 	 * enabled UNLESS the index isn't WAL-logged anyway.
 	 */
 	wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
 	/* reserve the metapage */
 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
@@ -794,15 +811,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	_bt_uppershutdown(wstate, state);
 	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
+	 * If the index is WAL-logged, we must fsync it down to disk before it's
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
+	 * safe to commit the transaction.  (For a non-WAL-logged index we don't
-	 * during the build has no way to flush the previously written data to
+	 * care since the index will be uninteresting after a crash anyway.)
-	 * disk (indeed it won't know the index even exists).  A crash later on
+	 *
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
+	 * It's obvious that we must do this when not WAL-logging the build. It's
-	 * earlier WAL entries. If we do not fsync those pages here, they might
+	 * less obvious that we have to do it even if we did WAL-log the index
-	 * still not be on disk when the crash occurs.
+	 * pages.  The reason is that since we're building outside shared buffers,
 	 * a CHECKPOINT occurring during the build has no way to flush the
 	 * previously written data to disk (indeed it won't know the index even
 	 * exists).  A crash later on would replay WAL from the checkpoint,
 	 * therefore it wouldn't replay our earlier WAL entries. If we do not
 	 * fsync those pages here, they might still not be on disk when the crash
 	 * occurs.
 	 */
-	if (wstate->btws_use_wal)
+	if (RelationNeedsWAL(wstate->index))
 	{
 		RelationOpenSmgr(wstate->index);
 		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -46,9 +46,6 @@ gist_desc(StringInfo buf, XLogReaderState *record)
 			break;
 		case XLOG_GIST_CREATE_INDEX:
 			break;
 		case XLOG_GIST_ASSIGN_LSN:
 			/* No details to write out */
 			break;
 	}
 }
@@ -68,9 +65,6 @@ gist_identify(uint8 info)
 		case XLOG_GIST_CREATE_INDEX:
 			id = "CREATE_INDEX";
 			break;
 		case XLOG_GIST_ASSIGN_LSN:
 			id = "ASSIGN_LSN";
 			break;
 	}
 	return id;
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -717,38 +717,6 @@ then restart recovery.  This is part of the reason for not writing a WAL
 entry until we've successfully done the original action.
 Skipping WAL for New RelFileNode
 --------------------------------
 Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK
 would unlink, in-tree access methods write no WAL for that change.  Code that
 writes WAL without calling RelationNeedsWAL() must check for this case.  This
 skipping is mandatory.  If a WAL-writing change preceded a WAL-skipping change
 for the same block, REDO could overwrite the WAL-skipping change.  If a
 WAL-writing change followed a WAL-skipping change for the same block, a
 related problem would arise.  When a WAL record contains no full-page image,
 REDO expects the page to match its contents from just before record insertion.
 A WAL-skipping change may not reach disk at all, violating REDO's expectation
 under full_page_writes=off.  For any access method, CommitTransaction() writes
 and fsyncs affected blocks before recording the commit.
 Prefer to do the same in future access methods.  However, two other approaches
 can work.  First, an access method can irreversibly transition a given fork
 from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and
 smgrimmedsync().  Second, an access method can opt to write WAL
 unconditionally for permanent relations.  Under these approaches, the access
 method callbacks must not call functions that react to RelationNeedsWAL().
 This applies only to WAL records whose replay would modify bytes stored in the
 new relfilenode.  It does not apply to other records about the relfilenode,
 such as XLOG_SMGR_CREATE.  Because it operates at the level of individual
 relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations.
 Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which
 ALTER TABLE adds a TOAST relation.  The TOAST relation will skip WAL, while
 the table owning it will not.  ALTER TABLE SET TABLESPACE will cause a table
 to skip WAL, but that won't affect its indexes.
 Asynchronous Commit
 -------------------
@@ -852,12 +820,13 @@ Changes to a temp table are not WAL-logged, hence could reach disk in
 advance of T1's commit, but we don't care since temp table contents don't
 survive crashes anyway.
-Database writes that skip WAL for new relfilenodes are also safe.  In these
+Database writes made via any of the paths we have introduced to avoid WAL
-cases it's entirely possible for the data to reach disk before T1's commit,
+overhead for bulk updates are also safe.  In these cases it's entirely
-because T1 will fsync it down to disk without any sort of interlock.  However,
+possible for the data to reach disk before T1's commit, because T1 will
-all these paths are designed to write data that no other transaction can see
+fsync it down to disk without any sort of interlock, as soon as it finishes
-until after T1 commits.  The situation is thus not different from ordinary
+the bulk update.  However, all these paths are designed to write data that
-WAL-logged updates.
+no other transaction can see until after T1 commits.  The situation is thus
 not different from ordinary WAL-logged updates.
 Transaction Emulation during Recovery
 -------------------------------------
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2032,13 +2032,6 @@ CommitTransaction(void)
 	 */
 	PreCommit_on_commit_actions();
 	/*
 	 * Synchronize files that are created and not WAL-logged during this
 	 * transaction. This must happen before AtEOXact_RelationMap(), so that we
 	 * don't see committed-but-broken files after a crash.
 	 */
 	smgrDoPendingSyncs(true);
 	/* close large objects before lower-level cleanup */
 	AtEOXact_LargeObject(true);
@@ -2267,13 +2260,6 @@ PrepareTransaction(void)
 	 */
 	PreCommit_on_commit_actions();
 	/*
 	 * Synchronize files that are created and not WAL-logged during this
 	 * transaction. This must happen before EndPrepare(), so that we don't see
 	 * committed-but-broken files after a crash and COMMIT PREPARED.
 	 */
 	smgrDoPendingSyncs(true);
 	/* close large objects before lower-level cleanup */
 	AtEOXact_LargeObject(true);
@@ -2574,7 +2560,6 @@ AbortTransaction(void)
 	 */
 	AfterTriggerEndXact(false); /* 'false' means it's abort */
 	AtAbort_Portals();
 	smgrDoPendingSyncs(false);
 	AtEOXact_LargeObject(false);
 	AtAbort_Notify();
 	AtEOXact_RelationMap(false);
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -542,8 +542,6 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry;
 * fields related to physical storage, like rd_rel, are initialized, so the
 * fake entry is only usable in low-level operations like ReadBuffer().
 *
 * This is also used for syncing WAL-skipped files.
 *
 * Caller must free the returned entry with FreeFakeRelcacheEntry().
 */
 Relation
@@ -552,20 +550,18 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
 	FakeRelCacheEntry fakeentry;
 	Relation	rel;
 	Assert(InRecovery);
 	/* Allocate the Relation struct and all related space in one block. */
 	fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
 	rel = (Relation) fakeentry;
 	rel->rd_rel = &fakeentry->pgc;
 	rel->rd_node = rnode;
-
+	/* We will never be working with temp rels during recovery */
 	/*
 	 * We will never be working with temp rels during recovery or while
 	 * syncing WAL-skipped files.
 	 */
 	rel->rd_backend = InvalidBackendId;
-	/* It must be a permanent table here */
+	/* It must be a permanent table if we're in recovery. */
 	rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
 	/* We don't know the name of the relation; use relfilenode instead */
@@ -574,9 +570,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
 	/*
 	 * We set up the lockRelId in case anything tries to lock the dummy
 	 * relation.  Note that this is fairly bogus since relNode may be
-	 * different from the relation's OID.  It shouldn't really matter though.
+	 * different from the relation's OID.  It shouldn't really matter though,
-	 * In recovery, we are running by ourselves and can't have any lock
+	 * since we are presumably running by ourselves and can't have any lock
-	 * conflicts.  While syncing, we already hold AccessExclusiveLock.
+	 * conflicts ...
 	 */
 	rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
 	rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -299,8 +299,6 @@ Boot_DeclareIndexStmt:
 					stmt->idxcomment = NULL;
 					stmt->indexOid = InvalidOid;
 					stmt->oldNode = InvalidOid;
 					stmt->oldCreateSubid = InvalidSubTransactionId;
 					stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					stmt->unique = false;
 					stmt->primary = false;
 					stmt->isconstraint = false;
@@ -344,8 +342,6 @@ Boot_DeclareUniqueIndexStmt:
 					stmt->idxcomment = NULL;
 					stmt->indexOid = InvalidOid;
 					stmt->oldNode = InvalidOid;
 					stmt->oldCreateSubid = InvalidSubTransactionId;
 					stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					stmt->unique = true;
 					stmt->primary = false;
 					stmt->isconstraint = false;
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -27,16 +27,11 @@
 #include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
 #include "miscadmin.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 /* GUC variables */
 int			wal_skip_threshold = 2048;	/* in kilobytes */
 /*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
@@ -66,14 +61,7 @@ typedef struct PendingRelDelete
 	struct PendingRelDelete *next;		/* linked-list link */
 } PendingRelDelete;
 typedef struct pendingSync
 {
 	RelFileNode rnode;
 	bool		is_truncated;	/* Has the file experienced truncation? */
 } pendingSync;
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
 HTAB	   *pendingSyncHash = NULL;
 /*
 * RelationCreateStorage
@@ -128,37 +116,6 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
 	/*
 	 * Queue an at-commit sync.  Bootstrap does not need syncs, because initdb
 	 * syncs at the end.  During bootstrap, mdexists() creates the specified
 	 * file; smgrDoPendingSyncs() would not cope with that.
 	 */
 	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded() &&
 		!IsBootstrapProcessingMode())
 	{
 		pendingSync *pending;
 		bool		found;
 		/* we sync only permanent relations */
 		Assert(backend == InvalidBackendId);
 		if (!pendingSyncHash)
 		{
 			HASHCTL		ctl;
 			ctl.keysize = sizeof(RelFileNode);
 			ctl.entrysize = sizeof(pendingSync);
 			ctl.hcxt = TopTransactionContext;
 			pendingSyncHash =
 				hash_create("pending sync hash",
 							16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 		}
 		pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found);
 		Assert(!found);
 		pending->is_truncated = false;
 	}
 }
 /*
@@ -292,8 +249,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	if (vm)
 		visibilitymap_truncate(rel, nblocks);
 	RelationPreTruncate(rel);
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
 	 * trouble if the truncation fails. If we then crash, the WAL replay
@@ -336,49 +291,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
 }
 /*
 * RelationPreTruncate
 *		Perform AM-independent work before a physical truncation.
 *
 * If an access method's relation_nontransactional_truncate does not call
 * RelationTruncate(), it must call this before decreasing the table size.
 */
 void
 RelationPreTruncate(Relation rel)
 {
 	pendingSync *pending;
 	if (!pendingSyncHash)
 		return;
 	RelationOpenSmgr(rel);
 	pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
 						  HASH_FIND, NULL);
 	if (pending)
 		pending->is_truncated = true;
 }
 /*
 * RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL
 *
 *   Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
 *   New RelFileNode" in src/backend/access/transam/README.  Though it is
 *   known from Relation efficiently, this function is intended for the code
 *   paths not having access to Relation.
 */
 bool
 RelFileNodeSkippingWAL(RelFileNode rnode)
 {
 	if (XLogIsNeeded())
 		return false;			/* no permanent relfilenode skips WAL */
 	if (!pendingSyncHash ||
 		hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
 		return false;
 	return true;
 }
 /*
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
 *
@@ -456,144 +368,6 @@ smgrDoPendingDeletes(bool isCommit)
 	}
 }
 /*
 *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
 */
 void
 smgrDoPendingSyncs(bool isCommit)
 {
 	PendingRelDelete *pending;
 	int			nrels = 0,
 				maxrels = 0;
 	SMgrRelation *srels = NULL;
 	HASH_SEQ_STATUS scan;
 	pendingSync *pendingsync;
 	if (XLogIsNeeded())
 		return;					/* no relation can use this */
 	Assert(GetCurrentTransactionNestLevel() == 1);
 	if (!pendingSyncHash)
 		return;					/* no relation needs sync */
 	/* Just throw away all pending syncs if any at rollback */
 	if (!isCommit)
 	{
 		pendingSyncHash = NULL;
 		return;
 	}
 	AssertPendingSyncs_RelationCache();
 	/* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
 		if (!pending->atCommit)
 			continue;
 		(void) hash_search(pendingSyncHash, (void *) &pending->relnode,
 						   HASH_REMOVE, NULL);
 	}
 	hash_seq_init(&scan, pendingSyncHash);
 	while ((pendingsync = (pendingSync *) hash_seq_search(&scan)))
 	{
 		ForkNumber	fork;
 		BlockNumber nblocks[MAX_FORKNUM + 1];
 		BlockNumber total_blocks = 0;
 		SMgrRelation srel;
 		srel = smgropen(pendingsync->rnode, InvalidBackendId);
 		/*
 		 * We emit newpage WAL records for smaller relations.
 		 *
 		 * Small WAL records have a chance to be emitted along with other
 		 * backends' WAL records.  We emit WAL records instead of syncing for
 		 * files that are smaller than a certain threshold, expecting faster
 		 * commit.  The threshold is defined by the GUC wal_skip_threshold.
 		 */
 		if (!pendingsync->is_truncated)
 		{
 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
 			{
 				if (smgrexists(srel, fork))
 				{
 					BlockNumber n = smgrnblocks(srel, fork);
 					/* we shouldn't come here for unlogged relations */
 					Assert(fork != INIT_FORKNUM);
 					nblocks[fork] = n;
 					total_blocks += n;
 				}
 				else
 					nblocks[fork] = InvalidBlockNumber;
 			}
 		}
 		/*
 		 * Sync file or emit WAL records for its contents.
 		 *
 		 * Although we emit WAL record if the file is small enough, do file
 		 * sync regardless of the size if the file has experienced a
 		 * truncation. It is because the file would be followed by trailing
 		 * garbage blocks after a crash recovery if, while a past longer file
 		 * had been flushed out, we omitted syncing-out of the file and
 		 * emitted WAL instead.  You might think that we could choose WAL if
 		 * the current main fork is longer than ever, but there's a case where
 		 * main fork is longer than ever but FSM fork gets shorter.
 		 */
 		if (pendingsync->is_truncated ||
 			total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
 		{
 			/* allocate the initial array, or extend it, if needed */
 			if (maxrels == 0)
 			{
 				maxrels = 8;
 				srels = palloc(sizeof(SMgrRelation) * maxrels);
 			}
 			else if (maxrels <= nrels)
 			{
 				maxrels *= 2;
 				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
 			}
 			srels[nrels++] = srel;
 		}
 		else
 		{
 			/* Emit WAL records for all blocks.  The file is small enough. */
 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
 			{
 				int			n = nblocks[fork];
 				Relation	rel;
 				if (!BlockNumberIsValid(n))
 					continue;
 				/*
 				 * Emit WAL for the whole file.  Unfortunately we don't know
 				 * what kind of a page this is, so we have to log the full
 				 * page including any unused space.  ReadBufferExtended()
 				 * counts some pgstat events; unfortunately, we discard them.
 				 */
 				rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
 				log_newpage_range(rel, fork, 0, n, false);
 				FreeFakeRelcacheEntry(rel);
 			}
 		}
 	}
 	pendingSyncHash = NULL;
 	if (nrels > 0)
 	{
 		smgrdosyncall(srels, nrels);
 		pfree(srels);
 	}
 }
 /*
 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
 *
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -747,6 +747,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	bool	   *isnull;
 	IndexScanDesc indexScan;
 	HeapScanDesc heapScan;
 	bool		use_wal;
 	bool		is_system_catalog;
 	TransactionId OldestXmin;
 	TransactionId FreezeXid;
@@ -802,9 +803,12 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
+	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * This may be harmless, but this function hasn't planned for it.
+	 * enabled AND it's a WAL-logged rel.
 	 */
 	use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
 	/* use_wal off requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
 	/*
@@ -872,7 +876,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	/* Initialize the rewrite operation */
 	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
-								 MultiXactCutoff);
+								 MultiXactCutoff, use_wal);
 	/*
 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
@@ -1242,25 +1246,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 		*mapped_tables++ = r2;
 	}
 	/*
 	 * Recognize that rel1's relfilenode (swapped from rel2) is new in this
 	 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
 	 * new.
 	 */
 	{
 		Relation	rel1,
 					rel2;
 		rel1 = relation_open(r1, NoLock);
 		rel2 = relation_open(r2, NoLock);
 		rel2->rd_createSubid = rel1->rd_createSubid;
 		rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
 		rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
 		RelationAssumeNewRelfilenode(rel1);
 		relation_close(rel1, NoLock);
 		relation_close(rel2, NoLock);
 	}
 	/*
 	 * In the case of a shared catalog, these next few steps will only affect
 	 * our own database's pg_class row; but that's okay, because they are all
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2310,14 +2310,49 @@ CopyFrom(CopyState cstate)
 	tupDesc = RelationGetDescr(cstate->rel);
-	/*
+	/*----------
-	 * If the target file is new-in-transaction, we assume that checking FSM
+	 * Check to see if we can avoid writing WAL
-	 * for free space is a waste of time.  This could possibly be wrong, but
+	 *
-	 * it's unlikely.
+	 * If archive logging/streaming is not enabled *and* either
 	 *	- table was created in same transaction as this COPY
 	 *	- data is being written to relfilenode created in this transaction
 	 * then we can skip writing WAL.  It's safe because if the transaction
 	 * doesn't commit, we'll discard the table (or the new relfilenode file).
 	 * If it does commit, we'll have done the heap_sync at the bottom of this
 	 * routine first.
 	 *
 	 * As mentioned in comments in utils/rel.h, the in-same-transaction test
 	 * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
 	 * can be cleared before the end of the transaction. The exact case is
 	 * when a relation sets a new relfilenode twice in same transaction, yet
 	 * the second one fails in an aborted subtransaction, e.g.
 	 *
 	 * BEGIN;
 	 * TRUNCATE t;
 	 * SAVEPOINT save;
 	 * TRUNCATE t;
 	 * ROLLBACK TO save;
 	 * COPY ...
 	 *
 	 * Also, if the target file is new-in-transaction, we assume that checking
 	 * FSM for free space is a waste of time, even if we must use WAL because
 	 * of archiving.  This could possibly be wrong, but it's unlikely.
 	 *
 	 * The comments for heap_insert and RelationGetBufferForTuple specify that
 	 * skipping WAL logging is only safe if we ensure that our tuples do not
 	 * go into pages containing tuples from any other transactions --- but this
 	 * must be the case if we have a new table or new relfilenode, so we need
 	 * no additional work to enforce that.
 	 *----------
 	 */
 	/* createSubid is creation check, newRelfilenodeSubid is truncation check */
 	if (cstate->rel->rd_createSubid != InvalidSubTransactionId ||
-		cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+		cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)
 	{
 		hi_options |= HEAP_INSERT_SKIP_FSM;
 		if (!XLogIsNeeded())
 			hi_options |= HEAP_INSERT_SKIP_WAL;
 	}
 	/*
 	 * Optimize if new relfilenode was created in this subxact or one of its
@@ -2576,6 +2611,13 @@ CopyFrom(CopyState cstate)
 	FreeExecutorState(estate);
 	/*
 	 * If we skipped writing WAL, then we need to sync the heap (but not
 	 * indexes since those use WAL anyway)
 	 */
 	if (hi_options & HEAP_INSERT_SKIP_WAL)
 		heap_sync(cstate->rel);
 	return processed;
 }
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -562,13 +562,16 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	myState->rel = intoRelationDesc;
 	myState->reladdr = intoRelationAddr;
 	myState->output_cid = GetCurrentCommandId(true);
 	myState->hi_options = HEAP_INSERT_SKIP_FSM;
 	myState->bistate = GetBulkInsertState();
 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
+	 * We can skip WAL-logging the insertions, unless PITR or streaming
-	 * This may be harmless, but this function hasn't planned for it.
+	 * replication is in use. We can skip the FSM in any case.
 	 */
 	myState->hi_options = HEAP_INSERT_SKIP_FSM |
 		(XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL);
 	myState->bistate = GetBulkInsertState();
 	/* Not using WAL requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber);
 }
@@ -614,6 +617,10 @@ intorel_shutdown(DestReceiver *self)
 	FreeBulkInsertState(myState->bistate);
 	/* If we skipped using WAL, must heap_sync before commit */
 	if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
 		heap_sync(myState->rel);
 	/* close rel, but keep lock until commit */
 	heap_close(myState->rel, NoLock);
 	myState->rel = NULL;
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -436,13 +436,17 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 */
 	myState->transientrel = transientrel;
 	myState->output_cid = GetCurrentCommandId(true);
 	myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
 	myState->bistate = GetBulkInsertState();
 	/*
-	 * Valid smgr_targblock implies something already wrote to the relation.
+	 * We can skip WAL-logging the insertions, unless PITR or streaming
-	 * This may be harmless, but this function hasn't planned for it.
+	 * replication is in use. We can skip the FSM in any case.
 	 */
 	myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
 	if (!XLogIsNeeded())
 		myState->hi_options |= HEAP_INSERT_SKIP_WAL;
 	myState->bistate = GetBulkInsertState();
 	/* Not using WAL requires smgr_targblock be initially invalid */
 	Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber);
 }
@@ -482,6 +486,10 @@ transientrel_shutdown(DestReceiver *self)
 	FreeBulkInsertState(myState->bistate);
 	/* If we skipped using WAL, must heap_sync before commit */
 	if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
 		heap_sync(myState->transientrel);
 	/* close transientrel, but keep lock until commit */
 	heap_close(myState->transientrel, NoLock);
 	myState->transientrel = NULL;
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4021,14 +4021,19 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 		newrel = NULL;
 	/*
-	 * Prepare a BulkInsertState and options for heap_insert.  The FSM is
+	 * Prepare a BulkInsertState and options for heap_insert. Because we're
-	 * empty, so don't bother using it.
+	 * building a new heap, we can skip WAL-logging and fsync it to disk at
 	 * the end instead (unless WAL-logging is required for archiving or
 	 * streaming replication). The FSM is empty too, so don't bother using it.
 	 */
 	if (newrel)
 	{
 		mycid = GetCurrentCommandId(true);
 		bistate = GetBulkInsertState();
 		hi_options = HEAP_INSERT_SKIP_FSM;
 		if (!XLogIsNeeded())
 			hi_options |= HEAP_INSERT_SKIP_WAL;
 	}
 	else
 	{
@@ -4278,6 +4283,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 	{
 		FreeBulkInsertState(bistate);
 		/* If we skipped writing WAL, then we need to sync the heap. */
 		if (hi_options & HEAP_INSERT_SKIP_WAL)
 			heap_sync(newrel);
 		heap_close(newrel, NoLock);
 	}
 }
@@ -5979,19 +5988,14 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
 	/*
 	 * If TryReuseIndex() stashed a relfilenode for us, we used it for the new
-	 * index instead of building from scratch.  Restore associated fields.
+	 * index instead of building from scratch.  The DROP of the old edition of
-	 * This may store InvalidSubTransactionId in both fields, in which case
+	 * this index will have scheduled the storage for deletion at commit, so
-	 * relcache.c will assume it can rebuild the relcache entry.  Hence, do
+	 * cancel that pending deletion.
 	 * this after the CCI that made catalog rows visible to any rebuild.  The
 	 * DROP of the old edition of this index will have scheduled the storage
 	 * for deletion at commit, so cancel that pending deletion.
 	 */
 	if (OidIsValid(stmt->oldNode))
 	{
 		Relation	irel = index_open(address.objectId, NoLock);
 		irel->rd_createSubid = stmt->oldCreateSubid;
 		irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid;
 		RelationPreserveStorage(irel->rd_node, true);
 		index_close(irel, NoLock);
 	}
@@ -9130,8 +9134,6 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)
 		Relation	irel = index_open(oldId, NoLock);
 		stmt->oldNode = irel->rd_node.relNode;
 		stmt->oldCreateSubid = irel->rd_createSubid;
 		stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid;
 		index_close(irel, NoLock);
 	}
 }
@@ -9977,8 +9979,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 	heap_close(pg_class, RowExclusiveLock);
 	RelationAssumeNewRelfilenode(rel);
 	relation_close(rel, NoLock);
 	/* Make sure the reltablespace change is visible */
@@ -10193,9 +10193,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * enabled AND it's a permanent relation.  This gives the same answer as
+	 * enabled AND it's a permanent relation.
 	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
 	 * current operation created a new relfilenode.
 	 */
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -10237,15 +10235,21 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	}
 	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
+	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
+	 * to ensure that the toast table gets fsync'd too.  (For a temp or
-	 * occurring during the copy has no way to flush the previously written
+	 * unlogged rel we don't care since the data will be gone after a crash
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
+	 * anyway.)
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
+	 *
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
+	 * It's obvious that we must do this when not WAL-logging the copy. It's
-	 * they might still not be on disk when the crash occurs.
+	 * less obvious that we have to do it even if we did WAL-log the copied
 	 * pages. The reason is that since we're copying outside shared buffers, a
 	 * CHECKPOINT occurring during the copy has no way to flush the previously
 	 * written data to disk (indeed it won't know the new rel even exists).  A
 	 * crash later on would replay WAL from the checkpoint, therefore it
 	 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
 	 * here, they might still not be on disk when the crash occurs.
 	 */
-	if (use_wal || copying_initfork)
+	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3126,8 +3126,6 @@ _copyIndexStmt(const IndexStmt *from)
 	COPY_STRING_FIELD(idxcomment);
 	COPY_SCALAR_FIELD(indexOid);
 	COPY_SCALAR_FIELD(oldNode);
 	COPY_SCALAR_FIELD(oldCreateSubid);
 	COPY_SCALAR_FIELD(oldFirstRelfilenodeSubid);
 	COPY_SCALAR_FIELD(unique);
 	COPY_SCALAR_FIELD(primary);
 	COPY_SCALAR_FIELD(isconstraint);
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1259,8 +1259,6 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b)
 	COMPARE_STRING_FIELD(idxcomment);
 	COMPARE_SCALAR_FIELD(indexOid);
 	COMPARE_SCALAR_FIELD(oldNode);
 	COMPARE_SCALAR_FIELD(oldCreateSubid);
 	COMPARE_SCALAR_FIELD(oldFirstRelfilenodeSubid);
 	COMPARE_SCALAR_FIELD(unique);
 	COMPARE_SCALAR_FIELD(primary);
 	COMPARE_SCALAR_FIELD(isconstraint);
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2452,8 +2452,6 @@ _outIndexStmt(StringInfo str, const IndexStmt *node)
 	WRITE_STRING_FIELD(idxcomment);
 	WRITE_OID_FIELD(indexOid);
 	WRITE_OID_FIELD(oldNode);
 	WRITE_UINT_FIELD(oldCreateSubid);
 	WRITE_UINT_FIELD(oldFirstRelfilenodeSubid);
 	WRITE_BOOL_FIELD(unique);
 	WRITE_BOOL_FIELD(primary);
 	WRITE_BOOL_FIELD(isconstraint);
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -6664,8 +6664,6 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->idxcomment = NULL;
 					n->indexOid = InvalidOid;
 					n->oldNode = InvalidOid;
 					n->oldCreateSubid = InvalidSubTransactionId;
 					n->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					n->primary = false;
 					n->isconstraint = false;
 					n->deferrable = false;
@@ -6692,8 +6690,6 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->idxcomment = NULL;
 					n->indexOid = InvalidOid;
 					n->oldNode = InvalidOid;
 					n->oldCreateSubid = InvalidSubTransactionId;
 					n->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 					n->primary = false;
 					n->isconstraint = false;
 					n->deferrable = false;
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1121,8 +1121,6 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx,
 	index->idxcomment = NULL;
 	index->indexOid = InvalidOid;
 	index->oldNode = InvalidOid;
 	index->oldCreateSubid = InvalidSubTransactionId;
 	index->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 	index->unique = idxrec->indisunique;
 	index->primary = idxrec->indisprimary;
 	index->transformed = true;	/* don't need transformIndexStmt */
@@ -1588,8 +1586,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 	index->idxcomment = NULL;
 	index->indexOid = InvalidOid;
 	index->oldNode = InvalidOid;
 	index->oldCreateSubid = InvalidSubTransactionId;
 	index->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
 	index->transformed = false;
 	index->concurrent = false;
 	index->if_not_exists = false;
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -65,7 +65,7 @@
 #define BUF_WRITTEN				0x01
 #define BUF_REUSABLE			0x02
-#define RELS_BSEARCH_THRESHOLD		20
+#define DROP_RELS_BSEARCH_THRESHOLD		20
 typedef struct PrivateRefCountEntry
 {
@@ -104,19 +104,6 @@ typedef struct CkptTsStatus
 	int			index;
 } CkptTsStatus;
 /*
 * Type for array used to sort SMgrRelations
 *
 * FlushRelationsAllBuffers shares the same comparator function with
 * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
 * compatible.
 */
 typedef struct SMgrSortArray
 {
 	RelFileNode rnode;			/* This must be the first member */
 	SMgrRelation srel;
 } SMgrSortArray;
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@@ -2990,7 +2977,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 	 * an exactly determined value, as it depends on many factors (CPU and RAM
 	 * speeds, amount of shared buffers etc.).
 	 */
-	use_bsearch = n > RELS_BSEARCH_THRESHOLD;
+	use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
 	/* sort the list of rnodes if necessary */
 	if (use_bsearch)
@@ -3240,104 +3227,6 @@ FlushRelationBuffers(Relation rel)
 	}
 }
 /* ---------------------------------------------------------------------
 *		FlushRelationsAllBuffers
 *
 *		This function flushes out of the buffer pool all the pages of all
 *		forks of the specified smgr relations.  It's equivalent to calling
 *		FlushRelationBuffers once per fork per relation.  The relations are
 *		assumed not to use local buffers.
 * --------------------------------------------------------------------
 */
 void
 FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
 {
 	int			i;
 	SMgrSortArray *srels;
 	bool		use_bsearch;
 	if (nrels == 0)
 		return;
 	/* fill-in array for qsort */
 	srels = palloc(sizeof(SMgrSortArray) * nrels);
 	for (i = 0; i < nrels; i++)
 	{
 		Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
 		srels[i].rnode = smgrs[i]->smgr_rnode.node;
 		srels[i].srel = smgrs[i];
 	}
 	/*
 	 * Save the bsearch overhead for low number of relations to sync. See
 	 * DropRelFileNodesAllBuffers for details.
 	 */
 	use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
 	/* sort the list of SMgrRelations if necessary */
 	if (use_bsearch)
 		pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
 	/* Make sure we can handle the pin inside the loop */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 	for (i = 0; i < NBuffers; i++)
 	{
 		SMgrSortArray *srelent = NULL;
 		BufferDesc *bufHdr = GetBufferDescriptor(i);
 		uint32		buf_state;
 		/*
 		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
 		 * and saves some cycles.
 		 */
 		if (!use_bsearch)
 		{
 			int			j;
 			for (j = 0; j < nrels; j++)
 			{
 				if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
 				{
 					srelent = &srels[j];
 					break;
 				}
 			}
 		}
 		else
 		{
 			srelent = bsearch((const void *) &(bufHdr->tag.rnode),
 							  srels, nrels, sizeof(SMgrSortArray),
 							  rnode_comparator);
 		}
 		/* buffer doesn't belong to any of the given relfilenodes; skip it */
 		if (srelent == NULL)
 			continue;
 		ReservePrivateRefCountEntry();
 		buf_state = LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
 			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
 			FlushBuffer(bufHdr, srelent->srel);
 			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
 			UnpinBuffer(bufHdr, true);
 		}
 		else
 			UnlockBufHdr(bufHdr, buf_state);
 	}
 	pfree(srels);
 }
 /* ---------------------------------------------------------------------
 *		FlushDatabaseBuffers
 *
@@ -3539,15 +3428,13 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
 		{
 			/*
-			 * If we must not write WAL, due to a relfilenode-specific
+			 * If we're in recovery we cannot dirty a page because of a hint.
-			 * condition or being in recovery, don't dirty the page.  We can
+			 * We can set the hint, just not dirty the page as a result so the
-			 * set the hint, just not dirty the page as a result so the hint
+			 * hint is lost when we evict the page or shutdown.
 			 * is lost when we evict the page or shutdown.
 			 *
 			 * See src/backend/storage/page/README for longer discussion.
 			 */
-			if (RecoveryInProgress() ||
+			if (RecoveryInProgress())
 				RelFileNodeSkippingWAL(bufHdr->tag.rnode))
 				return;
 			/*
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -563,18 +563,6 @@ DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
 	return false;
 }
 #ifdef USE_ASSERT_CHECKING
 /*
 * GetLockMethodLocalHash -- return the hash of local locks, for modules that
 *		evaluate assertions based on all locks held.
 */
 HTAB *
 GetLockMethodLocalHash(void)
 {
 	return LockMethodLocalHash;
 }
 #endif
 /*
 * LockHasWaiters -- look up 'locktag' and check if releasing this
 *		lock would wake up other processes waiting for it.
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -359,10 +359,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 * During replay, we would delete the file and then recreate it, which is fine
 * if the contents of the file were repopulated by subsequent WAL entries.
 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as we do at wal_level=minimal), the contents of
+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
- * the file would be lost forever.  By leaving the empty file until after the
+ * the contents of the file would be lost forever.  By leaving the empty file
- * next checkpoint, we prevent reassignment of the relfilenode number until
+ * until after the next checkpoint, we prevent reassignment of the relfilenode
- * it's safe, because relfilenode assignment skips over any existing file.
+ * number until it's safe, because relfilenode assignment skips over any
 * existing file.
 *
 * We do not need to go through this dance for temp relations, though, because
 * we never make WAL entries for temp rels, and so a temp rel poses no threat
@@ -1018,19 +1019,12 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 *	mdimmedsync() -- Immediately sync a relation to stable storage.
 *
 * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * nothing of dirty buffers that may exist inside the buffer manager.
 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
 * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
 * some segment, then mdtruncate() renders that segment inactive.  If we
 * crash before the next checkpoint syncs the newly-inactive segment, that
 * segment may survive recovery, reintroducing unwanted data into the table.
 */
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
 	MdfdVec    *v;
 	BlockNumber segno = 0;
 	bool		active = true;
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1040,42 +1034,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	v = mdopen(reln, forknum, EXTENSION_FAIL);
 	/*
 	 * Temporarily open inactive segments, then close them after sync.  There
 	 * may be some inactive segments left opened after fsync() error, but that
 	 * is harmless.  We don't bother to clean them up and take a risk of
 	 * further trouble.  The next mdclose() will soon close them.
 	 */
 	while (v != NULL)
 	{
-		File		vfd = v->mdfd_vfd;
+		if (FileSync(v->mdfd_vfd) < 0)
 		if (active)
 			v = v->mdfd_chain;
 		else
 		{
 			Assert(v->mdfd_chain == NULL);
 			pfree(v);
 			v = NULL;
 		}
 		if (FileSync(vfd) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
-							FilePathName(vfd))));
+							FilePathName(v->mdfd_vfd))));
-
+		v = v->mdfd_chain;
 		/* Close inactive segments immediately */
 		if (!active)
 			FileClose(vfd);
 		segno++;
 		if (v == NULL)
 		{
 			v = _mdfd_openseg(reln, forknum, segno, 0);
 			active = false;
 		}
 	}
 }
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -408,41 +408,6 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
 	(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
 }
 /*
 *	smgrdosyncall() -- Immediately sync all forks of all given relations
 *
 *		All forks of all given relations are synced out to the store.
 *
 *		This is equivalent to FlushRelationBuffers() for each smgr relation,
 *		then calling smgrimmedsync() for all forks of each relation, but it's
 *		significantly quicker so should be preferred when possible.
 */
 void
 smgrdosyncall(SMgrRelation *rels, int nrels)
 {
 	int			i = 0;
 	ForkNumber	forknum;
 	if (nrels == 0)
 		return;
 	FlushRelationsAllBuffers(rels, nrels);
 	/*
 	 * Sync the physical file(s).
 	 */
 	for (i = 0; i < nrels; i++)
 	{
 		int			which = rels[i]->smgr_which;
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
 			if (smgrsw[which].smgr_exists(rels[i], forknum))
 				smgrsw[which].smgr_immedsync(rels[i], forknum);
 		}
 	}
 }
 /*
 *	smgrdounlinkall() -- Immediately unlink all forks of all given relations
 *
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -244,9 +244,6 @@ static void RelationReloadIndexInfo(Relation relation);
 static void RelationReloadNailed(Relation relation);
 static void RelationFlushRelation(Relation relation);
 static void RememberToFreeTupleDescAtEOX(TupleDesc td);
 #ifdef USE_ASSERT_CHECKING
 static void AssertPendingSyncConsistency(Relation relation);
 #endif
 static void AtEOXact_cleanup(Relation relation, bool isCommit);
 static void AtEOSubXact_cleanup(Relation relation, bool isCommit,
 					SubTransactionId mySubid, SubTransactionId parentSubid);
@@ -984,8 +981,6 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 	relation->rd_isnailed = false;
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 	relation->rd_droppedSubid = InvalidSubTransactionId;
 	switch (relation->rd_rel->relpersistence)
 	{
 		case RELPERSISTENCE_UNLOGGED:
@@ -1609,8 +1604,6 @@ formrdesc(const char *relationName, Oid relationReltype,
 	relation->rd_isnailed = true;
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 	relation->rd_droppedSubid = InvalidSubTransactionId;
 	relation->rd_backend = InvalidBackendId;
 	relation->rd_islocaltemp = false;
@@ -1777,13 +1770,6 @@ RelationIdGetRelation(Oid relationId)
 	if (RelationIsValid(rd))
 	{
 		/* return NULL for dropped relations */
 		if (rd->rd_droppedSubid != InvalidSubTransactionId)
 		{
 			Assert(!rd->rd_isvalid);
 			return NULL;
 		}
 		RelationIncrementReferenceCount(rd);
 		/* revalidate cache entry if necessary */
 		if (!rd->rd_isvalid)
@@ -1876,7 +1862,7 @@ RelationClose(Relation relation)
 #ifdef RELCACHE_FORCE_RELEASE
 	if (RelationHasReferenceCountZero(relation) &&
 		relation->rd_createSubid == InvalidSubTransactionId &&
-		relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
+		relation->rd_newRelfilenodeSubid == InvalidSubTransactionId)
 		RelationClearRelation(relation, false);
 #endif
 }
@@ -1915,10 +1901,9 @@ RelationReloadIndexInfo(Relation relation)
 	HeapTuple	pg_class_tuple;
 	Form_pg_class relp;
-	/* Should be called only for invalidated, live indexes */
+	/* Should be called only for invalidated indexes */
 	Assert(relation->rd_rel->relkind == RELKIND_INDEX &&
-		   !relation->rd_isvalid &&
+		   !relation->rd_isvalid);
 		   relation->rd_droppedSubid == InvalidSubTransactionId);
 	/* Ensure it's closed at smgr level */
 	RelationCloseSmgr(relation);
@@ -2198,13 +2183,6 @@ RelationClearRelation(Relation relation, bool rebuild)
 		return;
 	}
 	/* Mark it invalid until we've finished rebuild */
 	relation->rd_isvalid = false;
 	/* See RelationForgetRelation(). */
 	if (relation->rd_droppedSubid != InvalidSubTransactionId)
 		return;
 	/*
 	 * Even non-system indexes should not be blown away if they are open and
 	 * have valid index support information.  This avoids problems with active
@@ -2216,11 +2194,15 @@ RelationClearRelation(Relation relation, bool rebuild)
 		relation->rd_refcnt > 0 &&
 		relation->rd_indexcxt != NULL)
 	{
 		relation->rd_isvalid = false;	/* needs to be revalidated */
 		if (IsTransactionState())
 			RelationReloadIndexInfo(relation);
 		return;
 	}
 	/* Mark it invalid until we've finished rebuild */
 	relation->rd_isvalid = false;
 	/*
 	 * If we're really done with the relcache entry, blow it away. But if
 	 * someone is still using it, reconstruct the whole deal without moving
@@ -2278,12 +2260,12 @@ RelationClearRelation(Relation relation, bool rebuild)
 		 * problem.
 		 *
 		 * When rebuilding an open relcache entry, we must preserve ref count,
-		 * rd_*Subid, and rd_toastoid state.  Also attempt to preserve the
+		 * rd_createSubid/rd_newRelfilenodeSubid, and rd_toastoid state.  Also
-		 * pg_class entry (rd_rel), tupledesc, and rewrite-rule substructures
+		 * attempt to preserve the pg_class entry (rd_rel), tupledesc, and
-		 * in place, because various places assume that these structures won't
+		 * rewrite-rule substructures in place, because various places assume
-		 * move while they are working with an open relcache entry.  (Note:
+		 * that these structures won't move while they are working with an
-		 * the refcount mechanism for tupledescs might someday allow us to
+		 * open relcache entry.  (Note: the refcount mechanism for tupledescs
-		 * remove this hack for the tupledesc.)
+		 * might someday allow us to remove this hack for the tupledesc.)
 		 *
 		 * Note that this process does not touch CurrentResourceOwner; which
 		 * is good because whatever ref counts the entry may have do not
@@ -2360,8 +2342,6 @@ RelationClearRelation(Relation relation, bool rebuild)
 		/* creation sub-XIDs must be preserved */
 		SWAPFIELD(SubTransactionId, rd_createSubid);
 		SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid);
 		SWAPFIELD(SubTransactionId, rd_firstRelfilenodeSubid);
 		SWAPFIELD(SubTransactionId, rd_droppedSubid);
 		/* un-swap rd_rel pointers, swap contents instead */
 		SWAPFIELD(Form_pg_class, rd_rel);
 		/* ... but actually, we don't have to update newrel->rd_rel */
@@ -2398,12 +2378,12 @@ static void
 RelationFlushRelation(Relation relation)
 {
 	if (relation->rd_createSubid != InvalidSubTransactionId ||
-		relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+		relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
 	{
 		/*
 		 * New relcache entries are always rebuilt, not flushed; else we'd
-		 * forget the "new" status of the relation.  Ditto for the
+		 * forget the "new" status of the relation, which is a useful
-		 * new-relfilenode status.
+		 * optimization to have.  Ditto for the new-relfilenode status.
 		 *
 		 * The rel could have zero refcnt here, so temporarily increment the
 		 * refcnt to ensure it's safe to rebuild it.  We can assume that the
@@ -2425,7 +2405,10 @@ RelationFlushRelation(Relation relation)
 }
 /*
- * RelationForgetRelation - caller reports that it dropped the relation
+ * RelationForgetRelation - unconditionally remove a relcache entry
 *
 *		   External interface for destroying a relcache entry when we
 *		   drop the relation.
 */
 void
 RelationForgetRelation(Oid rid)
@@ -2440,19 +2423,7 @@ RelationForgetRelation(Oid rid)
 	if (!RelationHasReferenceCountZero(relation))
 		elog(ERROR, "relation %u is still open", rid);
-	Assert(relation->rd_droppedSubid == InvalidSubTransactionId);
+	/* Unconditionally destroy the relcache entry */
 	if (relation->rd_createSubid != InvalidSubTransactionId ||
 		relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
 	{
 		/*
 		 * In the event of subtransaction rollback, we must not forget
 		 * rd_*Subid.  Mark the entry "dropped" so RelationClearRelation()
 		 * invalidates it in lieu of destroying it.  (If we're in a top
 		 * transaction, we could opt to destroy the entry.)
 		 */
 		relation->rd_droppedSubid = GetCurrentSubTransactionId();
 	}
 	RelationClearRelation(relation, false);
 }
@@ -2492,10 +2463,11 @@ RelationCacheInvalidateEntry(Oid relationId)
 *	 relation cache and re-read relation mapping data.
 *
 *	 This is currently used only to recover from SI message buffer overflow,
- *	 so we do not touch relations having new-in-transaction relfilenodes; they
+ *	 so we do not touch new-in-transaction relations; they cannot be targets
- *	 cannot be targets of cross-backend SI updates (and our own updates now go
+ *	 of cross-backend SI updates (and our own updates now go through a
- *	 through a separate linked list that isn't limited by the SI message
+ *	 separate linked list that isn't limited by the SI message buffer size).
- *	 buffer size).
+ *	 Likewise, we need not discard new-relfilenode-in-transaction hints,
 *	 since any invalidation of those would be a local event.
 *
 *	 We do this in two phases: the first pass deletes deletable items, and
 *	 the second one rebuilds the rebuildable items.  This is essential for
@@ -2546,7 +2518,7 @@ RelationCacheInvalidate(void)
 		 * pending invalidations.
 		 */
 		if (relation->rd_createSubid != InvalidSubTransactionId ||
-			relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+			relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
 			continue;
 		relcacheInvalsReceived++;
@@ -2658,96 +2630,6 @@ RememberToFreeTupleDescAtEOX(TupleDesc td)
 	EOXactTupleDescArray[NextEOXactTupleDescNum++] = td;
 }
 #ifdef USE_ASSERT_CHECKING
 /*
 * Relation kinds that have physical storage. These relations normally have
 * relfilenode set to non-zero, but it can also be zero if the relation is
 * mapped.
 */
 #define RELKIND_HAS_STORAGE(relkind) \
 	((relkind) == RELKIND_RELATION || \
 	 (relkind) == RELKIND_INDEX || \
 	 (relkind) == RELKIND_SEQUENCE || \
 	 (relkind) == RELKIND_TOASTVALUE || \
 	 (relkind) == RELKIND_MATVIEW)
 static void
 AssertPendingSyncConsistency(Relation relation)
 {
 	bool		relcache_verdict =
 	relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&
 	((relation->rd_createSubid != InvalidSubTransactionId &&
 	  RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) ||
 	 relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId);
 	Assert(relcache_verdict == RelFileNodeSkippingWAL(relation->rd_node));
 	if (relation->rd_droppedSubid != InvalidSubTransactionId)
 		Assert(!relation->rd_isvalid &&
 			   (relation->rd_createSubid != InvalidSubTransactionId ||
 				relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId));
 }
 /*
 * AssertPendingSyncs_RelationCache
 *
 *	Assert that relcache.c and storage.c agree on whether to skip WAL.
 */
 void
 AssertPendingSyncs_RelationCache(void)
 {
 	HASH_SEQ_STATUS status;
 	LOCALLOCK  *locallock;
 	Relation   *rels;
 	int			maxrels;
 	int			nrels;
 	RelIdCacheEnt *idhentry;
 	int			i;
 	/*
 	 * Open every relation that this transaction has locked.  If, for some
 	 * relation, storage.c is skipping WAL and relcache.c is not skipping WAL,
 	 * a CommandCounterIncrement() typically yields a local invalidation
 	 * message that destroys the relcache entry.  By recreating such entries
 	 * here, we detect the problem.
 	 */
 	PushActiveSnapshot(GetTransactionSnapshot());
 	maxrels = 1;
 	rels = palloc(maxrels * sizeof(*rels));
 	nrels = 0;
 	hash_seq_init(&status, GetLockMethodLocalHash());
 	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
 	{
 		Oid			relid;
 		Relation	r;
 		if (locallock->nLocks <= 0)
 			continue;
 		if ((LockTagType) locallock->tag.lock.locktag_type !=
 			LOCKTAG_RELATION)
 			continue;
 		relid = ObjectIdGetDatum(locallock->tag.lock.locktag_field2);
 		r = RelationIdGetRelation(relid);
 		if (!RelationIsValid(r))
 			continue;
 		if (nrels >= maxrels)
 		{
 			maxrels *= 2;
 			rels = repalloc(rels, maxrels * sizeof(*rels));
 		}
 		rels[nrels++] = r;
 	}
 	hash_seq_init(&status, RelationIdCache);
 	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
 		AssertPendingSyncConsistency(idhentry->reldesc);
 	for (i = 0; i < nrels; i++)
 		RelationClose(rels[i]);
 	PopActiveSnapshot();
 }
 #endif
 /*
 * AtEOXact_RelationCache
 *
@@ -2830,8 +2712,6 @@ AtEOXact_RelationCache(bool isCommit)
 static void
 AtEOXact_cleanup(Relation relation, bool isCommit)
 {
 	bool		clear_relcache = false;
 	/*
 	 * The relcache entry's ref count should be back to its normal
 	 * not-in-a-transaction state: 0 unless it's nailed in cache.
@@ -2857,31 +2737,17 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
 #endif
 	/*
-	 * Is the relation live after this transaction ends?
+	 * Is it a relation created in the current transaction?
 	 *
-	 * During commit, clear the relcache entry if it is preserved after
+	 * During commit, reset the flag to zero, since we are now out of the
-	 * relation drop, in order not to orphan the entry.  During rollback,
+	 * creating transaction.  During abort, simply delete the relcache entry
-	 * clear the relcache entry if the relation is created in the current
+	 * --- it isn't interesting any longer.
 	 * transaction since it isn't interesting any longer once we are out of
 	 * the transaction.
 	 */
-	clear_relcache =
+	if (relation->rd_createSubid != InvalidSubTransactionId)
 		(isCommit ?
 		 relation->rd_droppedSubid != InvalidSubTransactionId :
 		 relation->rd_createSubid != InvalidSubTransactionId);
 	/*
 	 * Since we are now out of the transaction, reset the subids to zero.
 	 * That also lets RelationClearRelation() drop the relcache entry.
 	 */
 	relation->rd_createSubid = InvalidSubTransactionId;
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 	relation->rd_droppedSubid = InvalidSubTransactionId;
 	if (clear_relcache)
 	{
-		if (RelationHasReferenceCountZero(relation))
+		if (isCommit)
 			relation->rd_createSubid = InvalidSubTransactionId;
 		else if (RelationHasReferenceCountZero(relation))
 		{
 			RelationClearRelation(relation, false);
 			return;
@@ -2896,11 +2762,17 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
 			 * eventually.  This must be just a WARNING to avoid
 			 * error-during-error-recovery loops.
 			 */
 			relation->rd_createSubid = InvalidSubTransactionId;
 			elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
 				 RelationGetRelationName(relation));
 		}
 	}
 	/*
 	 * Likewise, reset the hint about the relfilenode being new.
 	 */
 	relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	/*
 	 * Flush any temporary index list.
 	 */
@@ -2975,28 +2847,15 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 	/*
 	 * Is it a relation created in the current subtransaction?
 	 *
-	 * During subcommit, mark it as belonging to the parent, instead, as long
+	 * During subcommit, mark it as belonging to the parent, instead. During
-	 * as it has not been dropped. Otherwise simply delete the relcache entry.
+	 * subabort, simply delete the relcache entry.
 	 * --- it isn't interesting any longer.
 	 */
 	if (relation->rd_createSubid == mySubid)
 	{
-		/*
+		if (isCommit)
 		 * Valid rd_droppedSubid means the corresponding relation is dropped
 		 * but the relcache entry is preserved for at-commit pending sync. We
 		 * need to drop it explicitly here not to make the entry orphan.
 		 */
 		Assert(relation->rd_droppedSubid == mySubid ||
 			   relation->rd_droppedSubid == InvalidSubTransactionId);
 		if (isCommit && relation->rd_droppedSubid == InvalidSubTransactionId)
 			relation->rd_createSubid = parentSubid;
 		else if (RelationHasReferenceCountZero(relation))
 		{
 			/* allow the entry to be removed */
 			relation->rd_createSubid = InvalidSubTransactionId;
 			relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 			relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 			relation->rd_droppedSubid = InvalidSubTransactionId;
 			RelationClearRelation(relation, false);
 			return;
 		}
@@ -3016,8 +2875,7 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 	}
 	/*
-	 * Likewise, update or drop any new-relfilenode-in-subtransaction record
+	 * Likewise, update or drop any new-relfilenode-in-subtransaction hint.
 	 * or drop record.
 	 */
 	if (relation->rd_newRelfilenodeSubid == mySubid)
 	{
@@ -3027,22 +2885,6 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
 			relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	}
 	if (relation->rd_firstRelfilenodeSubid == mySubid)
 	{
 		if (isCommit)
 			relation->rd_firstRelfilenodeSubid = parentSubid;
 		else
 			relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 	}
 	if (relation->rd_droppedSubid == mySubid)
 	{
 		if (isCommit)
 			relation->rd_droppedSubid = parentSubid;
 		else
 			relation->rd_droppedSubid = InvalidSubTransactionId;
 	}
 	/*
 	 * Flush any temporary index list.
 	 */
@@ -3142,7 +2984,6 @@ RelationBuildLocalRelation(const char *relname,
 	/* it's being created in this transaction */
 	rel->rd_createSubid = GetCurrentSubTransactionId();
 	rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 	rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 	/*
 	 * create a new tuple descriptor from the one passed in.  We do this
@@ -3413,29 +3254,14 @@ RelationSetNewRelfilenode(Relation relation, char persistence,
 	 */
 	CommandCounterIncrement();
 	RelationAssumeNewRelfilenode(relation);
 }
 	/*
- * RelationAssumeNewRelfilenode
+	 * Mark the rel as having been given a new relfilenode in the current
- *
+	 * (sub) transaction.  This is a hint that can be used to optimize later
- * Code that modifies pg_class.reltablespace or pg_class.relfilenode must call
+	 * operations on the rel in the same transaction.
 * this.  The call shall precede any code that might insert WAL records whose
 * replay would modify bytes in the new RelFileNode, and the call shall follow
 * any WAL modifying bytes in the prior RelFileNode.  See struct RelationData.
 * Ideally, call this as near as possible to the CommandCounterIncrement()
 * that makes the pg_class change visible (before it or after it); that
 * minimizes the chance of future development adding a forbidden WAL insertion
 * between RelationAssumeNewRelfilenode() and CommandCounterIncrement().
 	 */
 void
 RelationAssumeNewRelfilenode(Relation relation)
 {
 	relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId();
 	if (relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
 		relation->rd_firstRelfilenodeSubid = relation->rd_newRelfilenodeSubid;
-	/* Flag relation as needing eoxact cleanup (to clear these fields) */
+	/* Flag relation as needing eoxact cleanup (to remove the hint) */
 	EOXactListAdd(relation);
 }
@@ -5434,8 +5260,6 @@ load_relcache_init_file(bool shared)
 		rel->rd_idattr = NULL;
 		rel->rd_createSubid = InvalidSubTransactionId;
 		rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
 		rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
 		rel->rd_droppedSubid = InvalidSubTransactionId;
 		rel->rd_amcache = NULL;
 		MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -32,7 +32,6 @@
 #include "access/twophase.h"
 #include "access/xact.h"
 #include "catalog/namespace.h"
 #include "catalog/storage.h"
 #include "commands/async.h"
 #include "commands/prepare.h"
 #include "commands/vacuum.h"
@@ -2323,17 +2322,6 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 	{
 		{"wal_skip_threshold", PGC_USERSET, WAL_SETTINGS,
 			gettext_noop("Size of new file to fsync instead of writing WAL."),
 			NULL,
 			GUC_UNIT_KB
 		},
 		&wal_skip_threshold,
 		2048, 0, MAX_KILOBYTES,
 		NULL, NULL, NULL
 	},
 	{
 		/* see max_connections */
 		{"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING,
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -197,7 +197,6 @@
 					# (change requires restart)
 #wal_writer_delay = 200ms		# 1-10000 milliseconds
 #wal_writer_flush_after = 1MB		# measured in pages, 0 disables
 #wal_skip_threshold = 2MB
 #commit_delay = 0			# range 0-100000, in microseconds
 #commit_siblings = 5			# range 1-1000
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -189,7 +189,6 @@ typedef GISTScanOpaqueData *GISTScanOpaque;
 /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
 /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
 #define XLOG_GIST_ASSIGN_LSN		0x70	/* nop, assign new LSN */
 /*
 * Backup Blk 0: updated page.
@@ -478,8 +477,6 @@ extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
 			  BlockNumber origrlink, GistNSN oldnsn,
 			  Buffer leftchild, bool markfollowright);
 extern XLogRecPtr gistXLogAssignLSN(void);
 /* gistget.c */
 extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
 extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -25,6 +25,7 @@
 /* "options" flag bits for heap_insert */
 #define HEAP_INSERT_SKIP_WAL	0x0001
 #define HEAP_INSERT_SKIP_FSM	0x0002
 #define HEAP_INSERT_FROZEN		0x0004
 #define HEAP_INSERT_SPECULATIVE 0x0008
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -23,7 +23,7 @@ typedef struct RewriteStateData *RewriteState;
 extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 				   TransactionId OldestXmin, TransactionId FreezeXid,
-				   MultiXactId MultiXactCutoff);
+				   MultiXactId MultiXactCutoff, bool use_wal);
 extern void end_heap_rewrite(RewriteState state);
 extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
 				   HeapTuple newTuple);
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -18,22 +18,16 @@
 #include "storage/relfilenode.h"
 #include "utils/relcache.h"
 /* GUC variables */
 extern int	wal_skip_threshold;
 extern void RelationCreateStorage(RelFileNode rnode, char relpersistence);
 extern void RelationDropStorage(Relation rel);
 extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
 extern void RelationPreTruncate(Relation rel);
 extern void RelationTruncate(Relation rel, BlockNumber nblocks);
 extern bool RelFileNodeSkippingWAL(RelFileNode rnode);
 /*
 * These functions used to be in storage/smgr/smgr.c, which explains the
 * naming
 */
 extern void smgrDoPendingDeletes(bool isCommit);
 extern void smgrDoPendingSyncs(bool isCommit);
 extern int	smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr);
 extern void AtSubCommit_smgr(void);
 extern void AtSubAbort_smgr(void);
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -2454,9 +2454,6 @@ typedef struct IndexStmt
 	bool		transformed;	/* true when transformIndexStmt is finished */
 	bool		concurrent;		/* should this be a concurrent index build? */
 	bool		if_not_exists;	/* just do nothing if index already exists? */
 	SubTransactionId oldCreateSubid;	/* rd_createSubid of oldNode */
 	SubTransactionId oldFirstRelfilenodeSubid;	/* rd_firstRelfilenodeSubid of
 												 * oldNode */
 } IndexStmt;
 /* ----------------------
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -50,9 +50,6 @@ typedef enum
 /* forward declared, to avoid having to expose buf_internals.h here */
 struct WritebackContext;
 /* forward declared, to avoid including smgr.h here */
 struct SMgrRelationData;
 /* in globals.c ... this duplicates miscadmin.h */
 extern PGDLLIMPORT int NBuffers;
@@ -193,7 +190,6 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
 								ForkNumber forkNum);
 extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
 					   ForkNumber forkNum, BlockNumber firstDelBlock);
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -541,9 +541,6 @@ extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks);
 extern void LockReleaseSession(LOCKMETHODID lockmethodid);
 extern void LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks);
 extern void LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks);
 #ifdef USE_ASSERT_CHECKING
 extern HTAB *GetLockMethodLocalHash(void);
 #endif
 extern bool LockHasWaiters(const LOCKTAG *locktag,
 			   LOCKMODE lockmode, bool sessionLock);
 extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag,
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -87,7 +87,6 @@ extern void smgrcloseall(void);
 extern void smgrclosenode(RelFileNodeBackend rnode);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdounlink(SMgrRelation reln, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -65,43 +65,25 @@ typedef struct RelationData
 	/*----------
 	 * rd_createSubid is the ID of the highest subtransaction the rel has
-	 * survived into or zero if the rel or its rd_node was created before the
+	 * survived into; or zero if the rel was not created in the current top
-	 * current top transaction.  (IndexStmt.oldNode leads to the case of a new
+	 * transaction.  This can be now be relied on, whereas previously it could
-	 * rel with an old rd_node.)  rd_firstRelfilenodeSubid is the ID of the
+	 * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
-	 * highest subtransaction an rd_node change has survived into or zero if
+	 * the ID of the highest subtransaction the relfilenode change has
-	 * rd_node matches the value it had at the start of the current top
+	 * survived into, or zero if not changed in the current transaction (or we
-	 * transaction.  (Rolling back the subtransaction that
+	 * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
-	 * rd_firstRelfilenodeSubid denotes would restore rd_node to the value it
+	 * when a relation has multiple new relfilenodes within a single
-	 * had at the start of the current top transaction.  Rolling back any
+	 * transaction, with one of them occurring in a subsequently aborted
-	 * lower subtransaction would not.)  Their accuracy is critical to
+	 * subtransaction, e.g.
 	 * RelationNeedsWAL().
 	 *
 	 * rd_newRelfilenodeSubid is the ID of the highest subtransaction the
 	 * most-recent relfilenode change has survived into or zero if not changed
 	 * in the current transaction (or we have forgotten changing it).  This
 	 * field is accurate when non-zero, but it can be zero when a relation has
 	 * multiple new relfilenodes within a single transaction, with one of them
 	 * occurring in a subsequently aborted subtransaction, e.g.
 	 *		BEGIN;
 	 *		TRUNCATE t;
 	 *		SAVEPOINT save;
 	 *		TRUNCATE t;
 	 *		ROLLBACK TO save;
 	 *		-- rd_newRelfilenodeSubid is now forgotten
 	 *
 	 * If every rd_*Subid field is zero, they are read-only outside
 	 * relcache.c.  Files that trigger rd_node changes by updating
 	 * pg_class.reltablespace and/or pg_class.relfilenode call
 	 * RelationAssumeNewRelfilenode() to update rd_*Subid.
 	 *
 	 * rd_droppedSubid is the ID of the highest subtransaction that a drop of
 	 * the rel has survived into.  In entries visible outside relcache.c, this
 	 * is always zero.
 	 */
 	SubTransactionId rd_createSubid;	/* rel was created in current xact */
-	SubTransactionId rd_newRelfilenodeSubid;	/* highest subxact changing
+	SubTransactionId rd_newRelfilenodeSubid;	/* new relfilenode assigned in
-												 * rd_node to current value */
+												 * current xact */
 	/* see end for rd_firstRelfilenodeSubid and rd_droppedSubid */
 	Form_pg_class rd_rel;		/* RELATION tuple */
 	TupleDesc	rd_att;			/* tuple descriptor */
@@ -195,10 +177,6 @@ typedef struct RelationData
 	/* use "struct" here to avoid needing to include pgstat.h: */
 	struct PgStat_TableStatus *pgstat_info;		/* statistics collection area */
 	SubTransactionId rd_firstRelfilenodeSubid;	/* highest subxact changing
 												 * rd_node to any value */
 	SubTransactionId rd_droppedSubid;	/* dropped with another Subid set */
 } RelationData;
@@ -485,16 +463,9 @@ typedef struct ViewOptions
 /*
 * RelationNeedsWAL
 *		True if relation needs WAL.
 *
 * Returns false if wal_level = minimal and this relation is created or
 * truncated in the current transaction.  See "Skipping WAL for New
 * RelFileNode" in src/backend/access/transam/README.
 */
 #define RelationNeedsWAL(relation) \
-	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
 	 (XLogIsNeeded() ||													\
 	  (relation->rd_createSubid == InvalidSubTransactionId &&			\
 	   relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
 /*
 * RelationUsesLocalBuffers
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -95,11 +95,10 @@ extern Relation RelationBuildLocalRelation(const char *relname,
 						   char relkind);
 /*
- * Routines to manage assignment of new relfilenode to a relation
+ * Routine to manage assignment of new relfilenode to a relation
 */
 extern void RelationSetNewRelfilenode(Relation relation, char persistence,
 						  TransactionId freezeXid, MultiXactId minmulti);
 extern void RelationAssumeNewRelfilenode(Relation relation);
 /*
 * Routines for flushing/rebuilding relcache entries in various scenarios
@@ -112,11 +111,6 @@ extern void RelationCacheInvalidate(void);
 extern void RelationCloseSmgrByOid(Oid relationId);
 #ifdef USE_ASSERT_CHECKING
 extern void AssertPendingSyncs_RelationCache(void);
 #else
 #define AssertPendingSyncs_RelationCache() do {} while (0)
 #endif
 extern void AtEOXact_RelationCache(bool isCommit);
 extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
 						  SubTransactionId parentSubid);
--- a/src/test/recovery/t/018_wal_optimize.pl
+++ b/src/test/recovery/t/018_wal_optimize.pl
@@ -1,372 +0,0 @@
 # Test WAL replay when some operation has skipped WAL.
 #
 # These tests exercise code that once violated the mandate described in
 # src/backend/access/transam/README section "Skipping WAL for New
 # RelFileNode".  The tests work by committing some transactions, initiating an
 # immediate shutdown, and confirming that the expected data survives recovery.
 # For many years, individual commands made the decision to skip WAL, hence the
 # frequent appearance of COPY in these tests.
 use strict;
 use warnings;
 use PostgresNode;
 use TestLib;
 use Test::More tests => 34;
 sub check_orphan_relfilenodes
 {
 	my ($node, $test_name) = @_;
 	my $db_oid = $node->safe_psql('postgres',
 		"SELECT oid FROM pg_database WHERE datname = 'postgres'");
 	my $prefix               = "base/$db_oid/";
 	my $filepaths_referenced = $node->safe_psql(
 		'postgres', "
 	   SELECT pg_relation_filepath(oid) FROM pg_class
 	   WHERE reltablespace = 0 AND relpersistence <> 't' AND
 	   pg_relation_filepath(oid) IS NOT NULL;");
 	is_deeply(
 		[
 			sort(map { "$prefix$_" }
 				  grep(/^[0-9]+$/, slurp_dir($node->data_dir . "/$prefix")))
 		],
 		[ sort split /\n/, $filepaths_referenced ],
 		$test_name);
 	return;
 }
 # We run this same test suite for both wal_level=minimal and replica.
 sub run_wal_optimize
 {
 	my $wal_level = shift;
 	my $node = get_new_node("node_$wal_level");
 	$node->init;
 	$node->append_conf(
 		'postgresql.conf', qq(
 wal_level = $wal_level
 max_prepared_transactions = 1
 wal_log_hints = on
 wal_skip_threshold = 0
 #wal_debug = on
 ));
 	$node->start;
 	# Setup
 	my $tablespace_dir = $node->basedir . '/tablespace_other';
 	mkdir($tablespace_dir);
 	$tablespace_dir = TestLib::perl2host($tablespace_dir);
 	$node->safe_psql('postgres',
 		"CREATE TABLESPACE other LOCATION '$tablespace_dir';");
 	# Test direct truncation optimization.  No tuples.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE trunc (id serial PRIMARY KEY);
 		TRUNCATE trunc;
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	my $result = $node->safe_psql('postgres', "SELECT count(*) FROM trunc;");
 	is($result, qq(0), "wal_level = $wal_level, TRUNCATE with empty table");
 	# Test truncation with inserted tuples within the same transaction.
 	# Tuples inserted after the truncation should be seen.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE trunc_ins (id serial PRIMARY KEY);
 		INSERT INTO trunc_ins VALUES (DEFAULT);
 		TRUNCATE trunc_ins;
 		INSERT INTO trunc_ins VALUES (DEFAULT);
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres',
 		"SELECT count(*), min(id) FROM trunc_ins;");
 	is($result, qq(1|2), "wal_level = $wal_level, TRUNCATE INSERT");
 	# Same for prepared transaction.
 	# Tuples inserted after the truncation should be seen.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE twophase (id serial PRIMARY KEY);
 		INSERT INTO twophase VALUES (DEFAULT);
 		TRUNCATE twophase;
 		INSERT INTO twophase VALUES (DEFAULT);
 		PREPARE TRANSACTION 't';
 		COMMIT PREPARED 't';");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres',
 		"SELECT count(*), min(id) FROM trunc_ins;");
 	is($result, qq(1|2), "wal_level = $wal_level, TRUNCATE INSERT PREPARE");
 	# Writing WAL at end of xact, instead of syncing.
 	$node->safe_psql(
 		'postgres', "
 		SET wal_skip_threshold = '1TB';
 		BEGIN;
 		CREATE TABLE noskip (id serial PRIMARY KEY);
 		INSERT INTO noskip (SELECT FROM generate_series(1, 20000) a) ;
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM noskip;");
 	is($result, qq(20000), "wal_level = $wal_level, end-of-xact WAL");
 	# Data file for COPY query in subsequent tests
 	my $basedir   = $node->basedir;
 	my $copy_file = "$basedir/copy_data.txt";
 	TestLib::append_to_file(
 		$copy_file, qq(20000,30000
 20001,30001
 20002,30002));
 	# Test truncation with inserted tuples using both INSERT and COPY.  Tuples
 	# inserted after the truncation should be seen.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE ins_trunc (id serial PRIMARY KEY, id2 int);
 		INSERT INTO ins_trunc VALUES (DEFAULT, generate_series(1,10000));
 		TRUNCATE ins_trunc;
 		INSERT INTO ins_trunc (id, id2) VALUES (DEFAULT, 10000);
 		COPY ins_trunc FROM '$copy_file' DELIMITER ',';
 		INSERT INTO ins_trunc (id, id2) VALUES (DEFAULT, 10000);
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_trunc;");
 	is($result, qq(5), "wal_level = $wal_level, TRUNCATE COPY INSERT");
 	# Test truncation with inserted tuples using COPY.  Tuples copied after
 	# the truncation should be seen.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE trunc_copy (id serial PRIMARY KEY, id2 int);
 		INSERT INTO trunc_copy VALUES (DEFAULT, generate_series(1,3000));
 		TRUNCATE trunc_copy;
 		COPY trunc_copy FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result =
 	  $node->safe_psql('postgres', "SELECT count(*) FROM trunc_copy;");
 	is($result, qq(3), "wal_level = $wal_level, TRUNCATE COPY");
 	# Like previous test, but rollback SET TABLESPACE in a subtransaction.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE spc_abort (id serial PRIMARY KEY, id2 int);
 		INSERT INTO spc_abort VALUES (DEFAULT, generate_series(1,3000));
 		TRUNCATE spc_abort;
 		SAVEPOINT s;
 		  ALTER TABLE spc_abort SET TABLESPACE other; ROLLBACK TO s;
 		COPY spc_abort FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_abort;");
 	is($result, qq(3),
 		"wal_level = $wal_level, SET TABLESPACE abort subtransaction");
 	# in different subtransaction patterns
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE spc_commit (id serial PRIMARY KEY, id2 int);
 		INSERT INTO spc_commit VALUES (DEFAULT, generate_series(1,3000));
 		TRUNCATE spc_commit;
 		SAVEPOINT s; ALTER TABLE spc_commit SET TABLESPACE other; RELEASE s;
 		COPY spc_commit FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result =
 	  $node->safe_psql('postgres', "SELECT count(*) FROM spc_commit;");
 	is($result, qq(3),
 		"wal_level = $wal_level, SET TABLESPACE commit subtransaction");
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE spc_nest (id serial PRIMARY KEY, id2 int);
 		INSERT INTO spc_nest VALUES (DEFAULT, generate_series(1,3000));
 		TRUNCATE spc_nest;
 		SAVEPOINT s;
 			ALTER TABLE spc_nest SET TABLESPACE other;
 			SAVEPOINT s2;
 				ALTER TABLE spc_nest SET TABLESPACE pg_default;
 			ROLLBACK TO s2;
 			SAVEPOINT s2;
 				ALTER TABLE spc_nest SET TABLESPACE pg_default;
 			RELEASE s2;
 		ROLLBACK TO s;
 		COPY spc_nest FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_nest;");
 	is($result, qq(3),
 		"wal_level = $wal_level, SET TABLESPACE nested subtransaction");
 	$node->safe_psql(
 		'postgres', "
 		CREATE TABLE spc_hint (id int);
 		INSERT INTO spc_hint VALUES (1);
 		BEGIN;
 		ALTER TABLE spc_hint SET TABLESPACE other;
 		CHECKPOINT;
 		SELECT * FROM spc_hint;  -- set hint bit
 		INSERT INTO spc_hint VALUES (2);
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_hint;");
 	is($result, qq(2), "wal_level = $wal_level, SET TABLESPACE, hint bit");
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE idx_hint (c int PRIMARY KEY);
 		SAVEPOINT q; INSERT INTO idx_hint VALUES (1); ROLLBACK TO q;
 		CHECKPOINT;
 		INSERT INTO idx_hint VALUES (1);  -- set index hint bit
 		INSERT INTO idx_hint VALUES (2);
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->psql('postgres',);
 	my ($ret, $stdout, $stderr) =
 	  $node->psql('postgres', "INSERT INTO idx_hint VALUES (2);");
 	is($ret, qq(3), "wal_level = $wal_level, unique index LP_DEAD");
 	like(
 		$stderr,
 		qr/violates unique/,
 		"wal_level = $wal_level, unique index LP_DEAD message");
 	# UPDATE touches two buffers for one row.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE upd (id serial PRIMARY KEY, id2 int);
 		INSERT INTO upd (id, id2) VALUES (DEFAULT, generate_series(1,10000));
 		COPY upd FROM '$copy_file' DELIMITER ',';
 		UPDATE upd SET id2 = id2 + 1;
 		DELETE FROM upd;
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM upd;");
 	is($result, qq(0),
 		"wal_level = $wal_level, UPDATE touches two buffers for one row");
 	# Test consistency of COPY with INSERT for table created in the same
 	# transaction.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE ins_copy (id serial PRIMARY KEY, id2 int);
 		INSERT INTO ins_copy VALUES (DEFAULT, 1);
 		COPY ins_copy FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_copy;");
 	is($result, qq(4), "wal_level = $wal_level, INSERT COPY");
 	# Test consistency of COPY that inserts more to the same table using
 	# triggers.  If the INSERTS from the trigger go to the same block data
 	# is copied to, and the INSERTs are WAL-logged, WAL replay will fail when
 	# it tries to replay the WAL record but the "before" image doesn't match,
 	# because not all changes were WAL-logged.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE ins_trig (id serial PRIMARY KEY, id2 text);
 		CREATE FUNCTION ins_trig_before_row_trig() RETURNS trigger
 		  LANGUAGE plpgsql as \$\$
 		  BEGIN
 			IF new.id2 NOT LIKE 'triggered%' THEN
 			  INSERT INTO ins_trig
 				VALUES (DEFAULT, 'triggered row before' || NEW.id2);
 			END IF;
 			RETURN NEW;
 		  END; \$\$;
 		CREATE FUNCTION ins_trig_after_row_trig() RETURNS trigger
 		  LANGUAGE plpgsql as \$\$
 		  BEGIN
 			IF new.id2 NOT LIKE 'triggered%' THEN
 			  INSERT INTO ins_trig
 				VALUES (DEFAULT, 'triggered row after' || NEW.id2);
 			END IF;
 			RETURN NEW;
 		  END; \$\$;
 		CREATE TRIGGER ins_trig_before_row_insert
 		  BEFORE INSERT ON ins_trig
 		  FOR EACH ROW EXECUTE PROCEDURE ins_trig_before_row_trig();
 		CREATE TRIGGER ins_trig_after_row_insert
 		  AFTER INSERT ON ins_trig
 		  FOR EACH ROW EXECUTE PROCEDURE ins_trig_after_row_trig();
 		COPY ins_trig FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_trig;");
 	is($result, qq(9), "wal_level = $wal_level, COPY with INSERT triggers");
 	# Test consistency of INSERT, COPY and TRUNCATE in same transaction block
 	# with TRUNCATE triggers.
 	$node->safe_psql(
 		'postgres', "
 		BEGIN;
 		CREATE TABLE trunc_trig (id serial PRIMARY KEY, id2 text);
 		CREATE FUNCTION trunc_trig_before_stat_trig() RETURNS trigger
 		  LANGUAGE plpgsql as \$\$
 		  BEGIN
 			INSERT INTO trunc_trig VALUES (DEFAULT, 'triggered stat before');
 			RETURN NULL;
 		  END; \$\$;
 		CREATE FUNCTION trunc_trig_after_stat_trig() RETURNS trigger
 		  LANGUAGE plpgsql as \$\$
 		  BEGIN
 			INSERT INTO trunc_trig VALUES (DEFAULT, 'triggered stat before');
 			RETURN NULL;
 		  END; \$\$;
 		CREATE TRIGGER trunc_trig_before_stat_truncate
 		  BEFORE TRUNCATE ON trunc_trig
 		  FOR EACH STATEMENT EXECUTE PROCEDURE trunc_trig_before_stat_trig();
 		CREATE TRIGGER trunc_trig_after_stat_truncate
 		  AFTER TRUNCATE ON trunc_trig
 		  FOR EACH STATEMENT EXECUTE PROCEDURE trunc_trig_after_stat_trig();
 		INSERT INTO trunc_trig VALUES (DEFAULT, 1);
 		TRUNCATE trunc_trig;
 		COPY trunc_trig FROM '$copy_file' DELIMITER ',';
 		COMMIT;");
 	$node->stop('immediate');
 	$node->start;
 	$result =
 	  $node->safe_psql('postgres', "SELECT count(*) FROM trunc_trig;");
 	is($result, qq(4),
 		"wal_level = $wal_level, TRUNCATE COPY with TRUNCATE triggers");
 	# Test redo of temp table creation.
 	$node->safe_psql(
 		'postgres', "
 		CREATE TEMP TABLE temp (id serial PRIMARY KEY, id2 text);");
 	$node->stop('immediate');
 	$node->start;
 	check_orphan_relfilenodes($node,
 		"wal_level = $wal_level, no orphan relfilenode remains");
 	return;
 }
 # Run same test suite for multiple wal_level values.
 run_wal_optimize("minimal");
 run_wal_optimize("replica");
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -2016,12 +2016,6 @@ select * from another;
 (3 rows)
 drop table another;
 -- Create an index that skips WAL, then perform a SET DATA TYPE that skips
 -- rewriting the index.
 begin;
 create table skip_wal_skip_rewrite_index (c varchar(10) primary key);
 alter table skip_wal_skip_rewrite_index alter c type varchar(20);
 commit;
 -- table's row type
 create table tab1 (a int, b text);
 create table tab2 (x int, y tab1);
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -267,16 +267,3 @@ DEALLOCATE select1;
 -- check that the oid column is added before the primary key is checked
 CREATE TABLE oid_pk (f1 INT, PRIMARY KEY(oid)) WITH OIDS;
 DROP TABLE oid_pk;
 -- Verify that subtransaction rollback restores rd_createSubid.
 BEGIN;
 CREATE TABLE remember_create_subid (c int);
 SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q;
 COMMIT;
 DROP TABLE remember_create_subid;
 -- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid.
 CREATE TABLE remember_node_subid (c int);
 BEGIN;
 ALTER TABLE remember_node_subid ALTER c TYPE bigint;
 SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q;
 COMMIT;
 DROP TABLE remember_node_subid;
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1348,13 +1348,6 @@ select * from another;
 drop table another;
 -- Create an index that skips WAL, then perform a SET DATA TYPE that skips
 -- rewriting the index.
 begin;
 create table skip_wal_skip_rewrite_index (c varchar(10) primary key);
 alter table skip_wal_skip_rewrite_index alter c type varchar(20);
 commit;
 -- table's row type
 create table tab1 (a int, b text);
 create table tab2 (x int, y tab1);
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -277,18 +277,3 @@ DEALLOCATE select1;
 -- check that the oid column is added before the primary key is checked
 CREATE TABLE oid_pk (f1 INT, PRIMARY KEY(oid)) WITH OIDS;
 DROP TABLE oid_pk;
 -- Verify that subtransaction rollback restores rd_createSubid.
 BEGIN;
 CREATE TABLE remember_create_subid (c int);
 SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q;
 COMMIT;
 DROP TABLE remember_create_subid;
 -- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid.
 CREATE TABLE remember_node_subid (c int);
 BEGIN;
 ALTER TABLE remember_node_subid ALTER c TYPE bigint;
 SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q;
 COMMIT;
 DROP TABLE remember_node_subid;