From 077ad4bd76b12cd4144e40ef5fba3f49528fa5e4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 16 Aug 2024 14:45:37 +0300
Subject: [PATCH] Relax fsyncing at end of a bulk load that was not WAL-logged

And improve the comments.

Backpatch to v17 where this was introduced.

Reviewed-by: Noah Misch
Discussion: https://www.postgresql.org/message-id/cac7d1b6-8358-40be-af0b-21bc9b27d34c@iki.fi
---
 src/backend/storage/smgr/bulk_write.c | 72 +++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 11 deletions(-)

diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index 4a10ece4c39..1a5f3ce96e1 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -132,19 +132,69 @@ smgr_bulk_finish(BulkWriteState *bulkstate)
 	smgr_bulk_flush(bulkstate);
 
 	/*
-	 * When we wrote out the pages, we passed skipFsync=true to avoid the
-	 * overhead of registering all the writes with the checkpointer.  Register
-	 * the whole relation now.
-	 *
-	 * There is one hole in that idea: If a checkpoint occurred while we were
-	 * writing the pages, it already missed fsyncing the pages we had written
-	 * before the checkpoint started.  A crash later on would replay the WAL
-	 * starting from the checkpoint, therefore it wouldn't replay our earlier
-	 * WAL records.  So if a checkpoint started after the bulk write, fsync
-	 * the files now.
+	 * Fsync the relation, or register it for the next checkpoint, if
+	 * necessary.
 	 */
-	if (!SmgrIsTemp(bulkstate->smgr))
+	if (SmgrIsTemp(bulkstate->smgr))
 	{
+		/* Temporary relations don't need to be fsync'd, ever */
+	}
+	else if (!bulkstate->use_wal)
+	{
+		/*----------
+		 * This is either an unlogged relation, or a permanent relation but we
+		 * skipped WAL-logging because wal_level=minimal:
+		 *
+		 * A) Unlogged relation
+		 *
+		 *    Unlogged relations will go away on crash, but they need to be
+		 *    fsync'd on a clean shutdown. It's sufficient to call
+		 *    smgrregistersync(), that ensures that the checkpointer will
+		 *    flush it at the shutdown checkpoint. (It will flush it on the
+		 *    next online checkpoint too, which is not strictly necessary.)
+		 *
+		 *    Note that the init-fork of an unlogged relation is not
+		 *    considered unlogged for our purposes. It's treated like a
+		 *    regular permanent relation. The callers will pass use_wal=true
+		 *    for the init fork.
+		 *
+		 * B) Permanent relation, WAL-logging skipped because wal_level=minimal
+		 *
+		 *    This is a new relation, and we didn't WAL-log the pages as we
+		 *    wrote, but they need to be fsync'd before commit.
+		 *
+		 *    We don't need to do that here, however. The fsync() is done at
+		 *    commit, by smgrDoPendingSyncs() (*).
+		 *
+		 *    (*) smgrDoPendingSyncs() might decide to WAL-log the whole
+		 *    relation at commit instead of fsyncing it, if the relation was
+		 *    very small, but it's smgrDoPendingSyncs() responsibility in any
+		 *    case.
+		 *
+		 * We cannot distinguish the two here, so conservatively assume it's
+		 * an unlogged relation. A permanent relation with wal_level=minimal
+		 * would require no actions, see above.
+		 */
+		smgrregistersync(bulkstate->smgr, bulkstate->forknum);
+	}
+	else
+	{
+		/*
+		 * Permanent relation, WAL-logged normally.
+		 *
+		 * We already WAL-logged all the pages, so they will be replayed from
+		 * WAL on crash. However, when we wrote out the pages, we passed
+		 * skipFsync=true to avoid the overhead of registering all the writes
+		 * with the checkpointer.  Register the whole relation now.
+		 *
+		 * There is one hole in that idea: If a checkpoint occurred while we
+		 * were writing the pages, it already missed fsyncing the pages we had
+		 * written before the checkpoint started.  A crash later on would
+		 * replay the WAL starting from the checkpoint, therefore it wouldn't
+		 * replay our earlier WAL records.  So if a checkpoint started after
+		 * the bulk write, fsync the files now.
+		 */
+
 		/*
 		 * Prevent a checkpoint from starting between the GetRedoRecPtr() and
 		 * smgrregistersync() calls.