From 9695835538c2c8e9cd0048028b8c85e1bbf5c79c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 22 Nov 2024 16:28:24 +0200 Subject: [PATCH] Fix data loss when restarting the bulk_write facility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a user started a bulk write operation on a fork with existing data to append data in bulk, the bulk_write machinery would zero out all previously written pages up to the last page written by the new bulk_write operation. This is not an issue for PostgreSQL itself, because we never use the bulk_write facility on a non-empty fork. But there are use cases where it makes sense. TimescaleDB extension is known to do that to merge partitions, for example. Backpatch to v17, where the bulk_write machinery was introduced. Author: Matthias van de Meent Reported-By: Erik Nordström Reviewed-by: Erik Nordström Discussion: https://www.postgresql.org/message-id/CACAa4VJ%2BQY4pY7M0ECq29uGkrOygikYtao1UG9yCDFosxaps9g@mail.gmail.com --- src/backend/storage/smgr/bulk_write.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c index 1a5f3ce96e1..ce7a62c580d 100644 --- a/src/backend/storage/smgr/bulk_write.c +++ b/src/backend/storage/smgr/bulk_write.c @@ -4,8 +4,10 @@ * Efficiently and reliably populate a new relation * * The assumption is that no other backends access the relation while we are - * loading it, so we can take some shortcuts. Do not mix operations through - * the regular buffer manager and the bulk loading interface! + * loading it, so we can take some shortcuts. Pages already present in the + * indicated fork when the bulk write operation is started are not modified + * unless explicitly written to. Do not mix operations through the regular + * buffer manager and the bulk loading interface! * * We bypass the buffer manager to avoid the locking overhead, and call * smgrextend() directly. A downside is that the pages will need to be @@ -69,7 +71,7 @@ struct BulkWriteState PendingWrite pending_writes[MAX_PENDING_WRITES]; /* Current size of the relation */ - BlockNumber pages_written; + BlockNumber relsize; /* The RedoRecPtr at the time that the bulk operation started */ XLogRecPtr start_RedoRecPtr; @@ -106,7 +108,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal) state->use_wal = use_wal; state->npending = 0; - state->pages_written = 0; + state->relsize = smgrnblocks(smgr, forknum); state->start_RedoRecPtr = GetRedoRecPtr(); @@ -280,7 +282,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate) PageSetChecksumInplace(page, blkno); - if (blkno >= bulkstate->pages_written) + if (blkno >= bulkstate->relsize) { /* * If we have to write pages nonsequentially, fill in the space @@ -289,17 +291,18 @@ smgr_bulk_flush(BulkWriteState *bulkstate) * space will read as zeroes anyway), but it should help to avoid * fragmentation. The dummy pages aren't WAL-logged though. */ - while (blkno > bulkstate->pages_written) + while (blkno > bulkstate->relsize) { /* don't set checksum for all-zero page */ smgrextend(bulkstate->smgr, bulkstate->forknum, - bulkstate->pages_written++, + bulkstate->relsize, &zero_buffer, true); + bulkstate->relsize++; } smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true); - bulkstate->pages_written = pending_writes[i].blkno + 1; + bulkstate->relsize++; } else smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);