1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-28 05:21:27 +03:00

Fix data loss when restarting the bulk_write facility

If a user started a bulk write operation on a fork with existing data
to append data in bulk, the bulk_write machinery would zero out all
previously written pages up to the last page written by the new
bulk_write operation.

This is not an issue for PostgreSQL itself, because we never use the
bulk_write facility on a non-empty fork. But there are use cases where
it makes sense. TimescaleDB extension is known to do that to merge
partitions, for example.

Backpatch to v17, where the bulk_write machinery was introduced.

Author: Matthias van de Meent <boekewurm+postgres@gmail.com>
Reported-By: Erik Nordström <erik@timescale.com>
Reviewed-by: Erik Nordström <erik@timescale.com>
Discussion: https://www.postgresql.org/message-id/CACAa4VJ%2BQY4pY7M0ECq29uGkrOygikYtao1UG9yCDFosxaps9g@mail.gmail.com
This commit is contained in:
Heikki Linnakangas 2024-11-22 16:28:24 +02:00
parent e6d6f2e46b
commit 9695835538

View File

@ -4,8 +4,10 @@
* Efficiently and reliably populate a new relation * Efficiently and reliably populate a new relation
* *
* The assumption is that no other backends access the relation while we are * The assumption is that no other backends access the relation while we are
* loading it, so we can take some shortcuts. Do not mix operations through * loading it, so we can take some shortcuts. Pages already present in the
* the regular buffer manager and the bulk loading interface! * indicated fork when the bulk write operation is started are not modified
* unless explicitly written to. Do not mix operations through the regular
* buffer manager and the bulk loading interface!
* *
* We bypass the buffer manager to avoid the locking overhead, and call * We bypass the buffer manager to avoid the locking overhead, and call
* smgrextend() directly. A downside is that the pages will need to be * smgrextend() directly. A downside is that the pages will need to be
@ -69,7 +71,7 @@ struct BulkWriteState
PendingWrite pending_writes[MAX_PENDING_WRITES]; PendingWrite pending_writes[MAX_PENDING_WRITES];
/* Current size of the relation */ /* Current size of the relation */
BlockNumber pages_written; BlockNumber relsize;
/* The RedoRecPtr at the time that the bulk operation started */ /* The RedoRecPtr at the time that the bulk operation started */
XLogRecPtr start_RedoRecPtr; XLogRecPtr start_RedoRecPtr;
@ -106,7 +108,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
state->use_wal = use_wal; state->use_wal = use_wal;
state->npending = 0; state->npending = 0;
state->pages_written = 0; state->relsize = smgrnblocks(smgr, forknum);
state->start_RedoRecPtr = GetRedoRecPtr(); state->start_RedoRecPtr = GetRedoRecPtr();
@ -280,7 +282,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
PageSetChecksumInplace(page, blkno); PageSetChecksumInplace(page, blkno);
if (blkno >= bulkstate->pages_written) if (blkno >= bulkstate->relsize)
{ {
/* /*
* If we have to write pages nonsequentially, fill in the space * If we have to write pages nonsequentially, fill in the space
@ -289,17 +291,18 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
* space will read as zeroes anyway), but it should help to avoid * space will read as zeroes anyway), but it should help to avoid
* fragmentation. The dummy pages aren't WAL-logged though. * fragmentation. The dummy pages aren't WAL-logged though.
*/ */
while (blkno > bulkstate->pages_written) while (blkno > bulkstate->relsize)
{ {
/* don't set checksum for all-zero page */ /* don't set checksum for all-zero page */
smgrextend(bulkstate->smgr, bulkstate->forknum, smgrextend(bulkstate->smgr, bulkstate->forknum,
bulkstate->pages_written++, bulkstate->relsize,
&zero_buffer, &zero_buffer,
true); true);
bulkstate->relsize++;
} }
smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true); smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
bulkstate->pages_written = pending_writes[i].blkno + 1; bulkstate->relsize++;
} }
else else
smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true); smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);