mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Fix data loss when restarting the bulk_write facility
If a user started a bulk write operation on a fork with existing data to append data in bulk, the bulk_write machinery would zero out all previously written pages up to the last page written by the new bulk_write operation. This is not an issue for PostgreSQL itself, because we never use the bulk_write facility on a non-empty fork. But there are use cases where it makes sense. TimescaleDB extension is known to do that to merge partitions, for example. Backpatch to v17, where the bulk_write machinery was introduced. Author: Matthias van de Meent <boekewurm+postgres@gmail.com> Reported-By: Erik Nordström <erik@timescale.com> Reviewed-by: Erik Nordström <erik@timescale.com> Discussion: https://www.postgresql.org/message-id/CACAa4VJ%2BQY4pY7M0ECq29uGkrOygikYtao1UG9yCDFosxaps9g@mail.gmail.com
This commit is contained in:
		@@ -4,8 +4,10 @@
 | 
				
			|||||||
 *	  Efficiently and reliably populate a new relation
 | 
					 *	  Efficiently and reliably populate a new relation
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * The assumption is that no other backends access the relation while we are
 | 
					 * The assumption is that no other backends access the relation while we are
 | 
				
			||||||
 * loading it, so we can take some shortcuts.  Do not mix operations through
 | 
					 * loading it, so we can take some shortcuts.  Pages already present in the
 | 
				
			||||||
 * the regular buffer manager and the bulk loading interface!
 | 
					 * indicated fork when the bulk write operation is started are not modified
 | 
				
			||||||
 | 
					 * unless explicitly written to.  Do not mix operations through the regular
 | 
				
			||||||
 | 
					 * buffer manager and the bulk loading interface!
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * We bypass the buffer manager to avoid the locking overhead, and call
 | 
					 * We bypass the buffer manager to avoid the locking overhead, and call
 | 
				
			||||||
 * smgrextend() directly.  A downside is that the pages will need to be
 | 
					 * smgrextend() directly.  A downside is that the pages will need to be
 | 
				
			||||||
@@ -69,7 +71,7 @@ struct BulkWriteState
 | 
				
			|||||||
	PendingWrite pending_writes[MAX_PENDING_WRITES];
 | 
						PendingWrite pending_writes[MAX_PENDING_WRITES];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Current size of the relation */
 | 
						/* Current size of the relation */
 | 
				
			||||||
	BlockNumber pages_written;
 | 
						BlockNumber relsize;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* The RedoRecPtr at the time that the bulk operation started */
 | 
						/* The RedoRecPtr at the time that the bulk operation started */
 | 
				
			||||||
	XLogRecPtr	start_RedoRecPtr;
 | 
						XLogRecPtr	start_RedoRecPtr;
 | 
				
			||||||
@@ -106,7 +108,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 | 
				
			|||||||
	state->use_wal = use_wal;
 | 
						state->use_wal = use_wal;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	state->npending = 0;
 | 
						state->npending = 0;
 | 
				
			||||||
	state->pages_written = 0;
 | 
						state->relsize = smgrnblocks(smgr, forknum);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	state->start_RedoRecPtr = GetRedoRecPtr();
 | 
						state->start_RedoRecPtr = GetRedoRecPtr();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -280,7 +282,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
		PageSetChecksumInplace(page, blkno);
 | 
							PageSetChecksumInplace(page, blkno);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (blkno >= bulkstate->pages_written)
 | 
							if (blkno >= bulkstate->relsize)
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			/*
 | 
								/*
 | 
				
			||||||
			 * If we have to write pages nonsequentially, fill in the space
 | 
								 * If we have to write pages nonsequentially, fill in the space
 | 
				
			||||||
@@ -289,17 +291,18 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 | 
				
			|||||||
			 * space will read as zeroes anyway), but it should help to avoid
 | 
								 * space will read as zeroes anyway), but it should help to avoid
 | 
				
			||||||
			 * fragmentation.  The dummy pages aren't WAL-logged though.
 | 
								 * fragmentation.  The dummy pages aren't WAL-logged though.
 | 
				
			||||||
			 */
 | 
								 */
 | 
				
			||||||
			while (blkno > bulkstate->pages_written)
 | 
								while (blkno > bulkstate->relsize)
 | 
				
			||||||
			{
 | 
								{
 | 
				
			||||||
				/* don't set checksum for all-zero page */
 | 
									/* don't set checksum for all-zero page */
 | 
				
			||||||
				smgrextend(bulkstate->smgr, bulkstate->forknum,
 | 
									smgrextend(bulkstate->smgr, bulkstate->forknum,
 | 
				
			||||||
						   bulkstate->pages_written++,
 | 
											   bulkstate->relsize,
 | 
				
			||||||
						   &zero_buffer,
 | 
											   &zero_buffer,
 | 
				
			||||||
						   true);
 | 
											   true);
 | 
				
			||||||
 | 
									bulkstate->relsize++;
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
 | 
								smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
 | 
				
			||||||
			bulkstate->pages_written = pending_writes[i].blkno + 1;
 | 
								bulkstate->relsize++;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		else
 | 
							else
 | 
				
			||||||
			smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
 | 
								smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user