1
0
mirror of https://github.com/postgres/postgres.git synced 2025-12-19 17:02:53 +03:00

WAL-log inplace update before revealing it to other sessions.

A buffer lock won't stop a reader having already checked tuple
visibility.  If a vac_update_datfrozenid() and then a crash happened
during inplace update of a relfrozenxid value, datfrozenxid could
overtake relfrozenxid.  That could lead to "could not access status of
transaction" errors.

Back-patch to v14 - v17.  This is a back-patch of commits:

- 8e7e672cda
  (main change, on master, before v18 branched)
- 8180136652
  (defect fix, on master, before v18 branched)

It reverses commit bc6bad8857, my revert
of the original back-patch.

In v14, this also back-patches the assertion removal from commit
7fcf2faf9c.

Discussion: https://postgr.es/m/20240620012908.92.nmisch@google.com
Backpatch-through: 14-17
This commit is contained in:
Noah Misch
2025-12-16 16:13:54 -08:00
parent 8114224719
commit e4b1986b98
3 changed files with 54 additions and 18 deletions

View File

@@ -198,9 +198,7 @@ Inplace updates create an exception to the rule that tuple data won't change
under a reader holding a pin. A reader of a heap_fetch() result tuple may
witness a torn read. Current inplace-updated fields are aligned and are no
wider than four bytes, and current readers don't need consistency across
fields. Hence, they get by with just fetching each field once. XXX such a
caller may also read a value that has not reached WAL; see
systable_inplace_update_finish().
fields. Hence, they get by with just fetching each field once.
During logical decoding, caches reflect an inplace update no later than the
next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command.

View File

@@ -6437,6 +6437,8 @@ heap_inplace_update_and_unlock(Relation relation,
HeapTupleHeader htup = oldtup->t_data;
uint32 oldlen;
uint32 newlen;
char *dst;
char *src;
Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
oldlen = oldtup->t_len - htup->t_hoff;
@@ -6444,6 +6446,9 @@ heap_inplace_update_and_unlock(Relation relation,
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length");
dst = (char *) htup + htup->t_hoff;
src = (char *) tuple->t_data + tuple->t_data->t_hoff;
/*
* Unlink relcache init files as needed. If unlinking, acquire
* RelCacheInitLock until after associated invalidations. By doing this
@@ -6454,15 +6459,15 @@ heap_inplace_update_and_unlock(Relation relation,
*/
PreInplace_Inval();
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
memcpy((char *) htup + htup->t_hoff,
(char *) tuple->t_data + tuple->t_data->t_hoff,
newlen);
/*----------
* XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
* NO EREPORT(ERROR) from here till changes are complete
*
* Our buffer lock won't stop a reader having already pinned and checked
* visibility for this tuple. Hence, we write WAL first, then mutate the
* buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
* checkpoint delay makes that acceptable. With the usual order of
* changes, a crash after memcpy() and before XLogInsert() could allow
* datfrozenxid to overtake relfrozenxid:
*
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
* ["R" is a VACUUM tbl]
@@ -6472,14 +6477,36 @@ heap_inplace_update_and_unlock(Relation relation,
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
* [crash]
* [recovery restores datfrozenxid w/o relfrozenxid]
*
* Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
* Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
* The stack copy facilitates a FPI of the post-mutation block before we
* accept other sessions seeing it. DELAY_CHKPT_START allows us to
* XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
* can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
* This function, however, likely could avoid it with the following order
* of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
* DELAY_CHKPT_START here, too, as a way to have fewer distinct code
* patterns to analyze. Inplace update isn't so frequent that it should
* pursue the small optimization of skipping DELAY_CHKPT_START.
*/
MarkBufferDirty(buffer);
Assert(!MyProc->delayChkpt);
START_CRIT_SECTION();
MyProc->delayChkpt = true;
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
PGAlignedBlock copied_buffer;
char *origdata = (char *) BufferGetBlock(buffer);
Page page = BufferGetPage(buffer);
uint16 lower = ((PageHeader) page)->pd_lower;
uint16 upper = ((PageHeader) page)->pd_upper;
uintptr_t dst_offset_in_block;
RelFileNode rnode;
ForkNumber forkno;
BlockNumber blkno;
XLogRecPtr recptr;
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@@ -6487,16 +6514,28 @@ heap_inplace_update_and_unlock(Relation relation,
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
/* register block matching what buffer will look like after changes */
memcpy(copied_buffer.data, origdata, lower);
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
dst_offset_in_block = dst - origdata;
memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
BufferGetTag(buffer, &rnode, &forkno, &blkno);
Assert(forkno == MAIN_FORKNUM);
XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data,
REGBUF_STANDARD);
XLogRegisterBufData(0, src, newlen);
/* inplace updates aren't decoded atm, don't log the origin */
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
PageSetLSN(BufferGetPage(buffer), recptr);
PageSetLSN(page, recptr);
}
memcpy(dst, src, newlen);
MarkBufferDirty(buffer);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/*
@@ -6505,6 +6544,7 @@ heap_inplace_update_and_unlock(Relation relation,
*/
AtInplace_Inval();
MyProc->delayChkpt = false;
END_CRIT_SECTION();
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);

View File

@@ -275,8 +275,6 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
{
registered_buffer *regbuf;
/* This is currently only used to WAL-log a full-page image of a page */
Assert(flags & REGBUF_FORCE_IMAGE);
Assert(begininsert_called);
if (block_id >= max_registered_block_id)