1
0
mirror of https://github.com/postgres/postgres.git synced 2025-12-19 17:02:53 +03:00

WAL-log inplace update before revealing it to other sessions.

A buffer lock won't stop a reader having already checked tuple
visibility.  If a vac_update_datfrozenid() and then a crash happened
during inplace update of a relfrozenxid value, datfrozenxid could
overtake relfrozenxid.  That could lead to "could not access status of
transaction" errors.

Back-patch to v14 - v17.  This is a back-patch of commits:

- 8e7e672cda
  (main change, on master, before v18 branched)
- 8180136652
  (defect fix, on master, before v18 branched)

It reverses commit bc6bad8857, my revert
of the original back-patch.

In v14, this also back-patches the assertion removal from commit
7fcf2faf9c.

Discussion: https://postgr.es/m/20240620012908.92.nmisch@google.com
Backpatch-through: 14-17
This commit is contained in:
Noah Misch
2025-12-16 16:13:54 -08:00
parent 0f69beddea
commit d3e5d89504
3 changed files with 58 additions and 20 deletions

View File

@@ -198,9 +198,7 @@ Inplace updates create an exception to the rule that tuple data won't change
under a reader holding a pin. A reader of a heap_fetch() result tuple may under a reader holding a pin. A reader of a heap_fetch() result tuple may
witness a torn read. Current inplace-updated fields are aligned and are no witness a torn read. Current inplace-updated fields are aligned and are no
wider than four bytes, and current readers don't need consistency across wider than four bytes, and current readers don't need consistency across
fields. Hence, they get by with just fetching each field once. XXX such a fields. Hence, they get by with just fetching each field once.
caller may also read a value that has not reached WAL; see
systable_inplace_update_finish().
During logical decoding, caches reflect an inplace update no later than the During logical decoding, caches reflect an inplace update no later than the
next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command. next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command.

View File

@@ -6450,6 +6450,8 @@ heap_inplace_update_and_unlock(Relation relation,
HeapTupleHeader htup = oldtup->t_data; HeapTupleHeader htup = oldtup->t_data;
uint32 oldlen; uint32 oldlen;
uint32 newlen; uint32 newlen;
char *dst;
char *src;
Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self)); Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
oldlen = oldtup->t_len - htup->t_hoff; oldlen = oldtup->t_len - htup->t_hoff;
@@ -6457,6 +6459,9 @@ heap_inplace_update_and_unlock(Relation relation,
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length"); elog(ERROR, "wrong tuple length");
dst = (char *) htup + htup->t_hoff;
src = (char *) tuple->t_data + tuple->t_data->t_hoff;
/* /*
* Unlink relcache init files as needed. If unlinking, acquire * Unlink relcache init files as needed. If unlinking, acquire
* RelCacheInitLock until after associated invalidations. By doing this * RelCacheInitLock until after associated invalidations. By doing this
@@ -6467,15 +6472,15 @@ heap_inplace_update_and_unlock(Relation relation,
*/ */
PreInplace_Inval(); PreInplace_Inval();
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
memcpy((char *) htup + htup->t_hoff,
(char *) tuple->t_data + tuple->t_data->t_hoff,
newlen);
/*---------- /*----------
* XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid: * NO EREPORT(ERROR) from here till changes are complete
*
* Our buffer lock won't stop a reader having already pinned and checked
* visibility for this tuple. Hence, we write WAL first, then mutate the
* buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
* checkpoint delay makes that acceptable. With the usual order of
* changes, a crash after memcpy() and before XLogInsert() could allow
* datfrozenxid to overtake relfrozenxid:
* *
* ["D" is a VACUUM (ONLY_DATABASE_STATS)] * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
* ["R" is a VACUUM tbl] * ["R" is a VACUUM tbl]
@@ -6485,14 +6490,36 @@ heap_inplace_update_and_unlock(Relation relation,
* D: raise pg_database.datfrozenxid, XLogInsert(), finish * D: raise pg_database.datfrozenxid, XLogInsert(), finish
* [crash] * [crash]
* [recovery restores datfrozenxid w/o relfrozenxid] * [recovery restores datfrozenxid w/o relfrozenxid]
*
* Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
* Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
* The stack copy facilitates a FPI of the post-mutation block before we
* accept other sessions seeing it. DELAY_CHKPT_START allows us to
* XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
* can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
* This function, however, likely could avoid it with the following order
* of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
* DELAY_CHKPT_START here, too, as a way to have fewer distinct code
* patterns to analyze. Inplace update isn't so frequent that it should
* pursue the small optimization of skipping DELAY_CHKPT_START.
*/ */
Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
MarkBufferDirty(buffer); START_CRIT_SECTION();
MyProc->delayChkptFlags |= DELAY_CHKPT_START;
/* XLOG stuff */ /* XLOG stuff */
if (RelationNeedsWAL(relation)) if (RelationNeedsWAL(relation))
{ {
xl_heap_inplace xlrec; xl_heap_inplace xlrec;
PGAlignedBlock copied_buffer;
char *origdata = (char *) BufferGetBlock(buffer);
Page page = BufferGetPage(buffer);
uint16 lower = ((PageHeader) page)->pd_lower;
uint16 upper = ((PageHeader) page)->pd_upper;
uintptr_t dst_offset_in_block;
RelFileLocator rlocator;
ForkNumber forkno;
BlockNumber blkno;
XLogRecPtr recptr; XLogRecPtr recptr;
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@@ -6500,16 +6527,28 @@ heap_inplace_update_and_unlock(Relation relation,
XLogBeginInsert(); XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* register block matching what buffer will look like after changes */
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); memcpy(copied_buffer.data, origdata, lower);
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
dst_offset_in_block = dst - origdata;
memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
BufferGetTag(buffer, &rlocator, &forkno, &blkno);
Assert(forkno == MAIN_FORKNUM);
XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
REGBUF_STANDARD);
XLogRegisterBufData(0, src, newlen);
/* inplace updates aren't decoded atm, don't log the origin */ /* inplace updates aren't decoded atm, don't log the origin */
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
PageSetLSN(BufferGetPage(buffer), recptr); PageSetLSN(page, recptr);
} }
memcpy(dst, src, newlen);
MarkBufferDirty(buffer);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK); LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/* /*
@@ -6518,6 +6557,7 @@ heap_inplace_update_and_unlock(Relation relation,
*/ */
AtInplace_Inval(); AtInplace_Inval();
MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
END_CRIT_SECTION(); END_CRIT_SECTION();
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock); UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);

View File

@@ -104,10 +104,10 @@ struct XidCache
* is inserted prior to the new redo point, the corresponding data changes will * is inserted prior to the new redo point, the corresponding data changes will
* also be flushed to disk before the checkpoint can complete. (In the * also be flushed to disk before the checkpoint can complete. (In the
* extremely common case where the data being modified is in shared buffers * extremely common case where the data being modified is in shared buffers
* and we acquire an exclusive content lock on the relevant buffers before * and we acquire an exclusive content lock and MarkBufferDirty() on the
* writing WAL, this mechanism is not needed, because phase 2 will block * relevant buffers before writing WAL, this mechanism is not needed, because
* until we release the content lock and then flush the modified data to * phase 2 will block until we release the content lock and then flush the
* disk.) * modified data to disk. See transam/README and SyncOneBuffer().)
* *
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2 * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
* to phase 3. This is useful if we are performing a WAL-logged operation that * to phase 3. This is useful if we are performing a WAL-logged operation that