diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock index ad835ff4820..16f7d78b7d2 100644 --- a/src/backend/access/heap/README.tuplock +++ b/src/backend/access/heap/README.tuplock @@ -198,9 +198,7 @@ Inplace updates create an exception to the rule that tuple data won't change under a reader holding a pin. A reader of a heap_fetch() result tuple may witness a torn read. Current inplace-updated fields are aligned and are no wider than four bytes, and current readers don't need consistency across -fields. Hence, they get by with just fetching each field once. XXX such a -caller may also read a value that has not reached WAL; see -systable_inplace_update_finish(). +fields. Hence, they get by with just fetching each field once. During logical decoding, caches reflect an inplace update no later than the next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b9326eed3ff..d977df4cec8 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6288,6 +6288,8 @@ heap_inplace_update_and_unlock(Relation relation, HeapTupleHeader htup = oldtup->t_data; uint32 oldlen; uint32 newlen; + char *dst; + char *src; Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self)); oldlen = oldtup->t_len - htup->t_hoff; @@ -6295,6 +6297,9 @@ heap_inplace_update_and_unlock(Relation relation, if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) elog(ERROR, "wrong tuple length"); + dst = (char *) htup + htup->t_hoff; + src = (char *) tuple->t_data + tuple->t_data->t_hoff; + /* * Unlink relcache init files as needed. If unlinking, acquire * RelCacheInitLock until after associated invalidations. By doing this @@ -6305,15 +6310,15 @@ heap_inplace_update_and_unlock(Relation relation, */ PreInplace_Inval(); - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - memcpy((char *) htup + htup->t_hoff, - (char *) tuple->t_data + tuple->t_data->t_hoff, - newlen); - /*---------- - * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid: + * NO EREPORT(ERROR) from here till changes are complete + * + * Our buffer lock won't stop a reader having already pinned and checked + * visibility for this tuple. Hence, we write WAL first, then mutate the + * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(), + * checkpoint delay makes that acceptable. With the usual order of + * changes, a crash after memcpy() and before XLogInsert() could allow + * datfrozenxid to overtake relfrozenxid: * * ["D" is a VACUUM (ONLY_DATABASE_STATS)] * ["R" is a VACUUM tbl] @@ -6323,14 +6328,36 @@ heap_inplace_update_and_unlock(Relation relation, * D: raise pg_database.datfrozenxid, XLogInsert(), finish * [crash] * [recovery restores datfrozenxid w/o relfrozenxid] + * + * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(). + * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack. + * The stack copy facilitates a FPI of the post-mutation block before we + * accept other sessions seeing it. DELAY_CHKPT_START allows us to + * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint() + * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START. + * This function, however, likely could avoid it with the following order + * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use + * DELAY_CHKPT_START here, too, as a way to have fewer distinct code + * patterns to analyze. Inplace update isn't so frequent that it should + * pursue the small optimization of skipping DELAY_CHKPT_START. */ - - MarkBufferDirty(buffer); + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; /* XLOG stuff */ if (RelationNeedsWAL(relation)) { xl_heap_inplace xlrec; + PGAlignedBlock copied_buffer; + char *origdata = (char *) BufferGetBlock(buffer); + Page page = BufferGetPage(buffer); + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + uintptr_t dst_offset_in_block; + RelFileLocator rlocator; + ForkNumber forkno; + BlockNumber blkno; XLogRecPtr recptr; xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); @@ -6338,16 +6365,28 @@ heap_inplace_update_and_unlock(Relation relation, XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); + /* register block matching what buffer will look like after changes */ + memcpy(copied_buffer.data, origdata, lower); + memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); + dst_offset_in_block = dst - origdata; + memcpy(copied_buffer.data + dst_offset_in_block, src, newlen); + BufferGetTag(buffer, &rlocator, &forkno, &blkno); + Assert(forkno == MAIN_FORKNUM); + XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, + REGBUF_STANDARD); + XLogRegisterBufData(0, src, newlen); /* inplace updates aren't decoded atm, don't log the origin */ recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); - PageSetLSN(BufferGetPage(buffer), recptr); + PageSetLSN(page, recptr); } + memcpy(dst, src, newlen); + + MarkBufferDirty(buffer); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* @@ -6356,6 +6395,7 @@ heap_inplace_update_and_unlock(Relation relation, */ AtInplace_Inval(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock); diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index ea3ca9c48a6..e16ab4a2e58 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -109,10 +109,10 @@ struct XidCache * is inserted prior to the new redo point, the corresponding data changes will * also be flushed to disk before the checkpoint can complete. (In the * extremely common case where the data being modified is in shared buffers - * and we acquire an exclusive content lock on the relevant buffers before - * writing WAL, this mechanism is not needed, because phase 2 will block - * until we release the content lock and then flush the modified data to - * disk.) + * and we acquire an exclusive content lock and MarkBufferDirty() on the + * relevant buffers before writing WAL, this mechanism is not needed, because + * phase 2 will block until we release the content lock and then flush the + * modified data to disk. See transam/README and SyncOneBuffer().) * * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2 * to phase 3. This is useful if we are performing a WAL-logged operation that