diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock index 6441e8baf0e..ddb2defd28b 100644 --- a/src/backend/access/heap/README.tuplock +++ b/src/backend/access/heap/README.tuplock @@ -153,3 +153,14 @@ The following infomask bits are applicable: We currently never set the HEAP_XMAX_COMMITTED when the HEAP_XMAX_IS_MULTI bit is set. + +Reading inplace-updated columns +------------------------------- + +Inplace updates create an exception to the rule that tuple data won't change +under a reader holding a pin. A reader of a heap_fetch() result tuple may +witness a torn read. Current inplace-updated fields are aligned and are no +wider than four bytes, and current readers don't need consistency across +fields. Hence, they get by with just fetching each field once. XXX such a +caller may also read a value that has not reached WAL; see +systable_inplace_update_finish(). diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 30bc88e56f8..f7fc0fcf6a3 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -5991,23 +5991,245 @@ heap_abort_speculative(Relation relation, ItemPointer tid) } /* - * heap_inplace_update - update a tuple "in place" (ie, overwrite it) + * heap_inplace_lock - protect inplace update from concurrent heap_update() * - * Overwriting violates both MVCC and transactional safety, so the uses - * of this function in Postgres are extremely limited. Nonetheless we - * find some places to use it. + * Evaluate whether the tuple's state is compatible with a no-key update. + * Current transaction rowmarks are fine, as is KEY SHARE from any + * transaction. If compatible, return true with the buffer exclusive-locked, + * and the caller must release that by calling + * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising + * an error. Otherwise, return false after blocking transactions, if any, + * have ended. * - * The tuple cannot change size, and therefore it's reasonable to assume - * that its null bitmap (if any) doesn't change either. So we just - * overwrite the data portion of the tuple without touching the null - * bitmap or any of the header fields. + * Since this is intended for system catalogs and SERIALIZABLE doesn't cover + * DDL, this doesn't guarantee any particular predicate locking. * - * tuple is an in-memory tuple structure containing the data to be written - * over the target tuple. Also, tuple->t_self identifies the target tuple. + * One could modify this to return true for tuples with delete in progress, + * All inplace updaters take a lock that conflicts with DROP. If explicit + * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an + * update. * - * Note that the tuple updated here had better not come directly from the - * syscache if the relation has a toast relation as this tuple could - * include toast values that have been expanded, causing a failure here. + * Readers of inplace-updated fields expect changes to those fields are + * durable. For example, vac_truncate_clog() reads datfrozenxid from + * pg_database tuples via catalog snapshots. A future snapshot must not + * return a lower datfrozenxid for the same database OID (lower in the + * FullTransactionIdPrecedes() sense). We achieve that since no update of a + * tuple can start while we hold a lock on its buffer. In cases like + * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only + * to this transaction. ROLLBACK then is one case where it's okay to lose + * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since + * any concurrent CREATE INDEX would have blocked, then inplace-updated the + * committed tuple.) + * + * In principle, we could avoid waiting by overwriting every tuple in the + * updated tuple chain. Reader expectations permit updating a tuple only if + * it's aborted, is the tail of the chain, or we already updated the tuple + * referenced in its t_ctid. Hence, we would need to overwrite the tuples in + * order from tail to head. That would imply either (a) mutating all tuples + * in one critical section or (b) accepting a chance of partial completion. + * Partial completion of a relfrozenxid update would have the weird + * consequence that the table's next VACUUM could see the table's relfrozenxid + * move forward between vacuum_get_cutoffs() and finishing. + */ +bool +heap_inplace_lock(Relation relation, + HeapTuple oldtup_ptr, Buffer buffer) +{ + HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */ + TM_Result result; + bool ret; + + Assert(BufferIsValid(buffer)); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /*---------- + * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except: + * + * - wait unconditionally + * - no tuple locks + * - don't recheck header after wait: simpler to defer to next iteration + * - don't try to continue even if the updater aborts: likewise + * - no crosscheck + */ + result = HeapTupleSatisfiesUpdate(&oldtup, GetCurrentCommandId(false), + buffer); + + if (result == TM_Invisible) + { + /* no known way this can happen */ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_internal("attempted to overwrite invisible tuple"))); + } + else if (result == TM_SelfModified) + { + /* + * CREATE INDEX might reach this if an expression is silly enough to + * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL + * statements might get here after a heap_update() of the same row, in + * the absence of an intervening CommandCounterIncrement(). + */ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be updated was already modified by an operation triggered by the current command"))); + } + else if (result == TM_BeingModified) + { + TransactionId xwait; + uint16 infomask; + + xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); + infomask = oldtup.t_data->t_infomask; + + if (infomask & HEAP_XMAX_IS_MULTI) + { + LockTupleMode lockmode = LockTupleNoKeyExclusive; + MultiXactStatus mxact_status = MultiXactStatusNoKeyUpdate; + int remain; + bool current_is_member; + + if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + lockmode, ¤t_is_member)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ret = false; + MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask, + relation, &oldtup.t_self, XLTW_Update, + &remain); + } + else + ret = true; + } + else if (TransactionIdIsCurrentTransactionId(xwait)) + ret = true; + else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) + ret = true; + else + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ret = false; + XactLockTableWait(xwait, relation, &oldtup.t_self, + XLTW_Update); + } + } + else + { + ret = (result == TM_Ok); + if (!ret) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + } + + /* + * GetCatalogSnapshot() relies on invalidation messages to know when to + * take a new snapshot. COMMIT of xwait is responsible for sending the + * invalidation. We're not acquiring heavyweight locks sufficient to + * block if not yet sent, so we must take a new snapshot to ensure a later + * attempt has a fair chance. While we don't need this if xwait aborted, + * don't bother optimizing that. + */ + if (!ret) + InvalidateCatalogSnapshot(); + return ret; +} + +/* + * heap_inplace_update_and_unlock - core of systable_inplace_update_finish + * + * The tuple cannot change size, and therefore its header fields and null + * bitmap (if any) don't change either. + */ +void +heap_inplace_update_and_unlock(Relation relation, + HeapTuple oldtup, HeapTuple tuple, + Buffer buffer) +{ + HeapTupleHeader htup = oldtup->t_data; + uint32 oldlen; + uint32 newlen; + + Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self)); + oldlen = oldtup->t_len - htup->t_hoff; + newlen = tuple->t_len - tuple->t_data->t_hoff; + if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) + elog(ERROR, "wrong tuple length"); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + memcpy((char *) htup + htup->t_hoff, + (char *) tuple->t_data + tuple->t_data->t_hoff, + newlen); + + /*---------- + * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid: + * + * ["D" is a VACUUM (ONLY_DATABASE_STATS)] + * ["R" is a VACUUM tbl] + * D: vac_update_datfrozenid() -> systable_beginscan(pg_class) + * D: systable_getnext() returns pg_class tuple of tbl + * R: memcpy() into pg_class tuple of tbl + * D: raise pg_database.datfrozenxid, XLogInsert(), finish + * [crash] + * [recovery restores datfrozenxid w/o relfrozenxid] + */ + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_inplace xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); + + /* inplace updates aren't decoded atm, don't log the origin */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); + + PageSetLSN(BufferGetPage(buffer), recptr); + } + + END_CRIT_SECTION(); + + heap_inplace_unlock(relation, oldtup, buffer); + + /* + * Send out shared cache inval if necessary. Note that because we only + * pass the new version of the tuple, this mustn't be used for any + * operations that could change catcache lookup keys. But we aren't + * bothering with index updates either, so that's true a fortiori. + * + * XXX ROLLBACK discards the invalidation. See test inplace-inval.spec. + */ + if (!IsBootstrapProcessingMode()) + CacheInvalidateHeapTuple(relation, tuple, NULL); +} + +/* + * heap_inplace_unlock - reverse of heap_inplace_lock + */ +void +heap_inplace_unlock(Relation relation, + HeapTuple oldtup, Buffer buffer) +{ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); +} + +/* + * heap_inplace_update - deprecated + * + * This exists only to keep modules working in back branches. Affected + * modules should migrate to systable_inplace_update_begin(). */ void heap_inplace_update(Relation relation, HeapTuple tuple) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 95120c5b283..c846bf30d50 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -24,6 +24,7 @@ #include "access/relscan.h" #include "access/tableam.h" #include "access/transam.h" +#include "catalog/catalog.h" #include "catalog/index.h" #include "lib/stringinfo.h" #include "miscadmin.h" @@ -748,3 +749,139 @@ systable_endscan_ordered(SysScanDesc sysscan) UnregisterSnapshot(sysscan->snapshot); pfree(sysscan); } + +/* + * systable_inplace_update_begin --- update a row "in place" (overwrite it) + * + * Overwriting violates both MVCC and transactional safety, so the uses of + * this function in Postgres are extremely limited. Nonetheless we find some + * places to use it. Standard flow: + * + * ... [any slow preparation not requiring oldtup] ... + * systable_inplace_update_begin([...], &tup, &inplace_state); + * if (!HeapTupleIsValid(tup)) + * elog(ERROR, [...]); + * ... [buffer is exclusive-locked; mutate "tup"] ... + * if (dirty) + * systable_inplace_update_finish(inplace_state, tup); + * else + * systable_inplace_update_cancel(inplace_state); + * + * The first several params duplicate the systable_beginscan() param list. + * "oldtupcopy" is an output parameter, assigned NULL if the key ceases to + * find a live tuple. (In PROC_IN_VACUUM, that is a low-probability transient + * condition.) If "oldtupcopy" gets non-NULL, you must pass output parameter + * "state" to systable_inplace_update_finish() or + * systable_inplace_update_cancel(). + */ +void +systable_inplace_update_begin(Relation relation, + Oid indexId, + bool indexOK, + Snapshot snapshot, + int nkeys, const ScanKeyData *key, + HeapTuple *oldtupcopy, + void **state) +{ + ScanKey mutable_key = palloc(sizeof(ScanKeyData) * nkeys); + int retries = 0; + SysScanDesc scan; + HeapTuple oldtup; + + /* + * For now, we don't allow parallel updates. Unlike a regular update, + * this should never create a combo CID, so it might be possible to relax + * this restriction, but not without more thought and testing. It's not + * clear that it would be useful, anyway. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot update tuples during a parallel operation"))); + + /* + * Accept a snapshot argument, for symmetry, but this function advances + * its snapshot as needed to reach the tail of the updated tuple chain. + */ + Assert(snapshot == NULL); + + Assert(IsInplaceUpdateRelation(relation) || !IsSystemRelation(relation)); + + /* Loop for an exclusive-locked buffer of a non-updated tuple. */ + for (;;) + { + TupleTableSlot *slot; + BufferHeapTupleTableSlot *bslot; + + CHECK_FOR_INTERRUPTS(); + + /* + * Processes issuing heap_update (e.g. GRANT) at maximum speed could + * drive us to this error. A hostile table owner has stronger ways to + * damage their own table, so that's minor. + */ + if (retries++ > 10000) + elog(ERROR, "giving up after too many tries to overwrite row"); + + memcpy(mutable_key, key, sizeof(ScanKeyData) * nkeys); + scan = systable_beginscan(relation, indexId, indexOK, snapshot, + nkeys, mutable_key); + oldtup = systable_getnext(scan); + if (!HeapTupleIsValid(oldtup)) + { + systable_endscan(scan); + *oldtupcopy = NULL; + return; + } + + slot = scan->slot; + Assert(TTS_IS_BUFFERTUPLE(slot)); + bslot = (BufferHeapTupleTableSlot *) slot; + if (heap_inplace_lock(scan->heap_rel, + bslot->base.tuple, bslot->buffer)) + break; + systable_endscan(scan); + }; + + *oldtupcopy = heap_copytuple(oldtup); + *state = scan; +} + +/* + * systable_inplace_update_finish --- second phase of inplace update + * + * The tuple cannot change size, and therefore its header fields and null + * bitmap (if any) don't change either. + */ +void +systable_inplace_update_finish(void *state, HeapTuple tuple) +{ + SysScanDesc scan = (SysScanDesc) state; + Relation relation = scan->heap_rel; + TupleTableSlot *slot = scan->slot; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + HeapTuple oldtup = bslot->base.tuple; + Buffer buffer = bslot->buffer; + + heap_inplace_update_and_unlock(relation, oldtup, tuple, buffer); + systable_endscan(scan); +} + +/* + * systable_inplace_update_cancel --- abandon inplace update + * + * This is an alternative to making a no-op update. + */ +void +systable_inplace_update_cancel(void *state) +{ + SysScanDesc scan = (SysScanDesc) state; + Relation relation = scan->heap_rel; + TupleTableSlot *slot = scan->slot; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + HeapTuple oldtup = bslot->base.tuple; + Buffer buffer = bslot->buffer; + + heap_inplace_unlock(relation, oldtup, buffer); + systable_endscan(scan); +} diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index e5a6d867597..e62eb41fce3 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2803,7 +2803,9 @@ index_update_stats(Relation rel, { Oid relid = RelationGetRelid(rel); Relation pg_class; + ScanKeyData key[1]; HeapTuple tuple; + void *state; Form_pg_class rd_rel; bool dirty; @@ -2837,33 +2839,12 @@ index_update_stats(Relation rel, pg_class = table_open(RelationRelationId, RowExclusiveLock); - /* - * Make a copy of the tuple to update. Normally we use the syscache, but - * we can't rely on that during bootstrap or while reindexing pg_class - * itself. - */ - if (IsBootstrapProcessingMode() || - ReindexIsProcessingHeap(RelationRelationId)) - { - /* don't assume syscache will work */ - TableScanDesc pg_class_scan; - ScanKeyData key[1]; - - ScanKeyInit(&key[0], - Anum_pg_class_oid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(relid)); - - pg_class_scan = table_beginscan_catalog(pg_class, 1, key); - tuple = heap_getnext(pg_class_scan, ForwardScanDirection); - tuple = heap_copytuple(tuple); - table_endscan(pg_class_scan); - } - else - { - /* normal case, use syscache */ - tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); - } + ScanKeyInit(&key[0], + Anum_pg_class_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + systable_inplace_update_begin(pg_class, ClassOidIndexId, true, NULL, + 1, key, &tuple, &state); if (!HeapTupleIsValid(tuple)) elog(ERROR, "could not find tuple for relation %u", relid); @@ -2922,11 +2903,12 @@ index_update_stats(Relation rel, */ if (dirty) { - heap_inplace_update(pg_class, tuple); + systable_inplace_update_finish(state, tuple); /* the above sends a cache inval message */ } else { + systable_inplace_update_cancel(state); /* no need to change tuple, but force relcache inval anyway */ CacheInvalidateRelcacheByTuple(tuple); } diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 9bc10729b02..530ef8e767e 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/genam.h" #include "access/heapam.h" #include "access/toast_compression.h" #include "access/xact.h" @@ -32,6 +33,7 @@ #include "nodes/makefuncs.h" #include "storage/lock.h" #include "utils/builtins.h" +#include "utils/fmgroids.h" #include "utils/rel.h" #include "utils/syscache.h" @@ -337,21 +339,36 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, */ class_rel = table_open(RelationRelationId, RowExclusiveLock); - reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid)); - if (!HeapTupleIsValid(reltup)) - elog(ERROR, "cache lookup failed for relation %u", relOid); - - ((Form_pg_class) GETSTRUCT(reltup))->reltoastrelid = toast_relid; - if (!IsBootstrapProcessingMode()) { /* normal case, use a transactional update */ + reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid)); + if (!HeapTupleIsValid(reltup)) + elog(ERROR, "cache lookup failed for relation %u", relOid); + + ((Form_pg_class) GETSTRUCT(reltup))->reltoastrelid = toast_relid; + CatalogTupleUpdate(class_rel, &reltup->t_self, reltup); } else { /* While bootstrapping, we cannot UPDATE, so overwrite in-place */ - heap_inplace_update(class_rel, reltup); + + ScanKeyData key[1]; + void *state; + + ScanKeyInit(&key[0], + Anum_pg_class_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relOid)); + systable_inplace_update_begin(class_rel, ClassOidIndexId, true, + NULL, 1, key, &reltup, &state); + if (!HeapTupleIsValid(reltup)) + elog(ERROR, "cache lookup failed for relation %u", relOid); + + ((Form_pg_class) GETSTRUCT(reltup))->reltoastrelid = toast_relid; + + systable_inplace_update_finish(state, reltup); } heap_freetuple(reltup); diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 11e3b3ec171..3e581d62efb 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1524,7 +1524,7 @@ dropdb(const char *dbname, bool missing_ok, bool force) Relation pgdbrel; HeapTuple tup; ScanKeyData scankey; - SysScanDesc scan; + void *inplace_state; Form_pg_database datform; int notherbackends; int npreparedxacts; @@ -1662,24 +1662,6 @@ dropdb(const char *dbname, bool missing_ok, bool force) */ pgstat_drop_database(db_id); - /* - * Get the pg_database tuple to scribble on. Note that this does not - * directly rely on the syscache to avoid issues with flattened toast - * values for the in-place update. - */ - ScanKeyInit(&scankey, - Anum_pg_database_datname, - BTEqualStrategyNumber, F_NAMEEQ, - CStringGetDatum(dbname)); - - scan = systable_beginscan(pgdbrel, DatabaseNameIndexId, true, - NULL, 1, &scankey); - - tup = systable_getnext(scan); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for database %u", db_id); - datform = (Form_pg_database) GETSTRUCT(tup); - /* * Except for the deletion of the catalog row, subsequent actions are not * transactional (consider DropDatabaseBuffers() discarding modified @@ -1691,8 +1673,17 @@ dropdb(const char *dbname, bool missing_ok, bool force) * modification is durable before performing irreversible filesystem * operations. */ + ScanKeyInit(&scankey, + Anum_pg_database_datname, + BTEqualStrategyNumber, F_NAMEEQ, + CStringGetDatum(dbname)); + systable_inplace_update_begin(pgdbrel, DatabaseNameIndexId, true, + NULL, 1, &scankey, &tup, &inplace_state); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for database %u", db_id); + datform = (Form_pg_database) GETSTRUCT(tup); datform->datconnlimit = DATCONNLIMIT_INVALID_DB; - heap_inplace_update(pgdbrel, tup); + systable_inplace_update_finish(inplace_state, tup); XLogFlush(XactLastRecEnd); /* @@ -1700,8 +1691,7 @@ dropdb(const char *dbname, bool missing_ok, bool force) * the row will be gone, but if we fail, dropdb() can be invoked again. */ CatalogTupleDelete(pgdbrel, &tup->t_self); - - systable_endscan(scan); + heap_freetuple(tup); /* * Drop db-specific replication slots. diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 329c73d2261..916ba841930 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1326,7 +1326,9 @@ vac_update_relstats(Relation relation, { Oid relid = RelationGetRelid(relation); Relation rd; + ScanKeyData key[1]; HeapTuple ctup; + void *inplace_state; Form_pg_class pgcform; bool dirty, futurexid, @@ -1337,7 +1339,12 @@ vac_update_relstats(Relation relation, rd = table_open(RelationRelationId, RowExclusiveLock); /* Fetch a copy of the tuple to scribble on */ - ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); + ScanKeyInit(&key[0], + Anum_pg_class_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + systable_inplace_update_begin(rd, ClassOidIndexId, true, + NULL, 1, key, &ctup, &inplace_state); if (!HeapTupleIsValid(ctup)) elog(ERROR, "pg_class entry for relid %u vanished during vacuuming", relid); @@ -1445,7 +1452,9 @@ vac_update_relstats(Relation relation, /* If anything changed, write out the tuple. */ if (dirty) - heap_inplace_update(rd, ctup); + systable_inplace_update_finish(inplace_state, ctup); + else + systable_inplace_update_cancel(inplace_state); table_close(rd, RowExclusiveLock); @@ -1497,6 +1506,7 @@ vac_update_datfrozenxid(void) bool bogus = false; bool dirty = false; ScanKeyData key[1]; + void *inplace_state; /* * Restrict this task to one backend per database. This avoids race @@ -1620,20 +1630,18 @@ vac_update_datfrozenxid(void) relation = table_open(DatabaseRelationId, RowExclusiveLock); /* - * Get the pg_database tuple to scribble on. Note that this does not - * directly rely on the syscache to avoid issues with flattened toast - * values for the in-place update. + * Fetch a copy of the tuple to scribble on. We could check the syscache + * tuple first. If that concluded !dirty, we'd avoid waiting on + * concurrent heap_update() and would avoid exclusive-locking the buffer. + * For now, don't optimize that. */ ScanKeyInit(&key[0], Anum_pg_database_oid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(MyDatabaseId)); - scan = systable_beginscan(relation, DatabaseOidIndexId, true, - NULL, 1, key); - tuple = systable_getnext(scan); - tuple = heap_copytuple(tuple); - systable_endscan(scan); + systable_inplace_update_begin(relation, DatabaseOidIndexId, true, + NULL, 1, key, &tuple, &inplace_state); if (!HeapTupleIsValid(tuple)) elog(ERROR, "could not find tuple for database %u", MyDatabaseId); @@ -1667,7 +1675,9 @@ vac_update_datfrozenxid(void) newMinMulti = dbform->datminmxid; if (dirty) - heap_inplace_update(relation, tuple); + systable_inplace_update_finish(inplace_state, tuple); + else + systable_inplace_update_cancel(inplace_state); heap_freetuple(tuple); table_close(relation, RowExclusiveLock); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index f6ad7c3ba55..26436abdf2a 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -228,5 +228,14 @@ extern SysScanDesc systable_beginscan_ordered(Relation heapRelation, extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction); extern void systable_endscan_ordered(SysScanDesc sysscan); +extern void systable_inplace_update_begin(Relation relation, + Oid indexId, + bool indexOK, + Snapshot snapshot, + int nkeys, const ScanKeyData *key, + HeapTuple *oldtupcopy, + void **state); +extern void systable_inplace_update_finish(void *state, HeapTuple tuple); +extern void systable_inplace_update_cancel(void *state); #endif /* GENAM_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 50b9e729e63..266934b71b0 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -163,6 +163,13 @@ extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, bool follow_update, Buffer *buffer, struct TM_FailureData *tmfd); +extern bool heap_inplace_lock(Relation relation, + HeapTuple oldtup_ptr, Buffer buffer); +extern void heap_inplace_update_and_unlock(Relation relation, + HeapTuple oldtup, HeapTuple tuple, + Buffer buffer); +extern void heap_inplace_unlock(Relation relation, + HeapTuple oldtup, Buffer buffer); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, diff --git a/src/test/isolation/expected/intra-grant-inplace-db.out b/src/test/isolation/expected/intra-grant-inplace-db.out index 432ece56361..a91402ccb8f 100644 --- a/src/test/isolation/expected/intra-grant-inplace-db.out +++ b/src/test/isolation/expected/intra-grant-inplace-db.out @@ -9,20 +9,20 @@ step b1: BEGIN; step grant1: GRANT TEMP ON DATABASE isolation_regression TO regress_temp_grantee; -step vac2: VACUUM (FREEZE); +step vac2: VACUUM (FREEZE); step snap3: INSERT INTO frozen_witness SELECT datfrozenxid FROM pg_database WHERE datname = current_catalog; step c1: COMMIT; +step vac2: <... completed> step cmp3: SELECT 'datfrozenxid retreated' FROM pg_database WHERE datname = current_catalog AND age(datfrozenxid) > (SELECT min(age(x)) FROM frozen_witness); -?column? ----------------------- -datfrozenxid retreated -(1 row) +?column? +-------- +(0 rows) diff --git a/src/test/isolation/expected/intra-grant-inplace.out b/src/test/isolation/expected/intra-grant-inplace.out index cc1e47a302c..fe26984c0e0 100644 --- a/src/test/isolation/expected/intra-grant-inplace.out +++ b/src/test/isolation/expected/intra-grant-inplace.out @@ -14,15 +14,16 @@ relhasindex f (1 row) -step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); +step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); step c1: COMMIT; +step addk2: <... completed> step read2: SELECT relhasindex FROM pg_class WHERE oid = 'intra_grant_inplace'::regclass; relhasindex ----------- -f +t (1 row) @@ -58,8 +59,9 @@ relhasindex f (1 row) -step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); +step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); step r3: ROLLBACK; +step addk2: <... completed> starting permutation: b2 sfnku2 addk2 c2 step b2: BEGIN; @@ -98,7 +100,7 @@ f step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); step c2: COMMIT; -starting permutation: b3 sfu3 b1 grant1 read2 addk2 r3 c1 read2 +starting permutation: b3 sfu3 b1 grant1 read2 as3 addk2 r3 c1 read2 step b3: BEGIN ISOLATION LEVEL READ COMMITTED; step sfu3: SELECT relhasindex FROM pg_class @@ -122,17 +124,19 @@ relhasindex f (1 row) -step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); +step as3: LOCK TABLE intra_grant_inplace IN ACCESS SHARE MODE; +step addk2: ALTER TABLE intra_grant_inplace ADD PRIMARY KEY (c); step r3: ROLLBACK; step grant1: <... completed> step c1: COMMIT; +step addk2: <... completed> step read2: SELECT relhasindex FROM pg_class WHERE oid = 'intra_grant_inplace'::regclass; relhasindex ----------- -f +t (1 row) diff --git a/src/test/isolation/specs/intra-grant-inplace-db.spec b/src/test/isolation/specs/intra-grant-inplace-db.spec index bbecd5ddde5..9de40ec5c94 100644 --- a/src/test/isolation/specs/intra-grant-inplace-db.spec +++ b/src/test/isolation/specs/intra-grant-inplace-db.spec @@ -42,5 +42,4 @@ step cmp3 { } -# XXX extant bug permutation snap3 b1 grant1 vac2(c1) snap3 c1 cmp3 diff --git a/src/test/isolation/specs/intra-grant-inplace.spec b/src/test/isolation/specs/intra-grant-inplace.spec index 3cd696b81f2..d07ed3bb2cc 100644 --- a/src/test/isolation/specs/intra-grant-inplace.spec +++ b/src/test/isolation/specs/intra-grant-inplace.spec @@ -48,6 +48,7 @@ step sfu3 { SELECT relhasindex FROM pg_class WHERE oid = 'intra_grant_inplace'::regclass FOR UPDATE; } +step as3 { LOCK TABLE intra_grant_inplace IN ACCESS SHARE MODE; } step r3 { ROLLBACK; } # Additional heap_update() @@ -73,7 +74,7 @@ step keyshr5 { teardown { ROLLBACK; } -# XXX extant bugs: permutation comments refer to planned post-bugfix behavior +# XXX extant bugs: permutation comments refer to planned future LockTuple() permutation b1 @@ -117,6 +118,7 @@ permutation b1 grant1(r3) # acquire LockTuple(), await sfu3 xmax read2 + as3 # XXX temporary until patch adds locking to addk2 addk2(c1) # block in LockTuple() behind grant1 r3 # unblock grant1; addk2 now awaits grant1 xmax c1