Fix data loss at inplace update after heap_update().

As previously-added tests demonstrated, heap_inplace_update() could instead update an unrelated tuple of the same catalog. It could lose the update. Losing relhasindex=t was a source of index corruption. Inplace-updating commands like VACUUM will now wait for heap_update() commands like GRANT TABLE and GRANT DATABASE. That isn't ideal, but a long-running GRANT already hurts VACUUM progress more just by keeping an XID running. The VACUUM will behave like a DELETE or UPDATE waiting for the uncommitted change. For implementation details, start at the systable_inplace_update_begin() header comment and README.tuplock. Back-patch to v12 (all supported versions). In back branches, retain a deprecated heap_inplace_update(), for extensions. Reported by Smolkin Grigory. Reviewed by Nitin Motiani, (in earlier versions) Heikki Linnakangas, and (in earlier versions) Alexander Lakhin. Discussion: https://postgr.es/m/CAMp+ueZQz3yDk7qg42hk6-9gxniYbp-=bG2mgqecErqR5gGGOA@mail.gmail.com
2025-08-28 18:48:04 +03:00 · 2024-09-24 15:25:18 -07:00
parent 41e0ba33d5
commit 8590c942c1
13 changed files with 484 additions and 94 deletions
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1326,7 +1326,9 @@ vac_update_relstats(Relation relation,
 {
 	Oid			relid = RelationGetRelid(relation);
 	Relation	rd;
+	ScanKeyData key[1];
 	HeapTuple	ctup;
+	void	   *inplace_state;
 	Form_pg_class pgcform;
 	bool		dirty,
 				futurexid,
@@ -1337,7 +1339,12 @@ vac_update_relstats(Relation relation,
 	rd = table_open(RelationRelationId, RowExclusiveLock);

 	/* Fetch a copy of the tuple to scribble on */
-	ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+	ScanKeyInit(&key[0],
+				Anum_pg_class_oid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	systable_inplace_update_begin(rd, ClassOidIndexId, true,
+								  NULL, 1, key, &ctup, &inplace_state);
 	if (!HeapTupleIsValid(ctup))
 		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
 			 relid);
@@ -1445,7 +1452,9 @@ vac_update_relstats(Relation relation,

 	/* If anything changed, write out the tuple. */
 	if (dirty)
-		heap_inplace_update(rd, ctup);
+		systable_inplace_update_finish(inplace_state, ctup);
+	else
+		systable_inplace_update_cancel(inplace_state);

 	table_close(rd, RowExclusiveLock);

@@ -1497,6 +1506,7 @@ vac_update_datfrozenxid(void)
 	bool		bogus = false;
 	bool		dirty = false;
 	ScanKeyData key[1];
+	void	   *inplace_state;

 	/*
 	 * Restrict this task to one backend per database.  This avoids race
@@ -1620,20 +1630,18 @@ vac_update_datfrozenxid(void)
 	relation = table_open(DatabaseRelationId, RowExclusiveLock);

 	/*
-	 * Get the pg_database tuple to scribble on.  Note that this does not
-	 * directly rely on the syscache to avoid issues with flattened toast
-	 * values for the in-place update.
+	 * Fetch a copy of the tuple to scribble on.  We could check the syscache
+	 * tuple first.  If that concluded !dirty, we'd avoid waiting on
+	 * concurrent heap_update() and would avoid exclusive-locking the buffer.
+	 * For now, don't optimize that.
 	 */
 	ScanKeyInit(&key[0],
 				Anum_pg_database_oid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(MyDatabaseId));

-	scan = systable_beginscan(relation, DatabaseOidIndexId, true,
-							  NULL, 1, key);
-	tuple = systable_getnext(scan);
-	tuple = heap_copytuple(tuple);
-	systable_endscan(scan);
+	systable_inplace_update_begin(relation, DatabaseOidIndexId, true,
+								  NULL, 1, key, &tuple, &inplace_state);

 	if (!HeapTupleIsValid(tuple))
 		elog(ERROR, "could not find tuple for database %u", MyDatabaseId);
@@ -1667,7 +1675,9 @@ vac_update_datfrozenxid(void)
 		newMinMulti = dbform->datminmxid;

 	if (dirty)
-		heap_inplace_update(relation, tuple);
+		systable_inplace_update_finish(inplace_state, tuple);
+	else
+		systable_inplace_update_cancel(inplace_state);

 	heap_freetuple(tuple);
 	table_close(relation, RowExclusiveLock);