The row-version chaining in Serializable Snapshot Isolation was still wrong.

On further analysis, it turns out that it is not needed to duplicate predicate locks to the new row version at update, the lock on the version that the transaction saw as visible is enough. However, there was a different bug in the code that checks for dangerous structures when a new rw-conflict happens. Fix that bug, and remove all the row-version chaining related code. Kevin Grittner & Dan Ports, with some comment editorialization by me.
2025-11-16 15:02:33 +03:00 · 2011-05-30 20:42:16 +03:00
parent 5177dfefc5
commit 3103f9a77d
7 changed files with 161 additions and 161 deletions
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -155,9 +155,6 @@
 *							   BlockNumber newblkno);
 *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
 *								 BlockNumber newblkno);
- *		PredicateLockTupleRowVersionLink(const Relation relation,
- *										 const HeapTuple oldTuple,
- *										 const HeapTuple newTuple)
 *		ReleasePredicateLocks(bool isCommit)
 *
 * conflict detection (may also trigger rollback)
@@ -2252,90 +2249,6 @@ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
 	PredicateLockAcquire(&tag);
 }

-/*
- * If the old tuple has any predicate locks, copy them to the new target.
- *
- * This is called at an UPDATE, where any predicate locks held on the old
- * tuple need to be copied to the new tuple, because logically they both
- * represent the same row. A lock taken before the update must conflict
- * with anyone locking the same row after the update.
- */
-void
-PredicateLockTupleRowVersionLink(const Relation relation,
-								 const HeapTuple oldTuple,
-								 const HeapTuple newTuple)
-{
-	PREDICATELOCKTARGETTAG oldtupletag;
-	PREDICATELOCKTARGETTAG oldpagetag;
-	PREDICATELOCKTARGETTAG newtupletag;
-	BlockNumber oldblk,
-				newblk;
-	OffsetNumber oldoff,
-				newoff;
-	TransactionId oldxmin,
-				newxmin;
-
-	/*
-	 * Bail out quickly if there are no serializable transactions
-	 * running.
-	 *
-	 * It's safe to do this check without taking any additional
-	 * locks. Even if a serializable transaction starts concurrently,
-	 * we know it can't take any SIREAD locks on the modified tuple
-	 * because the caller is holding the associated buffer page lock.
-	 * Memory reordering isn't an issue; the memory barrier in the
-	 * LWLock acquisition guarantees that this read occurs while the
-	 * buffer page lock is held.
-	 */
-	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
-		return;
-
-	oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
-	oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
-	oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
-
-	newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
-	newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
-	newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
-
-	SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
-									 relation->rd_node.dbNode,
-									 relation->rd_id,
-									 oldblk,
-									 oldoff,
-									 oldxmin);
-
-	SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
-									relation->rd_node.dbNode,
-									relation->rd_id,
-									oldblk);
-
-	SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
-									 relation->rd_node.dbNode,
-									 relation->rd_id,
-									 newblk,
-									 newoff,
-									 newxmin);
-
-	/*
-	 * A page-level lock on the page containing the old tuple counts too.
-	 * Anyone holding a lock on the page is logically holding a lock on the
-	 * old tuple, so we need to acquire a lock on his behalf on the new tuple
-	 * too. However, if the new tuple is on the same page as the old one, the
-	 * old page-level lock already covers the new tuple.
-	 *
-	 * A relation-level lock always covers both tuple versions, so we don't
-	 * need to worry about those here.
-	 */
-	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
-
-	TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
-	if (newblk != oldblk)
-		TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
-
-	LWLockRelease(SerializablePredicateLockListLock);
-}
-

 /*
 *		DeleteLockTarget
@@ -2650,9 +2563,15 @@ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,

 	/*
 	 * Bail out quickly if there are no serializable transactions
-	 * running. As with PredicateLockTupleRowVersionLink, it's safe to
-	 * check this without taking locks because the caller is holding
-	 * the buffer page lock.
+	 * running.
+	 *
+	 * It's safe to do this check without taking any additional
+	 * locks. Even if a serializable transaction starts concurrently,
+	 * we know it can't take any SIREAD locks on the page being split
+	 * because the caller is holding the associated buffer page lock.
+	 * Memory reordering isn't an issue; the memory barrier in the
+	 * LWLock acquisition guarantees that this read occurs while the
+	 * buffer page lock is held.
 	 */
 	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
 		return;
@@ -3890,8 +3809,21 @@ FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
 }

 /*
- * Check whether we should roll back one of these transactions
- * instead of flagging a new rw-conflict.
+ * We are about to add a RW-edge to the dependency graph - check that we don't
+ * introduce a dangerous structure by doing so, and abort one of the
+ * transactions if so.
+ *
+ * A serialization failure can only occur if there is a dangerous structure
+ * in the dependency graph:
+ *
+ *		Tin ------> Tpivot ------> Tout
+ *	   		  rw			 rw
+ *
+ * Furthermore, Tout must commit first.
+ *
+ * One more optimization is that if Tin is declared READ ONLY (or commits
+ * without writing), we can only have a problem if Tout committed before Tin
+ * acquired its snapshot.
 */
 static void
 OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
@@ -3905,17 +3837,76 @@ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
 	failure = false;

 	/*
-	 * Check for already-committed writer with rw-conflict out flagged. This
-	 * means that the reader must immediately fail.
+	 * Check for already-committed writer with rw-conflict out flagged
+	 * (conflict-flag on W means that T2 committed before W):
+	 *
+	 *		R ------> W ------> T2
+	 *			rw        rw
+	 *
+	 * That is a dangerous structure, so we must abort. (Since the writer
+	 * has already committed, we must be the reader)
 	 */
 	if (SxactIsCommitted(writer)
 	  && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
 		failure = true;

 	/*
-	 * Check whether the reader has become a pivot with a committed writer. If
-	 * so, we must roll back unless every in-conflict either committed before
-	 * the writer committed or is READ ONLY and overlaps the writer.
+	 * Check whether the writer has become a pivot with an out-conflict
+	 * committed transaction (T2), and T2 committed first:
+	 *
+	 *		R ------> W ------> T2
+	 *			rw        rw
+	 *
+	 * Because T2 must've committed first, there is no anomaly if:
+	 * - the reader committed before T2
+	 * - the writer committed before T2
+	 * - the reader is a READ ONLY transaction and the reader was not
+	 *   concurrent with T2 (= reader acquired its snapshot after T2 committed)
+	 */
+	if (!failure)
+	{
+		if (SxactHasSummaryConflictOut(writer))
+		{
+			failure = true;
+			conflict = NULL;
+		}
+		else
+			conflict = (RWConflict)
+				SHMQueueNext(&writer->outConflicts,
+							 &writer->outConflicts,
+							 offsetof(RWConflictData, outLink));
+		while (conflict)
+		{
+			SERIALIZABLEXACT *t2 = conflict->sxactIn;
+
+			if (SxactIsCommitted(t2)
+				&& (!SxactIsCommitted(reader)
+					|| t2->commitSeqNo <= reader->commitSeqNo)
+				&& (!SxactIsCommitted(writer)
+					|| t2->commitSeqNo <= writer->commitSeqNo)
+				&& (!SxactIsReadOnly(reader)
+					|| t2->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
+			{
+				failure = true;
+				break;
+			}
+			conflict = (RWConflict)
+				SHMQueueNext(&writer->outConflicts,
+							 &conflict->outLink,
+							 offsetof(RWConflictData, outLink));
+		}
+	}
+
+	/*
+	 * Check whether the reader has become a pivot with a committed writer:
+	 *
+	 *		T0 ------> R ------> W
+	 *			 rw        rw
+	 *
+	 * Because W must've committed first for an anomaly to occur, there is no
+	 * anomaly if:
+	 * - T0 committed before the writer
+	 * - T0 is READ ONLY, and overlaps the writer
 	 */
 	if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
 	{
@@ -3931,11 +3922,13 @@ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
 							 offsetof(RWConflictData, inLink));
 		while (conflict)
 		{
-			if (!SxactIsRolledBack(conflict->sxactOut)
-				&& (!SxactIsCommitted(conflict->sxactOut)
-					|| conflict->sxactOut->commitSeqNo >= writer->commitSeqNo)
-				&& (!SxactIsReadOnly(conflict->sxactOut)
-					|| conflict->sxactOut->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
+			SERIALIZABLEXACT *t0 = conflict->sxactOut;
+
+			if (!SxactIsRolledBack(t0)
+				&& (!SxactIsCommitted(t0)
+					|| t0->commitSeqNo >= writer->commitSeqNo)
+				&& (!SxactIsReadOnly(t0)
+					|| t0->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
 			{
 				failure = true;
 				break;
@@ -3947,58 +3940,31 @@ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
 		}
 	}

-	/*
-	 * Check whether the writer has become a pivot with an out-conflict
-	 * committed transaction, while neither reader nor writer is committed. If
-	 * the reader is a READ ONLY transaction, there is only a serialization
-	 * failure if an out-conflict transaction causing the pivot committed
-	 * before the reader acquired its snapshot.  (That is, the reader must not
-	 * have been concurrent with the out-conflict transaction.)
-	 */
-	if (!failure && !SxactIsCommitted(writer))
-	{
-		if (SxactHasSummaryConflictOut(reader))
-		{
-			failure = true;
-			conflict = NULL;
-		}
-		else
-			conflict = (RWConflict)
-				SHMQueueNext(&writer->outConflicts,
-							 &writer->outConflicts,
-							 offsetof(RWConflictData, outLink));
-		while (conflict)
-		{
-			if ((reader == conflict->sxactIn && SxactIsCommitted(reader))
-				|| (SxactIsCommitted(conflict->sxactIn)
-					&& !SxactIsCommitted(reader)
-					&& (!SxactIsReadOnly(reader)
-						|| conflict->sxactIn->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)))
-			{
-				failure = true;
-				break;
-			}
-			conflict = (RWConflict)
-				SHMQueueNext(&writer->outConflicts,
-							 &conflict->outLink,
-							 offsetof(RWConflictData, outLink));
-		}
-	}
-
 	if (failure)
 	{
+		/*
+		 * We have to kill a transaction to avoid a possible anomaly from
+		 * occurring. If the writer is us, we can just ereport() to cause
+		 * a transaction abort. Otherwise we flag the writer for termination,
+		 * causing it to abort when it tries to commit. However, if the writer
+		 * is a prepared transaction, already prepared, we can't abort it
+		 * anymore, so we have to kill the reader instead.
+		 */
 		if (MySerializableXact == writer)
 		{
 			LWLockRelease(SerializableXactHashLock);
 			ereport(ERROR,
 					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 					 errmsg("could not serialize access due to read/write dependencies among transactions"),
-			errdetail("Cancelled on identification as pivot, during write."),
+			errdetail("Cancelled on identification as a pivot, during write."),
 					 errhint("The transaction might succeed if retried.")));
 		}
 		else if (SxactIsPrepared(writer))
 		{
 			LWLockRelease(SerializableXactHashLock);
+
+			/* if we're not the writer, we have to be the reader */
+			Assert(MySerializableXact == reader);
 			ereport(ERROR,
 					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 					 errmsg("could not serialize access due to read/write dependencies among transactions"),