From 8e607a5a4be54f2d2bdfb7a53584d4b7ca1d2be3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 21 Oct 2024 09:49:21 +0300 Subject: [PATCH] Fix race condition in committing a serializable transaction The finished transaction list can contain XIDs that are older than the serializable global xmin. It's a short-lived state; ClearOldPredicateLocks() removes any such transactions from the list, and it's called whenever the global xmin advances. But if another backend calls SummarizeOldestCommittedSxact() in that window, it will call SerialAdd() on an XID that's older than the global xmin, or if there are no more transactions running, when global xmin is invalid. That trips the assertion in SerialAdd(). Fixes bug #18658 reported by Andrew Bille. Thanks to Alexander Lakhin for analysis. Backpatch to all versions. Discussion: https://www.postgresql.org/message-id/18658-7dab125ec688c70b%40postgresql.org --- src/backend/storage/lmgr/predicate.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 8da27cc9acd..375e5b541f0 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -920,12 +920,17 @@ SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); /* - * If no serializable transactions are active, there shouldn't be anything - * to push out to the SLRU. Hitting this assert would mean there's - * something wrong with the earlier cleanup logic. + * If 'xid' is older than the global xmin (== tailXid), there's no need to + * store it, after all. This can happen if the oldest transaction holding + * back the global xmin just finished, making 'xid' uninteresting, but + * ClearOldPredicateLocks() has not yet run. */ tailXid = serialControl->tailXid; - Assert(TransactionIdIsValid(tailXid)); + if (!TransactionIdIsValid(tailXid) || TransactionIdPrecedes(xid, tailXid)) + { + LWLockRelease(SerialSLRULock); + return; + } /* * If the SLRU is currently unused, zero out the whole active region from