mirror of
https://github.com/postgres/postgres.git
synced 2025-04-25 21:42:33 +03:00
Fix race leading to incorrect conflict cause in InvalidatePossiblyObsoleteSlot()
The invalidation of an active slot is done in two steps: - Termination of the backend holding it, if any. - Report that the slot is obsolete, with a conflict cause depending on the slot's data. This can be racy because between these two steps the slot mutex would be released while doing system calls, which means that the effective_xmin and effective_catalog_xmin could advance during that time, detecting a conflict cause different than the one originally wanted before the process owning a slot is terminated. Holding the mutex longer is not an option, so this commit changes the code to record the LSNs stored in the slot during the termination of the process owning the slot. Bonus thanks to Alexander Lakhin for the various tests and the analysis. Author: Bertrand Drouvot Reviewed-by: Michael Paquier, Bharath Rupireddy Discussion: https://postgr.es/m/ZaTjW2Xh+TQUCOH0@ip-10-97-1-34.eu-west-3.compute.internal Backpatch-through: 16
This commit is contained in:
parent
01ec4d89b9
commit
818fefd8fd
@ -1454,6 +1454,11 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
|
||||
{
|
||||
int last_signaled_pid = 0;
|
||||
bool released_lock = false;
|
||||
bool terminated = false;
|
||||
XLogRecPtr initial_effective_xmin = InvalidXLogRecPtr;
|
||||
XLogRecPtr initial_catalog_effective_xmin = InvalidXLogRecPtr;
|
||||
XLogRecPtr initial_restart_lsn = InvalidXLogRecPtr;
|
||||
ReplicationSlotInvalidationCause conflict_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
@ -1488,11 +1493,24 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
|
||||
*/
|
||||
if (s->data.invalidated == RS_INVAL_NONE)
|
||||
{
|
||||
/*
|
||||
* The slot's mutex will be released soon, and it is possible that
|
||||
* those values change since the process holding the slot has been
|
||||
* terminated (if any), so record them here to ensure that we
|
||||
* would report the correct conflict cause.
|
||||
*/
|
||||
if (!terminated)
|
||||
{
|
||||
initial_restart_lsn = s->data.restart_lsn;
|
||||
initial_effective_xmin = s->effective_xmin;
|
||||
initial_catalog_effective_xmin = s->effective_catalog_xmin;
|
||||
}
|
||||
|
||||
switch (cause)
|
||||
{
|
||||
case RS_INVAL_WAL_REMOVED:
|
||||
if (s->data.restart_lsn != InvalidXLogRecPtr &&
|
||||
s->data.restart_lsn < oldestLSN)
|
||||
if (initial_restart_lsn != InvalidXLogRecPtr &&
|
||||
initial_restart_lsn < oldestLSN)
|
||||
conflict = cause;
|
||||
break;
|
||||
case RS_INVAL_HORIZON:
|
||||
@ -1501,12 +1519,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
|
||||
/* invalid DB oid signals a shared relation */
|
||||
if (dboid != InvalidOid && dboid != s->data.database)
|
||||
break;
|
||||
if (TransactionIdIsValid(s->effective_xmin) &&
|
||||
TransactionIdPrecedesOrEquals(s->effective_xmin,
|
||||
if (TransactionIdIsValid(initial_effective_xmin) &&
|
||||
TransactionIdPrecedesOrEquals(initial_effective_xmin,
|
||||
snapshotConflictHorizon))
|
||||
conflict = cause;
|
||||
else if (TransactionIdIsValid(s->effective_catalog_xmin) &&
|
||||
TransactionIdPrecedesOrEquals(s->effective_catalog_xmin,
|
||||
else if (TransactionIdIsValid(initial_catalog_effective_xmin) &&
|
||||
TransactionIdPrecedesOrEquals(initial_catalog_effective_xmin,
|
||||
snapshotConflictHorizon))
|
||||
conflict = cause;
|
||||
break;
|
||||
@ -1519,6 +1537,13 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The conflict cause recorded previously should not change while the
|
||||
* process owning the slot (if any) has been terminated.
|
||||
*/
|
||||
Assert(!(conflict_prev != RS_INVAL_NONE && terminated &&
|
||||
conflict_prev != conflict));
|
||||
|
||||
/* if there's no conflict, we're done */
|
||||
if (conflict == RS_INVAL_NONE)
|
||||
{
|
||||
@ -1601,6 +1626,8 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
|
||||
(void) kill(active_pid, SIGTERM);
|
||||
|
||||
last_signaled_pid = active_pid;
|
||||
terminated = true;
|
||||
conflict_prev = conflict;
|
||||
}
|
||||
|
||||
/* Wait until the slot is released. */
|
||||
|
Loading…
x
Reference in New Issue
Block a user