1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-29 13:56:47 +03:00

Prevent concurrent SimpleLruTruncate() for any given SLRU.

The SimpleLruTruncate() header comment states the new coding rule.  To
achieve this, add locktype "frozenid" and two LWLocks.  This closes a
rare opportunity for data loss, which manifested as "apparent
wraparound" or "could not access status of transaction" errors.  Data
loss is more likely in pg_multixact, due to released branches' thin
margin between multiStopLimit and multiWrapLimit.  If a user's physical
replication primary logged ":  apparent wraparound" messages, the user
should rebuild standbys of that primary regardless of symptoms.  At less
risk is a cluster having emitted "not accepting commands" errors or
"must be vacuumed" warnings at some point.  One can test a cluster for
this data loss by running VACUUM FREEZE in every database.  Back-patch
to 9.5 (all supported versions).

Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com
This commit is contained in:
Noah Misch 2020-08-15 10:15:53 -07:00
parent dea07098af
commit e525770dd5
12 changed files with 119 additions and 16 deletions

View File

@ -8886,7 +8886,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
and general database objects (identified by class OID and object OID, and general database objects (identified by class OID and object OID,
in the same way as in <structname>pg_description</structname> or in the same way as in <structname>pg_description</structname> or
<structname>pg_depend</structname>). Also, the right to extend a <structname>pg_depend</structname>). Also, the right to extend a
relation is represented as a separate lockable object. relation is represented as a separate lockable object, as is the right to
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
Also, <quote>advisory</> locks can be taken on numbers that have Also, <quote>advisory</> locks can be taken on numbers that have
user-defined meanings. user-defined meanings.
</para> </para>
@ -8912,6 +8913,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
Type of the lockable object: Type of the lockable object:
<literal>relation</>, <literal>relation</>,
<literal>extend</>, <literal>extend</>,
<literal>frozenid</literal>,
<literal>page</>, <literal>page</>,
<literal>tuple</>, <literal>tuple</>,
<literal>transactionid</>, <literal>transactionid</>,

View File

@ -845,7 +845,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<tbody> <tbody>
<row> <row>
<entry morerows="62"><literal>LWLock</></entry> <entry morerows="64"><literal>LWLock</></entry>
<entry><literal>ShmemIndexLock</></entry> <entry><literal>ShmemIndexLock</></entry>
<entry>Waiting to find or allocate space in shared memory.</entry> <entry>Waiting to find or allocate space in shared memory.</entry>
</row> </row>
@ -1043,6 +1043,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting to execute <function>txid_status</function> or update <entry>Waiting to execute <function>txid_status</function> or update
the oldest transaction id available to it.</entry> the oldest transaction id available to it.</entry>
</row> </row>
<row>
<entry><literal>WrapLimitsVacuumLock</literal></entry>
<entry>Waiting to update limits on transaction id and multixact
consumption.</entry>
</row>
<row>
<entry><literal>NotifyQueueTailLock</literal></entry>
<entry>Waiting to update limit on notification message
storage.</entry>
</row>
<row> <row>
<entry><literal>clog</></entry> <entry><literal>clog</></entry>
<entry>Waiting for I/O on a clog (transaction status) buffer.</entry> <entry>Waiting for I/O on a clog (transaction status) buffer.</entry>
@ -1118,7 +1128,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting for TBM shared iterator lock.</entry> <entry>Waiting for TBM shared iterator lock.</entry>
</row> </row>
<row> <row>
<entry morerows="9"><literal>Lock</></entry> <entry morerows="10"><literal>Lock</></entry>
<entry><literal>relation</></entry> <entry><literal>relation</></entry>
<entry>Waiting to acquire a lock on a relation.</entry> <entry>Waiting to acquire a lock on a relation.</entry>
</row> </row>
@ -1126,6 +1136,12 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry><literal>extend</></entry> <entry><literal>extend</></entry>
<entry>Waiting to extend a relation.</entry> <entry>Waiting to extend a relation.</entry>
</row> </row>
<row>
<entry><literal>frozenid</literal></entry>
<entry>Waiting to
update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
</row>
<row> <row>
<entry><literal>page</></entry> <entry><literal>page</></entry>
<entry>Waiting to acquire a lock on page of a relation.</entry> <entry>Waiting to acquire a lock on page of a relation.</entry>

View File

@ -1164,6 +1164,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
/* /*
* Remove all segments before the one holding the passed page number * Remove all segments before the one holding the passed page number
*
* All SLRUs prevent concurrent calls to this function, either with an LWLock
* or by calling it only as part of a checkpoint. Mutual exclusion must begin
* before computing cutoffPage. Mutual exclusion must end after any limit
* update that would permit other backends to write fresh data into the
* segment immediately preceding the one containing cutoffPage. Otherwise,
* when the SLRU is quite full, SimpleLruTruncate() might delete that segment
* after it has accrued freshly-written data.
*/ */
void void
SimpleLruTruncate(SlruCtl ctl, int cutoffPage) SimpleLruTruncate(SlruCtl ctl, int cutoffPage)

View File

@ -347,8 +347,8 @@ ExtendSUBTRANS(TransactionId newestXact)
/* /*
* Remove all SUBTRANS segments before the one holding the passed transaction ID * Remove all SUBTRANS segments before the one holding the passed transaction ID
* *
* This is normally called during checkpoint, with oldestXact being the * oldestXact is the oldest TransactionXmin of any running transaction. This
* oldest TransactionXmin of any running transaction. * is called only during checkpoint.
*/ */
void void
TruncateSUBTRANS(TransactionId oldestXact) TruncateSUBTRANS(TransactionId oldestXact)

View File

@ -224,19 +224,22 @@ typedef struct QueueBackendStatus
/* /*
* Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff) * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
* *
* The AsyncQueueControl structure is protected by the AsyncQueueLock. * The AsyncQueueControl structure is protected by the AsyncQueueLock and
* NotifyQueueTailLock.
* *
* When holding the lock in SHARED mode, backends may only inspect their own * When holding AsyncQueueLock in SHARED mode, backends may only inspect their
* entries as well as the head and tail pointers. Consequently we can allow a * own entries as well as the head and tail pointers. Consequently we can
* backend to update its own record while holding only SHARED lock (since no * allow a backend to update its own record while holding only SHARED lock
* other backend will inspect it). * (since no other backend will inspect it).
* *
* When holding the lock in EXCLUSIVE mode, backends can inspect the entries * When holding AsyncQueueLock in EXCLUSIVE mode, backends can inspect the
* of other backends and also change the head and tail pointers. * entries of other backends and also change the head pointer. When holding
* both AsyncQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends can
* change the tail pointer.
* *
* AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers. * AsyncCtlLock is used as the control lock for the pg_notify SLRU buffers.
* In order to avoid deadlocks, whenever we need both locks, we always first * In order to avoid deadlocks, whenever we need multiple locks, we first get
* get AsyncQueueLock and then AsyncCtlLock. * NotifyQueueTailLock, then AsyncQueueLock, and lastly AsyncCtlLock.
* *
* Each backend uses the backend[] array entry with index equal to its * Each backend uses the backend[] array entry with index equal to its
* BackendId (which can range from 1 to MaxBackends). We rely on this to make * BackendId (which can range from 1 to MaxBackends). We rely on this to make
@ -2013,6 +2016,10 @@ asyncQueueAdvanceTail(void)
int newtailpage; int newtailpage;
int boundary; int boundary;
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
/* Compute the new tail. */
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE); LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
min = QUEUE_HEAD; min = QUEUE_HEAD;
for (i = 1; i <= MaxBackends; i++) for (i = 1; i <= MaxBackends; i++)
@ -2021,7 +2028,6 @@ asyncQueueAdvanceTail(void)
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i)); min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
} }
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL); oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
QUEUE_TAIL = min;
LWLockRelease(AsyncQueueLock); LWLockRelease(AsyncQueueLock);
/* /*
@ -2041,6 +2047,17 @@ asyncQueueAdvanceTail(void)
*/ */
SimpleLruTruncate(AsyncCtl, newtailpage); SimpleLruTruncate(AsyncCtl, newtailpage);
} }
/*
* Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
* the segment immediately prior to the new tail, allowing fresh data into
* that segment.
*/
LWLockAcquire(AsyncQueueLock, LW_EXCLUSIVE);
QUEUE_TAIL = min;
LWLockRelease(AsyncQueueLock);
LWLockRelease(NotifyQueueTailLock);
} }
/* /*

View File

@ -933,6 +933,14 @@ vac_update_datfrozenxid(void)
bool bogus = false; bool bogus = false;
bool dirty = false; bool dirty = false;
/*
* Restrict this task to one backend per database. This avoids race
* conditions that would move datfrozenxid or datminmxid backward. It
* avoids calling vac_truncate_clog() with a datfrozenxid preceding a
* datfrozenxid passed to an earlier vac_truncate_clog() call.
*/
LockDatabaseFrozenIds(ExclusiveLock);
/* /*
* Initialize the "min" calculation with GetOldestXmin, which is a * Initialize the "min" calculation with GetOldestXmin, which is a
* reasonable approximation to the minimum relfrozenxid for not-yet- * reasonable approximation to the minimum relfrozenxid for not-yet-
@ -1097,6 +1105,9 @@ vac_truncate_clog(TransactionId frozenXID,
bool bogus = false; bool bogus = false;
bool frozenAlreadyWrapped = false; bool frozenAlreadyWrapped = false;
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
/* init oldest datoids to sync with my frozenXID/minMulti values */ /* init oldest datoids to sync with my frozenXID/minMulti values */
oldestxid_datoid = MyDatabaseId; oldestxid_datoid = MyDatabaseId;
minmulti_datoid = MyDatabaseId; minmulti_datoid = MyDatabaseId;
@ -1206,6 +1217,8 @@ vac_truncate_clog(TransactionId frozenXID,
*/ */
SetTransactionIdLimit(frozenXID, oldestxid_datoid); SetTransactionIdLimit(frozenXID, oldestxid_datoid);
SetMultiXactIdLimit(minMulti, minmulti_datoid, false); SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
LWLockRelease(WrapLimitsVacuumLock);
} }

View File

@ -412,6 +412,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
LockRelease(&tag, lockmode, false); LockRelease(&tag, lockmode, false);
} }
/*
* LockDatabaseFrozenIds
*
* This allows one backend per database to execute vac_update_datfrozenxid().
*/
void
LockDatabaseFrozenIds(LOCKMODE lockmode)
{
LOCKTAG tag;
SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
(void) LockAcquire(&tag, lockmode, false, false);
}
/* /*
* LockPage * LockPage
* *
@ -1015,6 +1030,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
tag->locktag_field2, tag->locktag_field2,
tag->locktag_field1); tag->locktag_field1);
break; break;
case LOCKTAG_DATABASE_FROZEN_IDS:
appendStringInfo(buf,
_("pg_database.datfrozenxid of database %u"),
tag->locktag_field1);
break;
case LOCKTAG_PAGE: case LOCKTAG_PAGE:
appendStringInfo(buf, appendStringInfo(buf,
_("page %u of relation %u of database %u"), _("page %u of relation %u of database %u"),

View File

@ -495,7 +495,7 @@ RegisterLWLockTranches(void)
if (LWLockTrancheArray == NULL) if (LWLockTrancheArray == NULL)
{ {
LWLockTranchesAllocated = 64; LWLockTranchesAllocated = 128;
LWLockTrancheArray = (char **) LWLockTrancheArray = (char **)
MemoryContextAllocZero(TopMemoryContext, MemoryContextAllocZero(TopMemoryContext,
LWLockTranchesAllocated * sizeof(char *)); LWLockTranchesAllocated * sizeof(char *));

View File

@ -50,3 +50,5 @@ OldSnapshotTimeMapLock 42
BackendRandomLock 43 BackendRandomLock 43
LogicalRepWorkerLock 44 LogicalRepWorkerLock 44
CLogTruncationLock 45 CLogTruncationLock 45
WrapLimitsVacuumLock 46
NotifyQueueTailLock 47

View File

@ -26,6 +26,7 @@
const char *const LockTagTypeNames[] = { const char *const LockTagTypeNames[] = {
"relation", "relation",
"extend", "extend",
"frozenid",
"page", "page",
"tuple", "tuple",
"transactionid", "transactionid",
@ -245,6 +246,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
nulls[8] = true; nulls[8] = true;
nulls[9] = true; nulls[9] = true;
break; break;
case LOCKTAG_DATABASE_FROZEN_IDS:
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
nulls[2] = true;
nulls[3] = true;
nulls[4] = true;
nulls[5] = true;
nulls[6] = true;
nulls[7] = true;
nulls[8] = true;
nulls[9] = true;
break;
case LOCKTAG_PAGE: case LOCKTAG_PAGE:
values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);

View File

@ -57,6 +57,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
LOCKMODE lockmode); LOCKMODE lockmode);
extern int RelationExtensionLockWaiterCount(Relation relation); extern int RelationExtensionLockWaiterCount(Relation relation);
/* Lock to recompute pg_database.datfrozenxid in the current database */
extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
/* Lock a page (currently only used within indexes) */ /* Lock a page (currently only used within indexes) */
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);

View File

@ -141,6 +141,8 @@ typedef enum LockTagType
/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */ /* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */
/* same ID info as RELATION */ /* same ID info as RELATION */
LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */
/* ID info for frozen IDs is DB OID */
LOCKTAG_PAGE, /* one page of a relation */ LOCKTAG_PAGE, /* one page of a relation */
/* ID info for a page is RELATION info + BlockNumber */ /* ID info for a page is RELATION info + BlockNumber */
LOCKTAG_TUPLE, /* one physical tuple */ LOCKTAG_TUPLE, /* one physical tuple */
@ -206,6 +208,14 @@ typedef struct LOCKTAG
(locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
((locktag).locktag_field1 = (dboid), \
(locktag).locktag_field2 = 0, \
(locktag).locktag_field3 = 0, \
(locktag).locktag_field4 = 0, \
(locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
(locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
((locktag).locktag_field1 = (dboid), \ ((locktag).locktag_field1 = (dboid), \
(locktag).locktag_field2 = (reloid), \ (locktag).locktag_field2 = (reloid), \