mirror of
https://github.com/postgres/postgres.git
synced 2025-07-02 09:02:37 +03:00
Use an MVCC snapshot, rather than SnapshotNow, for catalog scans.
SnapshotNow scans have the undesirable property that, in the face of concurrent updates, the scan can fail to see either the old or the new versions of the row. In many cases, we work around this by requiring DDL operations to hold AccessExclusiveLock on the object being modified; in some cases, the existing locking is inadequate and random failures occur as a result. This commit doesn't change anything related to locking, but will hopefully pave the way to allowing lock strength reductions in the future. The major issue has held us back from making this change in the past is that taking an MVCC snapshot is significantly more expensive than using a static special snapshot such as SnapshotNow. However, testing of various worst-case scenarios reveals that this problem is not severe except under fairly extreme workloads. To mitigate those problems, we avoid retaking the MVCC snapshot for each new scan; instead, we take a new snapshot only when invalidation messages have been processed. The catcache machinery already requires that invalidation messages be sent before releasing the related heavyweight lock; else other backends might rely on locally-cached data rather than scanning the catalog at all. Thus, making snapshot reuse dependent on the same guarantees shouldn't break anything that wasn't already subtly broken. Patch by me. Review by Michael Paquier and Andres Freund.
This commit is contained in:
4
src/backend/utils/cache/catcache.c
vendored
4
src/backend/utils/cache/catcache.c
vendored
@ -1182,7 +1182,7 @@ SearchCatCache(CatCache *cache,
|
||||
scandesc = systable_beginscan(relation,
|
||||
cache->cc_indexoid,
|
||||
IndexScanOK(cache, cur_skey),
|
||||
SnapshotNow,
|
||||
NULL,
|
||||
cache->cc_nkeys,
|
||||
cur_skey);
|
||||
|
||||
@ -1461,7 +1461,7 @@ SearchCatCacheList(CatCache *cache,
|
||||
scandesc = systable_beginscan(relation,
|
||||
cache->cc_indexoid,
|
||||
IndexScanOK(cache, cur_skey),
|
||||
SnapshotNow,
|
||||
NULL,
|
||||
nkeys,
|
||||
cur_skey);
|
||||
|
||||
|
6
src/backend/utils/cache/evtcache.c
vendored
6
src/backend/utils/cache/evtcache.c
vendored
@ -129,13 +129,11 @@ BuildEventTriggerCache(void)
|
||||
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
||||
|
||||
/*
|
||||
* Prepare to scan pg_event_trigger in name order. We use an MVCC
|
||||
* snapshot to avoid getting inconsistent results if the table is being
|
||||
* concurrently updated.
|
||||
* Prepare to scan pg_event_trigger in name order.
|
||||
*/
|
||||
rel = relation_open(EventTriggerRelationId, AccessShareLock);
|
||||
irel = index_open(EventTriggerNameIndexId, AccessShareLock);
|
||||
scan = systable_beginscan_ordered(rel, irel, GetLatestSnapshot(), 0, NULL);
|
||||
scan = systable_beginscan_ordered(rel, irel, NULL, 0, NULL);
|
||||
|
||||
/*
|
||||
* Build a cache item for each pg_event_trigger tuple, and append each one
|
||||
|
67
src/backend/utils/cache/inval.c
vendored
67
src/backend/utils/cache/inval.c
vendored
@ -9,8 +9,8 @@
|
||||
* consider that it is *still valid* so long as we are in the same command,
|
||||
* ie, until the next CommandCounterIncrement() or transaction commit.
|
||||
* (See utils/time/tqual.c, and note that system catalogs are generally
|
||||
* scanned under SnapshotNow rules by the system, or plain user snapshots
|
||||
* for user queries.) At the command boundary, the old tuple stops
|
||||
* scanned under the most current snapshot available, rather than the
|
||||
* transaction snapshot.) At the command boundary, the old tuple stops
|
||||
* being valid and the new version, if any, becomes valid. Therefore,
|
||||
* we cannot simply flush a tuple from the system caches during heap_update()
|
||||
* or heap_delete(). The tuple is still good at that point; what's more,
|
||||
@ -106,6 +106,7 @@
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/relmapper.h"
|
||||
#include "utils/snapmgr.h"
|
||||
#include "utils/syscache.h"
|
||||
|
||||
|
||||
@ -372,6 +373,29 @@ AddRelcacheInvalidationMessage(InvalidationListHeader *hdr,
|
||||
AddInvalidationMessage(&hdr->rclist, &msg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a snapshot inval entry
|
||||
*/
|
||||
static void
|
||||
AddSnapshotInvalidationMessage(InvalidationListHeader *hdr,
|
||||
Oid dbId, Oid relId)
|
||||
{
|
||||
SharedInvalidationMessage msg;
|
||||
|
||||
/* Don't add a duplicate item */
|
||||
/* We assume dbId need not be checked because it will never change */
|
||||
ProcessMessageList(hdr->rclist,
|
||||
if (msg->sn.id == SHAREDINVALSNAPSHOT_ID &&
|
||||
msg->sn.relId == relId)
|
||||
return);
|
||||
|
||||
/* OK, add the item */
|
||||
msg.sn.id = SHAREDINVALSNAPSHOT_ID;
|
||||
msg.sn.dbId = dbId;
|
||||
msg.sn.relId = relId;
|
||||
AddInvalidationMessage(&hdr->rclist, &msg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Append one list of invalidation messages to another, resetting
|
||||
* the source list to empty.
|
||||
@ -468,6 +492,19 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
|
||||
transInvalInfo->RelcacheInitFileInval = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* RegisterSnapshotInvalidation
|
||||
*
|
||||
* Register a invalidation event for MVCC scans against a given catalog.
|
||||
* Only needed for catalogs that don't have catcaches.
|
||||
*/
|
||||
static void
|
||||
RegisterSnapshotInvalidation(Oid dbId, Oid relId)
|
||||
{
|
||||
AddSnapshotInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
|
||||
dbId, relId);
|
||||
}
|
||||
|
||||
/*
|
||||
* LocalExecuteInvalidationMessage
|
||||
*
|
||||
@ -482,6 +519,8 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
|
||||
{
|
||||
if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == InvalidOid)
|
||||
{
|
||||
InvalidateCatalogSnapshot();
|
||||
|
||||
CatalogCacheIdInvalidate(msg->cc.id, msg->cc.hashValue);
|
||||
|
||||
CallSyscacheCallbacks(msg->cc.id, msg->cc.hashValue);
|
||||
@ -491,6 +530,8 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
|
||||
{
|
||||
if (msg->cat.dbId == MyDatabaseId || msg->cat.dbId == InvalidOid)
|
||||
{
|
||||
InvalidateCatalogSnapshot();
|
||||
|
||||
CatalogCacheFlushCatalog(msg->cat.catId);
|
||||
|
||||
/* CatalogCacheFlushCatalog calls CallSyscacheCallbacks as needed */
|
||||
@ -532,6 +573,14 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
|
||||
else if (msg->rm.dbId == MyDatabaseId)
|
||||
RelationMapInvalidate(false);
|
||||
}
|
||||
else if (msg->id == SHAREDINVALSNAPSHOT_ID)
|
||||
{
|
||||
/* We only care about our own database and shared catalogs */
|
||||
if (msg->rm.dbId == InvalidOid)
|
||||
InvalidateCatalogSnapshot();
|
||||
else if (msg->rm.dbId == MyDatabaseId)
|
||||
InvalidateCatalogSnapshot();
|
||||
}
|
||||
else
|
||||
elog(FATAL, "unrecognized SI message ID: %d", msg->id);
|
||||
}
|
||||
@ -552,6 +601,7 @@ InvalidateSystemCaches(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
InvalidateCatalogSnapshot();
|
||||
ResetCatalogCaches();
|
||||
RelationCacheInvalidate(); /* gets smgr and relmap too */
|
||||
|
||||
@ -1006,8 +1056,15 @@ CacheInvalidateHeapTuple(Relation relation,
|
||||
/*
|
||||
* First let the catcache do its thing
|
||||
*/
|
||||
PrepareToInvalidateCacheTuple(relation, tuple, newtuple,
|
||||
RegisterCatcacheInvalidation);
|
||||
tupleRelId = RelationGetRelid(relation);
|
||||
if (RelationInvalidatesSnapshotsOnly(tupleRelId))
|
||||
{
|
||||
databaseId = IsSharedRelation(tupleRelId) ? InvalidOid : MyDatabaseId;
|
||||
RegisterSnapshotInvalidation(databaseId, tupleRelId);
|
||||
}
|
||||
else
|
||||
PrepareToInvalidateCacheTuple(relation, tuple, newtuple,
|
||||
RegisterCatcacheInvalidation);
|
||||
|
||||
/*
|
||||
* Now, is this tuple one of the primary definers of a relcache entry?
|
||||
@ -1015,8 +1072,6 @@ CacheInvalidateHeapTuple(Relation relation,
|
||||
* Note we ignore newtuple here; we assume an update cannot move a tuple
|
||||
* from being part of one relcache entry to being part of another.
|
||||
*/
|
||||
tupleRelId = RelationGetRelid(relation);
|
||||
|
||||
if (tupleRelId == RelationRelationId)
|
||||
{
|
||||
Form_pg_class classtup = (Form_pg_class) GETSTRUCT(tuple);
|
||||
|
24
src/backend/utils/cache/relcache.c
vendored
24
src/backend/utils/cache/relcache.c
vendored
@ -265,8 +265,10 @@ static void unlink_initfile(const char *initfilename);
|
||||
* This is used by RelationBuildDesc to find a pg_class
|
||||
* tuple matching targetRelId. The caller must hold at least
|
||||
* AccessShareLock on the target relid to prevent concurrent-update
|
||||
* scenarios --- else our SnapshotNow scan might fail to find any
|
||||
* version that it thinks is live.
|
||||
* scenarios; it isn't guaranteed that all scans used to build the
|
||||
* relcache entry will use the same snapshot. If, for example,
|
||||
* an attribute were to be added after scanning pg_class and before
|
||||
* scanning pg_attribute, relnatts wouldn't match.
|
||||
*
|
||||
* NB: the returned tuple has been copied into palloc'd storage
|
||||
* and must eventually be freed with heap_freetuple.
|
||||
@ -305,7 +307,7 @@ ScanPgRelation(Oid targetRelId, bool indexOK)
|
||||
pg_class_desc = heap_open(RelationRelationId, AccessShareLock);
|
||||
pg_class_scan = systable_beginscan(pg_class_desc, ClassOidIndexId,
|
||||
indexOK && criticalRelcachesBuilt,
|
||||
SnapshotNow,
|
||||
NULL,
|
||||
1, key);
|
||||
|
||||
pg_class_tuple = systable_getnext(pg_class_scan);
|
||||
@ -480,7 +482,7 @@ RelationBuildTupleDesc(Relation relation)
|
||||
pg_attribute_scan = systable_beginscan(pg_attribute_desc,
|
||||
AttributeRelidNumIndexId,
|
||||
criticalRelcachesBuilt,
|
||||
SnapshotNow,
|
||||
NULL,
|
||||
2, skey);
|
||||
|
||||
/*
|
||||
@ -663,7 +665,7 @@ RelationBuildRuleLock(Relation relation)
|
||||
rewrite_tupdesc = RelationGetDescr(rewrite_desc);
|
||||
rewrite_scan = systable_beginscan(rewrite_desc,
|
||||
RewriteRelRulenameIndexId,
|
||||
true, SnapshotNow,
|
||||
true, NULL,
|
||||
1, &key);
|
||||
|
||||
while (HeapTupleIsValid(rewrite_tuple = systable_getnext(rewrite_scan)))
|
||||
@ -1313,7 +1315,7 @@ LookupOpclassInfo(Oid operatorClassOid,
|
||||
ObjectIdGetDatum(operatorClassOid));
|
||||
rel = heap_open(OperatorClassRelationId, AccessShareLock);
|
||||
scan = systable_beginscan(rel, OpclassOidIndexId, indexOK,
|
||||
SnapshotNow, 1, skey);
|
||||
NULL, 1, skey);
|
||||
|
||||
if (HeapTupleIsValid(htup = systable_getnext(scan)))
|
||||
{
|
||||
@ -1348,7 +1350,7 @@ LookupOpclassInfo(Oid operatorClassOid,
|
||||
ObjectIdGetDatum(opcentry->opcintype));
|
||||
rel = heap_open(AccessMethodProcedureRelationId, AccessShareLock);
|
||||
scan = systable_beginscan(rel, AccessMethodProcedureIndexId, indexOK,
|
||||
SnapshotNow, 3, skey);
|
||||
NULL, 3, skey);
|
||||
|
||||
while (HeapTupleIsValid(htup = systable_getnext(scan)))
|
||||
{
|
||||
@ -3317,7 +3319,7 @@ AttrDefaultFetch(Relation relation)
|
||||
|
||||
adrel = heap_open(AttrDefaultRelationId, AccessShareLock);
|
||||
adscan = systable_beginscan(adrel, AttrDefaultIndexId, true,
|
||||
SnapshotNow, 1, &skey);
|
||||
NULL, 1, &skey);
|
||||
found = 0;
|
||||
|
||||
while (HeapTupleIsValid(htup = systable_getnext(adscan)))
|
||||
@ -3384,7 +3386,7 @@ CheckConstraintFetch(Relation relation)
|
||||
|
||||
conrel = heap_open(ConstraintRelationId, AccessShareLock);
|
||||
conscan = systable_beginscan(conrel, ConstraintRelidIndexId, true,
|
||||
SnapshotNow, 1, skey);
|
||||
NULL, 1, skey);
|
||||
|
||||
while (HeapTupleIsValid(htup = systable_getnext(conscan)))
|
||||
{
|
||||
@ -3487,7 +3489,7 @@ RelationGetIndexList(Relation relation)
|
||||
|
||||
indrel = heap_open(IndexRelationId, AccessShareLock);
|
||||
indscan = systable_beginscan(indrel, IndexIndrelidIndexId, true,
|
||||
SnapshotNow, 1, &skey);
|
||||
NULL, 1, &skey);
|
||||
|
||||
while (HeapTupleIsValid(htup = systable_getnext(indscan)))
|
||||
{
|
||||
@ -3938,7 +3940,7 @@ RelationGetExclusionInfo(Relation indexRelation,
|
||||
|
||||
conrel = heap_open(ConstraintRelationId, AccessShareLock);
|
||||
conscan = systable_beginscan(conrel, ConstraintRelidIndexId, true,
|
||||
SnapshotNow, 1, skey);
|
||||
NULL, 1, skey);
|
||||
found = false;
|
||||
|
||||
while (HeapTupleIsValid(htup = systable_getnext(conscan)))
|
||||
|
97
src/backend/utils/cache/syscache.c
vendored
97
src/backend/utils/cache/syscache.c
vendored
@ -33,7 +33,10 @@
|
||||
#include "catalog/pg_constraint.h"
|
||||
#include "catalog/pg_conversion.h"
|
||||
#include "catalog/pg_database.h"
|
||||
#include "catalog/pg_db_role_setting.h"
|
||||
#include "catalog/pg_default_acl.h"
|
||||
#include "catalog/pg_depend.h"
|
||||
#include "catalog/pg_description.h"
|
||||
#include "catalog/pg_enum.h"
|
||||
#include "catalog/pg_event_trigger.h"
|
||||
#include "catalog/pg_foreign_data_wrapper.h"
|
||||
@ -47,6 +50,10 @@
|
||||
#include "catalog/pg_proc.h"
|
||||
#include "catalog/pg_range.h"
|
||||
#include "catalog/pg_rewrite.h"
|
||||
#include "catalog/pg_seclabel.h"
|
||||
#include "catalog/pg_shdepend.h"
|
||||
#include "catalog/pg_shdescription.h"
|
||||
#include "catalog/pg_shseclabel.h"
|
||||
#include "catalog/pg_statistic.h"
|
||||
#include "catalog/pg_tablespace.h"
|
||||
#include "catalog/pg_ts_config.h"
|
||||
@ -796,6 +803,10 @@ static CatCache *SysCache[
|
||||
static int SysCacheSize = lengthof(cacheinfo);
|
||||
static bool CacheInitialized = false;
|
||||
|
||||
static Oid SysCacheRelationOid[lengthof(cacheinfo)];
|
||||
static int SysCacheRelationOidSize;
|
||||
|
||||
static int oid_compare(const void *a, const void *b);
|
||||
|
||||
/*
|
||||
* InitCatalogCache - initialize the caches
|
||||
@ -809,6 +820,8 @@ void
|
||||
InitCatalogCache(void)
|
||||
{
|
||||
int cacheId;
|
||||
int i,
|
||||
j = 0;
|
||||
|
||||
Assert(!CacheInitialized);
|
||||
|
||||
@ -825,11 +838,23 @@ InitCatalogCache(void)
|
||||
if (!PointerIsValid(SysCache[cacheId]))
|
||||
elog(ERROR, "could not initialize cache %u (%d)",
|
||||
cacheinfo[cacheId].reloid, cacheId);
|
||||
SysCacheRelationOid[SysCacheRelationOidSize++] =
|
||||
cacheinfo[cacheId].reloid;
|
||||
/* see comments for RelationInvalidatesSnapshotsOnly */
|
||||
Assert(!RelationInvalidatesSnapshotsOnly(cacheinfo[cacheId].reloid));
|
||||
}
|
||||
|
||||
/* Sort and dedup OIDs. */
|
||||
pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
|
||||
sizeof(Oid), oid_compare);
|
||||
for (i = 1; i < SysCacheRelationOidSize; ++i)
|
||||
if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
|
||||
SysCacheRelationOid[++j] = SysCacheRelationOid[i];
|
||||
SysCacheRelationOidSize = j + 1;
|
||||
|
||||
CacheInitialized = true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* InitCatalogCachePhase2 - finish initializing the caches
|
||||
*
|
||||
@ -1113,3 +1138,73 @@ SearchSysCacheList(int cacheId, int nkeys,
|
||||
return SearchCatCacheList(SysCache[cacheId], nkeys,
|
||||
key1, key2, key3, key4);
|
||||
}
|
||||
|
||||
/*
|
||||
* Certain relations that do not have system caches send snapshot invalidation
|
||||
* messages in lieu of catcache messages. This is for the benefit of
|
||||
* GetCatalogSnapshot(), which can then reuse its existing MVCC snapshot
|
||||
* for scanning one of those catalogs, rather than taking a new one, if no
|
||||
* invalidation has been received.
|
||||
*
|
||||
* Relations that have syscaches need not (and must not) be listed here. The
|
||||
* catcache invalidation messages will also flush the snapshot. If you add a
|
||||
* syscache for one of these relations, remove it from this list.
|
||||
*/
|
||||
bool
|
||||
RelationInvalidatesSnapshotsOnly(Oid relid)
|
||||
{
|
||||
switch (relid)
|
||||
{
|
||||
case DbRoleSettingRelationId:
|
||||
case DependRelationId:
|
||||
case SharedDependRelationId:
|
||||
case DescriptionRelationId:
|
||||
case SharedDescriptionRelationId:
|
||||
case SecLabelRelationId:
|
||||
case SharedSecLabelRelationId:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test whether a relation has a system cache.
|
||||
*/
|
||||
bool
|
||||
RelationHasSysCache(Oid relid)
|
||||
{
|
||||
int low = 0,
|
||||
high = SysCacheRelationOidSize - 1;
|
||||
|
||||
while (low <= high)
|
||||
{
|
||||
int middle = low + (high - low) / 2;
|
||||
|
||||
if (SysCacheRelationOid[middle] == relid)
|
||||
return true;
|
||||
if (SysCacheRelationOid[middle] < relid)
|
||||
low = middle + 1;
|
||||
else
|
||||
high = middle - 1;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* OID comparator for pg_qsort
|
||||
*/
|
||||
static int
|
||||
oid_compare(const void *a, const void *b)
|
||||
{
|
||||
Oid oa = *((Oid *) a);
|
||||
Oid ob = *((Oid *) b);
|
||||
|
||||
if (oa == ob)
|
||||
return 0;
|
||||
return (oa > ob) ? 1 : -1;
|
||||
}
|
||||
|
2
src/backend/utils/cache/ts_cache.c
vendored
2
src/backend/utils/cache/ts_cache.c
vendored
@ -484,7 +484,7 @@ lookup_ts_config_cache(Oid cfgId)
|
||||
maprel = heap_open(TSConfigMapRelationId, AccessShareLock);
|
||||
mapidx = index_open(TSConfigMapIndexId, AccessShareLock);
|
||||
mapscan = systable_beginscan_ordered(maprel, mapidx,
|
||||
SnapshotNow, 1, &mapskey);
|
||||
NULL, 1, &mapskey);
|
||||
|
||||
while ((maptup = systable_getnext_ordered(mapscan, ForwardScanDirection)) != NULL)
|
||||
{
|
||||
|
9
src/backend/utils/cache/typcache.c
vendored
9
src/backend/utils/cache/typcache.c
vendored
@ -1082,12 +1082,7 @@ load_enum_cache_data(TypeCacheEntry *tcache)
|
||||
items = (EnumItem *) palloc(sizeof(EnumItem) * maxitems);
|
||||
numitems = 0;
|
||||
|
||||
/*
|
||||
* Scan pg_enum for the members of the target enum type. We use a current
|
||||
* MVCC snapshot, *not* SnapshotNow, so that we see a consistent set of
|
||||
* rows even if someone commits a renumbering of the enum meanwhile. See
|
||||
* comments for RenumberEnumType in catalog/pg_enum.c for more info.
|
||||
*/
|
||||
/* Scan pg_enum for the members of the target enum type. */
|
||||
ScanKeyInit(&skey,
|
||||
Anum_pg_enum_enumtypid,
|
||||
BTEqualStrategyNumber, F_OIDEQ,
|
||||
@ -1096,7 +1091,7 @@ load_enum_cache_data(TypeCacheEntry *tcache)
|
||||
enum_rel = heap_open(EnumRelationId, AccessShareLock);
|
||||
enum_scan = systable_beginscan(enum_rel,
|
||||
EnumTypIdLabelIndexId,
|
||||
true, GetLatestSnapshot(),
|
||||
true, NULL,
|
||||
1, &skey);
|
||||
|
||||
while (HeapTupleIsValid(enum_tuple = systable_getnext(enum_scan)))
|
||||
|
Reference in New Issue
Block a user