1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-02 09:02:37 +03:00

Use an MVCC snapshot, rather than SnapshotNow, for catalog scans.

SnapshotNow scans have the undesirable property that, in the face of
concurrent updates, the scan can fail to see either the old or the new
versions of the row.  In many cases, we work around this by requiring
DDL operations to hold AccessExclusiveLock on the object being
modified; in some cases, the existing locking is inadequate and random
failures occur as a result.  This commit doesn't change anything
related to locking, but will hopefully pave the way to allowing lock
strength reductions in the future.

The major issue has held us back from making this change in the past
is that taking an MVCC snapshot is significantly more expensive than
using a static special snapshot such as SnapshotNow.  However, testing
of various worst-case scenarios reveals that this problem is not
severe except under fairly extreme workloads.  To mitigate those
problems, we avoid retaking the MVCC snapshot for each new scan;
instead, we take a new snapshot only when invalidation messages have
been processed.  The catcache machinery already requires that
invalidation messages be sent before releasing the related heavyweight
lock; else other backends might rely on locally-cached data rather
than scanning the catalog at all.  Thus, making snapshot reuse
dependent on the same guarantees shouldn't break anything that wasn't
already subtly broken.

Patch by me.  Review by Michael Paquier and Andres Freund.
This commit is contained in:
Robert Haas
2013-07-02 09:47:01 -04:00
parent 384f933046
commit 568d4138c6
69 changed files with 617 additions and 353 deletions

View File

@ -1182,7 +1182,7 @@ SearchCatCache(CatCache *cache,
scandesc = systable_beginscan(relation,
cache->cc_indexoid,
IndexScanOK(cache, cur_skey),
SnapshotNow,
NULL,
cache->cc_nkeys,
cur_skey);
@ -1461,7 +1461,7 @@ SearchCatCacheList(CatCache *cache,
scandesc = systable_beginscan(relation,
cache->cc_indexoid,
IndexScanOK(cache, cur_skey),
SnapshotNow,
NULL,
nkeys,
cur_skey);

View File

@ -129,13 +129,11 @@ BuildEventTriggerCache(void)
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
/*
* Prepare to scan pg_event_trigger in name order. We use an MVCC
* snapshot to avoid getting inconsistent results if the table is being
* concurrently updated.
* Prepare to scan pg_event_trigger in name order.
*/
rel = relation_open(EventTriggerRelationId, AccessShareLock);
irel = index_open(EventTriggerNameIndexId, AccessShareLock);
scan = systable_beginscan_ordered(rel, irel, GetLatestSnapshot(), 0, NULL);
scan = systable_beginscan_ordered(rel, irel, NULL, 0, NULL);
/*
* Build a cache item for each pg_event_trigger tuple, and append each one

View File

@ -9,8 +9,8 @@
* consider that it is *still valid* so long as we are in the same command,
* ie, until the next CommandCounterIncrement() or transaction commit.
* (See utils/time/tqual.c, and note that system catalogs are generally
* scanned under SnapshotNow rules by the system, or plain user snapshots
* for user queries.) At the command boundary, the old tuple stops
* scanned under the most current snapshot available, rather than the
* transaction snapshot.) At the command boundary, the old tuple stops
* being valid and the new version, if any, becomes valid. Therefore,
* we cannot simply flush a tuple from the system caches during heap_update()
* or heap_delete(). The tuple is still good at that point; what's more,
@ -106,6 +106,7 @@
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/relmapper.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
@ -372,6 +373,29 @@ AddRelcacheInvalidationMessage(InvalidationListHeader *hdr,
AddInvalidationMessage(&hdr->rclist, &msg);
}
/*
* Add a snapshot inval entry
*/
static void
AddSnapshotInvalidationMessage(InvalidationListHeader *hdr,
Oid dbId, Oid relId)
{
SharedInvalidationMessage msg;
/* Don't add a duplicate item */
/* We assume dbId need not be checked because it will never change */
ProcessMessageList(hdr->rclist,
if (msg->sn.id == SHAREDINVALSNAPSHOT_ID &&
msg->sn.relId == relId)
return);
/* OK, add the item */
msg.sn.id = SHAREDINVALSNAPSHOT_ID;
msg.sn.dbId = dbId;
msg.sn.relId = relId;
AddInvalidationMessage(&hdr->rclist, &msg);
}
/*
* Append one list of invalidation messages to another, resetting
* the source list to empty.
@ -468,6 +492,19 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
transInvalInfo->RelcacheInitFileInval = true;
}
/*
* RegisterSnapshotInvalidation
*
* Register a invalidation event for MVCC scans against a given catalog.
* Only needed for catalogs that don't have catcaches.
*/
static void
RegisterSnapshotInvalidation(Oid dbId, Oid relId)
{
AddSnapshotInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
dbId, relId);
}
/*
* LocalExecuteInvalidationMessage
*
@ -482,6 +519,8 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
{
if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == InvalidOid)
{
InvalidateCatalogSnapshot();
CatalogCacheIdInvalidate(msg->cc.id, msg->cc.hashValue);
CallSyscacheCallbacks(msg->cc.id, msg->cc.hashValue);
@ -491,6 +530,8 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
{
if (msg->cat.dbId == MyDatabaseId || msg->cat.dbId == InvalidOid)
{
InvalidateCatalogSnapshot();
CatalogCacheFlushCatalog(msg->cat.catId);
/* CatalogCacheFlushCatalog calls CallSyscacheCallbacks as needed */
@ -532,6 +573,14 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
else if (msg->rm.dbId == MyDatabaseId)
RelationMapInvalidate(false);
}
else if (msg->id == SHAREDINVALSNAPSHOT_ID)
{
/* We only care about our own database and shared catalogs */
if (msg->rm.dbId == InvalidOid)
InvalidateCatalogSnapshot();
else if (msg->rm.dbId == MyDatabaseId)
InvalidateCatalogSnapshot();
}
else
elog(FATAL, "unrecognized SI message ID: %d", msg->id);
}
@ -552,6 +601,7 @@ InvalidateSystemCaches(void)
{
int i;
InvalidateCatalogSnapshot();
ResetCatalogCaches();
RelationCacheInvalidate(); /* gets smgr and relmap too */
@ -1006,8 +1056,15 @@ CacheInvalidateHeapTuple(Relation relation,
/*
* First let the catcache do its thing
*/
PrepareToInvalidateCacheTuple(relation, tuple, newtuple,
RegisterCatcacheInvalidation);
tupleRelId = RelationGetRelid(relation);
if (RelationInvalidatesSnapshotsOnly(tupleRelId))
{
databaseId = IsSharedRelation(tupleRelId) ? InvalidOid : MyDatabaseId;
RegisterSnapshotInvalidation(databaseId, tupleRelId);
}
else
PrepareToInvalidateCacheTuple(relation, tuple, newtuple,
RegisterCatcacheInvalidation);
/*
* Now, is this tuple one of the primary definers of a relcache entry?
@ -1015,8 +1072,6 @@ CacheInvalidateHeapTuple(Relation relation,
* Note we ignore newtuple here; we assume an update cannot move a tuple
* from being part of one relcache entry to being part of another.
*/
tupleRelId = RelationGetRelid(relation);
if (tupleRelId == RelationRelationId)
{
Form_pg_class classtup = (Form_pg_class) GETSTRUCT(tuple);

View File

@ -265,8 +265,10 @@ static void unlink_initfile(const char *initfilename);
* This is used by RelationBuildDesc to find a pg_class
* tuple matching targetRelId. The caller must hold at least
* AccessShareLock on the target relid to prevent concurrent-update
* scenarios --- else our SnapshotNow scan might fail to find any
* version that it thinks is live.
* scenarios; it isn't guaranteed that all scans used to build the
* relcache entry will use the same snapshot. If, for example,
* an attribute were to be added after scanning pg_class and before
* scanning pg_attribute, relnatts wouldn't match.
*
* NB: the returned tuple has been copied into palloc'd storage
* and must eventually be freed with heap_freetuple.
@ -305,7 +307,7 @@ ScanPgRelation(Oid targetRelId, bool indexOK)
pg_class_desc = heap_open(RelationRelationId, AccessShareLock);
pg_class_scan = systable_beginscan(pg_class_desc, ClassOidIndexId,
indexOK && criticalRelcachesBuilt,
SnapshotNow,
NULL,
1, key);
pg_class_tuple = systable_getnext(pg_class_scan);
@ -480,7 +482,7 @@ RelationBuildTupleDesc(Relation relation)
pg_attribute_scan = systable_beginscan(pg_attribute_desc,
AttributeRelidNumIndexId,
criticalRelcachesBuilt,
SnapshotNow,
NULL,
2, skey);
/*
@ -663,7 +665,7 @@ RelationBuildRuleLock(Relation relation)
rewrite_tupdesc = RelationGetDescr(rewrite_desc);
rewrite_scan = systable_beginscan(rewrite_desc,
RewriteRelRulenameIndexId,
true, SnapshotNow,
true, NULL,
1, &key);
while (HeapTupleIsValid(rewrite_tuple = systable_getnext(rewrite_scan)))
@ -1313,7 +1315,7 @@ LookupOpclassInfo(Oid operatorClassOid,
ObjectIdGetDatum(operatorClassOid));
rel = heap_open(OperatorClassRelationId, AccessShareLock);
scan = systable_beginscan(rel, OpclassOidIndexId, indexOK,
SnapshotNow, 1, skey);
NULL, 1, skey);
if (HeapTupleIsValid(htup = systable_getnext(scan)))
{
@ -1348,7 +1350,7 @@ LookupOpclassInfo(Oid operatorClassOid,
ObjectIdGetDatum(opcentry->opcintype));
rel = heap_open(AccessMethodProcedureRelationId, AccessShareLock);
scan = systable_beginscan(rel, AccessMethodProcedureIndexId, indexOK,
SnapshotNow, 3, skey);
NULL, 3, skey);
while (HeapTupleIsValid(htup = systable_getnext(scan)))
{
@ -3317,7 +3319,7 @@ AttrDefaultFetch(Relation relation)
adrel = heap_open(AttrDefaultRelationId, AccessShareLock);
adscan = systable_beginscan(adrel, AttrDefaultIndexId, true,
SnapshotNow, 1, &skey);
NULL, 1, &skey);
found = 0;
while (HeapTupleIsValid(htup = systable_getnext(adscan)))
@ -3384,7 +3386,7 @@ CheckConstraintFetch(Relation relation)
conrel = heap_open(ConstraintRelationId, AccessShareLock);
conscan = systable_beginscan(conrel, ConstraintRelidIndexId, true,
SnapshotNow, 1, skey);
NULL, 1, skey);
while (HeapTupleIsValid(htup = systable_getnext(conscan)))
{
@ -3487,7 +3489,7 @@ RelationGetIndexList(Relation relation)
indrel = heap_open(IndexRelationId, AccessShareLock);
indscan = systable_beginscan(indrel, IndexIndrelidIndexId, true,
SnapshotNow, 1, &skey);
NULL, 1, &skey);
while (HeapTupleIsValid(htup = systable_getnext(indscan)))
{
@ -3938,7 +3940,7 @@ RelationGetExclusionInfo(Relation indexRelation,
conrel = heap_open(ConstraintRelationId, AccessShareLock);
conscan = systable_beginscan(conrel, ConstraintRelidIndexId, true,
SnapshotNow, 1, skey);
NULL, 1, skey);
found = false;
while (HeapTupleIsValid(htup = systable_getnext(conscan)))

View File

@ -33,7 +33,10 @@
#include "catalog/pg_constraint.h"
#include "catalog/pg_conversion.h"
#include "catalog/pg_database.h"
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_default_acl.h"
#include "catalog/pg_depend.h"
#include "catalog/pg_description.h"
#include "catalog/pg_enum.h"
#include "catalog/pg_event_trigger.h"
#include "catalog/pg_foreign_data_wrapper.h"
@ -47,6 +50,10 @@
#include "catalog/pg_proc.h"
#include "catalog/pg_range.h"
#include "catalog/pg_rewrite.h"
#include "catalog/pg_seclabel.h"
#include "catalog/pg_shdepend.h"
#include "catalog/pg_shdescription.h"
#include "catalog/pg_shseclabel.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_tablespace.h"
#include "catalog/pg_ts_config.h"
@ -796,6 +803,10 @@ static CatCache *SysCache[
static int SysCacheSize = lengthof(cacheinfo);
static bool CacheInitialized = false;
static Oid SysCacheRelationOid[lengthof(cacheinfo)];
static int SysCacheRelationOidSize;
static int oid_compare(const void *a, const void *b);
/*
* InitCatalogCache - initialize the caches
@ -809,6 +820,8 @@ void
InitCatalogCache(void)
{
int cacheId;
int i,
j = 0;
Assert(!CacheInitialized);
@ -825,11 +838,23 @@ InitCatalogCache(void)
if (!PointerIsValid(SysCache[cacheId]))
elog(ERROR, "could not initialize cache %u (%d)",
cacheinfo[cacheId].reloid, cacheId);
SysCacheRelationOid[SysCacheRelationOidSize++] =
cacheinfo[cacheId].reloid;
/* see comments for RelationInvalidatesSnapshotsOnly */
Assert(!RelationInvalidatesSnapshotsOnly(cacheinfo[cacheId].reloid));
}
/* Sort and dedup OIDs. */
pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
sizeof(Oid), oid_compare);
for (i = 1; i < SysCacheRelationOidSize; ++i)
if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
SysCacheRelationOid[++j] = SysCacheRelationOid[i];
SysCacheRelationOidSize = j + 1;
CacheInitialized = true;
}
/*
* InitCatalogCachePhase2 - finish initializing the caches
*
@ -1113,3 +1138,73 @@ SearchSysCacheList(int cacheId, int nkeys,
return SearchCatCacheList(SysCache[cacheId], nkeys,
key1, key2, key3, key4);
}
/*
* Certain relations that do not have system caches send snapshot invalidation
* messages in lieu of catcache messages. This is for the benefit of
* GetCatalogSnapshot(), which can then reuse its existing MVCC snapshot
* for scanning one of those catalogs, rather than taking a new one, if no
* invalidation has been received.
*
* Relations that have syscaches need not (and must not) be listed here. The
* catcache invalidation messages will also flush the snapshot. If you add a
* syscache for one of these relations, remove it from this list.
*/
bool
RelationInvalidatesSnapshotsOnly(Oid relid)
{
switch (relid)
{
case DbRoleSettingRelationId:
case DependRelationId:
case SharedDependRelationId:
case DescriptionRelationId:
case SharedDescriptionRelationId:
case SecLabelRelationId:
case SharedSecLabelRelationId:
return true;
default:
break;
}
return false;
}
/*
* Test whether a relation has a system cache.
*/
bool
RelationHasSysCache(Oid relid)
{
int low = 0,
high = SysCacheRelationOidSize - 1;
while (low <= high)
{
int middle = low + (high - low) / 2;
if (SysCacheRelationOid[middle] == relid)
return true;
if (SysCacheRelationOid[middle] < relid)
low = middle + 1;
else
high = middle - 1;
}
return false;
}
/*
* OID comparator for pg_qsort
*/
static int
oid_compare(const void *a, const void *b)
{
Oid oa = *((Oid *) a);
Oid ob = *((Oid *) b);
if (oa == ob)
return 0;
return (oa > ob) ? 1 : -1;
}

View File

@ -484,7 +484,7 @@ lookup_ts_config_cache(Oid cfgId)
maprel = heap_open(TSConfigMapRelationId, AccessShareLock);
mapidx = index_open(TSConfigMapIndexId, AccessShareLock);
mapscan = systable_beginscan_ordered(maprel, mapidx,
SnapshotNow, 1, &mapskey);
NULL, 1, &mapskey);
while ((maptup = systable_getnext_ordered(mapscan, ForwardScanDirection)) != NULL)
{

View File

@ -1082,12 +1082,7 @@ load_enum_cache_data(TypeCacheEntry *tcache)
items = (EnumItem *) palloc(sizeof(EnumItem) * maxitems);
numitems = 0;
/*
* Scan pg_enum for the members of the target enum type. We use a current
* MVCC snapshot, *not* SnapshotNow, so that we see a consistent set of
* rows even if someone commits a renumbering of the enum meanwhile. See
* comments for RenumberEnumType in catalog/pg_enum.c for more info.
*/
/* Scan pg_enum for the members of the target enum type. */
ScanKeyInit(&skey,
Anum_pg_enum_enumtypid,
BTEqualStrategyNumber, F_OIDEQ,
@ -1096,7 +1091,7 @@ load_enum_cache_data(TypeCacheEntry *tcache)
enum_rel = heap_open(EnumRelationId, AccessShareLock);
enum_scan = systable_beginscan(enum_rel,
EnumTypIdLabelIndexId,
true, GetLatestSnapshot(),
true, NULL,
1, &skey);
while (HeapTupleIsValid(enum_tuple = systable_getnext(enum_scan)))