mirror of
https://github.com/postgres/postgres.git
synced 2025-07-28 23:42:10 +03:00
snapshot scalability: Don't compute global horizons while building snapshots.
To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
This commit is contained in:
@ -23,12 +23,30 @@
|
||||
#include "miscadmin.h"
|
||||
#include "pgstat.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "utils/snapmgr.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/snapmgr.h"
|
||||
|
||||
/* Working data for heap_page_prune and subroutines */
|
||||
typedef struct
|
||||
{
|
||||
Relation rel;
|
||||
|
||||
/* tuple visibility test, initialized for the relation */
|
||||
GlobalVisState *vistest;
|
||||
|
||||
/*
|
||||
* Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
|
||||
* been computed (done on demand, and only if
|
||||
* OldSnapshotThresholdActive()). The first time a tuple is about to be
|
||||
* removed based on the limited horizon, old_snap_used is set to true, and
|
||||
* SetOldSnapshotThresholdTimestamp() is called. See
|
||||
* heap_prune_satisfies_vacuum().
|
||||
*/
|
||||
TimestampTz old_snap_ts;
|
||||
TransactionId old_snap_xmin;
|
||||
bool old_snap_used;
|
||||
|
||||
TransactionId new_prune_xid; /* new prune hint value for page */
|
||||
TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
|
||||
int nredirected; /* numbers of entries in arrays below */
|
||||
@ -43,9 +61,8 @@ typedef struct
|
||||
} PruneState;
|
||||
|
||||
/* Local functions */
|
||||
static int heap_prune_chain(Relation relation, Buffer buffer,
|
||||
static int heap_prune_chain(Buffer buffer,
|
||||
OffsetNumber rootoffnum,
|
||||
TransactionId OldestXmin,
|
||||
PruneState *prstate);
|
||||
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
|
||||
static void heap_prune_record_redirect(PruneState *prstate,
|
||||
@ -65,16 +82,16 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
|
||||
* if there's not any use in pruning.
|
||||
*
|
||||
* Caller must have pin on the buffer, and must *not* have a lock on it.
|
||||
*
|
||||
* OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
|
||||
* or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
|
||||
*/
|
||||
void
|
||||
heap_page_prune_opt(Relation relation, Buffer buffer)
|
||||
{
|
||||
Page page = BufferGetPage(buffer);
|
||||
TransactionId prune_xid;
|
||||
GlobalVisState *vistest;
|
||||
TransactionId limited_xmin = InvalidTransactionId;
|
||||
TimestampTz limited_ts = 0;
|
||||
Size minfree;
|
||||
TransactionId OldestXmin;
|
||||
|
||||
/*
|
||||
* We can't write WAL in recovery mode, so there's no point trying to
|
||||
@ -85,37 +102,55 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Use the appropriate xmin horizon for this relation. If it's a proper
|
||||
* catalog relation or a user defined, additional, catalog relation, we
|
||||
* need to use the horizon that includes slots, otherwise the data-only
|
||||
* horizon can be used. Note that the toast relation of user defined
|
||||
* relations are *not* considered catalog relations.
|
||||
* XXX: Magic to keep old_snapshot_threshold tests appear "working". They
|
||||
* currently are broken, and discussion of what to do about them is
|
||||
* ongoing. See
|
||||
* https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
|
||||
*/
|
||||
if (old_snapshot_threshold == 0)
|
||||
SnapshotTooOldMagicForTest();
|
||||
|
||||
/*
|
||||
* First check whether there's any chance there's something to prune,
|
||||
* determining the appropriate horizon is a waste if there's no prune_xid
|
||||
* (i.e. no updates/deletes left potentially dead tuples around).
|
||||
*/
|
||||
prune_xid = ((PageHeader) page)->pd_prune_xid;
|
||||
if (!TransactionIdIsValid(prune_xid))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Check whether prune_xid indicates that there may be dead rows that can
|
||||
* be cleaned up.
|
||||
*
|
||||
* It is OK to apply the old snapshot limit before acquiring the cleanup
|
||||
* It is OK to check the old snapshot limit before acquiring the cleanup
|
||||
* lock because the worst that can happen is that we are not quite as
|
||||
* aggressive about the cleanup (by however many transaction IDs are
|
||||
* consumed between this point and acquiring the lock). This allows us to
|
||||
* save significant overhead in the case where the page is found not to be
|
||||
* prunable.
|
||||
*/
|
||||
if (IsCatalogRelation(relation) ||
|
||||
RelationIsAccessibleInLogicalDecoding(relation))
|
||||
OldestXmin = RecentGlobalXmin;
|
||||
else
|
||||
OldestXmin =
|
||||
TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin,
|
||||
relation);
|
||||
|
||||
Assert(TransactionIdIsValid(OldestXmin));
|
||||
|
||||
/*
|
||||
* Let's see if we really need pruning.
|
||||
*
|
||||
* Forget it if page is not hinted to contain something prunable that's
|
||||
* older than OldestXmin.
|
||||
* Even if old_snapshot_threshold is set, we first check whether the page
|
||||
* can be pruned without. Both because
|
||||
* TransactionIdLimitedForOldSnapshots() is not cheap, and because not
|
||||
* unnecessarily relying on old_snapshot_threshold avoids causing
|
||||
* conflicts.
|
||||
*/
|
||||
if (!PageIsPrunable(page, OldestXmin))
|
||||
return;
|
||||
vistest = GlobalVisTestFor(relation);
|
||||
|
||||
if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
|
||||
{
|
||||
if (!OldSnapshotThresholdActive())
|
||||
return;
|
||||
|
||||
if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
|
||||
relation,
|
||||
&limited_xmin, &limited_ts))
|
||||
return;
|
||||
|
||||
if (!TransactionIdPrecedes(prune_xid, limited_xmin))
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We prune when a previous UPDATE failed to find enough space on the page
|
||||
@ -151,7 +186,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
|
||||
* needed */
|
||||
|
||||
/* OK to prune */
|
||||
(void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore);
|
||||
(void) heap_page_prune(relation, buffer, vistest,
|
||||
limited_xmin, limited_ts,
|
||||
true, &ignore);
|
||||
}
|
||||
|
||||
/* And release buffer lock */
|
||||
@ -165,8 +202,11 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
|
||||
*
|
||||
* Caller must have pin and buffer cleanup lock on the page.
|
||||
*
|
||||
* OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
|
||||
* or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
|
||||
* vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
|
||||
* (see heap_prune_satisfies_vacuum and
|
||||
* HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
|
||||
* either have been set by TransactionIdLimitedForOldSnapshots, or
|
||||
* InvalidTransactionId/0 respectively.
|
||||
*
|
||||
* If report_stats is true then we send the number of reclaimed heap-only
|
||||
* tuples to pgstats. (This must be false during vacuum, since vacuum will
|
||||
@ -177,7 +217,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
|
||||
* latestRemovedXid.
|
||||
*/
|
||||
int
|
||||
heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
heap_page_prune(Relation relation, Buffer buffer,
|
||||
GlobalVisState *vistest,
|
||||
TransactionId old_snap_xmin,
|
||||
TimestampTz old_snap_ts,
|
||||
bool report_stats, TransactionId *latestRemovedXid)
|
||||
{
|
||||
int ndeleted = 0;
|
||||
@ -198,6 +241,11 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
* initialize the rest of our working state.
|
||||
*/
|
||||
prstate.new_prune_xid = InvalidTransactionId;
|
||||
prstate.rel = relation;
|
||||
prstate.vistest = vistest;
|
||||
prstate.old_snap_xmin = old_snap_xmin;
|
||||
prstate.old_snap_ts = old_snap_ts;
|
||||
prstate.old_snap_used = false;
|
||||
prstate.latestRemovedXid = *latestRemovedXid;
|
||||
prstate.nredirected = prstate.ndead = prstate.nunused = 0;
|
||||
memset(prstate.marked, 0, sizeof(prstate.marked));
|
||||
@ -220,9 +268,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
continue;
|
||||
|
||||
/* Process this item or chain of items */
|
||||
ndeleted += heap_prune_chain(relation, buffer, offnum,
|
||||
OldestXmin,
|
||||
&prstate);
|
||||
ndeleted += heap_prune_chain(buffer, offnum, &prstate);
|
||||
}
|
||||
|
||||
/* Any error while applying the changes is critical */
|
||||
@ -323,6 +369,85 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Perform visiblity checks for heap pruning.
|
||||
*
|
||||
* This is more complicated than just using GlobalVisTestIsRemovableXid()
|
||||
* because of old_snapshot_threshold. We only want to increase the threshold
|
||||
* that triggers errors for old snapshots when we actually decide to remove a
|
||||
* row based on the limited horizon.
|
||||
*
|
||||
* Due to its cost we also only want to call
|
||||
* TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
|
||||
* done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
|
||||
* still want to be able to remove rows that are too new to be removed
|
||||
* according to prstate->vistest, but that can be removed based on
|
||||
* old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
|
||||
* demand in here, if appropriate.
|
||||
*/
|
||||
static HTSV_Result
|
||||
heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
|
||||
{
|
||||
HTSV_Result res;
|
||||
TransactionId dead_after;
|
||||
|
||||
res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
|
||||
|
||||
if (res != HEAPTUPLE_RECENTLY_DEAD)
|
||||
return res;
|
||||
|
||||
/*
|
||||
* If we are already relying on the limited xmin, there is no need to
|
||||
* delay doing so anymore.
|
||||
*/
|
||||
if (prstate->old_snap_used)
|
||||
{
|
||||
Assert(TransactionIdIsValid(prstate->old_snap_xmin));
|
||||
|
||||
if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
|
||||
res = HEAPTUPLE_DEAD;
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* First check if GlobalVisTestIsRemovableXid() is sufficient to find the
|
||||
* row dead. If not, and old_snapshot_threshold is enabled, try to use the
|
||||
* lowered horizon.
|
||||
*/
|
||||
if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
|
||||
res = HEAPTUPLE_DEAD;
|
||||
else if (OldSnapshotThresholdActive())
|
||||
{
|
||||
/* haven't determined limited horizon yet, requests */
|
||||
if (!TransactionIdIsValid(prstate->old_snap_xmin))
|
||||
{
|
||||
TransactionId horizon =
|
||||
GlobalVisTestNonRemovableHorizon(prstate->vistest);
|
||||
|
||||
TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
|
||||
&prstate->old_snap_xmin,
|
||||
&prstate->old_snap_ts);
|
||||
}
|
||||
|
||||
if (TransactionIdIsValid(prstate->old_snap_xmin) &&
|
||||
TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
|
||||
{
|
||||
/*
|
||||
* About to remove row based on snapshot_too_old. Need to raise
|
||||
* the threshold so problematic accesses would error.
|
||||
*/
|
||||
Assert(!prstate->old_snap_used);
|
||||
SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
|
||||
prstate->old_snap_xmin);
|
||||
prstate->old_snap_used = true;
|
||||
res = HEAPTUPLE_DEAD;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Prune specified line pointer or a HOT chain originating at line pointer.
|
||||
*
|
||||
@ -349,9 +474,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
* Returns the number of tuples (to be) deleted from the page.
|
||||
*/
|
||||
static int
|
||||
heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
TransactionId OldestXmin,
|
||||
PruneState *prstate)
|
||||
heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
|
||||
{
|
||||
int ndeleted = 0;
|
||||
Page dp = (Page) BufferGetPage(buffer);
|
||||
@ -366,7 +489,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
i;
|
||||
HeapTupleData tup;
|
||||
|
||||
tup.t_tableOid = RelationGetRelid(relation);
|
||||
tup.t_tableOid = RelationGetRelid(prstate->rel);
|
||||
|
||||
rootlp = PageGetItemId(dp, rootoffnum);
|
||||
|
||||
@ -401,7 +524,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
* either here or while following a chain below. Whichever path
|
||||
* gets there first will mark the tuple unused.
|
||||
*/
|
||||
if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)
|
||||
if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
|
||||
== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
|
||||
{
|
||||
heap_prune_record_unused(prstate, rootoffnum);
|
||||
@ -485,7 +608,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
*/
|
||||
tupdead = recent_dead = false;
|
||||
|
||||
switch (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer))
|
||||
switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
|
||||
{
|
||||
case HEAPTUPLE_DEAD:
|
||||
tupdead = true;
|
||||
|
Reference in New Issue
Block a user