mirror of
https://github.com/postgres/postgres.git
synced 2025-12-21 05:21:08 +03:00
The EvalPlanQual recheck for TID Scan wasn't rechecking the TID qual still passed after following update chains. This could result in tuples being updated or deleted by plans using TID Scans where the ctid of the new (updated) tuple no longer matches the clause of the scan. This isn't desired behavior, and isn't consistent with what would happen if the chosen plan had used an Index or Seq Scan, and that could lead to hard to predict behavior for scans that contain TID quals and other quals as the planner has freedom to choose TID or some other scan method for such queries, and the chosen plan could change at any moment. Here we fix this by properly implementing the recheck function for TID Scans. Backpatch to 13, oldest supported version Reported-by: Sophie Alpert <pg@sophiebits.com> Author: Sophie Alpert <pg@sophiebits.com> Author: David Rowley <dgrowleyml@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://postgr.es/m/4a6268ff-3340-453a-9bf5-c98d51a6f729@app.fastmail.com Backpatch-through: 13
570 lines
14 KiB
C
570 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* nodeTidscan.c
|
|
* Routines to support direct tid scans of relations
|
|
*
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/executor/nodeTidscan.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
/*
|
|
* INTERFACE ROUTINES
|
|
*
|
|
* ExecTidScan scans a relation using tids
|
|
* ExecInitTidScan creates and initializes state info.
|
|
* ExecReScanTidScan rescans the tid relation.
|
|
* ExecEndTidScan releases all storage.
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/sysattr.h"
|
|
#include "access/tableam.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "executor/execdebug.h"
|
|
#include "executor/nodeTidscan.h"
|
|
#include "lib/qunique.h"
|
|
#include "miscadmin.h"
|
|
#include "nodes/nodeFuncs.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "utils/array.h"
|
|
#include "utils/rel.h"
|
|
|
|
|
|
#define IsCTIDVar(node) \
|
|
((node) != NULL && \
|
|
IsA((node), Var) && \
|
|
((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \
|
|
((Var *) (node))->varlevelsup == 0)
|
|
|
|
/* one element in tss_tidexprs */
|
|
typedef struct TidExpr
|
|
{
|
|
ExprState *exprstate; /* ExprState for a TID-yielding subexpr */
|
|
bool isarray; /* if true, it yields tid[] not just tid */
|
|
CurrentOfExpr *cexpr; /* alternatively, we can have CURRENT OF */
|
|
} TidExpr;
|
|
|
|
static void TidExprListCreate(TidScanState *tidstate);
|
|
static void TidListEval(TidScanState *tidstate);
|
|
static int itemptr_comparator(const void *a, const void *b);
|
|
static TupleTableSlot *TidNext(TidScanState *node);
|
|
|
|
|
|
/*
|
|
* Extract the qual subexpressions that yield TIDs to search for,
|
|
* and compile them into ExprStates if they're ordinary expressions.
|
|
*
|
|
* CURRENT OF is a special case that we can't compile usefully;
|
|
* just drop it into the TidExpr list as-is.
|
|
*/
|
|
static void
|
|
TidExprListCreate(TidScanState *tidstate)
|
|
{
|
|
TidScan *node = (TidScan *) tidstate->ss.ps.plan;
|
|
ListCell *l;
|
|
|
|
tidstate->tss_tidexprs = NIL;
|
|
tidstate->tss_isCurrentOf = false;
|
|
|
|
foreach(l, node->tidquals)
|
|
{
|
|
Expr *expr = (Expr *) lfirst(l);
|
|
TidExpr *tidexpr = (TidExpr *) palloc0(sizeof(TidExpr));
|
|
|
|
if (is_opclause(expr))
|
|
{
|
|
Node *arg1;
|
|
Node *arg2;
|
|
|
|
arg1 = get_leftop(expr);
|
|
arg2 = get_rightop(expr);
|
|
if (IsCTIDVar(arg1))
|
|
tidexpr->exprstate = ExecInitExpr((Expr *) arg2,
|
|
&tidstate->ss.ps);
|
|
else if (IsCTIDVar(arg2))
|
|
tidexpr->exprstate = ExecInitExpr((Expr *) arg1,
|
|
&tidstate->ss.ps);
|
|
else
|
|
elog(ERROR, "could not identify CTID variable");
|
|
tidexpr->isarray = false;
|
|
}
|
|
else if (expr && IsA(expr, ScalarArrayOpExpr))
|
|
{
|
|
ScalarArrayOpExpr *saex = (ScalarArrayOpExpr *) expr;
|
|
|
|
Assert(IsCTIDVar(linitial(saex->args)));
|
|
tidexpr->exprstate = ExecInitExpr(lsecond(saex->args),
|
|
&tidstate->ss.ps);
|
|
tidexpr->isarray = true;
|
|
}
|
|
else if (expr && IsA(expr, CurrentOfExpr))
|
|
{
|
|
CurrentOfExpr *cexpr = (CurrentOfExpr *) expr;
|
|
|
|
tidexpr->cexpr = cexpr;
|
|
tidstate->tss_isCurrentOf = true;
|
|
}
|
|
else
|
|
elog(ERROR, "could not identify CTID expression");
|
|
|
|
tidstate->tss_tidexprs = lappend(tidstate->tss_tidexprs, tidexpr);
|
|
}
|
|
|
|
/* CurrentOfExpr could never appear OR'd with something else */
|
|
Assert(list_length(tidstate->tss_tidexprs) == 1 ||
|
|
!tidstate->tss_isCurrentOf);
|
|
}
|
|
|
|
/*
|
|
* Compute the list of TIDs to be visited, by evaluating the expressions
|
|
* for them.
|
|
*
|
|
* (The result is actually an array, not a list.)
|
|
*/
|
|
static void
|
|
TidListEval(TidScanState *tidstate)
|
|
{
|
|
ExprContext *econtext = tidstate->ss.ps.ps_ExprContext;
|
|
TableScanDesc scan;
|
|
ItemPointerData *tidList;
|
|
int numAllocTids;
|
|
int numTids;
|
|
ListCell *l;
|
|
|
|
/*
|
|
* Start scan on-demand - initializing a scan isn't free (e.g. heap stats
|
|
* the size of the table), so it makes sense to delay that until needed -
|
|
* the node might never get executed.
|
|
*/
|
|
if (tidstate->ss.ss_currentScanDesc == NULL)
|
|
tidstate->ss.ss_currentScanDesc =
|
|
table_beginscan_tid(tidstate->ss.ss_currentRelation,
|
|
tidstate->ss.ps.state->es_snapshot);
|
|
scan = tidstate->ss.ss_currentScanDesc;
|
|
|
|
/*
|
|
* We initialize the array with enough slots for the case that all quals
|
|
* are simple OpExprs or CurrentOfExprs. If there are any
|
|
* ScalarArrayOpExprs, we may have to enlarge the array.
|
|
*/
|
|
numAllocTids = list_length(tidstate->tss_tidexprs);
|
|
tidList = (ItemPointerData *)
|
|
palloc(numAllocTids * sizeof(ItemPointerData));
|
|
numTids = 0;
|
|
|
|
foreach(l, tidstate->tss_tidexprs)
|
|
{
|
|
TidExpr *tidexpr = (TidExpr *) lfirst(l);
|
|
ItemPointer itemptr;
|
|
bool isNull;
|
|
|
|
if (tidexpr->exprstate && !tidexpr->isarray)
|
|
{
|
|
itemptr = (ItemPointer)
|
|
DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate,
|
|
econtext,
|
|
&isNull));
|
|
if (isNull)
|
|
continue;
|
|
|
|
/*
|
|
* We silently discard any TIDs that the AM considers invalid
|
|
* (E.g. for heap, they could be out of range at the time of scan
|
|
* start. Since we hold at least AccessShareLock on the table, it
|
|
* won't be possible for someone to truncate away the blocks we
|
|
* intend to visit.).
|
|
*/
|
|
if (!table_tuple_tid_valid(scan, itemptr))
|
|
continue;
|
|
|
|
if (numTids >= numAllocTids)
|
|
{
|
|
numAllocTids *= 2;
|
|
tidList = (ItemPointerData *)
|
|
repalloc(tidList,
|
|
numAllocTids * sizeof(ItemPointerData));
|
|
}
|
|
tidList[numTids++] = *itemptr;
|
|
}
|
|
else if (tidexpr->exprstate && tidexpr->isarray)
|
|
{
|
|
Datum arraydatum;
|
|
ArrayType *itemarray;
|
|
Datum *ipdatums;
|
|
bool *ipnulls;
|
|
int ndatums;
|
|
int i;
|
|
|
|
arraydatum = ExecEvalExprSwitchContext(tidexpr->exprstate,
|
|
econtext,
|
|
&isNull);
|
|
if (isNull)
|
|
continue;
|
|
itemarray = DatumGetArrayTypeP(arraydatum);
|
|
deconstruct_array(itemarray,
|
|
TIDOID, sizeof(ItemPointerData), false, TYPALIGN_SHORT,
|
|
&ipdatums, &ipnulls, &ndatums);
|
|
if (numTids + ndatums > numAllocTids)
|
|
{
|
|
numAllocTids = numTids + ndatums;
|
|
tidList = (ItemPointerData *)
|
|
repalloc(tidList,
|
|
numAllocTids * sizeof(ItemPointerData));
|
|
}
|
|
for (i = 0; i < ndatums; i++)
|
|
{
|
|
if (ipnulls[i])
|
|
continue;
|
|
|
|
itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]);
|
|
|
|
if (!table_tuple_tid_valid(scan, itemptr))
|
|
continue;
|
|
|
|
tidList[numTids++] = *itemptr;
|
|
}
|
|
pfree(ipdatums);
|
|
pfree(ipnulls);
|
|
}
|
|
else
|
|
{
|
|
ItemPointerData cursor_tid;
|
|
|
|
Assert(tidexpr->cexpr);
|
|
if (execCurrentOf(tidexpr->cexpr, econtext,
|
|
RelationGetRelid(tidstate->ss.ss_currentRelation),
|
|
&cursor_tid))
|
|
{
|
|
if (numTids >= numAllocTids)
|
|
{
|
|
numAllocTids *= 2;
|
|
tidList = (ItemPointerData *)
|
|
repalloc(tidList,
|
|
numAllocTids * sizeof(ItemPointerData));
|
|
}
|
|
tidList[numTids++] = cursor_tid;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Sort the array of TIDs into order, and eliminate duplicates.
|
|
* Eliminating duplicates is necessary since we want OR semantics across
|
|
* the list. Sorting makes it easier to detect duplicates, and as a bonus
|
|
* ensures that we will visit the heap in the most efficient way.
|
|
*/
|
|
if (numTids > 1)
|
|
{
|
|
/* CurrentOfExpr could never appear OR'd with something else */
|
|
Assert(!tidstate->tss_isCurrentOf);
|
|
|
|
qsort((void *) tidList, numTids, sizeof(ItemPointerData),
|
|
itemptr_comparator);
|
|
numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
|
|
itemptr_comparator);
|
|
}
|
|
|
|
tidstate->tss_TidList = tidList;
|
|
tidstate->tss_NumTids = numTids;
|
|
tidstate->tss_TidPtr = -1;
|
|
}
|
|
|
|
/*
|
|
* qsort comparator for ItemPointerData items
|
|
*/
|
|
static int
|
|
itemptr_comparator(const void *a, const void *b)
|
|
{
|
|
const ItemPointerData *ipa = (const ItemPointerData *) a;
|
|
const ItemPointerData *ipb = (const ItemPointerData *) b;
|
|
BlockNumber ba = ItemPointerGetBlockNumber(ipa);
|
|
BlockNumber bb = ItemPointerGetBlockNumber(ipb);
|
|
OffsetNumber oa = ItemPointerGetOffsetNumber(ipa);
|
|
OffsetNumber ob = ItemPointerGetOffsetNumber(ipb);
|
|
|
|
if (ba < bb)
|
|
return -1;
|
|
if (ba > bb)
|
|
return 1;
|
|
if (oa < ob)
|
|
return -1;
|
|
if (oa > ob)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* TidNext
|
|
*
|
|
* Retrieve a tuple from the TidScan node's currentRelation
|
|
* using the tids in the TidScanState information.
|
|
*
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot *
|
|
TidNext(TidScanState *node)
|
|
{
|
|
EState *estate;
|
|
ScanDirection direction;
|
|
Snapshot snapshot;
|
|
TableScanDesc scan;
|
|
Relation heapRelation;
|
|
TupleTableSlot *slot;
|
|
ItemPointerData *tidList;
|
|
int numTids;
|
|
bool bBackward;
|
|
|
|
/*
|
|
* extract necessary information from tid scan node
|
|
*/
|
|
estate = node->ss.ps.state;
|
|
direction = estate->es_direction;
|
|
snapshot = estate->es_snapshot;
|
|
heapRelation = node->ss.ss_currentRelation;
|
|
slot = node->ss.ss_ScanTupleSlot;
|
|
|
|
/*
|
|
* First time through, compute the list of TIDs to be visited
|
|
*/
|
|
if (node->tss_TidList == NULL)
|
|
TidListEval(node);
|
|
|
|
scan = node->ss.ss_currentScanDesc;
|
|
tidList = node->tss_TidList;
|
|
numTids = node->tss_NumTids;
|
|
|
|
/*
|
|
* Initialize or advance scan position, depending on direction.
|
|
*/
|
|
bBackward = ScanDirectionIsBackward(direction);
|
|
if (bBackward)
|
|
{
|
|
if (node->tss_TidPtr < 0)
|
|
{
|
|
/* initialize for backward scan */
|
|
node->tss_TidPtr = numTids - 1;
|
|
}
|
|
else
|
|
node->tss_TidPtr--;
|
|
}
|
|
else
|
|
{
|
|
if (node->tss_TidPtr < 0)
|
|
{
|
|
/* initialize for forward scan */
|
|
node->tss_TidPtr = 0;
|
|
}
|
|
else
|
|
node->tss_TidPtr++;
|
|
}
|
|
|
|
while (node->tss_TidPtr >= 0 && node->tss_TidPtr < numTids)
|
|
{
|
|
ItemPointerData tid = tidList[node->tss_TidPtr];
|
|
|
|
/*
|
|
* For WHERE CURRENT OF, the tuple retrieved from the cursor might
|
|
* since have been updated; if so, we should fetch the version that is
|
|
* current according to our snapshot.
|
|
*/
|
|
if (node->tss_isCurrentOf)
|
|
table_tuple_get_latest_tid(scan, &tid);
|
|
|
|
if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot))
|
|
return slot;
|
|
|
|
/* Bad TID or failed snapshot qual; try next */
|
|
if (bBackward)
|
|
node->tss_TidPtr--;
|
|
else
|
|
node->tss_TidPtr++;
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
}
|
|
|
|
/*
|
|
* if we get here it means the tid scan failed so we are at the end of the
|
|
* scan..
|
|
*/
|
|
return ExecClearTuple(slot);
|
|
}
|
|
|
|
/*
|
|
* TidRecheck -- access method routine to recheck a tuple in EvalPlanQual
|
|
*/
|
|
static bool
|
|
TidRecheck(TidScanState *node, TupleTableSlot *slot)
|
|
{
|
|
ItemPointer match;
|
|
|
|
/* WHERE CURRENT OF always intends to resolve to the latest tuple */
|
|
if (node->tss_isCurrentOf)
|
|
return true;
|
|
|
|
if (node->tss_TidList == NULL)
|
|
TidListEval(node);
|
|
|
|
/*
|
|
* Binary search the TidList to see if this ctid is mentioned and return
|
|
* true if it is.
|
|
*/
|
|
match = (ItemPointer) bsearch(&slot->tts_tid, node->tss_TidList,
|
|
node->tss_NumTids, sizeof(ItemPointerData),
|
|
itemptr_comparator);
|
|
return match != NULL;
|
|
}
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecTidScan(node)
|
|
*
|
|
* Scans the relation using tids and returns
|
|
* the next qualifying tuple in the direction specified.
|
|
* We call the ExecScan() routine and pass it the appropriate
|
|
* access method functions.
|
|
*
|
|
* Conditions:
|
|
* -- the "cursor" maintained by the AMI is positioned at the tuple
|
|
* returned previously.
|
|
*
|
|
* Initial States:
|
|
* -- the relation indicated is opened for scanning so that the
|
|
* "cursor" is positioned before the first qualifying tuple.
|
|
* -- tss_TidPtr is -1.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot *
|
|
ExecTidScan(PlanState *pstate)
|
|
{
|
|
TidScanState *node = castNode(TidScanState, pstate);
|
|
|
|
return ExecScan(&node->ss,
|
|
(ExecScanAccessMtd) TidNext,
|
|
(ExecScanRecheckMtd) TidRecheck);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecReScanTidScan(node)
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecReScanTidScan(TidScanState *node)
|
|
{
|
|
if (node->tss_TidList)
|
|
pfree(node->tss_TidList);
|
|
node->tss_TidList = NULL;
|
|
node->tss_NumTids = 0;
|
|
node->tss_TidPtr = -1;
|
|
|
|
/* not really necessary, but seems good form */
|
|
if (node->ss.ss_currentScanDesc)
|
|
table_rescan(node->ss.ss_currentScanDesc, NULL);
|
|
|
|
ExecScanReScan(&node->ss);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecEndTidScan
|
|
*
|
|
* Releases any storage allocated through C routines.
|
|
* Returns nothing.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecEndTidScan(TidScanState *node)
|
|
{
|
|
if (node->ss.ss_currentScanDesc)
|
|
table_endscan(node->ss.ss_currentScanDesc);
|
|
|
|
/*
|
|
* Free the exprcontext
|
|
*/
|
|
ExecFreeExprContext(&node->ss.ps);
|
|
|
|
/*
|
|
* clear out tuple table slots
|
|
*/
|
|
if (node->ss.ps.ps_ResultTupleSlot)
|
|
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
|
|
ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecInitTidScan
|
|
*
|
|
* Initializes the tid scan's state information, creates
|
|
* scan keys, and opens the base and tid relations.
|
|
*
|
|
* Parameters:
|
|
* node: TidScan node produced by the planner.
|
|
* estate: the execution state initialized in InitPlan.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
TidScanState *
|
|
ExecInitTidScan(TidScan *node, EState *estate, int eflags)
|
|
{
|
|
TidScanState *tidstate;
|
|
Relation currentRelation;
|
|
|
|
/*
|
|
* create state structure
|
|
*/
|
|
tidstate = makeNode(TidScanState);
|
|
tidstate->ss.ps.plan = (Plan *) node;
|
|
tidstate->ss.ps.state = estate;
|
|
tidstate->ss.ps.ExecProcNode = ExecTidScan;
|
|
|
|
/*
|
|
* Miscellaneous initialization
|
|
*
|
|
* create expression context for node
|
|
*/
|
|
ExecAssignExprContext(estate, &tidstate->ss.ps);
|
|
|
|
/*
|
|
* mark tid list as not computed yet
|
|
*/
|
|
tidstate->tss_TidList = NULL;
|
|
tidstate->tss_NumTids = 0;
|
|
tidstate->tss_TidPtr = -1;
|
|
|
|
/*
|
|
* open the scan relation
|
|
*/
|
|
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
|
|
|
|
tidstate->ss.ss_currentRelation = currentRelation;
|
|
tidstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
|
|
|
|
/*
|
|
* get the scan type from the relation descriptor.
|
|
*/
|
|
ExecInitScanTupleSlot(estate, &tidstate->ss,
|
|
RelationGetDescr(currentRelation),
|
|
table_slot_callbacks(currentRelation));
|
|
|
|
/*
|
|
* Initialize result type and projection.
|
|
*/
|
|
ExecInitResultTypeTL(&tidstate->ss.ps);
|
|
ExecAssignScanProjectionInfo(&tidstate->ss);
|
|
|
|
/*
|
|
* initialize child expressions
|
|
*/
|
|
tidstate->ss.ps.qual =
|
|
ExecInitQual(node->scan.plan.qual, (PlanState *) tidstate);
|
|
|
|
TidExprListCreate(tidstate);
|
|
|
|
/*
|
|
* all done.
|
|
*/
|
|
return tidstate;
|
|
}
|