mirror of
https://github.com/postgres/postgres.git
synced 2025-07-08 11:42:09 +03:00
Add TID Range Scans to support efficient scanning ranges of TIDs
This adds a new executor node named TID Range Scan. The query planner will generate paths for TID Range scans when quals are discovered on base relations which search for ranges on the table's ctid column. These ranges may be open at either end. For example, WHERE ctid >= '(10,0)'; will return all tuples on page 10 and over. To support this, two new optional callback functions have been added to table AM. scan_set_tidrange is used to set the scan range to just the given range of TIDs. scan_getnextslot_tidrange fetches the next tuple in the given range. For AMs were scanning ranges of TIDs would not make sense, these functions can be set to NULL in the TableAmRoutine. The query planner won't generate TID Range Scan Paths in that case. Author: Edmund Horner, David Rowley Reviewed-by: David Rowley, Tomas Vondra, Tom Lane, Andres Freund, Zhihong Yu Discussion: https://postgr.es/m/CAMyN-kB-nFTkF=VA_JPwFNo08S0d-Yk0F741S2B7LDmYAi8eyA@mail.gmail.com
This commit is contained in:
@ -67,6 +67,7 @@ OBJS = \
|
||||
nodeSubplan.o \
|
||||
nodeSubqueryscan.o \
|
||||
nodeTableFuncscan.o \
|
||||
nodeTidrangescan.o \
|
||||
nodeTidscan.o \
|
||||
nodeUnique.o \
|
||||
nodeValuesscan.o \
|
||||
|
@ -51,6 +51,7 @@
|
||||
#include "executor/nodeSubplan.h"
|
||||
#include "executor/nodeSubqueryscan.h"
|
||||
#include "executor/nodeTableFuncscan.h"
|
||||
#include "executor/nodeTidrangescan.h"
|
||||
#include "executor/nodeTidscan.h"
|
||||
#include "executor/nodeUnique.h"
|
||||
#include "executor/nodeValuesscan.h"
|
||||
@ -197,6 +198,10 @@ ExecReScan(PlanState *node)
|
||||
ExecReScanTidScan((TidScanState *) node);
|
||||
break;
|
||||
|
||||
case T_TidRangeScanState:
|
||||
ExecReScanTidRangeScan((TidRangeScanState *) node);
|
||||
break;
|
||||
|
||||
case T_SubqueryScanState:
|
||||
ExecReScanSubqueryScan((SubqueryScanState *) node);
|
||||
break;
|
||||
@ -562,6 +567,7 @@ ExecSupportsBackwardScan(Plan *node)
|
||||
|
||||
case T_SeqScan:
|
||||
case T_TidScan:
|
||||
case T_TidRangeScan:
|
||||
case T_FunctionScan:
|
||||
case T_ValuesScan:
|
||||
case T_CteScan:
|
||||
|
@ -336,6 +336,7 @@ search_plan_tree(PlanState *node, Oid table_oid,
|
||||
case T_IndexOnlyScanState:
|
||||
case T_BitmapHeapScanState:
|
||||
case T_TidScanState:
|
||||
case T_TidRangeScanState:
|
||||
case T_ForeignScanState:
|
||||
case T_CustomScanState:
|
||||
{
|
||||
|
@ -109,6 +109,7 @@
|
||||
#include "executor/nodeSubplan.h"
|
||||
#include "executor/nodeSubqueryscan.h"
|
||||
#include "executor/nodeTableFuncscan.h"
|
||||
#include "executor/nodeTidrangescan.h"
|
||||
#include "executor/nodeTidscan.h"
|
||||
#include "executor/nodeUnique.h"
|
||||
#include "executor/nodeValuesscan.h"
|
||||
@ -238,6 +239,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
|
||||
estate, eflags);
|
||||
break;
|
||||
|
||||
case T_TidRangeScan:
|
||||
result = (PlanState *) ExecInitTidRangeScan((TidRangeScan *) node,
|
||||
estate, eflags);
|
||||
break;
|
||||
|
||||
case T_SubqueryScan:
|
||||
result = (PlanState *) ExecInitSubqueryScan((SubqueryScan *) node,
|
||||
estate, eflags);
|
||||
@ -637,6 +643,10 @@ ExecEndNode(PlanState *node)
|
||||
ExecEndTidScan((TidScanState *) node);
|
||||
break;
|
||||
|
||||
case T_TidRangeScanState:
|
||||
ExecEndTidRangeScan((TidRangeScanState *) node);
|
||||
break;
|
||||
|
||||
case T_SubqueryScanState:
|
||||
ExecEndSubqueryScan((SubqueryScanState *) node);
|
||||
break;
|
||||
|
413
src/backend/executor/nodeTidrangescan.c
Normal file
413
src/backend/executor/nodeTidrangescan.c
Normal file
@ -0,0 +1,413 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* nodeTidrangescan.c
|
||||
* Routines to support TID range scans of relations
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/executor/nodeTidrangescan.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/relscan.h"
|
||||
#include "access/sysattr.h"
|
||||
#include "access/tableam.h"
|
||||
#include "catalog/pg_operator.h"
|
||||
#include "executor/execdebug.h"
|
||||
#include "executor/nodeTidrangescan.h"
|
||||
#include "nodes/nodeFuncs.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
|
||||
#define IsCTIDVar(node) \
|
||||
((node) != NULL && \
|
||||
IsA((node), Var) && \
|
||||
((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \
|
||||
((Var *) (node))->varlevelsup == 0)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
TIDEXPR_UPPER_BOUND,
|
||||
TIDEXPR_LOWER_BOUND
|
||||
} TidExprType;
|
||||
|
||||
/* Upper or lower range bound for scan */
|
||||
typedef struct TidOpExpr
|
||||
{
|
||||
TidExprType exprtype; /* type of op; lower or upper */
|
||||
ExprState *exprstate; /* ExprState for a TID-yielding subexpr */
|
||||
bool inclusive; /* whether op is inclusive */
|
||||
} TidOpExpr;
|
||||
|
||||
/*
|
||||
* For the given 'expr', build and return an appropriate TidOpExpr taking into
|
||||
* account the expr's operator and operand order.
|
||||
*/
|
||||
static TidOpExpr *
|
||||
MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate)
|
||||
{
|
||||
Node *arg1 = get_leftop((Expr *) expr);
|
||||
Node *arg2 = get_rightop((Expr *) expr);
|
||||
ExprState *exprstate = NULL;
|
||||
bool invert = false;
|
||||
TidOpExpr *tidopexpr;
|
||||
|
||||
if (IsCTIDVar(arg1))
|
||||
exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps);
|
||||
else if (IsCTIDVar(arg2))
|
||||
{
|
||||
exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps);
|
||||
invert = true;
|
||||
}
|
||||
else
|
||||
elog(ERROR, "could not identify CTID variable");
|
||||
|
||||
tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr));
|
||||
tidopexpr->inclusive = false; /* for now */
|
||||
|
||||
switch (expr->opno)
|
||||
{
|
||||
case TIDLessEqOperator:
|
||||
tidopexpr->inclusive = true;
|
||||
/* fall through */
|
||||
case TIDLessOperator:
|
||||
tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND;
|
||||
break;
|
||||
case TIDGreaterEqOperator:
|
||||
tidopexpr->inclusive = true;
|
||||
/* fall through */
|
||||
case TIDGreaterOperator:
|
||||
tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND;
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "could not identify CTID operator");
|
||||
}
|
||||
|
||||
tidopexpr->exprstate = exprstate;
|
||||
|
||||
return tidopexpr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract the qual subexpressions that yield TIDs to search for,
|
||||
* and compile them into ExprStates if they're ordinary expressions.
|
||||
*/
|
||||
static void
|
||||
TidExprListCreate(TidRangeScanState *tidrangestate)
|
||||
{
|
||||
TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan;
|
||||
List *tidexprs = NIL;
|
||||
ListCell *l;
|
||||
|
||||
foreach(l, node->tidrangequals)
|
||||
{
|
||||
OpExpr *opexpr = lfirst(l);
|
||||
TidOpExpr *tidopexpr;
|
||||
|
||||
if (!IsA(opexpr, OpExpr))
|
||||
elog(ERROR, "could not identify CTID expression");
|
||||
|
||||
tidopexpr = MakeTidOpExpr(opexpr, tidrangestate);
|
||||
tidexprs = lappend(tidexprs, tidopexpr);
|
||||
}
|
||||
|
||||
tidrangestate->trss_tidexprs = tidexprs;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* TidRangeEval
|
||||
*
|
||||
* Compute and set node's block and offset range to scan by evaluating
|
||||
* the trss_tidexprs. Returns false if we detect the range cannot
|
||||
* contain any tuples. Returns true if it's possible for the range to
|
||||
* contain tuples.
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
static bool
|
||||
TidRangeEval(TidRangeScanState *node)
|
||||
{
|
||||
ExprContext *econtext = node->ss.ps.ps_ExprContext;
|
||||
ItemPointerData lowerBound;
|
||||
ItemPointerData upperBound;
|
||||
ListCell *l;
|
||||
|
||||
/*
|
||||
* Set the upper and lower bounds to the absolute limits of the range of
|
||||
* the ItemPointer type. Below we'll try to narrow this range on either
|
||||
* side by looking at the TidOpExprs.
|
||||
*/
|
||||
ItemPointerSet(&lowerBound, 0, 0);
|
||||
ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX);
|
||||
|
||||
foreach(l, node->trss_tidexprs)
|
||||
{
|
||||
TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l);
|
||||
ItemPointer itemptr;
|
||||
bool isNull;
|
||||
|
||||
/* Evaluate this bound. */
|
||||
itemptr = (ItemPointer)
|
||||
DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate,
|
||||
econtext,
|
||||
&isNull));
|
||||
|
||||
/* If the bound is NULL, *nothing* matches the qual. */
|
||||
if (isNull)
|
||||
return false;
|
||||
|
||||
if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND)
|
||||
{
|
||||
ItemPointerData lb;
|
||||
|
||||
ItemPointerCopy(itemptr, &lb);
|
||||
|
||||
/*
|
||||
* Normalize non-inclusive ranges to become inclusive. The
|
||||
* resulting ItemPointer here may not be a valid item pointer.
|
||||
*/
|
||||
if (!tidopexpr->inclusive)
|
||||
ItemPointerInc(&lb);
|
||||
|
||||
/* Check if we can narrow the range using this qual */
|
||||
if (ItemPointerCompare(&lb, &lowerBound) > 0)
|
||||
ItemPointerCopy(&lb, &lowerBound);
|
||||
}
|
||||
|
||||
else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND)
|
||||
{
|
||||
ItemPointerData ub;
|
||||
|
||||
ItemPointerCopy(itemptr, &ub);
|
||||
|
||||
/*
|
||||
* Normalize non-inclusive ranges to become inclusive. The
|
||||
* resulting ItemPointer here may not be a valid item pointer.
|
||||
*/
|
||||
if (!tidopexpr->inclusive)
|
||||
ItemPointerDec(&ub);
|
||||
|
||||
/* Check if we can narrow the range using this qual */
|
||||
if (ItemPointerCompare(&ub, &upperBound) < 0)
|
||||
ItemPointerCopy(&ub, &upperBound);
|
||||
}
|
||||
}
|
||||
|
||||
ItemPointerCopy(&lowerBound, &node->trss_mintid);
|
||||
ItemPointerCopy(&upperBound, &node->trss_maxtid);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* TidRangeNext
|
||||
*
|
||||
* Retrieve a tuple from the TidRangeScan node's currentRelation
|
||||
* using the TIDs in the TidRangeScanState information.
|
||||
*
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
static TupleTableSlot *
|
||||
TidRangeNext(TidRangeScanState *node)
|
||||
{
|
||||
TableScanDesc scandesc;
|
||||
EState *estate;
|
||||
ScanDirection direction;
|
||||
TupleTableSlot *slot;
|
||||
|
||||
/*
|
||||
* extract necessary information from TID scan node
|
||||
*/
|
||||
scandesc = node->ss.ss_currentScanDesc;
|
||||
estate = node->ss.ps.state;
|
||||
slot = node->ss.ss_ScanTupleSlot;
|
||||
direction = estate->es_direction;
|
||||
|
||||
if (!node->trss_inScan)
|
||||
{
|
||||
/* First time through, compute TID range to scan */
|
||||
if (!TidRangeEval(node))
|
||||
return NULL;
|
||||
|
||||
if (scandesc == NULL)
|
||||
{
|
||||
scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation,
|
||||
estate->es_snapshot,
|
||||
&node->trss_mintid,
|
||||
&node->trss_maxtid);
|
||||
node->ss.ss_currentScanDesc = scandesc;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* rescan with the updated TID range */
|
||||
table_rescan_tidrange(scandesc, &node->trss_mintid,
|
||||
&node->trss_maxtid);
|
||||
}
|
||||
|
||||
node->trss_inScan = true;
|
||||
}
|
||||
|
||||
/* Fetch the next tuple. */
|
||||
if (!table_scan_getnextslot_tidrange(scandesc, direction, slot))
|
||||
{
|
||||
node->trss_inScan = false;
|
||||
ExecClearTuple(slot);
|
||||
}
|
||||
|
||||
return slot;
|
||||
}
|
||||
|
||||
/*
|
||||
* TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual
|
||||
*/
|
||||
static bool
|
||||
TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecTidRangeScan(node)
|
||||
*
|
||||
* Scans the relation using tids and returns the next qualifying tuple.
|
||||
* We call the ExecScan() routine and pass it the appropriate
|
||||
* access method functions.
|
||||
*
|
||||
* Conditions:
|
||||
* -- the "cursor" maintained by the AMI is positioned at the tuple
|
||||
* returned previously.
|
||||
*
|
||||
* Initial States:
|
||||
* -- the relation indicated is opened for TID range scanning.
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
static TupleTableSlot *
|
||||
ExecTidRangeScan(PlanState *pstate)
|
||||
{
|
||||
TidRangeScanState *node = castNode(TidRangeScanState, pstate);
|
||||
|
||||
return ExecScan(&node->ss,
|
||||
(ExecScanAccessMtd) TidRangeNext,
|
||||
(ExecScanRecheckMtd) TidRangeRecheck);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecReScanTidRangeScan(node)
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
void
|
||||
ExecReScanTidRangeScan(TidRangeScanState *node)
|
||||
{
|
||||
/* mark scan as not in progress, and tid range list as not computed yet */
|
||||
node->trss_inScan = false;
|
||||
|
||||
/*
|
||||
* We must wait until TidRangeNext before calling table_rescan_tidrange.
|
||||
*/
|
||||
ExecScanReScan(&node->ss);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecEndTidRangeScan
|
||||
*
|
||||
* Releases any storage allocated through C routines.
|
||||
* Returns nothing.
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
void
|
||||
ExecEndTidRangeScan(TidRangeScanState *node)
|
||||
{
|
||||
TableScanDesc scan = node->ss.ss_currentScanDesc;
|
||||
|
||||
if (scan != NULL)
|
||||
table_endscan(scan);
|
||||
|
||||
/*
|
||||
* Free the exprcontext
|
||||
*/
|
||||
ExecFreeExprContext(&node->ss.ps);
|
||||
|
||||
/*
|
||||
* clear out tuple table slots
|
||||
*/
|
||||
if (node->ss.ps.ps_ResultTupleSlot)
|
||||
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
|
||||
ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecInitTidRangeScan
|
||||
*
|
||||
* Initializes the tid range scan's state information, creates
|
||||
* scan keys, and opens the scan relation.
|
||||
*
|
||||
* Parameters:
|
||||
* node: TidRangeScan node produced by the planner.
|
||||
* estate: the execution state initialized in InitPlan.
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
TidRangeScanState *
|
||||
ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags)
|
||||
{
|
||||
TidRangeScanState *tidrangestate;
|
||||
Relation currentRelation;
|
||||
|
||||
/*
|
||||
* create state structure
|
||||
*/
|
||||
tidrangestate = makeNode(TidRangeScanState);
|
||||
tidrangestate->ss.ps.plan = (Plan *) node;
|
||||
tidrangestate->ss.ps.state = estate;
|
||||
tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan;
|
||||
|
||||
/*
|
||||
* Miscellaneous initialization
|
||||
*
|
||||
* create expression context for node
|
||||
*/
|
||||
ExecAssignExprContext(estate, &tidrangestate->ss.ps);
|
||||
|
||||
/*
|
||||
* mark scan as not in progress, and TID range as not computed yet
|
||||
*/
|
||||
tidrangestate->trss_inScan = false;
|
||||
|
||||
/*
|
||||
* open the scan relation
|
||||
*/
|
||||
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
|
||||
|
||||
tidrangestate->ss.ss_currentRelation = currentRelation;
|
||||
tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */
|
||||
|
||||
/*
|
||||
* get the scan type from the relation descriptor.
|
||||
*/
|
||||
ExecInitScanTupleSlot(estate, &tidrangestate->ss,
|
||||
RelationGetDescr(currentRelation),
|
||||
table_slot_callbacks(currentRelation));
|
||||
|
||||
/*
|
||||
* Initialize result type and projection.
|
||||
*/
|
||||
ExecInitResultTypeTL(&tidrangestate->ss.ps);
|
||||
ExecAssignScanProjectionInfo(&tidrangestate->ss);
|
||||
|
||||
/*
|
||||
* initialize child expressions
|
||||
*/
|
||||
tidrangestate->ss.ps.qual =
|
||||
ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate);
|
||||
|
||||
TidExprListCreate(tidrangestate);
|
||||
|
||||
/*
|
||||
* all done.
|
||||
*/
|
||||
return tidrangestate;
|
||||
}
|
Reference in New Issue
Block a user