Redesign tablesample method API, and do extensive code review.

The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2025-09-02 04:21:28 +03:00 · 2015-07-25 14:39:00 -04:00
parent 7d4240d6cd
commit 6fcb337fa5
83 changed files with 3184 additions and 2589 deletions
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -18,6 +18,7 @@
 #include <math.h>

 #include "access/sysattr.h"
+#include "access/tsmapi.h"
 #include "catalog/pg_class.h"
 #include "catalog/pg_operator.h"
 #include "foreign/fdwapi.h"
@@ -390,7 +391,7 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
 				}
 				else if (rte->tablesample != NULL)
 				{
-					/* Build sample scan on relation */
+					/* Sampled relation */
 					set_tablesample_rel_pathlist(root, rel, rte);
 				}
 				else
@@ -480,11 +481,40 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)

 /*
 * set_tablesample_rel_size
- *	  Set size estimates for a sampled relation.
+ *	  Set size estimates for a sampled relation
 */
 static void
 set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
+	TableSampleClause *tsc = rte->tablesample;
+	TsmRoutine *tsm;
+	BlockNumber pages;
+	double		tuples;
+
+	/*
+	 * Test any partial indexes of rel for applicability.  We must do this
+	 * first since partial unique indexes can affect size estimates.
+	 */
+	check_partial_indexes(root, rel);
+
+	/*
+	 * Call the sampling method's estimation function to estimate the number
+	 * of pages it will read and the number of tuples it will return.  (Note:
+	 * we assume the function returns sane values.)
+	 */
+	tsm = GetTsmRoutine(tsc->tsmhandler);
+	tsm->SampleScanGetSampleSize(root, rel, tsc->args,
+								 &pages, &tuples);
+
+	/*
+	 * For the moment, because we will only consider a SampleScan path for the
+	 * rel, it's okay to just overwrite the pages and tuples estimates for the
+	 * whole relation.  If we ever consider multiple path types for sampled
+	 * rels, we'll need more complication.
+	 */
+	rel->pages = pages;
+	rel->tuples = tuples;
+
 	/* Mark rel with estimated output rows, width, etc */
 	set_baserel_size_estimates(root, rel);
 }
@@ -492,8 +522,6 @@ set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 /*
 * set_tablesample_rel_pathlist
 *	  Build access paths for a sampled relation
- *
- * There is only one possible path - sampling scan
 */
 static void
 set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
@@ -502,15 +530,41 @@ set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *
 	Path	   *path;

 	/*
-	 * We don't support pushing join clauses into the quals of a seqscan, but
-	 * it could still have required parameterization due to LATERAL refs in
-	 * its tlist.
+	 * We don't support pushing join clauses into the quals of a samplescan,
+	 * but it could still have required parameterization due to LATERAL refs
+	 * in its tlist or TABLESAMPLE arguments.
 	 */
 	required_outer = rel->lateral_relids;

-	/* We only do sample scan if it was requested */
+	/* Consider sampled scan */
 	path = create_samplescan_path(root, rel, required_outer);
-	rel->pathlist = list_make1(path);
+
+	/*
+	 * If the sampling method does not support repeatable scans, we must avoid
+	 * plans that would scan the rel multiple times.  Ideally, we'd simply
+	 * avoid putting the rel on the inside of a nestloop join; but adding such
+	 * a consideration to the planner seems like a great deal of complication
+	 * to support an uncommon usage of second-rate sampling methods.  Instead,
+	 * if there is a risk that the query might perform an unsafe join, just
+	 * wrap the SampleScan in a Materialize node.  We can check for joins by
+	 * counting the membership of all_baserels (note that this correctly
+	 * counts inheritance trees as single rels).  If we're inside a subquery,
+	 * we can't easily check whether a join might occur in the outer query, so
+	 * just assume one is possible.
+	 *
+	 * GetTsmRoutine is relatively expensive compared to the other tests here,
+	 * so check repeatable_across_scans last, even though that's a bit odd.
+	 */
+	if ((root->query_level > 1 ||
+		 bms_membership(root->all_baserels) != BMS_SINGLETON) &&
+	 !(GetTsmRoutine(rte->tablesample->tsmhandler)->repeatable_across_scans))
+	{
+		path = (Path *) create_material_path(rel, path);
+	}
+
+	add_path(rel, path);
+
+	/* For the moment, at least, there are no other paths to consider */
 }

 /*
@@ -2450,7 +2504,33 @@ print_path(PlannerInfo *root, Path *path, int indent)
 	switch (nodeTag(path))
 	{
 		case T_Path:
-			ptype = "SeqScan";
+			switch (path->pathtype)
+			{
+				case T_SeqScan:
+					ptype = "SeqScan";
+					break;
+				case T_SampleScan:
+					ptype = "SampleScan";
+					break;
+				case T_SubqueryScan:
+					ptype = "SubqueryScan";
+					break;
+				case T_FunctionScan:
+					ptype = "FunctionScan";
+					break;
+				case T_ValuesScan:
+					ptype = "ValuesScan";
+					break;
+				case T_CteScan:
+					ptype = "CteScan";
+					break;
+				case T_WorkTableScan:
+					ptype = "WorkTableScan";
+					break;
+				default:
+					ptype = "???Path";
+					break;
+			}
 			break;
 		case T_IndexPath:
 			ptype = "IdxScan";
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -74,6 +74,7 @@
 #include <math.h>

 #include "access/htup_details.h"
+#include "access/tsmapi.h"
 #include "executor/executor.h"
 #include "executor/nodeHash.h"
 #include "miscadmin.h"
@@ -223,64 +224,66 @@ cost_seqscan(Path *path, PlannerInfo *root,
 * cost_samplescan
 *	  Determines and returns the cost of scanning a relation using sampling.
 *
- * From planner/optimizer perspective, we don't care all that much about cost
- * itself since there is always only one scan path to consider when sampling
- * scan is present, but number of rows estimation is still important.
- *
 * 'baserel' is the relation to be scanned
 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
 */
 void
-cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
+cost_samplescan(Path *path, PlannerInfo *root,
+				RelOptInfo *baserel, ParamPathInfo *param_info)
 {
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
+	RangeTblEntry *rte;
+	TableSampleClause *tsc;
+	TsmRoutine *tsm;
 	double		spc_seq_page_cost,
 				spc_random_page_cost,
 				spc_page_cost;
 	QualCost	qpqual_cost;
 	Cost		cpu_per_tuple;
-	BlockNumber pages;
-	double		tuples;
-	RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root);
-	TableSampleClause *tablesample = rte->tablesample;

-	/* Should only be applied to base relations */
+	/* Should only be applied to base relations with tablesample clauses */
 	Assert(baserel->relid > 0);
-	Assert(baserel->rtekind == RTE_RELATION);
+	rte = planner_rt_fetch(baserel->relid, root);
+	Assert(rte->rtekind == RTE_RELATION);
+	tsc = rte->tablesample;
+	Assert(tsc != NULL);
+	tsm = GetTsmRoutine(tsc->tsmhandler);

 	/* Mark the path with the correct row estimate */
-	if (path->param_info)
-		path->rows = path->param_info->ppi_rows;
+	if (param_info)
+		path->rows = param_info->ppi_rows;
 	else
 		path->rows = baserel->rows;

-	/* Call the sampling method's costing function. */
-	OidFunctionCall6(tablesample->tsmcost, PointerGetDatum(root),
-					 PointerGetDatum(path), PointerGetDatum(baserel),
-					 PointerGetDatum(tablesample->args),
-					 PointerGetDatum(&pages), PointerGetDatum(&tuples));
-
 	/* fetch estimated page cost for tablespace containing table */
 	get_tablespace_page_costs(baserel->reltablespace,
 							  &spc_random_page_cost,
 							  &spc_seq_page_cost);

-
-	spc_page_cost = tablesample->tsmseqscan ? spc_seq_page_cost :
-		spc_random_page_cost;
+	/* if NextSampleBlock is used, assume random access, else sequential */
+	spc_page_cost = (tsm->NextSampleBlock != NULL) ?
+		spc_random_page_cost : spc_seq_page_cost;

 	/*
-	 * disk costs
+	 * disk costs (recall that baserel->pages has already been set to the
+	 * number of pages the sampling method will visit)
 	 */
-	run_cost += spc_page_cost * pages;
+	run_cost += spc_page_cost * baserel->pages;

-	/* CPU costs */
-	get_restriction_qual_cost(root, baserel, path->param_info, &qpqual_cost);
+	/*
+	 * CPU costs (recall that baserel->tuples has already been set to the
+	 * number of tuples the sampling method will select).  Note that we ignore
+	 * execution cost of the TABLESAMPLE parameter expressions; they will be
+	 * evaluated only once per scan, and in most usages they'll likely be
+	 * simple constants anyway.  We also don't charge anything for the
+	 * calculations the sampling method might do internally.
+	 */
+	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);

 	startup_cost += qpqual_cost.startup;
 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
-	run_cost += cpu_per_tuple * tuples;
+	run_cost += cpu_per_tuple * baserel->tuples;

 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -102,7 +102,8 @@ static List *order_qual_clauses(PlannerInfo *root, List *clauses);
 static void copy_path_costsize(Plan *dest, Path *src);
 static void copy_plan_costsize(Plan *dest, Plan *src);
 static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid);
-static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid);
+static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid,
+				TableSampleClause *tsc);
 static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid,
 			   Oid indexid, List *indexqual, List *indexqualorig,
 			   List *indexorderby, List *indexorderbyorig,
@@ -1148,7 +1149,7 @@ create_seqscan_plan(PlannerInfo *root, Path *best_path,

 /*
 * create_samplescan_plan
- *	 Returns a samplecan plan for the base relation scanned by 'best_path'
+ *	 Returns a samplescan plan for the base relation scanned by 'best_path'
 *	 with restriction clauses 'scan_clauses' and targetlist 'tlist'.
 */
 static SampleScan *
@@ -1157,11 +1158,15 @@ create_samplescan_plan(PlannerInfo *root, Path *best_path,
 {
 	SampleScan *scan_plan;
 	Index		scan_relid = best_path->parent->relid;
+	RangeTblEntry *rte;
+	TableSampleClause *tsc;

-	/* it should be a base rel with tablesample clause... */
+	/* it should be a base rel with a tablesample clause... */
 	Assert(scan_relid > 0);
-	Assert(best_path->parent->rtekind == RTE_RELATION);
-	Assert(best_path->pathtype == T_SampleScan);
+	rte = planner_rt_fetch(scan_relid, root);
+	Assert(rte->rtekind == RTE_RELATION);
+	tsc = rte->tablesample;
+	Assert(tsc != NULL);

 	/* Sort clauses into best execution order */
 	scan_clauses = order_qual_clauses(root, scan_clauses);
@@ -1174,13 +1179,16 @@ create_samplescan_plan(PlannerInfo *root, Path *best_path,
 	{
 		scan_clauses = (List *)
 			replace_nestloop_params(root, (Node *) scan_clauses);
+		tsc = (TableSampleClause *)
+			replace_nestloop_params(root, (Node *) tsc);
 	}

 	scan_plan = make_samplescan(tlist,
 								scan_clauses,
-								scan_relid);
+								scan_relid,
+								tsc);

-	copy_path_costsize(&scan_plan->plan, best_path);
+	copy_path_costsize(&scan_plan->scan.plan, best_path);

 	return scan_plan;
 }
@@ -2161,9 +2169,9 @@ create_customscan_plan(PlannerInfo *root, CustomPath *best_path,
 	ListCell   *lc;

 	/* Recursively transform child paths. */
-	foreach (lc, best_path->custom_paths)
+	foreach(lc, best_path->custom_paths)
 	{
-		Plan   *plan = create_plan_recurse(root, (Path *) lfirst(lc));
+		Plan	   *plan = create_plan_recurse(root, (Path *) lfirst(lc));

 		custom_plans = lappend(custom_plans, plan);
 	}
@@ -3437,17 +3445,19 @@ make_seqscan(List *qptlist,
 static SampleScan *
 make_samplescan(List *qptlist,
 				List *qpqual,
-				Index scanrelid)
+				Index scanrelid,
+				TableSampleClause *tsc)
 {
 	SampleScan *node = makeNode(SampleScan);
-	Plan	   *plan = &node->plan;
+	Plan	   *plan = &node->scan.plan;

 	/* cost should be inserted by caller */
 	plan->targetlist = qptlist;
 	plan->qual = qpqual;
 	plan->lefttree = NULL;
 	plan->righttree = NULL;
-	node->scanrelid = scanrelid;
+	node->scan.scanrelid = scanrelid;
+	node->tablesample = tsc;

 	return node;
 }
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -306,7 +306,9 @@ extract_lateral_references(PlannerInfo *root, RelOptInfo *brel, Index rtindex)
 		return;

 	/* Fetch the appropriate variables */
-	if (rte->rtekind == RTE_SUBQUERY)
+	if (rte->rtekind == RTE_RELATION)
+		vars = pull_vars_of_level((Node *) rte->tablesample, 0);
+	else if (rte->rtekind == RTE_SUBQUERY)
 		vars = pull_vars_of_level((Node *) rte->subquery, 1);
 	else if (rte->rtekind == RTE_FUNCTION)
 		vars = pull_vars_of_level((Node *) rte->functions, 0);
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -505,14 +505,10 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
 		if (rte->rtekind == RTE_RELATION)
 		{
 			if (rte->tablesample)
-			{
-				rte->tablesample->args = (List *)
-					preprocess_expression(root, (Node *) rte->tablesample->args,
+				rte->tablesample = (TableSampleClause *)
+					preprocess_expression(root,
+										  (Node *) rte->tablesample,
 										  EXPRKIND_TABLESAMPLE);
-				rte->tablesample->repeatable = (Node *)
-					preprocess_expression(root, rte->tablesample->repeatable,
-										  EXPRKIND_TABLESAMPLE);
-			}
 		}
 		else if (rte->rtekind == RTE_SUBQUERY)
 		{
@@ -697,11 +693,14 @@ preprocess_expression(PlannerInfo *root, Node *expr, int kind)
 	 * If the query has any join RTEs, replace join alias variables with
 	 * base-relation variables.  We must do this before sublink processing,
 	 * else sublinks expanded out from join aliases would not get processed.
-	 * We can skip it in non-lateral RTE functions and VALUES lists, however,
-	 * since they can't contain any Vars of the current query level.
+	 * We can skip it in non-lateral RTE functions, VALUES lists, and
+	 * TABLESAMPLE clauses, however, since they can't contain any Vars of the
+	 * current query level.
 	 */
 	if (root->hasJoinRTEs &&
-		!(kind == EXPRKIND_RTFUNC || kind == EXPRKIND_VALUES))
+		!(kind == EXPRKIND_RTFUNC ||
+		  kind == EXPRKIND_VALUES ||
+		  kind == EXPRKIND_TABLESAMPLE))
 		expr = flatten_join_alias_vars(root, expr);

 	/*
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -372,9 +372,8 @@ flatten_rtes_walker(Node *node, PlannerGlobal *glob)
 *
 * In the flat rangetable, we zero out substructure pointers that are not
 * needed by the executor; this reduces the storage space and copying cost
- * for cached plans.  We keep only the tablesample field (which we'd otherwise
- * have to put in the plan tree, anyway); the ctename, alias and eref Alias
- * fields, which are needed by EXPLAIN; and the selectedCols, insertedCols and
+ * for cached plans.  We keep only the ctename, alias and eref Alias fields,
+ * which are needed by EXPLAIN, and the selectedCols, insertedCols and
 * updatedCols bitmaps, which are needed for executor-startup permissions
 * checking and for trigger event checking.
 */
@@ -388,6 +387,7 @@ add_rte_to_flat_rtable(PlannerGlobal *glob, RangeTblEntry *rte)
 	memcpy(newrte, rte, sizeof(RangeTblEntry));

 	/* zap unneeded sub-structure */
+	newrte->tablesample = NULL;
 	newrte->subquery = NULL;
 	newrte->joinaliasvars = NIL;
 	newrte->functions = NIL;
@@ -456,11 +456,13 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
 			{
 				SampleScan *splan = (SampleScan *) plan;

-				splan->scanrelid += rtoffset;
-				splan->plan.targetlist =
-					fix_scan_list(root, splan->plan.targetlist, rtoffset);
-				splan->plan.qual =
-					fix_scan_list(root, splan->plan.qual, rtoffset);
+				splan->scan.scanrelid += rtoffset;
+				splan->scan.plan.targetlist =
+					fix_scan_list(root, splan->scan.plan.targetlist, rtoffset);
+				splan->scan.plan.qual =
+					fix_scan_list(root, splan->scan.plan.qual, rtoffset);
+				splan->tablesample = (TableSampleClause *)
+					fix_scan_expr(root, (Node *) splan->tablesample, rtoffset);
 			}
 			break;
 		case T_IndexScan:
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2216,7 +2216,12 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
 			break;

 		case T_SeqScan:
+			context.paramids = bms_add_members(context.paramids, scan_params);
+			break;
+
 		case T_SampleScan:
+			finalize_primnode((Node *) ((SampleScan *) plan)->tablesample,
+							  &context);
 			context.paramids = bms_add_members(context.paramids, scan_params);
 			break;

@@ -2384,7 +2389,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
 					bms_add_members(context.paramids, scan_params);

 				/* child nodes if any */
-				foreach (lc, cscan->custom_plans)
+				foreach(lc, cscan->custom_plans)
 				{
 					context.paramids =
 						bms_add_members(context.paramids,
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -1091,12 +1091,15 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte,

 			switch (child_rte->rtekind)
 			{
+				case RTE_RELATION:
+					if (child_rte->tablesample)
+						child_rte->lateral = true;
+					break;
 				case RTE_SUBQUERY:
 				case RTE_FUNCTION:
 				case RTE_VALUES:
 					child_rte->lateral = true;
 					break;
-				case RTE_RELATION:
 				case RTE_JOIN:
 				case RTE_CTE:
 					/* these can't contain any lateral references */
@@ -1909,6 +1912,13 @@ replace_vars_in_jointree(Node *jtnode,
 			{
 				switch (rte->rtekind)
 				{
+					case RTE_RELATION:
+						/* shouldn't be marked LATERAL unless tablesample */
+						Assert(rte->tablesample);
+						rte->tablesample = (TableSampleClause *)
+							pullup_replace_vars((Node *) rte->tablesample,
+												context);
+						break;
 					case RTE_SUBQUERY:
 						rte->subquery =
 							pullup_replace_vars_subquery(rte->subquery,
@@ -1924,7 +1934,6 @@ replace_vars_in_jointree(Node *jtnode,
 							pullup_replace_vars((Node *) rte->values_lists,
 												context);
 						break;
-					case RTE_RELATION:
 					case RTE_JOIN:
 					case RTE_CTE:
 						/* these shouldn't be marked LATERAL */
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -713,7 +713,7 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer)

 /*
 * create_samplescan_path
- *	  Like seqscan but uses sampling function while scanning.
+ *	  Creates a path node for a sampled table scan.
 */
 Path *
 create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer)
@@ -726,7 +726,7 @@ create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer
 													 required_outer);
 	pathnode->pathkeys = NIL;	/* samplescan has unordered result */

-	cost_samplescan(pathnode, root, rel);
+	cost_samplescan(pathnode, root, rel, pathnode->param_info);

 	return pathnode;
 }
@@ -1773,6 +1773,8 @@ reparameterize_path(PlannerInfo *root, Path *path,
 	{
 		case T_SeqScan:
 			return create_seqscan_path(root, rel, required_outer);
+		case T_SampleScan:
+			return (Path *) create_samplescan_path(root, rel, required_outer);
 		case T_IndexScan:
 		case T_IndexOnlyScan:
 			{
@@ -1805,8 +1807,6 @@ reparameterize_path(PlannerInfo *root, Path *path,
 		case T_SubqueryScan:
 			return create_subqueryscan_path(root, rel, path->pathkeys,
 											required_outer);
-		case T_SampleScan:
-			return (Path *) create_samplescan_path(root, rel, required_outer);
 		default:
 			break;
 	}