Redesign tablesample method API, and do extensive code review.

The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2025-11-24 00:23:06 +03:00 · 2015-07-25 14:39:00 -04:00
parent b26e3d660d
commit dd7a8f66ed
83 changed files with 3184 additions and 2589 deletions
--- a/src/backend/access/tablesample/system.c
+++ b/src/backend/access/tablesample/system.c
@@ -1,186 +1,260 @@
 /*-------------------------------------------------------------------------
 *
 * system.c
- *	  interface routines for system tablesample method
+ *	  support routines for SYSTEM tablesample method
+ *
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
+ *
+ * To achieve that, we proceed by hashing each candidate block number together
+ * with the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
 *
 *
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  src/backend/utils/tablesample/system.c
+ *	  src/backend/access/tablesample/system.c
 *
 *-------------------------------------------------------------------------
 */

 #include "postgres.h"

-#include "fmgr.h"
+#ifdef _MSC_VER
+#include <float.h>				/* for _isnan */
+#endif
+#include <math.h>

-#include "access/tablesample.h"
+#include "access/hash.h"
 #include "access/relscan.h"
-#include "nodes/execnodes.h"
-#include "nodes/relation.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
 #include "optimizer/clauses.h"
-#include "storage/bufmgr.h"
-#include "utils/sampling.h"
+#include "optimizer/cost.h"
+#include "utils/builtins.h"


-/*
- * State
- */
+/* Private state */
 typedef struct
 {
-	BlockSamplerData bs;
+	uint64		cutoff;			/* select blocks with hash less than this */
 	uint32		seed;			/* random seed */
-	BlockNumber nblocks;		/* number of block in relation */
-	int			samplesize;		/* number of blocks to return */
+	BlockNumber nextblock;		/* next block to consider sampling */
 	OffsetNumber lt;			/* last tuple returned from current block */
 } SystemSamplerData;


+static void system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples);
+static void system_initsamplescan(SampleScanState *node,
+					  int eflags);
+static void system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed);
+static BlockNumber system_nextsampleblock(SampleScanState *node);
+static OffsetNumber system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset);
+
+
 /*
- * Initializes the state.
+ * Create a TsmRoutine descriptor for the SYSTEM method.
 */
 Datum
-tsm_system_init(PG_FUNCTION_ARGS)
+tsm_system_handler(PG_FUNCTION_ARGS)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	uint32		seed = PG_GETARG_UINT32(1);
-	float4		percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
-	HeapScanDesc scan = tsdesc->heapScan;
-	SystemSamplerData *sampler;
+	TsmRoutine *tsm = makeNode(TsmRoutine);

-	if (percent < 0 || percent > 100)
+	tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+	tsm->repeatable_across_queries = true;
+	tsm->repeatable_across_scans = true;
+	tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
+	tsm->InitSampleScan = system_initsamplescan;
+	tsm->BeginSampleScan = system_beginsamplescan;
+	tsm->NextSampleBlock = system_nextsampleblock;
+	tsm->NextSampleTuple = system_nextsampletuple;
+	tsm->EndSampleScan = NULL;
+
+	PG_RETURN_POINTER(tsm);
+}
+
+/*
+ * Sample size estimation.
+ */
+static void
+system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples)
+{
+	Node	   *pctnode;
+	float4		samplefract;
+
+	/* Try to extract an estimate for the sample percentage */
+	pctnode = (Node *) linitial(paramexprs);
+	pctnode = estimate_expression_value(root, pctnode);
+
+	if (IsA(pctnode, Const) &&
+		!((Const *) pctnode)->constisnull)
+	{
+		samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+		if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+			samplefract /= 100.0f;
+		else
+		{
+			/* Default samplefract if the value is bogus */
+			samplefract = 0.1f;
+		}
+	}
+	else
+	{
+		/* Default samplefract if we didn't obtain a non-null Const */
+		samplefract = 0.1f;
+	}
+
+	/* We'll visit a sample of the pages ... */
+	*pages = clamp_row_est(baserel->pages * samplefract);
+
+	/* ... and hopefully get a representative number of tuples from them */
+	*tuples = clamp_row_est(baserel->tuples * samplefract);
+}
+
+/*
+ * Initialize during executor setup.
+ */
+static void
+system_initsamplescan(SampleScanState *node, int eflags)
+{
+	node->tsm_state = palloc0(sizeof(SystemSamplerData));
+}
+
+/*
+ * Examine parameters and prepare for a sample scan.
+ */
+static void
+system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	double		percent = DatumGetFloat4(params[0]);
+
+	if (percent < 0 || percent > 100 || isnan(percent))
 		ereport(ERROR,
-				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-				 errmsg("invalid sample size"),
-				 errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
+				(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+				 errmsg("sample percentage must be between 0 and 100")));

-	sampler = palloc0(sizeof(SystemSamplerData));
-
-	/* Remember initial values for reinit */
+	/*
+	 * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+	 * store that as a uint64, of course.  Note that this gives strictly
+	 * correct behavior at the limits of zero or one probability.
+	 */
+	sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
 	sampler->seed = seed;
-	sampler->nblocks = scan->rs_nblocks;
-	sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0));
+	sampler->nextblock = 0;
 	sampler->lt = InvalidOffsetNumber;

-	BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
-					  sampler->seed);
-
-	tsdesc->tsmdata = (void *) sampler;
-
-	PG_RETURN_VOID();
+	/*
+	 * Bulkread buffer access strategy probably makes sense unless we're
+	 * scanning a very small fraction of the table.  The 1% cutoff here is a
+	 * guess.  We should use pagemode visibility checking, since we scan all
+	 * tuples on each selected page.
+	 */
+	node->use_bulkread = (percent >= 1);
+	node->use_pagemode = true;
 }

 /*
- * Get next block number or InvalidBlockNumber when we're done.
+ * Select next block to sample.
+ */
+static BlockNumber
+system_nextsampleblock(SampleScanState *node)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	HeapScanDesc scan = node->ss.ss_currentScanDesc;
+	BlockNumber nextblock = sampler->nextblock;
+	uint32		hashinput[2];
+
+	/*
+	 * We compute the hash by applying hash_any to an array of 2 uint32's
+	 * containing the block number and seed.  This is efficient to set up, and
+	 * with the current implementation of hash_any, it gives
+	 * machine-independent results, which is a nice property for regression
+	 * testing.
+	 *
+	 * These words in the hash input are the same throughout the block:
+	 */
+	hashinput[1] = sampler->seed;
+
+	/*
+	 * Loop over block numbers until finding suitable block or reaching end of
+	 * relation.
+	 */
+	for (; nextblock < scan->rs_nblocks; nextblock++)
+	{
+		uint32		hash;
+
+		hashinput[0] = nextblock;
+
+		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+									   (int) sizeof(hashinput)));
+		if (hash < sampler->cutoff)
+			break;
+	}
+
+	if (nextblock < scan->rs_nblocks)
+	{
+		/* Found a suitable block; remember where we should start next time */
+		sampler->nextblock = nextblock + 1;
+		return nextblock;
+	}
+
+	/* Done, but let's reset nextblock to 0 for safety. */
+	sampler->nextblock = 0;
+	return InvalidBlockNumber;
+}
+
+/*
+ * Select next sampled tuple in current block.
 *
- * Uses the same logic as ANALYZE for picking the random blocks.
+ * In block sampling, we just want to sample all the tuples in each selected
+ * block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists); nodeSamplescan.c will deal with that.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
 */
-Datum
-tsm_system_nextblock(PG_FUNCTION_ARGS)
+static OffsetNumber
+system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
-	BlockNumber blockno;
-
-	if (!BlockSampler_HasMore(&sampler->bs))
-		PG_RETURN_UINT32(InvalidBlockNumber);
-
-	blockno = BlockSampler_Next(&sampler->bs);
-
-	PG_RETURN_UINT32(blockno);
-}
-
-/*
- * Get next tuple offset in current block or InvalidOffsetNumber if we are done
- * with this block.
- */
-Datum
-tsm_system_nexttuple(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	OffsetNumber maxoffset = PG_GETARG_UINT16(2);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
 	OffsetNumber tupoffset = sampler->lt;

+	/* Advance to next possible offset on page */
 	if (tupoffset == InvalidOffsetNumber)
 		tupoffset = FirstOffsetNumber;
 	else
 		tupoffset++;

+	/* Done? */
 	if (tupoffset > maxoffset)
 		tupoffset = InvalidOffsetNumber;

 	sampler->lt = tupoffset;

-	PG_RETURN_UINT16(tupoffset);
-}
-
-/*
- * Cleanup method.
- */
-Datum
-tsm_system_end(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-
-	pfree(tsdesc->tsmdata);
-
-	PG_RETURN_VOID();
-}
-
-/*
- * Reset state (called by ReScan).
- */
-Datum
-tsm_system_reset(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
-
-	sampler->lt = InvalidOffsetNumber;
-	BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
-					  sampler->seed);
-
-	PG_RETURN_VOID();
-}
-
-/*
- * Costing function.
- */
-Datum
-tsm_system_cost(PG_FUNCTION_ARGS)
-{
-	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-	Path	   *path = (Path *) PG_GETARG_POINTER(1);
-	RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
-	List	   *args = (List *) PG_GETARG_POINTER(3);
-	BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
-	double	   *tuples = (double *) PG_GETARG_POINTER(5);
-	Node	   *pctnode;
-	float4		samplesize;
-
-	pctnode = linitial(args);
-	pctnode = estimate_expression_value(root, pctnode);
-
-	if (IsA(pctnode, RelabelType))
-		pctnode = (Node *) ((RelabelType *) pctnode)->arg;
-
-	if (IsA(pctnode, Const))
-	{
-		samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
-		samplesize /= 100.0;
-	}
-	else
-	{
-		/* Default samplesize if the estimation didn't return Const. */
-		samplesize = 0.1f;
-	}
-
-	*pages = baserel->pages * samplesize;
-	*tuples = path->rows * samplesize;
-	path->rows = *tuples;
-
-	PG_RETURN_VOID();
+	return tupoffset;
 }