1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-19 23:22:23 +03:00

Redesign tablesample method API, and do extensive code review.

The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM.  (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.)  Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type.  This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.

Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.

Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.

Back-patch to 9.5 so that we don't have to support the original API
in production.
This commit is contained in:
Tom Lane
2015-07-25 14:39:00 -04:00
parent b26e3d660d
commit dd7a8f66ed
83 changed files with 3184 additions and 2589 deletions

View File

@@ -1,233 +1,231 @@
/*-------------------------------------------------------------------------
*
* bernoulli.c
* interface routines for BERNOULLI tablesample method
* support routines for BERNOULLI tablesample method
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* To ensure repeatability of samples, it is necessary that selection of a
* given tuple be history-independent; otherwise syncscanning would break
* repeatability, to say nothing of logically-irrelevant maintenance such
* as physical extension or shortening of the relation.
*
* To achieve that, we proceed by hashing each candidate TID together with
* the active seed, and then selecting it if the hash is less than the
* cutoff value computed from the selection probability by BeginSampleScan.
*
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/utils/tablesample/bernoulli.c
* src/backend/access/tablesample/bernoulli.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#ifdef _MSC_VER
#include <float.h> /* for _isnan */
#endif
#include <math.h>
#include "access/tablesample.h"
#include "access/relscan.h"
#include "nodes/execnodes.h"
#include "nodes/relation.h"
#include "access/hash.h"
#include "access/tsmapi.h"
#include "catalog/pg_type.h"
#include "optimizer/clauses.h"
#include "storage/bufmgr.h"
#include "utils/sampling.h"
#include "optimizer/cost.h"
#include "utils/builtins.h"
/* tsdesc */
/* Private state */
typedef struct
{
uint64 cutoff; /* select tuples with hash less than this */
uint32 seed; /* random seed */
BlockNumber startblock; /* starting block, we use ths for syncscan
* support */
BlockNumber nblocks; /* number of blocks */
BlockNumber blockno; /* current block */
float4 probability; /* probabilty that tuple will be returned
* (0.0-1.0) */
OffsetNumber lt; /* last tuple returned from current block */
SamplerRandomState randstate; /* random generator tsdesc */
} BernoulliSamplerData;
static void bernoulli_samplescangetsamplesize(PlannerInfo *root,
RelOptInfo *baserel,
List *paramexprs,
BlockNumber *pages,
double *tuples);
static void bernoulli_initsamplescan(SampleScanState *node,
int eflags);
static void bernoulli_beginsamplescan(SampleScanState *node,
Datum *params,
int nparams,
uint32 seed);
static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node,
BlockNumber blockno,
OffsetNumber maxoffset);
/*
* Initialize the state.
* Create a TsmRoutine descriptor for the BERNOULLI method.
*/
Datum
tsm_bernoulli_init(PG_FUNCTION_ARGS)
tsm_bernoulli_handler(PG_FUNCTION_ARGS)
{
TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
uint32 seed = PG_GETARG_UINT32(1);
float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
HeapScanDesc scan = tsdesc->heapScan;
BernoulliSamplerData *sampler;
TsmRoutine *tsm = makeNode(TsmRoutine);
if (percent < 0 || percent > 100)
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("invalid sample size"),
errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
tsm->parameterTypes = list_make1_oid(FLOAT4OID);
tsm->repeatable_across_queries = true;
tsm->repeatable_across_scans = true;
tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize;
tsm->InitSampleScan = bernoulli_initsamplescan;
tsm->BeginSampleScan = bernoulli_beginsamplescan;
tsm->NextSampleBlock = NULL;
tsm->NextSampleTuple = bernoulli_nextsampletuple;
tsm->EndSampleScan = NULL;
sampler = palloc0(sizeof(BernoulliSamplerData));
/* Remember initial values for reinit */
sampler->seed = seed;
sampler->startblock = scan->rs_startblock;
sampler->nblocks = scan->rs_nblocks;
sampler->blockno = InvalidBlockNumber;
sampler->probability = percent / 100;
sampler->lt = InvalidOffsetNumber;
sampler_random_init_state(sampler->seed, sampler->randstate);
tsdesc->tsmdata = (void *) sampler;
PG_RETURN_VOID();
PG_RETURN_POINTER(tsm);
}
/*
* Get next block number to read or InvalidBlockNumber if we are at the
* end of the relation.
* Sample size estimation.
*/
Datum
tsm_bernoulli_nextblock(PG_FUNCTION_ARGS)
static void
bernoulli_samplescangetsamplesize(PlannerInfo *root,
RelOptInfo *baserel,
List *paramexprs,
BlockNumber *pages,
double *tuples)
{
TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
Node *pctnode;
float4 samplefract;
/*
* Bernoulli sampling scans all blocks on the table and supports syncscan
* so loop from startblock to startblock instead of from 0 to nblocks.
*/
if (sampler->blockno == InvalidBlockNumber)
sampler->blockno = sampler->startblock;
/* Try to extract an estimate for the sample percentage */
pctnode = (Node *) linitial(paramexprs);
pctnode = estimate_expression_value(root, pctnode);
if (IsA(pctnode, Const) &&
!((Const *) pctnode)->constisnull)
{
samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
samplefract /= 100.0f;
else
{
/* Default samplefract if the value is bogus */
samplefract = 0.1f;
}
}
else
{
sampler->blockno++;
if (sampler->blockno >= sampler->nblocks)
sampler->blockno = 0;
if (sampler->blockno == sampler->startblock)
PG_RETURN_UINT32(InvalidBlockNumber);
/* Default samplefract if we didn't obtain a non-null Const */
samplefract = 0.1f;
}
PG_RETURN_UINT32(sampler->blockno);
/* We'll visit all pages of the baserel */
*pages = baserel->pages;
*tuples = clamp_row_est(baserel->tuples * samplefract);
}
/*
* Get next tuple from current block.
* Initialize during executor setup.
*/
static void
bernoulli_initsamplescan(SampleScanState *node, int eflags)
{
node->tsm_state = palloc0(sizeof(BernoulliSamplerData));
}
/*
* Examine parameters and prepare for a sample scan.
*/
static void
bernoulli_beginsamplescan(SampleScanState *node,
Datum *params,
int nparams,
uint32 seed)
{
BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
double percent = DatumGetFloat4(params[0]);
if (percent < 0 || percent > 100 || isnan(percent))
ereport(ERROR,
(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
errmsg("sample percentage must be between 0 and 100")));
/*
* The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
* store that as a uint64, of course. Note that this gives strictly
* correct behavior at the limits of zero or one probability.
*/
sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
sampler->seed = seed;
sampler->lt = InvalidOffsetNumber;
/*
* Use bulkread, since we're scanning all pages. But pagemode visibility
* checking is a win only at larger sampling fractions. The 25% cutoff
* here is based on very limited experimentation.
*/
node->use_bulkread = true;
node->use_pagemode = (percent >= 25);
}
/*
* Select next sampled tuple in current block.
*
* This method implements the main logic in bernoulli sampling.
* The algorithm simply generates new random number (in 0.0-1.0 range) and if
* it falls within user specified probability (in the same range) return the
* tuple offset.
* It is OK here to return an offset without knowing if the tuple is visible
* (or even exists). The reason is that we do the coinflip for every tuple
* offset in the table. Since all tuples have the same probability of being
* returned, it doesn't matter if we do extra coinflips for invisible tuples.
*
* It is ok here to return tuple offset without knowing if tuple is visible
* and not check it via examinetuple. The reason for that is that we do the
* coinflip (random number generation) for every tuple in the table. Since all
* tuples have same probability of being returned the visible and invisible
* tuples will be returned in same ratio as they have in the actual table.
* This means that there is no skew towards either visible or invisible tuples
* and the number of visible tuples returned from the executor node should
* match the fraction of visible tuples which was specified by user.
*
* This is faster than doing the coinflip in examinetuple because we don't
* have to do visibility checks on uninteresting tuples.
*
* If we reach end of the block return InvalidOffsetNumber which tells
* When we reach end of the block, return InvalidOffsetNumber which tells
* SampleScan to go to next block.
*/
Datum
tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS)
static OffsetNumber
bernoulli_nextsampletuple(SampleScanState *node,
BlockNumber blockno,
OffsetNumber maxoffset)
{
TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
OffsetNumber maxoffset = PG_GETARG_UINT16(2);
BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
OffsetNumber tupoffset = sampler->lt;
float4 probability = sampler->probability;
uint32 hashinput[3];
/* Advance to first/next tuple in block */
if (tupoffset == InvalidOffsetNumber)
tupoffset = FirstOffsetNumber;
else
tupoffset++;
/*
* Loop over tuple offsets until the random generator returns value that
* is within the probability of returning the tuple or until we reach end
* of the block.
* We compute the hash by applying hash_any to an array of 3 uint32's
* containing the block, offset, and seed. This is efficient to set up,
* and with the current implementation of hash_any, it gives
* machine-independent results, which is a nice property for regression
* testing.
*
* (This is our implementation of bernoulli trial)
* These words in the hash input are the same throughout the block:
*/
while (sampler_random_fract(sampler->randstate) > probability)
{
tupoffset++;
hashinput[0] = blockno;
hashinput[2] = sampler->seed;
if (tupoffset > maxoffset)
/*
* Loop over tuple offsets until finding suitable TID or reaching end of
* block.
*/
for (; tupoffset <= maxoffset; tupoffset++)
{
uint32 hash;
hashinput[1] = tupoffset;
hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
(int) sizeof(hashinput)));
if (hash < sampler->cutoff)
break;
}
if (tupoffset > maxoffset)
/* Tell SampleScan that we want next block. */
tupoffset = InvalidOffsetNumber;
sampler->lt = tupoffset;
PG_RETURN_UINT16(tupoffset);
}
/*
* Cleanup method.
*/
Datum
tsm_bernoulli_end(PG_FUNCTION_ARGS)
{
TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
pfree(tsdesc->tsmdata);
PG_RETURN_VOID();
}
/*
* Reset tsdesc (called by ReScan).
*/
Datum
tsm_bernoulli_reset(PG_FUNCTION_ARGS)
{
TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
sampler->blockno = InvalidBlockNumber;
sampler->lt = InvalidOffsetNumber;
sampler_random_init_state(sampler->seed, sampler->randstate);
PG_RETURN_VOID();
}
/*
* Costing function.
*/
Datum
tsm_bernoulli_cost(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Path *path = (Path *) PG_GETARG_POINTER(1);
RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
List *args = (List *) PG_GETARG_POINTER(3);
BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
double *tuples = (double *) PG_GETARG_POINTER(5);
Node *pctnode;
float4 samplesize;
*pages = baserel->pages;
pctnode = linitial(args);
pctnode = estimate_expression_value(root, pctnode);
if (IsA(pctnode, RelabelType))
pctnode = (Node *) ((RelabelType *) pctnode)->arg;
if (IsA(pctnode, Const))
{
samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
samplesize /= 100.0;
}
else
{
/* Default samplesize if the estimation didn't return Const. */
samplesize = 0.1f;
}
*tuples = path->rows * samplesize;
path->rows = *tuples;
PG_RETURN_VOID();
return tupoffset;
}