mirror of
https://github.com/postgres/postgres.git
synced 2025-04-20 00:42:27 +03:00
Separate block sampling functions
Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek
This commit is contained in:
parent
5a3022fde0
commit
83e176ec18
@ -34,6 +34,7 @@
|
|||||||
#include "optimizer/var.h"
|
#include "optimizer/var.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
#include "utils/rel.h"
|
#include "utils/rel.h"
|
||||||
|
#include "utils/sampling.h"
|
||||||
|
|
||||||
PG_MODULE_MAGIC;
|
PG_MODULE_MAGIC;
|
||||||
|
|
||||||
@ -1006,7 +1007,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
{
|
{
|
||||||
int numrows = 0;
|
int numrows = 0;
|
||||||
double rowstoskip = -1; /* -1 means not set yet */
|
double rowstoskip = -1; /* -1 means not set yet */
|
||||||
double rstate;
|
ReservoirStateData rstate;
|
||||||
TupleDesc tupDesc;
|
TupleDesc tupDesc;
|
||||||
Datum *values;
|
Datum *values;
|
||||||
bool *nulls;
|
bool *nulls;
|
||||||
@ -1044,7 +1045,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
ALLOCSET_DEFAULT_MAXSIZE);
|
ALLOCSET_DEFAULT_MAXSIZE);
|
||||||
|
|
||||||
/* Prepare for sampling rows */
|
/* Prepare for sampling rows */
|
||||||
rstate = anl_init_selection_state(targrows);
|
reservoir_init_selection_state(&rstate, targrows);
|
||||||
|
|
||||||
/* Set up callback to identify error line number. */
|
/* Set up callback to identify error line number. */
|
||||||
errcallback.callback = CopyFromErrorCallback;
|
errcallback.callback = CopyFromErrorCallback;
|
||||||
@ -1088,7 +1089,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
* not-yet-incremented value of totalrows as t.
|
* not-yet-incremented value of totalrows as t.
|
||||||
*/
|
*/
|
||||||
if (rowstoskip < 0)
|
if (rowstoskip < 0)
|
||||||
rowstoskip = anl_get_next_S(*totalrows, targrows, &rstate);
|
rowstoskip = reservoir_get_next_S(&rstate, *totalrows, targrows);
|
||||||
|
|
||||||
if (rowstoskip <= 0)
|
if (rowstoskip <= 0)
|
||||||
{
|
{
|
||||||
@ -1096,7 +1097,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
* Found a suitable tuple, so save it, replacing one old tuple
|
* Found a suitable tuple, so save it, replacing one old tuple
|
||||||
* at random
|
* at random
|
||||||
*/
|
*/
|
||||||
int k = (int) (targrows * anl_random_fract());
|
int k = (int) (targrows * sampler_random_fract());
|
||||||
|
|
||||||
Assert(k >= 0 && k < targrows);
|
Assert(k >= 0 && k < targrows);
|
||||||
heap_freetuple(rows[k]);
|
heap_freetuple(rows[k]);
|
||||||
|
@ -37,6 +37,7 @@
|
|||||||
#include "utils/lsyscache.h"
|
#include "utils/lsyscache.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
#include "utils/rel.h"
|
#include "utils/rel.h"
|
||||||
|
#include "utils/sampling.h"
|
||||||
|
|
||||||
PG_MODULE_MAGIC;
|
PG_MODULE_MAGIC;
|
||||||
|
|
||||||
@ -202,7 +203,7 @@ typedef struct PgFdwAnalyzeState
|
|||||||
/* for random sampling */
|
/* for random sampling */
|
||||||
double samplerows; /* # of rows fetched */
|
double samplerows; /* # of rows fetched */
|
||||||
double rowstoskip; /* # of rows to skip before next sample */
|
double rowstoskip; /* # of rows to skip before next sample */
|
||||||
double rstate; /* random state */
|
ReservoirStateData rstate; /* state for reservoir sampling*/
|
||||||
|
|
||||||
/* working memory contexts */
|
/* working memory contexts */
|
||||||
MemoryContext anl_cxt; /* context for per-analyze lifespan data */
|
MemoryContext anl_cxt; /* context for per-analyze lifespan data */
|
||||||
@ -2411,7 +2412,7 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
|
|||||||
astate.numrows = 0;
|
astate.numrows = 0;
|
||||||
astate.samplerows = 0;
|
astate.samplerows = 0;
|
||||||
astate.rowstoskip = -1; /* -1 means not set yet */
|
astate.rowstoskip = -1; /* -1 means not set yet */
|
||||||
astate.rstate = anl_init_selection_state(targrows);
|
reservoir_init_selection_state(&astate.rstate, targrows);
|
||||||
|
|
||||||
/* Remember ANALYZE context, and create a per-tuple temp context */
|
/* Remember ANALYZE context, and create a per-tuple temp context */
|
||||||
astate.anl_cxt = CurrentMemoryContext;
|
astate.anl_cxt = CurrentMemoryContext;
|
||||||
@ -2551,13 +2552,12 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
|
|||||||
* analyze.c; see Jeff Vitter's paper.
|
* analyze.c; see Jeff Vitter's paper.
|
||||||
*/
|
*/
|
||||||
if (astate->rowstoskip < 0)
|
if (astate->rowstoskip < 0)
|
||||||
astate->rowstoskip = anl_get_next_S(astate->samplerows, targrows,
|
astate->rowstoskip = reservoir_get_next_S(&astate->rstate, astate->samplerows, targrows);
|
||||||
&astate->rstate);
|
|
||||||
|
|
||||||
if (astate->rowstoskip <= 0)
|
if (astate->rowstoskip <= 0)
|
||||||
{
|
{
|
||||||
/* Choose a random reservoir element to replace. */
|
/* Choose a random reservoir element to replace. */
|
||||||
pos = (int) (targrows * anl_random_fract());
|
pos = (int) (targrows * sampler_random_fract());
|
||||||
Assert(pos >= 0 && pos < targrows);
|
Assert(pos >= 0 && pos < targrows);
|
||||||
heap_freetuple(astate->rows[pos]);
|
heap_freetuple(astate->rows[pos]);
|
||||||
}
|
}
|
||||||
|
@ -50,23 +50,13 @@
|
|||||||
#include "utils/lsyscache.h"
|
#include "utils/lsyscache.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
#include "utils/pg_rusage.h"
|
#include "utils/pg_rusage.h"
|
||||||
|
#include "utils/sampling.h"
|
||||||
#include "utils/sortsupport.h"
|
#include "utils/sortsupport.h"
|
||||||
#include "utils/syscache.h"
|
#include "utils/syscache.h"
|
||||||
#include "utils/timestamp.h"
|
#include "utils/timestamp.h"
|
||||||
#include "utils/tqual.h"
|
#include "utils/tqual.h"
|
||||||
|
|
||||||
|
|
||||||
/* Data structure for Algorithm S from Knuth 3.4.2 */
|
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
BlockNumber N; /* number of blocks, known in advance */
|
|
||||||
int n; /* desired sample size */
|
|
||||||
BlockNumber t; /* current block number */
|
|
||||||
int m; /* blocks selected so far */
|
|
||||||
} BlockSamplerData;
|
|
||||||
|
|
||||||
typedef BlockSamplerData *BlockSampler;
|
|
||||||
|
|
||||||
/* Per-index data for ANALYZE */
|
/* Per-index data for ANALYZE */
|
||||||
typedef struct AnlIndexData
|
typedef struct AnlIndexData
|
||||||
{
|
{
|
||||||
@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options,
|
|||||||
VacuumParams *params, List *va_cols,
|
VacuumParams *params, List *va_cols,
|
||||||
AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
|
AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
|
||||||
bool inh, bool in_outer_xact, int elevel);
|
bool inh, bool in_outer_xact, int elevel);
|
||||||
static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
|
|
||||||
int samplesize);
|
|
||||||
static bool BlockSampler_HasMore(BlockSampler bs);
|
|
||||||
static BlockNumber BlockSampler_Next(BlockSampler bs);
|
|
||||||
static void compute_index_stats(Relation onerel, double totalrows,
|
static void compute_index_stats(Relation onerel, double totalrows,
|
||||||
AnlIndexData *indexdata, int nindexes,
|
AnlIndexData *indexdata, int nindexes,
|
||||||
HeapTuple *rows, int numrows,
|
HeapTuple *rows, int numrows,
|
||||||
@ -950,94 +936,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
|
|||||||
return stats;
|
return stats;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* BlockSampler_Init -- prepare for random sampling of blocknumbers
|
|
||||||
*
|
|
||||||
* BlockSampler is used for stage one of our new two-stage tuple
|
|
||||||
* sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
|
|
||||||
* "Large DB"). It selects a random sample of samplesize blocks out of
|
|
||||||
* the nblocks blocks in the table. If the table has less than
|
|
||||||
* samplesize blocks, all blocks are selected.
|
|
||||||
*
|
|
||||||
* Since we know the total number of blocks in advance, we can use the
|
|
||||||
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
|
|
||||||
* algorithm.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
|
|
||||||
{
|
|
||||||
bs->N = nblocks; /* measured table size */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we decide to reduce samplesize for tables that have less or not much
|
|
||||||
* more than samplesize blocks, here is the place to do it.
|
|
||||||
*/
|
|
||||||
bs->n = samplesize;
|
|
||||||
bs->t = 0; /* blocks scanned so far */
|
|
||||||
bs->m = 0; /* blocks selected so far */
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
BlockSampler_HasMore(BlockSampler bs)
|
|
||||||
{
|
|
||||||
return (bs->t < bs->N) && (bs->m < bs->n);
|
|
||||||
}
|
|
||||||
|
|
||||||
static BlockNumber
|
|
||||||
BlockSampler_Next(BlockSampler bs)
|
|
||||||
{
|
|
||||||
BlockNumber K = bs->N - bs->t; /* remaining blocks */
|
|
||||||
int k = bs->n - bs->m; /* blocks still to sample */
|
|
||||||
double p; /* probability to skip block */
|
|
||||||
double V; /* random */
|
|
||||||
|
|
||||||
Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
|
|
||||||
|
|
||||||
if ((BlockNumber) k >= K)
|
|
||||||
{
|
|
||||||
/* need all the rest */
|
|
||||||
bs->m++;
|
|
||||||
return bs->t++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*----------
|
|
||||||
* It is not obvious that this code matches Knuth's Algorithm S.
|
|
||||||
* Knuth says to skip the current block with probability 1 - k/K.
|
|
||||||
* If we are to skip, we should advance t (hence decrease K), and
|
|
||||||
* repeat the same probabilistic test for the next block. The naive
|
|
||||||
* implementation thus requires an anl_random_fract() call for each block
|
|
||||||
* number. But we can reduce this to one anl_random_fract() call per
|
|
||||||
* selected block, by noting that each time the while-test succeeds,
|
|
||||||
* we can reinterpret V as a uniform random number in the range 0 to p.
|
|
||||||
* Therefore, instead of choosing a new V, we just adjust p to be
|
|
||||||
* the appropriate fraction of its former value, and our next loop
|
|
||||||
* makes the appropriate probabilistic test.
|
|
||||||
*
|
|
||||||
* We have initially K > k > 0. If the loop reduces K to equal k,
|
|
||||||
* the next while-test must fail since p will become exactly zero
|
|
||||||
* (we assume there will not be roundoff error in the division).
|
|
||||||
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
|
|
||||||
* to be doubly sure about roundoff error.) Therefore K cannot become
|
|
||||||
* less than k, which means that we cannot fail to select enough blocks.
|
|
||||||
*----------
|
|
||||||
*/
|
|
||||||
V = anl_random_fract();
|
|
||||||
p = 1.0 - (double) k / (double) K;
|
|
||||||
while (V < p)
|
|
||||||
{
|
|
||||||
/* skip */
|
|
||||||
bs->t++;
|
|
||||||
K--; /* keep K == N - t */
|
|
||||||
|
|
||||||
/* adjust p to be new cutoff point in reduced range */
|
|
||||||
p *= 1.0 - (double) k / (double) K;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* select */
|
|
||||||
bs->m++;
|
|
||||||
return bs->t++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* acquire_sample_rows -- acquire a random sample of rows from the table
|
* acquire_sample_rows -- acquire a random sample of rows from the table
|
||||||
*
|
*
|
||||||
@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
BlockNumber totalblocks;
|
BlockNumber totalblocks;
|
||||||
TransactionId OldestXmin;
|
TransactionId OldestXmin;
|
||||||
BlockSamplerData bs;
|
BlockSamplerData bs;
|
||||||
double rstate;
|
ReservoirStateData rstate;
|
||||||
|
|
||||||
Assert(targrows > 0);
|
Assert(targrows > 0);
|
||||||
|
|
||||||
@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
OldestXmin = GetOldestXmin(onerel, true);
|
OldestXmin = GetOldestXmin(onerel, true);
|
||||||
|
|
||||||
/* Prepare for sampling block numbers */
|
/* Prepare for sampling block numbers */
|
||||||
BlockSampler_Init(&bs, totalblocks, targrows);
|
BlockSampler_Init(&bs, totalblocks, targrows, random());
|
||||||
/* Prepare for sampling rows */
|
/* Prepare for sampling rows */
|
||||||
rstate = anl_init_selection_state(targrows);
|
reservoir_init_selection_state(&rstate, targrows);
|
||||||
|
|
||||||
/* Outer loop over blocks to sample */
|
/* Outer loop over blocks to sample */
|
||||||
while (BlockSampler_HasMore(&bs))
|
while (BlockSampler_HasMore(&bs))
|
||||||
@ -1244,8 +1142,7 @@ acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
* t.
|
* t.
|
||||||
*/
|
*/
|
||||||
if (rowstoskip < 0)
|
if (rowstoskip < 0)
|
||||||
rowstoskip = anl_get_next_S(samplerows, targrows,
|
rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
|
||||||
&rstate);
|
|
||||||
|
|
||||||
if (rowstoskip <= 0)
|
if (rowstoskip <= 0)
|
||||||
{
|
{
|
||||||
@ -1253,7 +1150,7 @@ acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
* Found a suitable tuple, so save it, replacing one
|
* Found a suitable tuple, so save it, replacing one
|
||||||
* old tuple at random
|
* old tuple at random
|
||||||
*/
|
*/
|
||||||
int k = (int) (targrows * anl_random_fract());
|
int k = (int) (targrows * sampler_random_fract());
|
||||||
|
|
||||||
Assert(k >= 0 && k < targrows);
|
Assert(k >= 0 && k < targrows);
|
||||||
heap_freetuple(rows[k]);
|
heap_freetuple(rows[k]);
|
||||||
@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel,
|
|||||||
return numrows;
|
return numrows;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Select a random value R uniformly distributed in (0 - 1) */
|
|
||||||
double
|
|
||||||
anl_random_fract(void)
|
|
||||||
{
|
|
||||||
return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* These two routines embody Algorithm Z from "Random sampling with a
|
|
||||||
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
|
|
||||||
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
|
|
||||||
* of the count S of records to skip before processing another record.
|
|
||||||
* It is computed primarily based on t, the number of records already read.
|
|
||||||
* The only extra state needed between calls is W, a random state variable.
|
|
||||||
*
|
|
||||||
* anl_init_selection_state computes the initial W value.
|
|
||||||
*
|
|
||||||
* Given that we've already read t records (t >= n), anl_get_next_S
|
|
||||||
* determines the number of records to skip before the next record is
|
|
||||||
* processed.
|
|
||||||
*/
|
|
||||||
double
|
|
||||||
anl_init_selection_state(int n)
|
|
||||||
{
|
|
||||||
/* Initial value of W (for use when Algorithm Z is first applied) */
|
|
||||||
return exp(-log(anl_random_fract()) / n);
|
|
||||||
}
|
|
||||||
|
|
||||||
double
|
|
||||||
anl_get_next_S(double t, int n, double *stateptr)
|
|
||||||
{
|
|
||||||
double S;
|
|
||||||
|
|
||||||
/* The magic constant here is T from Vitter's paper */
|
|
||||||
if (t <= (22.0 * n))
|
|
||||||
{
|
|
||||||
/* Process records using Algorithm X until t is large enough */
|
|
||||||
double V,
|
|
||||||
quot;
|
|
||||||
|
|
||||||
V = anl_random_fract(); /* Generate V */
|
|
||||||
S = 0;
|
|
||||||
t += 1;
|
|
||||||
/* Note: "num" in Vitter's code is always equal to t - n */
|
|
||||||
quot = (t - (double) n) / t;
|
|
||||||
/* Find min S satisfying (4.1) */
|
|
||||||
while (quot > V)
|
|
||||||
{
|
|
||||||
S += 1;
|
|
||||||
t += 1;
|
|
||||||
quot *= (t - (double) n) / t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* Now apply Algorithm Z */
|
|
||||||
double W = *stateptr;
|
|
||||||
double term = t - (double) n + 1;
|
|
||||||
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
double numer,
|
|
||||||
numer_lim,
|
|
||||||
denom;
|
|
||||||
double U,
|
|
||||||
X,
|
|
||||||
lhs,
|
|
||||||
rhs,
|
|
||||||
y,
|
|
||||||
tmp;
|
|
||||||
|
|
||||||
/* Generate U and X */
|
|
||||||
U = anl_random_fract();
|
|
||||||
X = t * (W - 1.0);
|
|
||||||
S = floor(X); /* S is tentatively set to floor(X) */
|
|
||||||
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
|
|
||||||
tmp = (t + 1) / term;
|
|
||||||
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
|
|
||||||
rhs = (((t + X) / (term + S)) * term) / t;
|
|
||||||
if (lhs <= rhs)
|
|
||||||
{
|
|
||||||
W = rhs / lhs;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* Test if U <= f(S)/cg(X) */
|
|
||||||
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
|
|
||||||
if ((double) n < S)
|
|
||||||
{
|
|
||||||
denom = t;
|
|
||||||
numer_lim = term + S;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
denom = t - (double) n + S;
|
|
||||||
numer_lim = t + 1;
|
|
||||||
}
|
|
||||||
for (numer = t + S; numer >= numer_lim; numer -= 1)
|
|
||||||
{
|
|
||||||
y *= numer / denom;
|
|
||||||
denom -= 1;
|
|
||||||
}
|
|
||||||
W = exp(-log(anl_random_fract()) / n); /* Generate W in advance */
|
|
||||||
if (exp(log(y) / n) <= (t + X) / t)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
*stateptr = W;
|
|
||||||
}
|
|
||||||
return S;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* qsort comparator for sorting rows[] array
|
* qsort comparator for sorting rows[] array
|
||||||
*/
|
*/
|
||||||
|
@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global
|
|||||||
override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
|
override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
|
||||||
|
|
||||||
OBJS = guc.o help_config.o pg_rusage.o ps_status.o rls.o \
|
OBJS = guc.o help_config.o pg_rusage.o ps_status.o rls.o \
|
||||||
superuser.o timeout.o tzparser.o
|
sampling.o superuser.o timeout.o tzparser.o
|
||||||
|
|
||||||
# This location might depend on the installation directories. Therefore
|
# This location might depend on the installation directories. Therefore
|
||||||
# we can't subsitute it into pg_config.h.
|
# we can't subsitute it into pg_config.h.
|
||||||
|
226
src/backend/utils/misc/sampling.c
Normal file
226
src/backend/utils/misc/sampling.c
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* sampling.c
|
||||||
|
* Relation block sampling routines.
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* src/backend/utils/misc/sampling.c
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "utils/sampling.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BlockSampler_Init -- prepare for random sampling of blocknumbers
|
||||||
|
*
|
||||||
|
* BlockSampler provides algorithm for block level sampling of a relation
|
||||||
|
* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
|
||||||
|
* It selects a random sample of samplesize blocks out of
|
||||||
|
* the nblocks blocks in the table. If the table has less than
|
||||||
|
* samplesize blocks, all blocks are selected.
|
||||||
|
*
|
||||||
|
* Since we know the total number of blocks in advance, we can use the
|
||||||
|
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
|
||||||
|
* algorithm.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
|
||||||
|
long randseed)
|
||||||
|
{
|
||||||
|
bs->N = nblocks; /* measured table size */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we decide to reduce samplesize for tables that have less or not much
|
||||||
|
* more than samplesize blocks, here is the place to do it.
|
||||||
|
*/
|
||||||
|
bs->n = samplesize;
|
||||||
|
bs->t = 0; /* blocks scanned so far */
|
||||||
|
bs->m = 0; /* blocks selected so far */
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
BlockSampler_HasMore(BlockSampler bs)
|
||||||
|
{
|
||||||
|
return (bs->t < bs->N) && (bs->m < bs->n);
|
||||||
|
}
|
||||||
|
|
||||||
|
BlockNumber
|
||||||
|
BlockSampler_Next(BlockSampler bs)
|
||||||
|
{
|
||||||
|
BlockNumber K = bs->N - bs->t; /* remaining blocks */
|
||||||
|
int k = bs->n - bs->m; /* blocks still to sample */
|
||||||
|
double p; /* probability to skip block */
|
||||||
|
double V; /* random */
|
||||||
|
|
||||||
|
Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
|
||||||
|
|
||||||
|
if ((BlockNumber) k >= K)
|
||||||
|
{
|
||||||
|
/* need all the rest */
|
||||||
|
bs->m++;
|
||||||
|
return bs->t++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*----------
|
||||||
|
* It is not obvious that this code matches Knuth's Algorithm S.
|
||||||
|
* Knuth says to skip the current block with probability 1 - k/K.
|
||||||
|
* If we are to skip, we should advance t (hence decrease K), and
|
||||||
|
* repeat the same probabilistic test for the next block. The naive
|
||||||
|
* implementation thus requires an sampler_random_fract() call for each
|
||||||
|
* block number. But we can reduce this to one sampler_random_fract()
|
||||||
|
* call per selected block, by noting that each time the while-test
|
||||||
|
* succeeds, we can reinterpret V as a uniform random number in the range
|
||||||
|
* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
|
||||||
|
* the appropriate fraction of its former value, and our next loop
|
||||||
|
* makes the appropriate probabilistic test.
|
||||||
|
*
|
||||||
|
* We have initially K > k > 0. If the loop reduces K to equal k,
|
||||||
|
* the next while-test must fail since p will become exactly zero
|
||||||
|
* (we assume there will not be roundoff error in the division).
|
||||||
|
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
|
||||||
|
* to be doubly sure about roundoff error.) Therefore K cannot become
|
||||||
|
* less than k, which means that we cannot fail to select enough blocks.
|
||||||
|
*----------
|
||||||
|
*/
|
||||||
|
V = sampler_random_fract();
|
||||||
|
p = 1.0 - (double) k / (double) K;
|
||||||
|
while (V < p)
|
||||||
|
{
|
||||||
|
/* skip */
|
||||||
|
bs->t++;
|
||||||
|
K--; /* keep K == N - t */
|
||||||
|
|
||||||
|
/* adjust p to be new cutoff point in reduced range */
|
||||||
|
p *= 1.0 - (double) k / (double) K;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* select */
|
||||||
|
bs->m++;
|
||||||
|
return bs->t++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These two routines embody Algorithm Z from "Random sampling with a
|
||||||
|
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
|
||||||
|
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
|
||||||
|
* of the count S of records to skip before processing another record.
|
||||||
|
* It is computed primarily based on t, the number of records already read.
|
||||||
|
* The only extra state needed between calls is W, a random state variable.
|
||||||
|
*
|
||||||
|
* reservoir_init_selection_state computes the initial W value.
|
||||||
|
*
|
||||||
|
* Given that we've already read t records (t >= n), reservoir_get_next_S
|
||||||
|
* determines the number of records to skip before the next record is
|
||||||
|
* processed.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
reservoir_init_selection_state(ReservoirState rs, int n)
|
||||||
|
{
|
||||||
|
/* Initial value of W (for use when Algorithm Z is first applied) */
|
||||||
|
*rs = exp(-log(sampler_random_fract()) / n);
|
||||||
|
}
|
||||||
|
|
||||||
|
double
|
||||||
|
reservoir_get_next_S(ReservoirState rs, double t, int n)
|
||||||
|
{
|
||||||
|
double S;
|
||||||
|
|
||||||
|
/* The magic constant here is T from Vitter's paper */
|
||||||
|
if (t <= (22.0 * n))
|
||||||
|
{
|
||||||
|
/* Process records using Algorithm X until t is large enough */
|
||||||
|
double V,
|
||||||
|
quot;
|
||||||
|
|
||||||
|
V = sampler_random_fract(); /* Generate V */
|
||||||
|
S = 0;
|
||||||
|
t += 1;
|
||||||
|
/* Note: "num" in Vitter's code is always equal to t - n */
|
||||||
|
quot = (t - (double) n) / t;
|
||||||
|
/* Find min S satisfying (4.1) */
|
||||||
|
while (quot > V)
|
||||||
|
{
|
||||||
|
S += 1;
|
||||||
|
t += 1;
|
||||||
|
quot *= (t - (double) n) / t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Now apply Algorithm Z */
|
||||||
|
double W = *rs;
|
||||||
|
double term = t - (double) n + 1;
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
double numer,
|
||||||
|
numer_lim,
|
||||||
|
denom;
|
||||||
|
double U,
|
||||||
|
X,
|
||||||
|
lhs,
|
||||||
|
rhs,
|
||||||
|
y,
|
||||||
|
tmp;
|
||||||
|
|
||||||
|
/* Generate U and X */
|
||||||
|
U = sampler_random_fract();
|
||||||
|
X = t * (W - 1.0);
|
||||||
|
S = floor(X); /* S is tentatively set to floor(X) */
|
||||||
|
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
|
||||||
|
tmp = (t + 1) / term;
|
||||||
|
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
|
||||||
|
rhs = (((t + X) / (term + S)) * term) / t;
|
||||||
|
if (lhs <= rhs)
|
||||||
|
{
|
||||||
|
W = rhs / lhs;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* Test if U <= f(S)/cg(X) */
|
||||||
|
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
|
||||||
|
if ((double) n < S)
|
||||||
|
{
|
||||||
|
denom = t;
|
||||||
|
numer_lim = term + S;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
denom = t - (double) n + S;
|
||||||
|
numer_lim = t + 1;
|
||||||
|
}
|
||||||
|
for (numer = t + S; numer >= numer_lim; numer -= 1)
|
||||||
|
{
|
||||||
|
y *= numer / denom;
|
||||||
|
denom -= 1;
|
||||||
|
}
|
||||||
|
W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */
|
||||||
|
if (exp(log(y) / n) <= (t + X) / t)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
*rs = W;
|
||||||
|
}
|
||||||
|
return S;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*----------
|
||||||
|
* Random number generator used by sampling
|
||||||
|
*----------
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Select a random value R uniformly distributed in (0 - 1) */
|
||||||
|
double
|
||||||
|
sampler_random_fract()
|
||||||
|
{
|
||||||
|
return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
|
||||||
|
}
|
@ -197,8 +197,5 @@ extern void analyze_rel(Oid relid, RangeVar *relation, int options,
|
|||||||
VacuumParams *params, List *va_cols, bool in_outer_xact,
|
VacuumParams *params, List *va_cols, bool in_outer_xact,
|
||||||
BufferAccessStrategy bstrategy);
|
BufferAccessStrategy bstrategy);
|
||||||
extern bool std_typanalyze(VacAttrStats *stats);
|
extern bool std_typanalyze(VacAttrStats *stats);
|
||||||
extern double anl_random_fract(void);
|
|
||||||
extern double anl_init_selection_state(int n);
|
|
||||||
extern double anl_get_next_S(double t, int n, double *stateptr);
|
|
||||||
|
|
||||||
#endif /* VACUUM_H */
|
#endif /* VACUUM_H */
|
||||||
|
44
src/include/utils/sampling.h
Normal file
44
src/include/utils/sampling.h
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* sampling.h
|
||||||
|
* definitions for sampling functions
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
|
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
|
*
|
||||||
|
* src/include/utils/sampling.h
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#ifndef SAMPLING_H
|
||||||
|
#define SAMPLING_H
|
||||||
|
|
||||||
|
#include "storage/bufmgr.h"
|
||||||
|
|
||||||
|
extern double sampler_random_fract(void);
|
||||||
|
|
||||||
|
/* Block sampling methods */
|
||||||
|
/* Data structure for Algorithm S from Knuth 3.4.2 */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
BlockNumber N; /* number of blocks, known in advance */
|
||||||
|
int n; /* desired sample size */
|
||||||
|
BlockNumber t; /* current block number */
|
||||||
|
int m; /* blocks selected so far */
|
||||||
|
} BlockSamplerData;
|
||||||
|
|
||||||
|
typedef BlockSamplerData *BlockSampler;
|
||||||
|
|
||||||
|
extern void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
|
||||||
|
int samplesize, long randseed);
|
||||||
|
extern bool BlockSampler_HasMore(BlockSampler bs);
|
||||||
|
extern BlockNumber BlockSampler_Next(BlockSampler bs);
|
||||||
|
|
||||||
|
/* Reservoid sampling methods */
|
||||||
|
typedef double ReservoirStateData;
|
||||||
|
typedef ReservoirStateData *ReservoirState;
|
||||||
|
|
||||||
|
extern void reservoir_init_selection_state(ReservoirState rs, int n);
|
||||||
|
extern double reservoir_get_next_S(ReservoirState rs, double t, int n);
|
||||||
|
|
||||||
|
#endif /* SAMPLING_H */
|
Loading…
x
Reference in New Issue
Block a user