mirror of
https://github.com/postgres/postgres.git
synced 2025-09-08 00:47:37 +03:00
Separate block sampling functions
Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek
This commit is contained in:
226
src/backend/utils/misc/sampling.c
Normal file
226
src/backend/utils/misc/sampling.c
Normal file
@@ -0,0 +1,226 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* sampling.c
|
||||
* Relation block sampling routines.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/utils/misc/sampling.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "utils/sampling.h"
|
||||
|
||||
|
||||
/*
|
||||
* BlockSampler_Init -- prepare for random sampling of blocknumbers
|
||||
*
|
||||
* BlockSampler provides algorithm for block level sampling of a relation
|
||||
* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
|
||||
* It selects a random sample of samplesize blocks out of
|
||||
* the nblocks blocks in the table. If the table has less than
|
||||
* samplesize blocks, all blocks are selected.
|
||||
*
|
||||
* Since we know the total number of blocks in advance, we can use the
|
||||
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
|
||||
* algorithm.
|
||||
*/
|
||||
void
|
||||
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
|
||||
long randseed)
|
||||
{
|
||||
bs->N = nblocks; /* measured table size */
|
||||
|
||||
/*
|
||||
* If we decide to reduce samplesize for tables that have less or not much
|
||||
* more than samplesize blocks, here is the place to do it.
|
||||
*/
|
||||
bs->n = samplesize;
|
||||
bs->t = 0; /* blocks scanned so far */
|
||||
bs->m = 0; /* blocks selected so far */
|
||||
}
|
||||
|
||||
bool
|
||||
BlockSampler_HasMore(BlockSampler bs)
|
||||
{
|
||||
return (bs->t < bs->N) && (bs->m < bs->n);
|
||||
}
|
||||
|
||||
BlockNumber
|
||||
BlockSampler_Next(BlockSampler bs)
|
||||
{
|
||||
BlockNumber K = bs->N - bs->t; /* remaining blocks */
|
||||
int k = bs->n - bs->m; /* blocks still to sample */
|
||||
double p; /* probability to skip block */
|
||||
double V; /* random */
|
||||
|
||||
Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
|
||||
|
||||
if ((BlockNumber) k >= K)
|
||||
{
|
||||
/* need all the rest */
|
||||
bs->m++;
|
||||
return bs->t++;
|
||||
}
|
||||
|
||||
/*----------
|
||||
* It is not obvious that this code matches Knuth's Algorithm S.
|
||||
* Knuth says to skip the current block with probability 1 - k/K.
|
||||
* If we are to skip, we should advance t (hence decrease K), and
|
||||
* repeat the same probabilistic test for the next block. The naive
|
||||
* implementation thus requires an sampler_random_fract() call for each
|
||||
* block number. But we can reduce this to one sampler_random_fract()
|
||||
* call per selected block, by noting that each time the while-test
|
||||
* succeeds, we can reinterpret V as a uniform random number in the range
|
||||
* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
|
||||
* the appropriate fraction of its former value, and our next loop
|
||||
* makes the appropriate probabilistic test.
|
||||
*
|
||||
* We have initially K > k > 0. If the loop reduces K to equal k,
|
||||
* the next while-test must fail since p will become exactly zero
|
||||
* (we assume there will not be roundoff error in the division).
|
||||
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
|
||||
* to be doubly sure about roundoff error.) Therefore K cannot become
|
||||
* less than k, which means that we cannot fail to select enough blocks.
|
||||
*----------
|
||||
*/
|
||||
V = sampler_random_fract();
|
||||
p = 1.0 - (double) k / (double) K;
|
||||
while (V < p)
|
||||
{
|
||||
/* skip */
|
||||
bs->t++;
|
||||
K--; /* keep K == N - t */
|
||||
|
||||
/* adjust p to be new cutoff point in reduced range */
|
||||
p *= 1.0 - (double) k / (double) K;
|
||||
}
|
||||
|
||||
/* select */
|
||||
bs->m++;
|
||||
return bs->t++;
|
||||
}
|
||||
|
||||
/*
|
||||
* These two routines embody Algorithm Z from "Random sampling with a
|
||||
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
|
||||
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
|
||||
* of the count S of records to skip before processing another record.
|
||||
* It is computed primarily based on t, the number of records already read.
|
||||
* The only extra state needed between calls is W, a random state variable.
|
||||
*
|
||||
* reservoir_init_selection_state computes the initial W value.
|
||||
*
|
||||
* Given that we've already read t records (t >= n), reservoir_get_next_S
|
||||
* determines the number of records to skip before the next record is
|
||||
* processed.
|
||||
*/
|
||||
void
|
||||
reservoir_init_selection_state(ReservoirState rs, int n)
|
||||
{
|
||||
/* Initial value of W (for use when Algorithm Z is first applied) */
|
||||
*rs = exp(-log(sampler_random_fract()) / n);
|
||||
}
|
||||
|
||||
double
|
||||
reservoir_get_next_S(ReservoirState rs, double t, int n)
|
||||
{
|
||||
double S;
|
||||
|
||||
/* The magic constant here is T from Vitter's paper */
|
||||
if (t <= (22.0 * n))
|
||||
{
|
||||
/* Process records using Algorithm X until t is large enough */
|
||||
double V,
|
||||
quot;
|
||||
|
||||
V = sampler_random_fract(); /* Generate V */
|
||||
S = 0;
|
||||
t += 1;
|
||||
/* Note: "num" in Vitter's code is always equal to t - n */
|
||||
quot = (t - (double) n) / t;
|
||||
/* Find min S satisfying (4.1) */
|
||||
while (quot > V)
|
||||
{
|
||||
S += 1;
|
||||
t += 1;
|
||||
quot *= (t - (double) n) / t;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Now apply Algorithm Z */
|
||||
double W = *rs;
|
||||
double term = t - (double) n + 1;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
double numer,
|
||||
numer_lim,
|
||||
denom;
|
||||
double U,
|
||||
X,
|
||||
lhs,
|
||||
rhs,
|
||||
y,
|
||||
tmp;
|
||||
|
||||
/* Generate U and X */
|
||||
U = sampler_random_fract();
|
||||
X = t * (W - 1.0);
|
||||
S = floor(X); /* S is tentatively set to floor(X) */
|
||||
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
|
||||
tmp = (t + 1) / term;
|
||||
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
|
||||
rhs = (((t + X) / (term + S)) * term) / t;
|
||||
if (lhs <= rhs)
|
||||
{
|
||||
W = rhs / lhs;
|
||||
break;
|
||||
}
|
||||
/* Test if U <= f(S)/cg(X) */
|
||||
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
|
||||
if ((double) n < S)
|
||||
{
|
||||
denom = t;
|
||||
numer_lim = term + S;
|
||||
}
|
||||
else
|
||||
{
|
||||
denom = t - (double) n + S;
|
||||
numer_lim = t + 1;
|
||||
}
|
||||
for (numer = t + S; numer >= numer_lim; numer -= 1)
|
||||
{
|
||||
y *= numer / denom;
|
||||
denom -= 1;
|
||||
}
|
||||
W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */
|
||||
if (exp(log(y) / n) <= (t + X) / t)
|
||||
break;
|
||||
}
|
||||
*rs = W;
|
||||
}
|
||||
return S;
|
||||
}
|
||||
|
||||
|
||||
/*----------
|
||||
* Random number generator used by sampling
|
||||
*----------
|
||||
*/
|
||||
|
||||
/* Select a random value R uniformly distributed in (0 - 1) */
|
||||
double
|
||||
sampler_random_fract()
|
||||
{
|
||||
return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
|
||||
}
|
Reference in New Issue
Block a user