Separate block sampling functions

Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek
2025-07-08 11:42:09 +03:00 · 2015-05-15 04:02:54 +02:00
parent 5a3022fde0
commit 83e176ec18
7 changed files with 287 additions and 232 deletions
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@ -50,23 +50,13 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/pg_rusage.h"
+#include "utils/sampling.h"
 #include "utils/sortsupport.h"
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 #include "utils/tqual.h"


-/* Data structure for Algorithm S from Knuth 3.4.2 */
-typedef struct
-{
-	BlockNumber N;				/* number of blocks, known in advance */
-	int			n;				/* desired sample size */
-	BlockNumber t;				/* current block number */
-	int			m;				/* blocks selected so far */
-} BlockSamplerData;
-
-typedef BlockSamplerData *BlockSampler;
-
 /* Per-index data for ANALYZE */
 typedef struct AnlIndexData
 {
@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options,
 			   VacuumParams *params, List *va_cols,
 			   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
 			   bool inh, bool in_outer_xact, int elevel);
-static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
-				  int samplesize);
-static bool BlockSampler_HasMore(BlockSampler bs);
-static BlockNumber BlockSampler_Next(BlockSampler bs);
 static void compute_index_stats(Relation onerel, double totalrows,
 					AnlIndexData *indexdata, int nindexes,
 					HeapTuple *rows, int numrows,
@ -950,94 +936,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
 	return stats;
 }

-/*
- * BlockSampler_Init -- prepare for random sampling of blocknumbers
- *
- * BlockSampler is used for stage one of our new two-stage tuple
- * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
- * "Large DB").  It selects a random sample of samplesize blocks out of
- * the nblocks blocks in the table.  If the table has less than
- * samplesize blocks, all blocks are selected.
- *
- * Since we know the total number of blocks in advance, we can use the
- * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
- * algorithm.
- */
-static void
-BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
-{
-	bs->N = nblocks;			/* measured table size */
-
-	/*
-	 * If we decide to reduce samplesize for tables that have less or not much
-	 * more than samplesize blocks, here is the place to do it.
-	 */
-	bs->n = samplesize;
-	bs->t = 0;					/* blocks scanned so far */
-	bs->m = 0;					/* blocks selected so far */
-}
-
-static bool
-BlockSampler_HasMore(BlockSampler bs)
-{
-	return (bs->t < bs->N) && (bs->m < bs->n);
-}
-
-static BlockNumber
-BlockSampler_Next(BlockSampler bs)
-{
-	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
-	int			k = bs->n - bs->m;		/* blocks still to sample */
-	double		p;				/* probability to skip block */
-	double		V;				/* random */
-
-	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */
-
-	if ((BlockNumber) k >= K)
-	{
-		/* need all the rest */
-		bs->m++;
-		return bs->t++;
-	}
-
-	/*----------
-	 * It is not obvious that this code matches Knuth's Algorithm S.
-	 * Knuth says to skip the current block with probability 1 - k/K.
-	 * If we are to skip, we should advance t (hence decrease K), and
-	 * repeat the same probabilistic test for the next block.  The naive
-	 * implementation thus requires an anl_random_fract() call for each block
-	 * number.  But we can reduce this to one anl_random_fract() call per
-	 * selected block, by noting that each time the while-test succeeds,
-	 * we can reinterpret V as a uniform random number in the range 0 to p.
-	 * Therefore, instead of choosing a new V, we just adjust p to be
-	 * the appropriate fraction of its former value, and our next loop
-	 * makes the appropriate probabilistic test.
-	 *
-	 * We have initially K > k > 0.  If the loop reduces K to equal k,
-	 * the next while-test must fail since p will become exactly zero
-	 * (we assume there will not be roundoff error in the division).
-	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
-	 * to be doubly sure about roundoff error.)  Therefore K cannot become
-	 * less than k, which means that we cannot fail to select enough blocks.
-	 *----------
-	 */
-	V = anl_random_fract();
-	p = 1.0 - (double) k / (double) K;
-	while (V < p)
-	{
-		/* skip */
-		bs->t++;
-		K--;					/* keep K == N - t */
-
-		/* adjust p to be new cutoff point in reduced range */
-		p *= 1.0 - (double) k / (double) K;
-	}
-
-	/* select */
-	bs->m++;
-	return bs->t++;
-}
-
 /*
 * acquire_sample_rows -- acquire a random sample of rows from the table
 *
@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 	BlockNumber totalblocks;
 	TransactionId OldestXmin;
 	BlockSamplerData bs;
-	double		rstate;
+	ReservoirStateData rstate;

 	Assert(targrows > 0);

@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel,
 	OldestXmin = GetOldestXmin(onerel, true);

 	/* Prepare for sampling block numbers */
-	BlockSampler_Init(&bs, totalblocks, targrows);
+	BlockSampler_Init(&bs, totalblocks, targrows, random());
 	/* Prepare for sampling rows */
-	rstate = anl_init_selection_state(targrows);
+	reservoir_init_selection_state(&rstate, targrows);

 	/* Outer loop over blocks to sample */
 	while (BlockSampler_HasMore(&bs))
@ -1244,8 +1142,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 					 * t.
 					 */
 					if (rowstoskip < 0)
-						rowstoskip = anl_get_next_S(samplerows, targrows,
-													&rstate);
+						rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);

 					if (rowstoskip <= 0)
 					{
@ -1253,7 +1150,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 						 * Found a suitable tuple, so save it, replacing one
 						 * old tuple at random
 						 */
-						int			k = (int) (targrows * anl_random_fract());
+						int			k = (int) (targrows * sampler_random_fract());

 						Assert(k >= 0 && k < targrows);
 						heap_freetuple(rows[k]);
@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel,
 	return numrows;
 }

-/* Select a random value R uniformly distributed in (0 - 1) */
-double
-anl_random_fract(void)
-{
-	return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
-}
-
-/*
- * These two routines embody Algorithm Z from "Random sampling with a
- * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
- * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
- * of the count S of records to skip before processing another record.
- * It is computed primarily based on t, the number of records already read.
- * The only extra state needed between calls is W, a random state variable.
- *
- * anl_init_selection_state computes the initial W value.
- *
- * Given that we've already read t records (t >= n), anl_get_next_S
- * determines the number of records to skip before the next record is
- * processed.
- */
-double
-anl_init_selection_state(int n)
-{
-	/* Initial value of W (for use when Algorithm Z is first applied) */
-	return exp(-log(anl_random_fract()) / n);
-}
-
-double
-anl_get_next_S(double t, int n, double *stateptr)
-{
-	double		S;
-
-	/* The magic constant here is T from Vitter's paper */
-	if (t <= (22.0 * n))
-	{
-		/* Process records using Algorithm X until t is large enough */
-		double		V,
-					quot;
-
-		V = anl_random_fract(); /* Generate V */
-		S = 0;
-		t += 1;
-		/* Note: "num" in Vitter's code is always equal to t - n */
-		quot = (t - (double) n) / t;
-		/* Find min S satisfying (4.1) */
-		while (quot > V)
-		{
-			S += 1;
-			t += 1;
-			quot *= (t - (double) n) / t;
-		}
-	}
-	else
-	{
-		/* Now apply Algorithm Z */
-		double		W = *stateptr;
-		double		term = t - (double) n + 1;
-
-		for (;;)
-		{
-			double		numer,
-						numer_lim,
-						denom;
-			double		U,
-						X,
-						lhs,
-						rhs,
-						y,
-						tmp;
-
-			/* Generate U and X */
-			U = anl_random_fract();
-			X = t * (W - 1.0);
-			S = floor(X);		/* S is tentatively set to floor(X) */
-			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
-			tmp = (t + 1) / term;
-			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
-			rhs = (((t + X) / (term + S)) * term) / t;
-			if (lhs <= rhs)
-			{
-				W = rhs / lhs;
-				break;
-			}
-			/* Test if U <= f(S)/cg(X) */
-			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
-			if ((double) n < S)
-			{
-				denom = t;
-				numer_lim = term + S;
-			}
-			else
-			{
-				denom = t - (double) n + S;
-				numer_lim = t + 1;
-			}
-			for (numer = t + S; numer >= numer_lim; numer -= 1)
-			{
-				y *= numer / denom;
-				denom -= 1;
-			}
-			W = exp(-log(anl_random_fract()) / n);		/* Generate W in advance */
-			if (exp(log(y) / n) <= (t + X) / t)
-				break;
-		}
-		*stateptr = W;
-	}
-	return S;
-}
-
 /*
 * qsort comparator for sorting rows[] array
 */