Fix BRIN cost estimation

The original code was overly optimistic about the cost of scanning a BRIN index, leading to BRIN indexes being selected when they'd be a worse choice than some other index. This complete rewrite should be more accurate. Author: David Rowley, based on an earlier patch by Emre Hasegeli Reviewed-by: Emre Hasegeli Discussion: https://postgr.es/m/CAKJS1f9n-Wapop5Xz1dtGdpdqmzeGqQK4sV2MK-zZugfC14Xtw@mail.gmail.com
2025-08-28 18:48:04 +03:00 · 2017-04-06 17:49:26 -03:00
parent b2ff37d43c
commit 7e534adcdc
5 changed files with 251 additions and 29 deletions
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -1049,6 +1049,27 @@ brin_free_desc(BrinDesc *bdesc)
 	MemoryContextDelete(bdesc->bd_context);
 }
 /*
 * Fetch index's statistical data into *stats
 */
 void
 brinGetStats(Relation index, BrinStatsData *stats)
 {
 	Buffer		metabuffer;
 	Page		metapage;
 	BrinMetaPageData *metadata;
 	metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
 	LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
 	metapage = BufferGetPage(metabuffer);
 	metadata = (BrinMetaPageData *) PageGetContents(metapage);
 	stats->pagesPerRange = metadata->pagesPerRange;
 	stats->revmapNumPages = metadata->lastRevmapPage - 1;
 	UnlockReleaseBuffer(metabuffer);
 }
 /*
 * Initialize a BrinBuildState appropriate to create tuples on the given index.
 */
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -101,6 +101,7 @@
 #include <float.h>
 #include <math.h>
 #include "access/brin.h"
 #include "access/gin.h"
 #include "access/htup_details.h"
 #include "access/sysattr.h"
@@ -7721,56 +7722,200 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 {
 	IndexOptInfo *index = path->indexinfo;
 	List	   *indexQuals = path->indexquals;
 	List	   *indexOrderBys = path->indexorderbys;
 	double		numPages = index->pages;
-	double		numTuples = index->tuples;
+	RelOptInfo *baserel = index->rel;
 	RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root);
 	List	   *qinfos;
 	Cost		spc_seq_page_cost;
 	Cost		spc_random_page_cost;
 	double		qual_op_cost;
 	double		qual_arg_cost;
 	double		qualSelectivity;
 	BrinStatsData statsData;
 	double		indexRanges;
 	double		minimalRanges;
 	double		estimatedRanges;
 	double		selec;
 	Relation	indexRel;
 	ListCell   *l;
 	VariableStatData vardata;
-	/* Do preliminary analysis of indexquals */
+	Assert(rte->rtekind == RTE_RELATION);
 	qinfos = deconstruct_indexquals(path);
-	/* fetch estimated page cost for tablespace containing index */
+	/* fetch estimated page cost for the tablespace containing the index */
 	get_tablespace_page_costs(index->reltablespace,
 							  &spc_random_page_cost,
 							  &spc_seq_page_cost);
 	/*
-	 * BRIN indexes are always read in full; use that as startup cost.
+	 * Obtain some data from the index itself.
 	 *
 	 * XXX maybe only include revmap pages here?
 	 */
-	*indexStartupCost = spc_seq_page_cost * numPages * loop_count;
+	indexRel = index_open(index->indexoid, AccessShareLock);
 	brinGetStats(indexRel, &statsData);
 	index_close(indexRel, AccessShareLock);
 	/*
 	 * Compute index correlation
 	 *
 	 * Because we can use all index quals equally when scanning, we can use
 	 * the largest correlation (in absolute value) among columns used by the
 	 * query.  Start at zero, the worst possible case.  If we cannot find
 	 * any correlation statistics, we will keep it as 0.
 	 */
 	*indexCorrelation = 0;
 	qinfos = deconstruct_indexquals(path);
 	foreach(l, qinfos)
 	{
 		IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(l);
 		AttrNumber	attnum = index->indexkeys[qinfo->indexcol];
 		/* attempt to lookup stats in relation for this index column */
 		if (attnum != 0)
 		{
 			/* Simple variable -- look to stats for the underlying table */
 			if (get_relation_stats_hook &&
 				(*get_relation_stats_hook) (root, rte, attnum, &vardata))
 			{
 				/*
 				 * The hook took control of acquiring a stats tuple.  If it
 				 * did supply a tuple, it'd better have supplied a freefunc.
 				 */
 				if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc)
 					elog(ERROR,
 						 "no function provided to release variable stats with");
 			}
 			else
 			{
 				vardata.statsTuple =
 					SearchSysCache3(STATRELATTINH,
 									ObjectIdGetDatum(rte->relid),
 									Int16GetDatum(attnum),
 									BoolGetDatum(false));
 				vardata.freefunc = ReleaseSysCache;
 			}
 		}
 		else
 		{
 			/*
 			 * Looks like we've found an expression column in the index. Let's
 			 * see if there's any stats for it.
 			 */
 			/* get the attnum from the 0-based index. */
 			attnum = qinfo->indexcol + 1;
 			if (get_index_stats_hook &&
 				(*get_index_stats_hook) (root, index->indexoid, attnum, &vardata))
 			{
 				/*
 				 * The hook took control of acquiring a stats tuple.  If it did
 				 * supply a tuple, it'd better have supplied a freefunc.
 				 */
 				if (HeapTupleIsValid(vardata.statsTuple) &&
 					!vardata.freefunc)
 					elog(ERROR, "no function provided to release variable stats with");
 			}
 			else
 			{
 				vardata.statsTuple = SearchSysCache3(STATRELATTINH,
 													 ObjectIdGetDatum(index->indexoid),
 													 Int16GetDatum(attnum),
 													 BoolGetDatum(false));
 				vardata.freefunc = ReleaseSysCache;
 			}
 		}
 		if (HeapTupleIsValid(vardata.statsTuple))
 		{
 			float4	   *numbers;
 			int			nnumbers;
 			if (get_attstatsslot(vardata.statsTuple, InvalidOid, 0,
 								 STATISTIC_KIND_CORRELATION,
 								 InvalidOid,
 								 NULL,
 								 NULL, NULL,
 								 &numbers, &nnumbers))
 			{
 				double		varCorrelation = 0.0;
 				if (nnumbers > 0)
 					varCorrelation = Abs(numbers[0]);
 				if (varCorrelation > *indexCorrelation)
 					*indexCorrelation = varCorrelation;
 				free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
 			}
 		}
 		ReleaseVariableStats(vardata);
 	}
 	qualSelectivity = clauselist_selectivity(root, indexQuals,
 											 baserel->relid,
 											 JOIN_INNER, NULL,
 											 baserel);
 	/* work out the actual number of ranges in the index */
 	indexRanges = Max(ceil((double) baserel->pages / statsData.pagesPerRange),
 					  1.0);
 	/*
 	 * Now calculate the minimum possible ranges we could match with if all of
 	 * the rows were in the perfect order in the table's heap.
 	 */
 	minimalRanges = ceil(indexRanges * qualSelectivity);
 	/*
 	 * Now estimate the number of ranges that we'll touch by using the
 	 * indexCorrelation from the stats. Careful not to divide by zero
 	 * (note we're using the absolute value of the correlation).
 	 */
 	if (*indexCorrelation < 1.0e-10)
 		estimatedRanges = indexRanges;
 	else
 		estimatedRanges = Min(minimalRanges / *indexCorrelation, indexRanges);
 	/* we expect to visit this portion of the table */
 	selec = estimatedRanges / indexRanges;
 	CLAMP_PROBABILITY(selec);
 	*indexSelectivity = selec;
 	/*
 	 * Compute the index qual costs, much as in genericcostestimate, to add
 	 * to the index costs.
 	 */
 	qual_arg_cost = other_operands_eval_cost(root, qinfos) +
 		orderby_operands_eval_cost(root, path);
 	/*
 	 * Compute the startup cost as the cost to read the whole revmap
 	 * sequentially, including the cost to execute the index quals.
 	 */
 	*indexStartupCost =
 		spc_seq_page_cost * statsData.revmapNumPages * loop_count;
 	*indexStartupCost += qual_arg_cost;
 	/*
 	 * To read a BRIN index there might be a bit of back and forth over
 	 * regular pages, as revmap might point to them out of sequential order;
-	 * calculate this as reading the whole index in random order.
+	 * calculate the total cost as reading the whole index in random order.
 	 */
-	*indexTotalCost = spc_random_page_cost * numPages * loop_count;
+	*indexTotalCost = *indexStartupCost +
-
+		spc_random_page_cost * (numPages - statsData.revmapNumPages) * loop_count;
 	*indexSelectivity =
 		clauselist_selectivity(root, indexQuals,
 							   path->indexinfo->rel->relid,
 							   JOIN_INNER, NULL,
 							   path->indexinfo->rel);
 	*indexCorrelation = 1;
 	/*
-	 * Add on index qual eval costs, much as in genericcostestimate.
+	 * Charge a small amount per range tuple which we expect to match to. This
 	 * is meant to reflect the costs of manipulating the bitmap. The BRIN scan
 	 * will set a bit for each page in the range when we find a matching
 	 * range, so we must multiply the charge by the number of pages in the
 	 * range.
 	 */
-	qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+	*indexTotalCost += 0.1 * cpu_operator_cost * estimatedRanges *
-		orderby_operands_eval_cost(root, path);
+		statsData.pagesPerRange;
 	qual_op_cost = cpu_operator_cost *
 		(list_length(indexQuals) + list_length(indexOrderBys));
 	*indexStartupCost += qual_arg_cost;
 	*indexTotalCost += qual_arg_cost;
 	*indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
 	*indexPages = index->pages;
 	/* XXX what about pages_per_range? */
 }
--- a/src/include/access/brin.h
+++ b/src/include/access/brin.h
@@ -25,6 +25,17 @@ typedef struct BrinOptions
 	bool		autosummarize;
 } BrinOptions;
 /*
 * BrinStatsData represents stats data for planner use
 */
 typedef struct BrinStatsData
 {
 	BlockNumber	pagesPerRange;
 	BlockNumber	revmapNumPages;
 } BrinStatsData;
 #define BRIN_DEFAULT_PAGES_PER_RANGE	128
 #define BrinGetPagesPerRange(relation) \
 	((relation)->rd_options ? \
@@ -35,4 +46,7 @@ typedef struct BrinOptions
 	 ((BrinOptions *) (relation)->rd_options)->autosummarize : \
 	  false)
 extern void brinGetStats(Relation index, BrinStatsData *stats);
 #endif   /* BRIN_H */
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -363,6 +363,8 @@ BEGIN
 	END LOOP;
 END;
 $x$;
 RESET enable_seqscan;
 RESET enable_bitmapscan;
 INSERT INTO brintest SELECT
 	repeat(stringu1, 42)::bytea,
 	substr(stringu1, 1, 1)::"char",
@@ -481,3 +483,27 @@ SELECT brin_summarize_range('brin_summarize_idx', -1);
 ERROR:  block number out of range: -1
 SELECT brin_summarize_range('brin_summarize_idx', 4294967296);
 ERROR:  block number out of range: 4294967296
 -- test brin cost estimates behave sanely based on correlation of values
 CREATE TABLE brin_test (a INT, b INT);
 INSERT INTO brin_test SELECT x/100,x%100 FROM generate_series(1,10000) x(x);
 CREATE INDEX brin_test_a_idx ON brin_test USING brin (a) WITH (pages_per_range = 2);
 CREATE INDEX brin_test_b_idx ON brin_test USING brin (b) WITH (pages_per_range = 2);
 VACUUM ANALYZE brin_test;
 -- Ensure brin index is used when columns are perfectly correlated
 EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
                 QUERY PLAN                 
 --------------------------------------------
 Bitmap Heap Scan on brin_test
   Recheck Cond: (a = 1)
   ->  Bitmap Index Scan on brin_test_a_idx
         Index Cond: (a = 1)
 (4 rows)
 -- Ensure brin index is not used when values are not correlated
 EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
      QUERY PLAN       
 -----------------------
 Seq Scan on brin_test
   Filter: (b = 1)
 (2 rows)
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -370,6 +370,9 @@ BEGIN
 END;
 $x$;
 RESET enable_seqscan;
 RESET enable_bitmapscan;
 INSERT INTO brintest SELECT
 	repeat(stringu1, 42)::bytea,
 	substr(stringu1, 1, 1)::"char",
@@ -444,3 +447,16 @@ SELECT brin_summarize_range('brin_summarize_idx', 4294967295);
 -- invalid block number values
 SELECT brin_summarize_range('brin_summarize_idx', -1);
 SELECT brin_summarize_range('brin_summarize_idx', 4294967296);
 -- test brin cost estimates behave sanely based on correlation of values
 CREATE TABLE brin_test (a INT, b INT);
 INSERT INTO brin_test SELECT x/100,x%100 FROM generate_series(1,10000) x(x);
 CREATE INDEX brin_test_a_idx ON brin_test USING brin (a) WITH (pages_per_range = 2);
 CREATE INDEX brin_test_b_idx ON brin_test USING brin (b) WITH (pages_per_range = 2);
 VACUUM ANALYZE brin_test;
 -- Ensure brin index is used when columns are perfectly correlated
 EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
 -- Ensure brin index is not used when values are not correlated
 EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;