diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 6612948769b..9013f41727b 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -411,6 +411,82 @@ _bt_gettrueroot(Relation rel) return rootbuf; } +/* + * _bt_getrootheight() -- Get the height of the btree search tree. + * + * We return the level (counting from zero) of the current fast root. + * This represents the number of tree levels we'd have to descend through + * to start any btree index search. + * + * This is used by the planner for cost-estimation purposes. Since it's + * only an estimate, slightly-stale data is fine, hence we don't worry + * about updating previously cached data. + */ +int +_bt_getrootheight(Relation rel) +{ + BTMetaPageData *metad; + + /* + * We can get what we need from the cached metapage data. If it's not + * cached yet, load it. Sanity checks here must match _bt_getroot(). + */ + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* sanity-check the metapage */ + if (!(metaopaque->btpo_flags & BTP_META) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" is not a btree", + RelationGetRelationName(rel)))); + + if (metad->btm_version != BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, code version %d", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION))); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here and report the index height is zero. + * (XXX perhaps _bt_getroot() should be changed to allow this case.) + */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return 0; + } + + /* + * Cache the metapage data for next time + */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + + _bt_relbuf(rel, metabuf); + } + + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version == BTREE_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + return metad->btm_fastlevel; +} + /* * _bt_checkpage() -- Verify that a freshly-read page looks sane. */ diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 33740ad6be0..3cce5f517e6 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1772,7 +1772,9 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node) /* Do NOT print rel field, else infinite recursion */ WRITE_UINT_FIELD(pages); WRITE_FLOAT_FIELD(tuples, "%.0f"); + WRITE_INT_FIELD(tree_height); WRITE_INT_FIELD(ncolumns); + /* array fields aren't really worth the trouble to print */ WRITE_OID_FIELD(relam); /* indexprs is redundant since we print indextlist */ WRITE_NODE_FIELD(indpred); @@ -1781,6 +1783,7 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node) WRITE_BOOL_FIELD(unique); WRITE_BOOL_FIELD(immediate); WRITE_BOOL_FIELD(hypothetical); + /* we don't bother with fields copied from the pg_am entry */ } static void diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 4dd24642346..01f988f7c39 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -20,6 +20,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup_details.h" +#include "access/nbtree.h" #include "access/sysattr.h" #include "access/transam.h" #include "access/xlog.h" @@ -352,6 +353,17 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->tuples = rel->tuples; } + if (info->relam == BTREE_AM_OID) + { + /* For btrees, get tree height while we have the index open */ + info->tree_height = _bt_getrootheight(indexRelation); + } + else + { + /* For other index types, just set it to "unknown" for now */ + info->tree_height = -1; + } + index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index bd63377ada7..72c2c30ad40 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -201,6 +201,7 @@ static Selectivity regex_selectivity(const char *patt, int pattlen, static Datum string_to_datum(const char *str, Oid datatype); static Const *string_to_const(const char *str, Oid datatype); static Const *string_to_bytea_const(const char *str, size_t str_len); +static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals); /* @@ -5916,76 +5917,55 @@ string_to_bytea_const(const char *str, size_t str_len) */ /* - * If the index is partial, add its predicate to the given qual list. + * genericcostestimate is a general-purpose estimator that can be used for + * most index types. In some cases we use genericcostestimate as the base + * code and then incorporate additional index-type-specific knowledge in + * the type-specific calling function. To avoid code duplication, we make + * genericcostestimate return a number of intermediate values as well as + * its preliminary estimates of the output cost values. The GenericCosts + * struct includes all these values. * - * ANDing the index predicate with the explicitly given indexquals produces - * a more accurate idea of the index's selectivity. However, we need to be - * careful not to insert redundant clauses, because clauselist_selectivity() - * is easily fooled into computing a too-low selectivity estimate. Our - * approach is to add only the predicate clause(s) that cannot be proven to - * be implied by the given indexquals. This successfully handles cases such - * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50". - * There are many other cases where we won't detect redundancy, leading to a - * too-low selectivity estimate, which will bias the system in favor of using - * partial indexes where possible. That is not necessarily bad though. - * - * Note that indexQuals contains RestrictInfo nodes while the indpred - * does not, so the output list will be mixed. This is OK for both - * predicate_implied_by() and clauselist_selectivity(), but might be - * problematic if the result were passed to other things. + * Callers should initialize all fields of GenericCosts to zero. In addition, + * they can set numIndexTuples to some positive value if they have a better + * than default way of estimating the number of leaf index tuples visited. */ -static List * -add_predicate_to_quals(IndexOptInfo *index, List *indexQuals) +typedef struct { - List *predExtraQuals = NIL; - ListCell *lc; + /* These are the values the cost estimator must return to the planner */ + Cost indexStartupCost; /* index-related startup cost */ + Cost indexTotalCost; /* total index-related scan cost */ + Selectivity indexSelectivity; /* selectivity of index */ + double indexCorrelation; /* order correlation of index */ - if (index->indpred == NIL) - return indexQuals; + /* Intermediate values we obtain along the way */ + double numIndexPages; /* number of leaf pages visited */ + double numIndexTuples; /* number of leaf tuples visited */ + double spc_random_page_cost; /* relevant random_page_cost value */ + double num_sa_scans; /* # indexscans from ScalarArrayOps */ +} GenericCosts; - foreach(lc, index->indpred) - { - Node *predQual = (Node *) lfirst(lc); - List *oneQual = list_make1(predQual); - - if (!predicate_implied_by(oneQual, indexQuals)) - predExtraQuals = list_concat(predExtraQuals, oneQual); - } - /* list_concat avoids modifying the passed-in indexQuals list */ - return list_concat(predExtraQuals, indexQuals); -} - -/* - * genericcostestimate is a general-purpose estimator for use when we - * don't have any better idea about how to estimate. Index-type-specific - * knowledge can be incorporated in the type-specific routines. - * - * One bit of index-type-specific knowledge we can relatively easily use - * in genericcostestimate is the estimate of the number of index tuples - * visited. If numIndexTuples is not 0 then it is used as the estimate, - * otherwise we compute a generic estimate. - */ static void genericcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, - double numIndexTuples, - Cost *indexStartupCost, - Cost *indexTotalCost, - Selectivity *indexSelectivity, - double *indexCorrelation) + GenericCosts *costs) { IndexOptInfo *index = path->indexinfo; List *indexQuals = path->indexquals; List *indexOrderBys = path->indexorderbys; + Cost indexStartupCost; + Cost indexTotalCost; + Selectivity indexSelectivity; + double indexCorrelation; double numIndexPages; + double numIndexTuples; + double spc_random_page_cost; double num_sa_scans; double num_outer_scans; double num_scans; QualCost index_qual_cost; double qual_op_cost; double qual_arg_cost; - double spc_random_page_cost; List *selectivityQuals; ListCell *l; @@ -6016,19 +5996,20 @@ genericcostestimate(PlannerInfo *root, } /* Estimate the fraction of main-table tuples that will be visited */ - *indexSelectivity = clauselist_selectivity(root, selectivityQuals, - index->rel->relid, - JOIN_INNER, - NULL); + indexSelectivity = clauselist_selectivity(root, selectivityQuals, + index->rel->relid, + JOIN_INNER, + NULL); /* * If caller didn't give us an estimate, estimate the number of index * tuples that will be visited. We do it in this rather peculiar-looking * way in order to get the right answer for partial indexes. */ + numIndexTuples = costs->numIndexTuples; if (numIndexTuples <= 0.0) { - numIndexTuples = *indexSelectivity * index->rel->tuples; + numIndexTuples = indexSelectivity * index->rel->tuples; /* * The above calculation counts all the tuples visited across all @@ -6055,9 +6036,12 @@ genericcostestimate(PlannerInfo *root, * * We use the simplistic method of taking a pro-rata fraction of the total * number of index pages. In effect, this counts only leaf pages and not - * any overhead such as index metapage or upper tree levels. In practice - * this seems a better approximation than charging for access to the upper - * levels, perhaps because those tend to stay in cache under load. + * any overhead such as index metapage or upper tree levels. + * + * In practice access to upper index levels is often nearly free because + * those tend to stay in cache under load; moreover, the cost involved is + * highly dependent on index type. We therefore ignore such costs here + * and leave it to the caller to add a suitable charge if needed. */ if (index->pages > 1 && index->tuples > 1) numIndexPages = ceil(numIndexTuples * index->pages / index->tuples); @@ -6107,7 +6091,7 @@ genericcostestimate(PlannerInfo *root, * share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr, * since that's internal to the indexscan.) */ - *indexTotalCost = (pages_fetched * spc_random_page_cost) + indexTotalCost = (pages_fetched * spc_random_page_cost) / num_outer_scans; } else @@ -6116,29 +6100,9 @@ genericcostestimate(PlannerInfo *root, * For a single index scan, we just charge spc_random_page_cost per * page touched. */ - *indexTotalCost = numIndexPages * spc_random_page_cost; + indexTotalCost = numIndexPages * spc_random_page_cost; } - /* - * A difficulty with the leaf-pages-only cost approach is that for small - * selectivities (eg, single index tuple fetched) all indexes will look - * equally attractive because we will estimate exactly 1 leaf page to be - * fetched. All else being equal, we should prefer physically smaller - * indexes over larger ones. (An index might be smaller because it is - * partial or because it contains fewer columns; presumably the other - * columns in the larger index aren't useful to the query, or the larger - * index would have better selectivity.) - * - * We can deal with this by adding a very small "fudge factor" that - * depends on the index size, so that indexes of different sizes won't - * look exactly equally attractive. To ensure the fudge factor stays - * small even for very large indexes, use a log function. (We previously - * used a factor of one spc_random_page_cost per 10000 index pages, which - * grew too large for large indexes. This expression has about the same - * growth rate for small indexes, but tails off quickly.) - */ - *indexTotalCost += log(1.0 + index->pages / 10000.0) * spc_random_page_cost; - /* * CPU cost: any complex expressions in the indexquals will need to be * evaluated once at the start of the scan to reduce them to runtime keys @@ -6149,10 +6113,9 @@ genericcostestimate(PlannerInfo *root, * for ScalarArrayOpExpr cases. Similarly add in costs for any index * ORDER BY expressions. * - * Note: this neglects the possible costs of rechecking lossy operators - * and OR-clause expressions. Detecting that that might be needed seems - * more expensive than it's worth, though, considering all the other - * inaccuracies here ... + * Note: this neglects the possible costs of rechecking lossy operators. + * Detecting that that might be needed seems more expensive than it's + * worth, though, considering all the other inaccuracies here ... */ cost_qual_eval(&index_qual_cost, indexQuals, root); qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple; @@ -6164,29 +6127,66 @@ genericcostestimate(PlannerInfo *root, if (qual_arg_cost < 0) /* just in case... */ qual_arg_cost = 0; - *indexStartupCost = qual_arg_cost; - *indexTotalCost += qual_arg_cost; - *indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost); - - /* - * We also add a CPU-cost component to represent the general costs of - * starting an indexscan, such as analysis of btree index keys and initial - * tree descent. This is estimated at 100x cpu_operator_cost, which is a - * bit arbitrary but seems the right order of magnitude. (As noted above, - * we don't charge any I/O for touching upper tree levels, but charging - * nothing at all has been found too optimistic.) - * - * Although this is startup cost with respect to any one scan, we add it - * to the "total" cost component because it's only very interesting in the - * many-ScalarArrayOpExpr-scan case, and there it will be paid over the - * life of the scan node. - */ - *indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost; + indexStartupCost = qual_arg_cost; + indexTotalCost += qual_arg_cost; + indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost); /* * Generic assumption about index correlation: there isn't any. */ - *indexCorrelation = 0.0; + indexCorrelation = 0.0; + + /* + * Return everything to caller. + */ + costs->indexStartupCost = indexStartupCost; + costs->indexTotalCost = indexTotalCost; + costs->indexSelectivity = indexSelectivity; + costs->indexCorrelation = indexCorrelation; + costs->numIndexPages = numIndexPages; + costs->numIndexTuples = numIndexTuples; + costs->spc_random_page_cost = spc_random_page_cost; + costs->num_sa_scans = num_sa_scans; +} + +/* + * If the index is partial, add its predicate to the given qual list. + * + * ANDing the index predicate with the explicitly given indexquals produces + * a more accurate idea of the index's selectivity. However, we need to be + * careful not to insert redundant clauses, because clauselist_selectivity() + * is easily fooled into computing a too-low selectivity estimate. Our + * approach is to add only the predicate clause(s) that cannot be proven to + * be implied by the given indexquals. This successfully handles cases such + * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50". + * There are many other cases where we won't detect redundancy, leading to a + * too-low selectivity estimate, which will bias the system in favor of using + * partial indexes where possible. That is not necessarily bad though. + * + * Note that indexQuals contains RestrictInfo nodes while the indpred + * does not, so the output list will be mixed. This is OK for both + * predicate_implied_by() and clauselist_selectivity(), but might be + * problematic if the result were passed to other things. + */ +static List * +add_predicate_to_quals(IndexOptInfo *index, List *indexQuals) +{ + List *predExtraQuals = NIL; + ListCell *lc; + + if (index->indpred == NIL) + return indexQuals; + + foreach(lc, index->indpred) + { + Node *predQual = (Node *) lfirst(lc); + List *oneQual = list_make1(predQual); + + if (!predicate_implied_by(oneQual, indexQuals)) + predExtraQuals = list_concat(predExtraQuals, oneQual); + } + /* list_concat avoids modifying the passed-in indexQuals list */ + return list_concat(predExtraQuals, indexQuals); } @@ -6201,10 +6201,12 @@ btcostestimate(PG_FUNCTION_ARGS) Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); double *indexCorrelation = (double *) PG_GETARG_POINTER(6); IndexOptInfo *index = path->indexinfo; + GenericCosts costs; Oid relid; AttrNumber colnum; VariableStatData vardata; double numIndexTuples; + Cost descentCost; List *indexBoundQuals; int indexcol; bool eqQualHere; @@ -6379,10 +6381,45 @@ btcostestimate(PG_FUNCTION_ARGS) numIndexTuples = rint(numIndexTuples / num_sa_scans); } - genericcostestimate(root, path, loop_count, - numIndexTuples, - indexStartupCost, indexTotalCost, - indexSelectivity, indexCorrelation); + /* + * Now do generic index cost estimation. + */ + MemSet(&costs, 0, sizeof(costs)); + costs.numIndexTuples = numIndexTuples; + + genericcostestimate(root, path, loop_count, &costs); + + /* + * Add a CPU-cost component to represent the costs of initial btree + * descent. We don't charge any I/O cost for touching upper btree levels, + * since they tend to stay in cache, but we still have to do about log2(N) + * comparisons to descend a btree of N leaf tuples. We charge one + * cpu_operator_cost per comparison. + * + * If there are ScalarArrayOpExprs, charge this once per SA scan. The + * ones after the first one are not startup cost so far as the overall + * plan is concerned, so add them only to "total" cost. + */ + if (index->tuples > 1) /* avoid computing log(0) */ + { + descentCost = ceil(log(index->tuples) / log(2.0)) * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + } + + /* + * Even though we're not charging I/O cost for touching upper btree pages, + * it's still reasonable to charge some CPU cost per page descended + * through. Moreover, if we had no such charge at all, bloated indexes + * would appear to have the same search cost as unbloated ones, at least + * in cases where only a single leaf page is expected to be visited. This + * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page + * touched. The number of such pages is btree tree height plus one (ie, + * we charge for the leaf page too). As above, charge once per SA scan. + */ + descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; /* * If we can get an estimate of the first column's ordering correlation C @@ -6478,9 +6515,9 @@ btcostestimate(PG_FUNCTION_ARGS) varCorrelation = -varCorrelation; if (index->ncolumns > 1) - *indexCorrelation = varCorrelation * 0.75; + costs.indexCorrelation = varCorrelation * 0.75; else - *indexCorrelation = varCorrelation; + costs.indexCorrelation = varCorrelation; free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers); } @@ -6488,6 +6525,11 @@ btcostestimate(PG_FUNCTION_ARGS) ReleaseVariableStats(vardata); + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + PG_RETURN_VOID(); } @@ -6501,10 +6543,41 @@ hashcostestimate(PG_FUNCTION_ARGS) Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); double *indexCorrelation = (double *) PG_GETARG_POINTER(6); + GenericCosts costs; - genericcostestimate(root, path, loop_count, 0.0, - indexStartupCost, indexTotalCost, - indexSelectivity, indexCorrelation); + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* + * A hash index has no descent costs as such, since the index AM can go + * directly to the target bucket after computing the hash value. There + * are a couple of other hash-specific costs that we could conceivably add + * here, though: + * + * Ideally we'd charge spc_random_page_cost for each page in the target + * bucket, not just the numIndexPages pages that genericcostestimate + * thought we'd visit. However in most cases we don't know which bucket + * that will be. There's no point in considering the average bucket size + * because the hash AM makes sure that's always one page. + * + * Likewise, we could consider charging some CPU for each index tuple in + * the bucket, if we knew how many there were. But the per-tuple cost is + * just a hash value comparison, not a general datatype-dependent + * comparison, so any such charge ought to be quite a bit less than + * cpu_operator_cost; which makes it probably not worth worrying about. + * + * A bigger issue is that chance hash-value collisions will result in + * wasted probes into the heap. We don't currently attempt to model this + * cost on the grounds that it's rare, but maybe it's not rare enough. + * (Any fix for this ought to consider the generic lossy-operator problem, + * though; it's not entirely hash-specific.) + */ + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; PG_RETURN_VOID(); } @@ -6519,10 +6592,54 @@ gistcostestimate(PG_FUNCTION_ARGS) Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); double *indexCorrelation = (double *) PG_GETARG_POINTER(6); + IndexOptInfo *index = path->indexinfo; + GenericCosts costs; + Cost descentCost; - genericcostestimate(root, path, loop_count, 0.0, - indexStartupCost, indexTotalCost, - indexSelectivity, indexCorrelation); + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* + * We model index descent costs similarly to those for btree, but to do + * that we first need an idea of the tree height. We somewhat arbitrarily + * assume that the fanout is 100, meaning the tree height is at most + * log100(index->pages). + * + * Although this computation isn't really expensive enough to require + * caching, we might as well use index->tree_height to cache it. + */ + if (index->tree_height < 0) /* unknown? */ + { + if (index->pages > 1) /* avoid computing log(0) */ + index->tree_height = (int) (log(index->pages) / log(100.0)); + else + index->tree_height = 0; + } + + /* + * Add a CPU-cost component to represent the costs of initial descent. + * We just use log(N) here not log2(N) since the branching factor isn't + * necessarily two anyway. As for btree, charge once per SA scan. + */ + if (index->tuples > 1) /* avoid computing log(0) */ + { + descentCost = ceil(log(index->tuples)) * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + } + + /* + * Likewise add a per-page charge, calculated the same as for btrees. + */ + descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; PG_RETURN_VOID(); } @@ -6537,10 +6654,54 @@ spgcostestimate(PG_FUNCTION_ARGS) Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); double *indexCorrelation = (double *) PG_GETARG_POINTER(6); + IndexOptInfo *index = path->indexinfo; + GenericCosts costs; + Cost descentCost; - genericcostestimate(root, path, loop_count, 0.0, - indexStartupCost, indexTotalCost, - indexSelectivity, indexCorrelation); + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* + * We model index descent costs similarly to those for btree, but to do + * that we first need an idea of the tree height. We somewhat arbitrarily + * assume that the fanout is 100, meaning the tree height is at most + * log100(index->pages). + * + * Although this computation isn't really expensive enough to require + * caching, we might as well use index->tree_height to cache it. + */ + if (index->tree_height < 0) /* unknown? */ + { + if (index->pages > 1) /* avoid computing log(0) */ + index->tree_height = (int) (log(index->pages) / log(100.0)); + else + index->tree_height = 0; + } + + /* + * Add a CPU-cost component to represent the costs of initial descent. + * We just use log(N) here not log2(N) since the branching factor isn't + * necessarily two anyway. As for btree, charge once per SA scan. + */ + if (index->tuples > 1) /* avoid computing log(0) */ + { + descentCost = ceil(log(index->tuples)) * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + } + + /* + * Likewise add a per-page charge, calculated the same as for btrees. + */ + descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; PG_RETURN_VOID(); } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index eef67f54b50..0e35d7ad25b 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -626,6 +626,7 @@ extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); +extern int _bt_getrootheight(Relation rel); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index d680c620999..f34f6d5a991 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -487,9 +487,10 @@ typedef struct IndexOptInfo Oid reltablespace; /* tablespace of index (not table) */ RelOptInfo *rel; /* back-link to index's table */ - /* statistics from pg_class */ + /* index-size statistics (from pg_class and elsewhere) */ BlockNumber pages; /* number of disk pages in index */ double tuples; /* number of index tuples in index */ + int tree_height; /* index tree height, or -1 if unknown */ /* index descriptor information */ int ncolumns; /* number of columns in index */