mirror of
https://github.com/postgres/postgres.git
synced 2025-06-22 02:52:08 +03:00
Minor improvements for the multivariate MCV lists
The MCV build should always call get_mincount_for_mcv_list(), as the
there is no other logic to decide whether the MCV list represents all
the data. So just remove the (ngroups > nitems) condition.
Also, when building MCV lists, the number of items was limited by the
statistics target (i.e. up to 10000). But when deserializing the MCV
list, a different value (8192) was used to check the input, causing
an error. Simply ensure that the same value is used in both places.
This should have been included in 7300a69950
, but I forgot to include it
in that commit.
This commit is contained in:
@ -155,15 +155,17 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
|
||||
numattrs,
|
||||
ngroups,
|
||||
nitems;
|
||||
|
||||
AttrNumber *attnums = build_attnums_array(attrs, &numattrs);
|
||||
|
||||
AttrNumber *attnums;
|
||||
double mincount;
|
||||
SortItem *items;
|
||||
SortItem *groups;
|
||||
MCVList *mcvlist = NULL;
|
||||
MultiSortSupport mss;
|
||||
|
||||
attnums = build_attnums_array(attrs, &numattrs);
|
||||
|
||||
/* comparator for all the columns */
|
||||
MultiSortSupport mss = build_mss(stats, numattrs);
|
||||
mss = build_mss(stats, numattrs);
|
||||
|
||||
/* sort the rows */
|
||||
items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
|
||||
@ -196,19 +198,15 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
|
||||
* per-column frequencies, as if the columns were independent).
|
||||
*
|
||||
* Using the same algorithm might exclude items that are close to the
|
||||
* "average" frequency. But it does not say whether the frequency is
|
||||
* close to base frequency or not. We also need to consider unexpectedly
|
||||
* uncommon items (compared to base frequency), and the single-column
|
||||
* algorithm ignores that entirely.
|
||||
* "average" frequency of the sample. But that does not say whether the
|
||||
* observed frequency is close to the base frequency or not. We also
|
||||
* need to consider unexpectedly uncommon items (again, compared to the
|
||||
* base frequency), and the single-column algorithm does not have to.
|
||||
*
|
||||
* If we can fit all the items onto the MCV list, do that. Otherwise
|
||||
* use get_mincount_for_mcv_list to decide which items to keep in the
|
||||
* MCV list, based on the number of occurrences in the sample.
|
||||
* We simply decide how many items to keep by computing minimum count
|
||||
* using get_mincount_for_mcv_list() and then keep all items that seem
|
||||
* to be more common than that.
|
||||
*/
|
||||
if (ngroups > nitems)
|
||||
{
|
||||
double mincount;
|
||||
|
||||
mincount = get_mincount_for_mcv_list(numrows, totalrows);
|
||||
|
||||
/*
|
||||
@ -224,7 +222,6 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* At this point we know the number of items for the MCV list. There might
|
||||
@ -469,11 +466,12 @@ statext_mcv_load(Oid mvoid)
|
||||
* Each attribute has to be processed separately, as we may be mixing different
|
||||
* datatypes, with different sort operators, etc.
|
||||
*
|
||||
* We use uint16 values for the indexes in step (3), as we currently don't allow
|
||||
* more than 8k MCV items anyway, although that's mostly arbitrary limit. We might
|
||||
* increase this to 65k and still fit into uint16. Furthermore, this limit is on
|
||||
* the number of distinct values per column, and we usually have few of those
|
||||
* (and various combinations of them for the those MCV list). So uint16 seems fine.
|
||||
* We use uint16 values for the indexes in step (3), as the number of MCV items
|
||||
* is limited by the statistics target (which is capped to 10k at the moment).
|
||||
* We might increase this to 65k and still fit into uint16, so there's a bit of
|
||||
* slack. Furthermore, this limit is on the number of distinct values per column,
|
||||
* and we usually have few of those (and various combinations of them for the
|
||||
* those MCV list). So uint16 seems fine for now.
|
||||
*
|
||||
* We don't really expect the serialization to save as much space as for
|
||||
* histograms, as we are not doing any bucket splits (which is the source
|
||||
@ -1322,7 +1320,7 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
|
||||
* somewhat wasteful as we could do with just a single bit, thus reducing
|
||||
* the size to ~1/8. It would also allow us to combine bitmaps simply using
|
||||
* & and |, which should be faster than min/max. The bitmaps are fairly
|
||||
* small, though (as we cap the MCV list size to 8k items).
|
||||
* small, though (thanks to the cap on the MCV list size).
|
||||
*/
|
||||
static bool *
|
||||
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
|
||||
|
@ -82,8 +82,8 @@ typedef struct MVDependencies
|
||||
#define STATS_MCV_MAGIC 0xE1A651C2 /* marks serialized bytea */
|
||||
#define STATS_MCV_TYPE_BASIC 1 /* basic MCV list type */
|
||||
|
||||
/* max items in MCV list (mostly arbitrary number) */
|
||||
#define STATS_MCVLIST_MAX_ITEMS 8192
|
||||
/* max items in MCV list (should be equal to max default_statistics_target) */
|
||||
#define STATS_MCVLIST_MAX_ITEMS 10000
|
||||
|
||||
/*
|
||||
* Multivariate MCV (most-common value) lists
|
||||
|
Reference in New Issue
Block a user