mirror of
https://github.com/postgres/postgres.git
synced 2025-06-22 02:52:08 +03:00
Minor improvements for the multivariate MCV lists
The MCV build should always call get_mincount_for_mcv_list(), as the
there is no other logic to decide whether the MCV list represents all
the data. So just remove the (ngroups > nitems) condition.
Also, when building MCV lists, the number of items was limited by the
statistics target (i.e. up to 10000). But when deserializing the MCV
list, a different value (8192) was used to check the input, causing
an error. Simply ensure that the same value is used in both places.
This should have been included in 7300a69950
, but I forgot to include it
in that commit.
This commit is contained in:
@ -155,15 +155,17 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
|
|||||||
numattrs,
|
numattrs,
|
||||||
ngroups,
|
ngroups,
|
||||||
nitems;
|
nitems;
|
||||||
|
AttrNumber *attnums;
|
||||||
AttrNumber *attnums = build_attnums_array(attrs, &numattrs);
|
double mincount;
|
||||||
|
|
||||||
SortItem *items;
|
SortItem *items;
|
||||||
SortItem *groups;
|
SortItem *groups;
|
||||||
MCVList *mcvlist = NULL;
|
MCVList *mcvlist = NULL;
|
||||||
|
MultiSortSupport mss;
|
||||||
|
|
||||||
|
attnums = build_attnums_array(attrs, &numattrs);
|
||||||
|
|
||||||
/* comparator for all the columns */
|
/* comparator for all the columns */
|
||||||
MultiSortSupport mss = build_mss(stats, numattrs);
|
mss = build_mss(stats, numattrs);
|
||||||
|
|
||||||
/* sort the rows */
|
/* sort the rows */
|
||||||
items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
|
items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
|
||||||
@ -196,33 +198,28 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
|
|||||||
* per-column frequencies, as if the columns were independent).
|
* per-column frequencies, as if the columns were independent).
|
||||||
*
|
*
|
||||||
* Using the same algorithm might exclude items that are close to the
|
* Using the same algorithm might exclude items that are close to the
|
||||||
* "average" frequency. But it does not say whether the frequency is
|
* "average" frequency of the sample. But that does not say whether the
|
||||||
* close to base frequency or not. We also need to consider unexpectedly
|
* observed frequency is close to the base frequency or not. We also
|
||||||
* uncommon items (compared to base frequency), and the single-column
|
* need to consider unexpectedly uncommon items (again, compared to the
|
||||||
* algorithm ignores that entirely.
|
* base frequency), and the single-column algorithm does not have to.
|
||||||
*
|
*
|
||||||
* If we can fit all the items onto the MCV list, do that. Otherwise
|
* We simply decide how many items to keep by computing minimum count
|
||||||
* use get_mincount_for_mcv_list to decide which items to keep in the
|
* using get_mincount_for_mcv_list() and then keep all items that seem
|
||||||
* MCV list, based on the number of occurrences in the sample.
|
* to be more common than that.
|
||||||
*/
|
*/
|
||||||
if (ngroups > nitems)
|
mincount = get_mincount_for_mcv_list(numrows, totalrows);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Walk the groups until we find the first group with a count below
|
||||||
|
* the mincount threshold (the index of that group is the number of
|
||||||
|
* groups we want to keep).
|
||||||
|
*/
|
||||||
|
for (i = 0; i < nitems; i++)
|
||||||
{
|
{
|
||||||
double mincount;
|
if (groups[i].count < mincount)
|
||||||
|
|
||||||
mincount = get_mincount_for_mcv_list(numrows, totalrows);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Walk the groups until we find the first group with a count below
|
|
||||||
* the mincount threshold (the index of that group is the number of
|
|
||||||
* groups we want to keep).
|
|
||||||
*/
|
|
||||||
for (i = 0; i < nitems; i++)
|
|
||||||
{
|
{
|
||||||
if (groups[i].count < mincount)
|
nitems = i;
|
||||||
{
|
break;
|
||||||
nitems = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -469,11 +466,12 @@ statext_mcv_load(Oid mvoid)
|
|||||||
* Each attribute has to be processed separately, as we may be mixing different
|
* Each attribute has to be processed separately, as we may be mixing different
|
||||||
* datatypes, with different sort operators, etc.
|
* datatypes, with different sort operators, etc.
|
||||||
*
|
*
|
||||||
* We use uint16 values for the indexes in step (3), as we currently don't allow
|
* We use uint16 values for the indexes in step (3), as the number of MCV items
|
||||||
* more than 8k MCV items anyway, although that's mostly arbitrary limit. We might
|
* is limited by the statistics target (which is capped to 10k at the moment).
|
||||||
* increase this to 65k and still fit into uint16. Furthermore, this limit is on
|
* We might increase this to 65k and still fit into uint16, so there's a bit of
|
||||||
* the number of distinct values per column, and we usually have few of those
|
* slack. Furthermore, this limit is on the number of distinct values per column,
|
||||||
* (and various combinations of them for the those MCV list). So uint16 seems fine.
|
* and we usually have few of those (and various combinations of them for the
|
||||||
|
* those MCV list). So uint16 seems fine for now.
|
||||||
*
|
*
|
||||||
* We don't really expect the serialization to save as much space as for
|
* We don't really expect the serialization to save as much space as for
|
||||||
* histograms, as we are not doing any bucket splits (which is the source
|
* histograms, as we are not doing any bucket splits (which is the source
|
||||||
@ -1322,7 +1320,7 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
|
|||||||
* somewhat wasteful as we could do with just a single bit, thus reducing
|
* somewhat wasteful as we could do with just a single bit, thus reducing
|
||||||
* the size to ~1/8. It would also allow us to combine bitmaps simply using
|
* the size to ~1/8. It would also allow us to combine bitmaps simply using
|
||||||
* & and |, which should be faster than min/max. The bitmaps are fairly
|
* & and |, which should be faster than min/max. The bitmaps are fairly
|
||||||
* small, though (as we cap the MCV list size to 8k items).
|
* small, though (thanks to the cap on the MCV list size).
|
||||||
*/
|
*/
|
||||||
static bool *
|
static bool *
|
||||||
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
|
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
|
||||||
|
@ -82,8 +82,8 @@ typedef struct MVDependencies
|
|||||||
#define STATS_MCV_MAGIC 0xE1A651C2 /* marks serialized bytea */
|
#define STATS_MCV_MAGIC 0xE1A651C2 /* marks serialized bytea */
|
||||||
#define STATS_MCV_TYPE_BASIC 1 /* basic MCV list type */
|
#define STATS_MCV_TYPE_BASIC 1 /* basic MCV list type */
|
||||||
|
|
||||||
/* max items in MCV list (mostly arbitrary number) */
|
/* max items in MCV list (should be equal to max default_statistics_target) */
|
||||||
#define STATS_MCVLIST_MAX_ITEMS 8192
|
#define STATS_MCVLIST_MAX_ITEMS 10000
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Multivariate MCV (most-common value) lists
|
* Multivariate MCV (most-common value) lists
|
||||||
|
Reference in New Issue
Block a user