mirror of
https://github.com/postgres/postgres.git
synced 2025-05-21 15:54:08 +03:00
Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
bright, but it beats assuming that a prefix match behaves identically to an exact match, which is what the code was doing before :-(. Noted while experimenting with Artur Dobrowski's example.
This commit is contained in:
parent
d4fe61b083
commit
97532f7c29
@ -7,7 +7,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
|
||||
*
|
||||
* 1 - select(oper) in NOT nodes
|
||||
*
|
||||
* freq[val] in VAL nodes, if the value is in MCELEM
|
||||
* histogram-based estimation in prefix VAL nodes
|
||||
*
|
||||
* freq[val] in exact VAL nodes, if the value is in MCELEM
|
||||
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
|
||||
*
|
||||
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
|
||||
* binary search for determining freq[MCELEM].
|
||||
*
|
||||
* If we don't have stats for the tsvector, we still use this logic,
|
||||
* except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case
|
||||
* is signaled by lookup == NULL.
|
||||
* except we use default estimates for VAL nodes. This case is signaled
|
||||
* by lookup == NULL.
|
||||
*/
|
||||
static Selectivity
|
||||
tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
TextFreq *lookup, int length, float4 minfreq)
|
||||
{
|
||||
LexemeKey key;
|
||||
TextFreq *searchres;
|
||||
Selectivity selec,
|
||||
s1,
|
||||
s2;
|
||||
Selectivity selec;
|
||||
|
||||
/* since this function recurses, it could be driven to stack overflow */
|
||||
check_stack_depth();
|
||||
@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
if (item->type == QI_VAL)
|
||||
{
|
||||
QueryOperand *oper = (QueryOperand *) item;
|
||||
|
||||
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
|
||||
if (lookup == NULL)
|
||||
return (Selectivity) DEFAULT_TS_MATCH_SEL;
|
||||
LexemeKey key;
|
||||
|
||||
/*
|
||||
* Prepare the key for bsearch().
|
||||
@ -294,6 +289,59 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
key.lexeme = operand + oper->distance;
|
||||
key.length = oper->length;
|
||||
|
||||
if (oper->prefix)
|
||||
{
|
||||
/* Prefix match, ie the query item is lexeme:* */
|
||||
Selectivity matched,
|
||||
allmcvs;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Our strategy is to scan through the MCV list and add up the
|
||||
* frequencies of the ones that match the prefix, thereby
|
||||
* assuming that the MCVs are representative of the whole lexeme
|
||||
* population in this respect. Compare histogram_selectivity().
|
||||
*
|
||||
* This is only a good plan if we have a pretty fair number of
|
||||
* MCVs available; we set the threshold at 100. If no stats or
|
||||
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
|
||||
*/
|
||||
if (lookup == NULL || length < 100)
|
||||
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
|
||||
|
||||
matched = allmcvs = 0;
|
||||
for (i = 0; i < length; i++)
|
||||
{
|
||||
TextFreq *t = lookup + i;
|
||||
int tlen = VARSIZE_ANY_EXHDR(t->element);
|
||||
|
||||
if (tlen >= key.length &&
|
||||
strncmp(key.lexeme, VARDATA_ANY(t->element),
|
||||
key.length) == 0)
|
||||
matched += t->frequency;
|
||||
allmcvs += t->frequency;
|
||||
}
|
||||
|
||||
if (allmcvs > 0) /* paranoia about zero divide */
|
||||
selec = matched / allmcvs;
|
||||
else
|
||||
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
|
||||
|
||||
/*
|
||||
* In any case, never believe that a prefix match has selectivity
|
||||
* less than DEFAULT_TS_MATCH_SEL.
|
||||
*/
|
||||
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Regular exact lexeme match */
|
||||
TextFreq *searchres;
|
||||
|
||||
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
|
||||
if (lookup == NULL)
|
||||
return (Selectivity) DEFAULT_TS_MATCH_SEL;
|
||||
|
||||
searchres = (TextFreq *) bsearch(&key, lookup, length,
|
||||
sizeof(TextFreq),
|
||||
compare_lexeme_textfreq);
|
||||
@ -301,10 +349,10 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
if (searchres)
|
||||
{
|
||||
/*
|
||||
* The element is in MCELEM. Return precise selectivity (or at
|
||||
* least as precise as ANALYZE could find out).
|
||||
* The element is in MCELEM. Return precise selectivity (or
|
||||
* at least as precise as ANALYZE could find out).
|
||||
*/
|
||||
return (Selectivity) searchres->frequency;
|
||||
selec = searchres->frequency;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -312,11 +360,16 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
* The element is not in MCELEM. Punt, but assume that the
|
||||
* selectivity cannot be more than minfreq / 2.
|
||||
*/
|
||||
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
|
||||
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Current TSQuery node is an operator */
|
||||
Selectivity s1,
|
||||
s2;
|
||||
|
||||
switch (item->qoperator.oper)
|
||||
{
|
||||
case OP_NOT:
|
||||
@ -345,6 +398,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
selec = 0; /* keep compiler quiet */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Clamp intermediate results to stay sane despite roundoff error */
|
||||
CLAMP_PROBABILITY(selec);
|
||||
|
Loading…
x
Reference in New Issue
Block a user