mirror of
https://github.com/postgres/postgres.git
synced 2025-05-20 05:13:53 +03:00
Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
bright, but it beats assuming that a prefix match behaves identically to an exact match, which is what the code was doing before :-(. Noted while experimenting with Artur Dobrowski's example.
This commit is contained in:
parent
d4fe61b083
commit
97532f7c29
@ -7,7 +7,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
|
|||||||
*
|
*
|
||||||
* 1 - select(oper) in NOT nodes
|
* 1 - select(oper) in NOT nodes
|
||||||
*
|
*
|
||||||
* freq[val] in VAL nodes, if the value is in MCELEM
|
* histogram-based estimation in prefix VAL nodes
|
||||||
|
*
|
||||||
|
* freq[val] in exact VAL nodes, if the value is in MCELEM
|
||||||
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
|
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
|
||||||
*
|
*
|
||||||
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
|
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
|
||||||
* binary search for determining freq[MCELEM].
|
* binary search for determining freq[MCELEM].
|
||||||
*
|
*
|
||||||
* If we don't have stats for the tsvector, we still use this logic,
|
* If we don't have stats for the tsvector, we still use this logic,
|
||||||
* except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case
|
* except we use default estimates for VAL nodes. This case is signaled
|
||||||
* is signaled by lookup == NULL.
|
* by lookup == NULL.
|
||||||
*/
|
*/
|
||||||
static Selectivity
|
static Selectivity
|
||||||
tsquery_opr_selec(QueryItem *item, char *operand,
|
tsquery_opr_selec(QueryItem *item, char *operand,
|
||||||
TextFreq *lookup, int length, float4 minfreq)
|
TextFreq *lookup, int length, float4 minfreq)
|
||||||
{
|
{
|
||||||
LexemeKey key;
|
Selectivity selec;
|
||||||
TextFreq *searchres;
|
|
||||||
Selectivity selec,
|
|
||||||
s1,
|
|
||||||
s2;
|
|
||||||
|
|
||||||
/* since this function recurses, it could be driven to stack overflow */
|
/* since this function recurses, it could be driven to stack overflow */
|
||||||
check_stack_depth();
|
check_stack_depth();
|
||||||
@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
|||||||
if (item->type == QI_VAL)
|
if (item->type == QI_VAL)
|
||||||
{
|
{
|
||||||
QueryOperand *oper = (QueryOperand *) item;
|
QueryOperand *oper = (QueryOperand *) item;
|
||||||
|
LexemeKey key;
|
||||||
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
|
|
||||||
if (lookup == NULL)
|
|
||||||
return (Selectivity) DEFAULT_TS_MATCH_SEL;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Prepare the key for bsearch().
|
* Prepare the key for bsearch().
|
||||||
@ -294,56 +289,115 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
|||||||
key.lexeme = operand + oper->distance;
|
key.lexeme = operand + oper->distance;
|
||||||
key.length = oper->length;
|
key.length = oper->length;
|
||||||
|
|
||||||
searchres = (TextFreq *) bsearch(&key, lookup, length,
|
if (oper->prefix)
|
||||||
sizeof(TextFreq),
|
|
||||||
compare_lexeme_textfreq);
|
|
||||||
|
|
||||||
if (searchres)
|
|
||||||
{
|
{
|
||||||
|
/* Prefix match, ie the query item is lexeme:* */
|
||||||
|
Selectivity matched,
|
||||||
|
allmcvs;
|
||||||
|
int i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The element is in MCELEM. Return precise selectivity (or at
|
* Our strategy is to scan through the MCV list and add up the
|
||||||
* least as precise as ANALYZE could find out).
|
* frequencies of the ones that match the prefix, thereby
|
||||||
|
* assuming that the MCVs are representative of the whole lexeme
|
||||||
|
* population in this respect. Compare histogram_selectivity().
|
||||||
|
*
|
||||||
|
* This is only a good plan if we have a pretty fair number of
|
||||||
|
* MCVs available; we set the threshold at 100. If no stats or
|
||||||
|
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
|
||||||
*/
|
*/
|
||||||
return (Selectivity) searchres->frequency;
|
if (lookup == NULL || length < 100)
|
||||||
|
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
|
||||||
|
|
||||||
|
matched = allmcvs = 0;
|
||||||
|
for (i = 0; i < length; i++)
|
||||||
|
{
|
||||||
|
TextFreq *t = lookup + i;
|
||||||
|
int tlen = VARSIZE_ANY_EXHDR(t->element);
|
||||||
|
|
||||||
|
if (tlen >= key.length &&
|
||||||
|
strncmp(key.lexeme, VARDATA_ANY(t->element),
|
||||||
|
key.length) == 0)
|
||||||
|
matched += t->frequency;
|
||||||
|
allmcvs += t->frequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allmcvs > 0) /* paranoia about zero divide */
|
||||||
|
selec = matched / allmcvs;
|
||||||
|
else
|
||||||
|
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In any case, never believe that a prefix match has selectivity
|
||||||
|
* less than DEFAULT_TS_MATCH_SEL.
|
||||||
|
*/
|
||||||
|
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/*
|
/* Regular exact lexeme match */
|
||||||
* The element is not in MCELEM. Punt, but assume that the
|
TextFreq *searchres;
|
||||||
* selectivity cannot be more than minfreq / 2.
|
|
||||||
*/
|
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
|
||||||
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
|
if (lookup == NULL)
|
||||||
|
return (Selectivity) DEFAULT_TS_MATCH_SEL;
|
||||||
|
|
||||||
|
searchres = (TextFreq *) bsearch(&key, lookup, length,
|
||||||
|
sizeof(TextFreq),
|
||||||
|
compare_lexeme_textfreq);
|
||||||
|
|
||||||
|
if (searchres)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The element is in MCELEM. Return precise selectivity (or
|
||||||
|
* at least as precise as ANALYZE could find out).
|
||||||
|
*/
|
||||||
|
selec = searchres->frequency;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The element is not in MCELEM. Punt, but assume that the
|
||||||
|
* selectivity cannot be more than minfreq / 2.
|
||||||
|
*/
|
||||||
|
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
/* Current TSQuery node is an operator */
|
|
||||||
switch (item->qoperator.oper)
|
|
||||||
{
|
{
|
||||||
case OP_NOT:
|
/* Current TSQuery node is an operator */
|
||||||
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
|
Selectivity s1,
|
||||||
lookup, length, minfreq);
|
s2;
|
||||||
break;
|
|
||||||
|
|
||||||
case OP_AND:
|
switch (item->qoperator.oper)
|
||||||
s1 = tsquery_opr_selec(item + 1, operand,
|
{
|
||||||
lookup, length, minfreq);
|
case OP_NOT:
|
||||||
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
|
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
|
||||||
lookup, length, minfreq);
|
lookup, length, minfreq);
|
||||||
selec = s1 * s2;
|
break;
|
||||||
break;
|
|
||||||
|
|
||||||
case OP_OR:
|
case OP_AND:
|
||||||
s1 = tsquery_opr_selec(item + 1, operand,
|
s1 = tsquery_opr_selec(item + 1, operand,
|
||||||
lookup, length, minfreq);
|
lookup, length, minfreq);
|
||||||
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
|
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
|
||||||
lookup, length, minfreq);
|
lookup, length, minfreq);
|
||||||
selec = s1 + s2 - s1 * s2;
|
selec = s1 * s2;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
case OP_OR:
|
||||||
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
|
s1 = tsquery_opr_selec(item + 1, operand,
|
||||||
selec = 0; /* keep compiler quiet */
|
lookup, length, minfreq);
|
||||||
break;
|
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
|
||||||
|
lookup, length, minfreq);
|
||||||
|
selec = s1 + s2 - s1 * s2;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
|
||||||
|
selec = 0; /* keep compiler quiet */
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Clamp intermediate results to stay sane despite roundoff error */
|
/* Clamp intermediate results to stay sane despite roundoff error */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user