Add some knowledge about prefix matches to tsmatchsel(). It's not terribly

bright, but it beats assuming that a prefix match behaves identically to an exact match, which is what the code was doing before :-(. Noted while experimenting with Artur Dobrowski's example.
2025-12-09 02:08:45 +03:00 · 2010-08-01 21:31:08 +00:00
parent d4fe61b083
commit 97532f7c29
1 changed files with 105 additions and 51 deletions
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
 *
 *	 1 - select(oper) in NOT nodes
 *
- *	 freq[val] in VAL nodes, if the value is in MCELEM
+ *	 histogram-based estimation in prefix VAL nodes
+ *
+ *	 freq[val] in exact VAL nodes, if the value is in MCELEM
 *	 min(freq[MCELEM]) / 2 in VAL nodes, if it is not
 *
 * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
 * binary search for determining freq[MCELEM].
 *
 * If we don't have stats for the tsvector, we still use this logic,
- * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes.  This case
- * is signaled by lookup == NULL.
+ * except we use default estimates for VAL nodes.  This case is signaled
+ * by lookup == NULL.
 */
 static Selectivity
 tsquery_opr_selec(QueryItem *item, char *operand,
 				  TextFreq *lookup, int length, float4 minfreq)
 {
-	LexemeKey	key;
-	TextFreq   *searchres;
-	Selectivity selec,
-				s1,
-				s2;
+	Selectivity selec;

 	/* since this function recurses, it could be driven to stack overflow */
 	check_stack_depth();
@@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 	if (item->type == QI_VAL)
 	{
 		QueryOperand *oper = (QueryOperand *) item;
-
-		/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
-		if (lookup == NULL)
-			return (Selectivity) DEFAULT_TS_MATCH_SEL;
+		LexemeKey	key;

 		/*
 		 * Prepare the key for bsearch().
@@ -294,6 +289,59 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 		key.lexeme = operand + oper->distance;
 		key.length = oper->length;

+		if (oper->prefix)
+		{
+			/* Prefix match, ie the query item is lexeme:* */
+			Selectivity matched,
+						allmcvs;
+			int			i;
+
+			/*
+			 * Our strategy is to scan through the MCV list and add up the
+			 * frequencies of the ones that match the prefix, thereby
+			 * assuming that the MCVs are representative of the whole lexeme
+			 * population in this respect.  Compare histogram_selectivity().
+			 *
+			 * This is only a good plan if we have a pretty fair number of
+			 * MCVs available; we set the threshold at 100.  If no stats or
+			 * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
+			 */
+			if (lookup == NULL || length < 100)
+				return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+			matched = allmcvs = 0;
+			for (i = 0; i < length; i++)
+			{
+				TextFreq   *t = lookup + i;
+				int			tlen = VARSIZE_ANY_EXHDR(t->element);
+
+				if (tlen >= key.length &&
+					strncmp(key.lexeme, VARDATA_ANY(t->element),
+							key.length) == 0)
+					matched += t->frequency;
+				allmcvs += t->frequency;
+			}
+
+			if (allmcvs > 0)	/* paranoia about zero divide */
+				selec = matched / allmcvs;
+			else
+				selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+			/*
+			 * In any case, never believe that a prefix match has selectivity
+			 * less than DEFAULT_TS_MATCH_SEL.
+			 */
+			selec = Max(DEFAULT_TS_MATCH_SEL, selec);
+		}
+		else
+		{
+			/* Regular exact lexeme match */
+			TextFreq   *searchres;
+
+			/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+			if (lookup == NULL)
+				return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
 			searchres = (TextFreq *) bsearch(&key, lookup, length,
 											 sizeof(TextFreq),
 											 compare_lexeme_textfreq);
@@ -301,10 +349,10 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 			if (searchres)
 			{
 				/*
-			 * The element is in MCELEM.  Return precise selectivity (or at
-			 * least as precise as ANALYZE could find out).
+				 * The element is in MCELEM.  Return precise selectivity (or
+				 * at least as precise as ANALYZE could find out).
 				 */
-			return (Selectivity) searchres->frequency;
+				selec = searchres->frequency;
 			}
 			else
 			{
@@ -312,11 +360,16 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 				 * The element is not in MCELEM.  Punt, but assume that the
 				 * selectivity cannot be more than minfreq / 2.
 				 */
-			return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+				selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
 			}
 		}
-
+	}
+	else
+	{
 		/* Current TSQuery node is an operator */
+		Selectivity s1,
+					s2;
+
 		switch (item->qoperator.oper)
 		{
 			case OP_NOT:
@@ -345,6 +398,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 				selec = 0;			/* keep compiler quiet */
 				break;
 		}
+	}

 	/* Clamp intermediate results to stay sane despite roundoff error */
 	CLAMP_PROBABILITY(selec);