Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.

Unlike Btree-based LIKE optimization, this works for non-left-anchored search patterns. The effectiveness of the search depends on how many trigrams can be extracted from the pattern. (The worst case, with no trigrams, degrades to a full-table scan, so this isn't a panacea. But it can be very useful.) Alexander Korotkov, reviewed by Jan Urbanski
2026-01-05 23:38:41 +03:00 · 2011-01-31 21:33:55 -05:00
parent 6238473adb
commit 6e2f3ae884
9 changed files with 639 additions and 49 deletions
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -18,6 +18,23 @@ float4		trgm_limit = 0.3f;

 PG_FUNCTION_INFO_V1(set_limit);
 Datum		set_limit(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(show_limit);
+Datum		show_limit(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(show_trgm);
+Datum		show_trgm(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity);
+Datum		similarity(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity_dist);
+Datum		similarity_dist(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity_op);
+Datum		similarity_op(PG_FUNCTION_ARGS);
+
+
 Datum
 set_limit(PG_FUNCTION_ARGS)
 {
@@ -29,8 +46,6 @@ set_limit(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(trgm_limit);
 }

-PG_FUNCTION_INFO_V1(show_limit);
-Datum		show_limit(PG_FUNCTION_ARGS);
 Datum
 show_limit(PG_FUNCTION_ARGS)
 {
@@ -120,7 +135,7 @@ cnt_trigram(trgm *tptr, char *str, int bytelen)
 #endif

 /*
- * Adds trigramm from words (already padded).
+ * Adds trigrams from words (already padded).
 */
 static trgm *
 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
@@ -236,6 +251,225 @@ generate_trgm(char *str, int slen)
 	return trg;
 }

+/*
+ * Extract the next non-wildcard part of a search string, ie, a word bounded
+ * by '_' or '%' meta-characters, non-word characters or string end.
+ *
+ * str: source string, of length lenstr bytes (need not be null-terminated)
+ * buf: where to return the substring (must be long enough)
+ * *bytelen: receives byte length of the found substring
+ * *charlen: receives character length of the found substring
+ *
+ * Returns pointer to end+1 of the found substring in the source string.
+ * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
+ *
+ * If the found word is bounded by non-word characters or string boundaries
+ * then this function will include corresponding padding spaces into buf.
+ */
+static const char *
+get_wildcard_part(const char *str, int lenstr,
+				  char *buf, int *bytelen, int *charlen)
+{
+	const char *beginword = str;
+	const char *endword;
+	char	   *s = buf;
+	bool        in_wildcard_meta = false;
+	bool        in_escape = false;
+	int         clen;
+
+	/*
+	 * Find the first word character remembering whether last character was
+	 * wildcard meta-character.
+	 */
+	while (beginword - str < lenstr)
+	{
+		if (in_escape)
+		{
+			in_escape = false;
+			in_wildcard_meta = false;
+			if (iswordchr(beginword))
+				break;
+		}
+		else
+		{
+			if (ISESCAPECHAR(beginword))
+				in_escape = true;
+			else if (ISWILDCARDCHAR(beginword))
+				in_wildcard_meta = true;
+			else if (iswordchr(beginword))
+				break;
+			else
+				in_wildcard_meta = false;
+		}
+		beginword += pg_mblen(beginword);
+	}
+
+	/*
+	 * Handle string end.
+	 */
+	if (beginword - str >= lenstr)
+		return NULL;
+
+	/*
+	 * Add left padding spaces if last character wasn't wildcard
+	 * meta-character.
+	 */
+	*charlen = 0;
+	if (!in_wildcard_meta)
+	{
+		if (LPADDING > 0)
+		{
+			*s++ = ' ';
+			(*charlen)++;
+			if (LPADDING > 1)
+			{
+				*s++ = ' ';
+				(*charlen)++;
+			}
+		}
+	}
+
+	/*
+	 * Copy data into buf until wildcard meta-character, non-word character or
+	 * string boundary.  Strip escapes during copy.
+	 */
+	endword = beginword;
+	in_wildcard_meta = false;
+	in_escape = false;
+	while (endword - str < lenstr)
+	{
+		clen = pg_mblen(endword);
+		if (in_escape)
+		{
+			in_escape = false;
+			in_wildcard_meta = false;
+			if (iswordchr(endword))
+			{
+				memcpy(s, endword, clen);
+				(*charlen)++;
+				s += clen;
+			}
+			else
+				break;
+		}
+		else
+		{
+			if (ISESCAPECHAR(endword))
+				in_escape = true;
+			else if (ISWILDCARDCHAR(endword))
+			{
+				in_wildcard_meta = true;
+				break;
+			}
+			else if (iswordchr(endword))
+			{
+				memcpy(s, endword, clen);
+				(*charlen)++;
+				s += clen;
+			}
+			else
+			{
+				in_wildcard_meta = false;
+				break;
+			}
+		}
+		endword += clen;
+	}
+
+	/*
+	 * Add right padding spaces if last character wasn't wildcard
+	 * meta-character.
+	 */
+	if (!in_wildcard_meta)
+	{
+		if (RPADDING > 0)
+		{
+			*s++ = ' ';
+			(*charlen)++;
+			if (RPADDING > 1)
+			{
+				*s++ = ' ';
+				(*charlen)++;
+			}
+		}
+	}
+
+	*bytelen = s - buf;
+	return endword;
+}
+
+/*
+ * Generates trigrams for wildcard search string.
+ *
+ * Returns array of trigrams that must occur in any string that matches the
+ * wildcard string.  For example, given pattern "a%bcd%" the trigrams
+ * " a", "bcd" would be extracted.
+ */
+TRGM *
+generate_wildcard_trgm(const char *str, int slen)
+{
+	TRGM	   *trg;
+	char	   *buf,
+		       *buf2;
+	trgm	   *tptr;
+	int			len,
+				charlen,
+				bytelen;
+	const char *eword;
+
+	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
+	trg->flag = ARRKEY;
+	SET_VARSIZE(trg, TRGMHDRSIZE);
+
+	if (slen + LPADDING + RPADDING < 3 || slen == 0)
+		return trg;
+
+	tptr = GETARR(trg);
+
+	buf = palloc(sizeof(char) * (slen + 4));
+
+	/*
+	 * Extract trigrams from each substring extracted by get_wildcard_part.
+	 */
+	eword = str;
+	while ((eword = get_wildcard_part(eword, slen - (eword - str),
+									  buf, &bytelen, &charlen)) != NULL)
+	{
+#ifdef IGNORECASE
+		buf2 = lowerstr_with_len(buf, bytelen);
+		bytelen = strlen(buf2);
+#else
+		buf2 = buf;
+#endif
+
+		/*
+		 * count trigrams
+		 */
+		tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+#ifdef IGNORECASE
+		pfree(buf2);
+#endif
+	}
+
+	pfree(buf);
+
+	if ((len = tptr - GETARR(trg)) == 0)
+		return trg;
+
+	/*
+	 * Make trigrams unique.
+	 */
+	if (len > 0)
+	{
+		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+		len = unique_array(GETARR(trg), len);
+	}
+
+	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+	return trg;
+}
+
 uint32
 trgm2int(trgm *ptr)
 {
@@ -250,8 +484,6 @@ trgm2int(trgm *ptr)
 	return val;
 }

-PG_FUNCTION_INFO_V1(show_trgm);
-Datum		show_trgm(PG_FUNCTION_ARGS);
 Datum
 show_trgm(PG_FUNCTION_ARGS)
 {
@@ -340,8 +572,44 @@ cnt_sml(TRGM *trg1, TRGM *trg2)

 }

-PG_FUNCTION_INFO_V1(similarity);
-Datum		similarity(PG_FUNCTION_ARGS);
+/*
+ * Returns whether trg2 contains all trigrams in trg1.
+ * This relies on the trigram arrays being sorted.
+ */
+bool
+trgm_contained_by(TRGM *trg1, TRGM *trg2)
+{
+	trgm	   *ptr1,
+			   *ptr2;
+	int			len1,
+				len2;
+
+	ptr1 = GETARR(trg1);
+	ptr2 = GETARR(trg2);
+
+	len1 = ARRNELEM(trg1);
+	len2 = ARRNELEM(trg2);
+
+	while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+	{
+		int			res = CMPTRGM(ptr1, ptr2);
+
+		if (res < 0)
+			return false;
+		else if (res > 0)
+			ptr2++;
+		else
+		{
+			ptr1++;
+			ptr2++;
+		}
+	}
+	if (ptr1 - GETARR(trg1) < len1)
+		return false;
+	else
+		return true;
+}
+
 Datum
 similarity(PG_FUNCTION_ARGS)
 {
@@ -364,8 +632,6 @@ similarity(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(res);
 }

-PG_FUNCTION_INFO_V1(similarity_dist);
-Datum		similarity_dist(PG_FUNCTION_ARGS);
 Datum
 similarity_dist(PG_FUNCTION_ARGS)
 {
@@ -375,8 +641,6 @@ similarity_dist(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(1.0 - res);
 }

-PG_FUNCTION_INFO_V1(similarity_op);
-Datum		similarity_op(PG_FUNCTION_ARGS);
 Datum
 similarity_op(PG_FUNCTION_ARGS)
 {