mirror of
https://github.com/postgres/postgres.git
synced 2025-07-30 11:03:19 +03:00
Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.
Unlike Btree-based LIKE optimization, this works for non-left-anchored search patterns. The effectiveness of the search depends on how many trigrams can be extracted from the pattern. (The worst case, with no trigrams, degrades to a full-table scan, so this isn't a panacea. But it can be very useful.) Alexander Korotkov, reviewed by Jan Urbanski
This commit is contained in:
@ -18,6 +18,23 @@ float4 trgm_limit = 0.3f;
|
||||
|
||||
PG_FUNCTION_INFO_V1(set_limit);
|
||||
Datum set_limit(PG_FUNCTION_ARGS);
|
||||
|
||||
PG_FUNCTION_INFO_V1(show_limit);
|
||||
Datum show_limit(PG_FUNCTION_ARGS);
|
||||
|
||||
PG_FUNCTION_INFO_V1(show_trgm);
|
||||
Datum show_trgm(PG_FUNCTION_ARGS);
|
||||
|
||||
PG_FUNCTION_INFO_V1(similarity);
|
||||
Datum similarity(PG_FUNCTION_ARGS);
|
||||
|
||||
PG_FUNCTION_INFO_V1(similarity_dist);
|
||||
Datum similarity_dist(PG_FUNCTION_ARGS);
|
||||
|
||||
PG_FUNCTION_INFO_V1(similarity_op);
|
||||
Datum similarity_op(PG_FUNCTION_ARGS);
|
||||
|
||||
|
||||
Datum
|
||||
set_limit(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -29,8 +46,6 @@ set_limit(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_FLOAT4(trgm_limit);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(show_limit);
|
||||
Datum show_limit(PG_FUNCTION_ARGS);
|
||||
Datum
|
||||
show_limit(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -120,7 +135,7 @@ cnt_trigram(trgm *tptr, char *str, int bytelen)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Adds trigramm from words (already padded).
|
||||
* Adds trigrams from words (already padded).
|
||||
*/
|
||||
static trgm *
|
||||
make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
|
||||
@ -236,6 +251,225 @@ generate_trgm(char *str, int slen)
|
||||
return trg;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract the next non-wildcard part of a search string, ie, a word bounded
|
||||
* by '_' or '%' meta-characters, non-word characters or string end.
|
||||
*
|
||||
* str: source string, of length lenstr bytes (need not be null-terminated)
|
||||
* buf: where to return the substring (must be long enough)
|
||||
* *bytelen: receives byte length of the found substring
|
||||
* *charlen: receives character length of the found substring
|
||||
*
|
||||
* Returns pointer to end+1 of the found substring in the source string.
|
||||
* Returns NULL if no word found (in which case buf, bytelen, charlen not set)
|
||||
*
|
||||
* If the found word is bounded by non-word characters or string boundaries
|
||||
* then this function will include corresponding padding spaces into buf.
|
||||
*/
|
||||
static const char *
|
||||
get_wildcard_part(const char *str, int lenstr,
|
||||
char *buf, int *bytelen, int *charlen)
|
||||
{
|
||||
const char *beginword = str;
|
||||
const char *endword;
|
||||
char *s = buf;
|
||||
bool in_wildcard_meta = false;
|
||||
bool in_escape = false;
|
||||
int clen;
|
||||
|
||||
/*
|
||||
* Find the first word character remembering whether last character was
|
||||
* wildcard meta-character.
|
||||
*/
|
||||
while (beginword - str < lenstr)
|
||||
{
|
||||
if (in_escape)
|
||||
{
|
||||
in_escape = false;
|
||||
in_wildcard_meta = false;
|
||||
if (iswordchr(beginword))
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ISESCAPECHAR(beginword))
|
||||
in_escape = true;
|
||||
else if (ISWILDCARDCHAR(beginword))
|
||||
in_wildcard_meta = true;
|
||||
else if (iswordchr(beginword))
|
||||
break;
|
||||
else
|
||||
in_wildcard_meta = false;
|
||||
}
|
||||
beginword += pg_mblen(beginword);
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle string end.
|
||||
*/
|
||||
if (beginword - str >= lenstr)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Add left padding spaces if last character wasn't wildcard
|
||||
* meta-character.
|
||||
*/
|
||||
*charlen = 0;
|
||||
if (!in_wildcard_meta)
|
||||
{
|
||||
if (LPADDING > 0)
|
||||
{
|
||||
*s++ = ' ';
|
||||
(*charlen)++;
|
||||
if (LPADDING > 1)
|
||||
{
|
||||
*s++ = ' ';
|
||||
(*charlen)++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy data into buf until wildcard meta-character, non-word character or
|
||||
* string boundary. Strip escapes during copy.
|
||||
*/
|
||||
endword = beginword;
|
||||
in_wildcard_meta = false;
|
||||
in_escape = false;
|
||||
while (endword - str < lenstr)
|
||||
{
|
||||
clen = pg_mblen(endword);
|
||||
if (in_escape)
|
||||
{
|
||||
in_escape = false;
|
||||
in_wildcard_meta = false;
|
||||
if (iswordchr(endword))
|
||||
{
|
||||
memcpy(s, endword, clen);
|
||||
(*charlen)++;
|
||||
s += clen;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ISESCAPECHAR(endword))
|
||||
in_escape = true;
|
||||
else if (ISWILDCARDCHAR(endword))
|
||||
{
|
||||
in_wildcard_meta = true;
|
||||
break;
|
||||
}
|
||||
else if (iswordchr(endword))
|
||||
{
|
||||
memcpy(s, endword, clen);
|
||||
(*charlen)++;
|
||||
s += clen;
|
||||
}
|
||||
else
|
||||
{
|
||||
in_wildcard_meta = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
endword += clen;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add right padding spaces if last character wasn't wildcard
|
||||
* meta-character.
|
||||
*/
|
||||
if (!in_wildcard_meta)
|
||||
{
|
||||
if (RPADDING > 0)
|
||||
{
|
||||
*s++ = ' ';
|
||||
(*charlen)++;
|
||||
if (RPADDING > 1)
|
||||
{
|
||||
*s++ = ' ';
|
||||
(*charlen)++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*bytelen = s - buf;
|
||||
return endword;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generates trigrams for wildcard search string.
|
||||
*
|
||||
* Returns array of trigrams that must occur in any string that matches the
|
||||
* wildcard string. For example, given pattern "a%bcd%" the trigrams
|
||||
* " a", "bcd" would be extracted.
|
||||
*/
|
||||
TRGM *
|
||||
generate_wildcard_trgm(const char *str, int slen)
|
||||
{
|
||||
TRGM *trg;
|
||||
char *buf,
|
||||
*buf2;
|
||||
trgm *tptr;
|
||||
int len,
|
||||
charlen,
|
||||
bytelen;
|
||||
const char *eword;
|
||||
|
||||
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
|
||||
trg->flag = ARRKEY;
|
||||
SET_VARSIZE(trg, TRGMHDRSIZE);
|
||||
|
||||
if (slen + LPADDING + RPADDING < 3 || slen == 0)
|
||||
return trg;
|
||||
|
||||
tptr = GETARR(trg);
|
||||
|
||||
buf = palloc(sizeof(char) * (slen + 4));
|
||||
|
||||
/*
|
||||
* Extract trigrams from each substring extracted by get_wildcard_part.
|
||||
*/
|
||||
eword = str;
|
||||
while ((eword = get_wildcard_part(eword, slen - (eword - str),
|
||||
buf, &bytelen, &charlen)) != NULL)
|
||||
{
|
||||
#ifdef IGNORECASE
|
||||
buf2 = lowerstr_with_len(buf, bytelen);
|
||||
bytelen = strlen(buf2);
|
||||
#else
|
||||
buf2 = buf;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* count trigrams
|
||||
*/
|
||||
tptr = make_trigrams(tptr, buf2, bytelen, charlen);
|
||||
#ifdef IGNORECASE
|
||||
pfree(buf2);
|
||||
#endif
|
||||
}
|
||||
|
||||
pfree(buf);
|
||||
|
||||
if ((len = tptr - GETARR(trg)) == 0)
|
||||
return trg;
|
||||
|
||||
/*
|
||||
* Make trigrams unique.
|
||||
*/
|
||||
if (len > 0)
|
||||
{
|
||||
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
|
||||
len = unique_array(GETARR(trg), len);
|
||||
}
|
||||
|
||||
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
||||
|
||||
return trg;
|
||||
}
|
||||
|
||||
uint32
|
||||
trgm2int(trgm *ptr)
|
||||
{
|
||||
@ -250,8 +484,6 @@ trgm2int(trgm *ptr)
|
||||
return val;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(show_trgm);
|
||||
Datum show_trgm(PG_FUNCTION_ARGS);
|
||||
Datum
|
||||
show_trgm(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -340,8 +572,44 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
|
||||
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(similarity);
|
||||
Datum similarity(PG_FUNCTION_ARGS);
|
||||
/*
|
||||
* Returns whether trg2 contains all trigrams in trg1.
|
||||
* This relies on the trigram arrays being sorted.
|
||||
*/
|
||||
bool
|
||||
trgm_contained_by(TRGM *trg1, TRGM *trg2)
|
||||
{
|
||||
trgm *ptr1,
|
||||
*ptr2;
|
||||
int len1,
|
||||
len2;
|
||||
|
||||
ptr1 = GETARR(trg1);
|
||||
ptr2 = GETARR(trg2);
|
||||
|
||||
len1 = ARRNELEM(trg1);
|
||||
len2 = ARRNELEM(trg2);
|
||||
|
||||
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
|
||||
{
|
||||
int res = CMPTRGM(ptr1, ptr2);
|
||||
|
||||
if (res < 0)
|
||||
return false;
|
||||
else if (res > 0)
|
||||
ptr2++;
|
||||
else
|
||||
{
|
||||
ptr1++;
|
||||
ptr2++;
|
||||
}
|
||||
}
|
||||
if (ptr1 - GETARR(trg1) < len1)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
Datum
|
||||
similarity(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -364,8 +632,6 @@ similarity(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_FLOAT4(res);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(similarity_dist);
|
||||
Datum similarity_dist(PG_FUNCTION_ARGS);
|
||||
Datum
|
||||
similarity_dist(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -375,8 +641,6 @@ similarity_dist(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_FLOAT4(1.0 - res);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(similarity_op);
|
||||
Datum similarity_op(PG_FUNCTION_ARGS);
|
||||
Datum
|
||||
similarity_op(PG_FUNCTION_ARGS)
|
||||
{
|
||||
|
Reference in New Issue
Block a user