From 6f5b8beb64d481c28a483090d10099c8d619c797 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Apr 2013 13:30:14 -0400 Subject: [PATCH] Make contrib/pg_trgm also support regex searches with GiST indexes. This wasn't addressed in the original patch, but it doesn't take very much additional code to cover the case, so let's get it done. Since pg_trgm 1.1 hasn't been released yet, I just changed the definition of what's in it, rather than inventing a 1.2. --- contrib/pg_trgm/expected/pg_trgm.out | 132 ++++++++++++++++++++++ contrib/pg_trgm/pg_trgm--1.0--1.1.sql | 4 + contrib/pg_trgm/pg_trgm--1.1.sql | 6 + contrib/pg_trgm/sql/pg_trgm.sql | 23 ++++ contrib/pg_trgm/trgm.h | 5 +- contrib/pg_trgm/trgm_gin.c | 3 +- contrib/pg_trgm/trgm_gist.c | 152 ++++++++++++++++++++++---- contrib/pg_trgm/trgm_op.c | 44 ++++++++ contrib/pg_trgm/trgm_regexp.c | 14 ++- doc/src/sgml/pgtrgm.sgml | 4 +- 10 files changed, 352 insertions(+), 35 deletions(-) diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 0ba44fa6a05..13b1fde1b8b 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%'; quark (1 row) +select * from test2 where t like '%z foo bar%'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t like ' z foo%'; + t +------------- + z foo bar +(1 row) + +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~ '[abc]{3}'::text) +(2 rows) + +explain (costs off) + select * from test2 where t ~* 'DEF'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~* 'DEF'::text) +(2 rows) + +select * from test2 where t ~ '[abc]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'a[bc]+d'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ '(abc)*$'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* 'DEF'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'dEf'; + t +--- +(0 rows) + +select * from test2 where t ~* '^q'; + t +------- + quark +(1 row) + +select * from test2 where t ~* '[abc]{3}[def]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* 'ab[a-z]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* '(^| )qua'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q.*rk$'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q'; + t +------- + quark +(1 row) + +select * from test2 where t ~ '[a-z]{3}'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; + t +--- +(0 rows) + +select * from test2 where t ~ 'z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo'; + t +------------- + z foo bar +(1 row) + diff --git a/contrib/pg_trgm/pg_trgm--1.0--1.1.sql b/contrib/pg_trgm/pg_trgm--1.0--1.1.sql index 449f840084b..65bbb1cfde6 100644 --- a/contrib/pg_trgm/pg_trgm--1.0--1.1.sql +++ b/contrib/pg_trgm/pg_trgm--1.0--1.1.sql @@ -3,6 +3,10 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); + ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD OPERATOR 5 pg_catalog.~ (text, text), OPERATOR 6 pg_catalog.~* (text, text); diff --git a/contrib/pg_trgm/pg_trgm--1.1.sql b/contrib/pg_trgm/pg_trgm--1.1.sql index 5d28339738e..1fff7af2c48 100644 --- a/contrib/pg_trgm/pg_trgm--1.1.sql +++ b/contrib/pg_trgm/pg_trgm--1.1.sql @@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD OPERATOR 4 pg_catalog.~~* (text, text), FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid); +-- Add operators that are new in 9.3. + +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); + -- support functions for gin CREATE FUNCTION gin_extract_value_trgm(text, internal) RETURNS internal diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 37a4c247056..7b02d988187 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%'; select * from test2 where t like E'%\\bcd%'; select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike 'qua%'; +select * from test2 where t like '%z foo bar%'; +select * from test2 where t like ' z foo%'; +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; +explain (costs off) + select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ '[abc]{3}'; +select * from test2 where t ~ 'a[bc]+d'; +select * from test2 where t ~ '(abc)*$'; +select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ 'dEf'; +select * from test2 where t ~* '^q'; +select * from test2 where t ~* '[abc]{3}[def]{3}'; +select * from test2 where t ~* 'ab[a-z]{3}'; +select * from test2 where t ~* '(^| )qua'; +select * from test2 where t ~ 'q.*rk$'; +select * from test2 where t ~ 'q'; +select * from test2 where t ~ '[a-z]{3}'; +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; +select * from test2 where t ~ 'z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo'; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 15e7bebb001..ed649b8dccd 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen); extern TRGM *generate_wildcard_trgm(const char *str, int slen); extern float4 cnt_sml(TRGM *trg1, TRGM *trg2); extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2); -extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph, - Oid collation); +extern bool *trgm_presence_map(TRGM *query, TRGM *key); +extern TRGM *createTrgmNFA(text *text_re, Oid collation, + TrgmPackedGraph **graph, MemoryContext rcontext); extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check); #endif /* __TRGM_H__ */ diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index e8285715631..1fbbd9ca35c 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) #endif /* FALL THRU */ case RegExpStrategyNumber: - trg = createTrgmNFA(val, &graph, PG_GET_COLLATION()); + trg = createTrgmNFA(val, PG_GET_COLLATION(), + &graph, CurrentMemoryContext); if (trg && ARRNELEM(trg) > 0) { /* diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 605d7ea3569..178f073755b 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -8,6 +8,25 @@ #include "access/skey.h" +typedef struct +{ + /* most recent inputs to gtrgm_consistent */ + StrategyNumber strategy; + text *query; + /* extracted trigrams for query */ + TRGM *trigrams; + /* if a regex operator, the extracted graph */ + TrgmPackedGraph *graph; + + /* + * The "query" and "trigrams" are stored in the same palloc block as this + * cache struct, at MAXALIGN'ed offsets. The graph however isn't. + */ +} gtrgm_consistent_cache; + +#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) + + PG_FUNCTION_INFO_V1(gtrgm_in); Datum gtrgm_in(PG_FUNCTION_ARGS); @@ -38,8 +57,6 @@ Datum gtrgm_penalty(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(gtrgm_picksplit); Datum gtrgm_picksplit(PG_FUNCTION_ARGS); -#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) - /* Number of one-bits in an unsigned byte */ static const uint8 number_of_ones[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, @@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS) TRGM *qtrg; bool res; Size querysize = VARSIZE(query); - char *cache = (char *) fcinfo->flinfo->fn_extra, - *cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber)); + gtrgm_consistent_cache *cache; /* - * Store both the strategy number and extracted trigrams in cache, because - * trigram extraction is relatively CPU-expensive. We must include - * strategy number because trigram extraction depends on strategy. + * We keep the extracted trigrams in cache, because trigram extraction is + * relatively CPU-expensive. When trying to reuse a cached value, check + * strategy number not just query itself, because trigram extraction + * depends on strategy. * - * The cached structure contains the strategy number, then the input query - * (starting at a MAXALIGN boundary), then the TRGM value (also starting - * at a MAXALIGN boundary). + * The cached structure is a single palloc chunk containing the + * gtrgm_consistent_cache header, then the input query (starting at a + * MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN + * boundary). However we don't try to include the regex graph (if any) in + * that struct. (XXX currently, this approach can leak regex graphs + * across index rescans. Not clear if that's worth fixing.) */ + cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra; if (cache == NULL || - strategy != *((StrategyNumber *) cache) || - VARSIZE(cachedQuery) != querysize || - memcmp(cachedQuery, query, querysize) != 0) + cache->strategy != strategy || + VARSIZE(cache->query) != querysize || + memcmp((char *) cache->query, (char *) query, querysize) != 0) { - char *newcache; + gtrgm_consistent_cache *newcache; + TrgmPackedGraph *graph = NULL; + Size qtrgsize; switch (strategy) { @@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS) qtrg = generate_wildcard_trgm(VARDATA(query), querysize - VARHDRSZ); break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + qtrg = createTrgmNFA(query, PG_GET_COLLATION(), + &graph, fcinfo->flinfo->fn_mcxt); + /* just in case an empty array is returned ... */ + if (qtrg && ARRNELEM(qtrg) <= 0) + { + pfree(qtrg); + qtrg = NULL; + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); qtrg = NULL; /* keep compiler quiet */ break; } - newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, - MAXALIGN(sizeof(StrategyNumber)) + - MAXALIGN(querysize) + - VARSIZE(qtrg)); - cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber)); + qtrgsize = qtrg ? VARSIZE(qtrg) : 0; - *((StrategyNumber *) newcache) = strategy; - memcpy(cachedQuery, query, querysize); - memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg)); + newcache = (gtrgm_consistent_cache *) + MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + MAXALIGN(sizeof(gtrgm_consistent_cache)) + + MAXALIGN(querysize) + + qtrgsize); + + newcache->strategy = strategy; + newcache->query = (text *) + ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache))); + memcpy((char *) newcache->query, (char *) query, querysize); + if (qtrg) + { + newcache->trigrams = (TRGM *) + ((char *) newcache->query + MAXALIGN(querysize)); + memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize); + /* release qtrg in case it was made in fn_mcxt */ + pfree(qtrg); + } + else + newcache->trigrams = NULL; + newcache->graph = graph; if (cache) pfree(cache); - fcinfo->flinfo->fn_extra = newcache; + fcinfo->flinfo->fn_extra = (void *) newcache; + cache = newcache; } - qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize)); + qtrg = cache->trigrams; switch (strategy) { @@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS) } } break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + /* Regexp search is inexact */ + *recheck = true; + + /* Check regex match as much as we can with available info */ + if (qtrg) + { + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + bool *check; + + check = trgm_presence_map(qtrg, key); + res = trigramsMatchGraph(cache->graph, check); + pfree(check); + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int32 k, + tmp = 0, + len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + BITVECP sign = GETSIGN(key); + + /* descend only if at least one trigram is present */ + res = false; + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + if (GETBIT(sign, HASHVAL(tmp))) + { + res = true; + break; + } + } + } + } + else + { + /* trigram-free query must be rechecked everywhere */ + res = true; + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 49e94f57a84..76e470c7785 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2) return true; } +/* + * Return a palloc'd boolean array showing, for each trigram in "query", + * whether it is present in the trigram array "key". + * This relies on the "key" array being sorted, but "query" need not be. + */ +bool * +trgm_presence_map(TRGM *query, TRGM *key) +{ + bool *result; + trgm *ptrq = GETARR(query), + *ptrk = GETARR(key); + int lenq = ARRNELEM(query), + lenk = ARRNELEM(key), + i; + + result = (bool *) palloc0(lenq * sizeof(bool)); + + /* for each query trigram, do a binary search in the key array */ + for (i = 0; i < lenq; i++) + { + int lo = 0; + int hi = lenk; + + while (lo < hi) + { + int mid = (lo + hi) / 2; + int res = CMPTRGM(ptrq, ptrk + mid); + + if (res < 0) + hi = mid; + else if (res > 0) + lo = mid + 1; + else + { + result[i] = true; + break; + } + } + ptrq++; + } + + return result; +} + Datum similarity(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 72e9b208fe5..772fc44b3c4 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams); * * Returns an array of trigrams required by the regular expression, or NULL if * the regular expression was too complex to analyze. In addition, a packed - * graph representation of the regex is returned into *graph. + * graph representation of the regex is returned into *graph. The results + * must be allocated in rcontext (which might or might not be the current + * context). */ TRGM * -createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) +createTrgmNFA(text *text_re, Oid collation, + TrgmPackedGraph **graph, MemoryContext rcontext) { TRGM *trg; regex_t regex; @@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) /* * This processing generates a great deal of cruft, which we'd like to - * clean up before returning (since this function is normally called in a + * clean up before returning (since this function may be called in a * query-lifespan memory context). Make a temp context we can work in so - * that cleanup is easy. Note that the returned data structures must be - * allocated in caller's context, however. + * that cleanup is easy. */ tmpcontext = AllocSetContextCreate(CurrentMemoryContext, "createTrgmNFA temporary context", @@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) */ PG_TRY(); { - trg = createTrgmNFAInternal(®ex, graph, oldcontext); + trg = createTrgmNFAInternal(®ex, graph, rcontext); } PG_CATCH(); { diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index 4572750f4d7..9039f03e8b9 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar'; - Beginning in PostgreSQL 9.3, pg_trgm - GIN indexes also support index searches for regular-expression matches + Beginning in PostgreSQL 9.3, these index types also support + index searches for regular-expression matches (~ and ~* operators), for example SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';