mirror of
https://github.com/postgres/postgres.git
synced 2025-04-25 21:42:33 +03:00
Make contrib/pg_trgm also support regex searches with GiST indexes.
This wasn't addressed in the original patch, but it doesn't take very much additional code to cover the case, so let's get it done. Since pg_trgm 1.1 hasn't been released yet, I just changed the definition of what's in it, rather than inventing a 1.2.
This commit is contained in:
parent
e543631f3c
commit
6f5b8beb64
@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%';
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t like '%z foo bar%';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t like ' z foo%';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
explain (costs off)
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
QUERY PLAN
|
||||
------------------------------------------
|
||||
Index Scan using test2_idx_gist on test2
|
||||
Index Cond: (t ~ '[abc]{3}'::text)
|
||||
(2 rows)
|
||||
|
||||
explain (costs off)
|
||||
select * from test2 where t ~* 'DEF';
|
||||
QUERY PLAN
|
||||
------------------------------------------
|
||||
Index Scan using test2_idx_gist on test2
|
||||
Index Cond: (t ~* 'DEF'::text)
|
||||
(2 rows)
|
||||
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'a[bc]+d';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ '(abc)*$';
|
||||
t
|
||||
-------------
|
||||
abcdef
|
||||
quark
|
||||
z foo bar
|
||||
(3 rows)
|
||||
|
||||
select * from test2 where t ~* 'DEF';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'dEf';
|
||||
t
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
select * from test2 where t ~* '^q';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~* 'ab[a-z]{3}';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~* '(^| )qua';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'q.*rk$';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'q';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ '[a-z]{3}';
|
||||
t
|
||||
-------------
|
||||
abcdef
|
||||
quark
|
||||
z foo bar
|
||||
(3 rows)
|
||||
|
||||
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
||||
t
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
select * from test2 where t ~ 'z foo bar';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ ' z foo';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
|
@ -3,6 +3,10 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
|
||||
|
||||
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
|
||||
OPERATOR 5 pg_catalog.~ (text, text),
|
||||
OPERATOR 6 pg_catalog.~* (text, text);
|
||||
|
||||
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
|
||||
OPERATOR 5 pg_catalog.~ (text, text),
|
||||
OPERATOR 6 pg_catalog.~* (text, text);
|
||||
|
@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
|
||||
OPERATOR 4 pg_catalog.~~* (text, text),
|
||||
FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid);
|
||||
|
||||
-- Add operators that are new in 9.3.
|
||||
|
||||
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
|
||||
OPERATOR 5 pg_catalog.~ (text, text),
|
||||
OPERATOR 6 pg_catalog.~* (text, text);
|
||||
|
||||
-- support functions for gin
|
||||
CREATE FUNCTION gin_extract_value_trgm(text, internal)
|
||||
RETURNS internal
|
||||
|
@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%';
|
||||
select * from test2 where t like E'%\\bcd%';
|
||||
select * from test2 where t ilike '%BCD%';
|
||||
select * from test2 where t ilike 'qua%';
|
||||
select * from test2 where t like '%z foo bar%';
|
||||
select * from test2 where t like ' z foo%';
|
||||
explain (costs off)
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
explain (costs off)
|
||||
select * from test2 where t ~* 'DEF';
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
select * from test2 where t ~ 'a[bc]+d';
|
||||
select * from test2 where t ~ '(abc)*$';
|
||||
select * from test2 where t ~* 'DEF';
|
||||
select * from test2 where t ~ 'dEf';
|
||||
select * from test2 where t ~* '^q';
|
||||
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
||||
select * from test2 where t ~* 'ab[a-z]{3}';
|
||||
select * from test2 where t ~* '(^| )qua';
|
||||
select * from test2 where t ~ 'q.*rk$';
|
||||
select * from test2 where t ~ 'q';
|
||||
select * from test2 where t ~ '[a-z]{3}';
|
||||
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
||||
select * from test2 where t ~ 'z foo bar';
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
select * from test2 where t ~ ' z foo';
|
||||
|
@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen);
|
||||
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
|
||||
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
|
||||
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
|
||||
extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph,
|
||||
Oid collation);
|
||||
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
|
||||
extern TRGM *createTrgmNFA(text *text_re, Oid collation,
|
||||
TrgmPackedGraph **graph, MemoryContext rcontext);
|
||||
extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
|
||||
|
||||
#endif /* __TRGM_H__ */
|
||||
|
@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
|
||||
#endif
|
||||
/* FALL THRU */
|
||||
case RegExpStrategyNumber:
|
||||
trg = createTrgmNFA(val, &graph, PG_GET_COLLATION());
|
||||
trg = createTrgmNFA(val, PG_GET_COLLATION(),
|
||||
&graph, CurrentMemoryContext);
|
||||
if (trg && ARRNELEM(trg) > 0)
|
||||
{
|
||||
/*
|
||||
|
@ -8,6 +8,25 @@
|
||||
#include "access/skey.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* most recent inputs to gtrgm_consistent */
|
||||
StrategyNumber strategy;
|
||||
text *query;
|
||||
/* extracted trigrams for query */
|
||||
TRGM *trigrams;
|
||||
/* if a regex operator, the extracted graph */
|
||||
TrgmPackedGraph *graph;
|
||||
|
||||
/*
|
||||
* The "query" and "trigrams" are stored in the same palloc block as this
|
||||
* cache struct, at MAXALIGN'ed offsets. The graph however isn't.
|
||||
*/
|
||||
} gtrgm_consistent_cache;
|
||||
|
||||
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
|
||||
|
||||
|
||||
PG_FUNCTION_INFO_V1(gtrgm_in);
|
||||
Datum gtrgm_in(PG_FUNCTION_ARGS);
|
||||
|
||||
@ -38,8 +57,6 @@ Datum gtrgm_penalty(PG_FUNCTION_ARGS);
|
||||
PG_FUNCTION_INFO_V1(gtrgm_picksplit);
|
||||
Datum gtrgm_picksplit(PG_FUNCTION_ARGS);
|
||||
|
||||
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
|
||||
|
||||
/* Number of one-bits in an unsigned byte */
|
||||
static const uint8 number_of_ones[256] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
|
||||
TRGM *qtrg;
|
||||
bool res;
|
||||
Size querysize = VARSIZE(query);
|
||||
char *cache = (char *) fcinfo->flinfo->fn_extra,
|
||||
*cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber));
|
||||
gtrgm_consistent_cache *cache;
|
||||
|
||||
/*
|
||||
* Store both the strategy number and extracted trigrams in cache, because
|
||||
* trigram extraction is relatively CPU-expensive. We must include
|
||||
* strategy number because trigram extraction depends on strategy.
|
||||
* We keep the extracted trigrams in cache, because trigram extraction is
|
||||
* relatively CPU-expensive. When trying to reuse a cached value, check
|
||||
* strategy number not just query itself, because trigram extraction
|
||||
* depends on strategy.
|
||||
*
|
||||
* The cached structure contains the strategy number, then the input query
|
||||
* (starting at a MAXALIGN boundary), then the TRGM value (also starting
|
||||
* at a MAXALIGN boundary).
|
||||
* The cached structure is a single palloc chunk containing the
|
||||
* gtrgm_consistent_cache header, then the input query (starting at a
|
||||
* MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN
|
||||
* boundary). However we don't try to include the regex graph (if any) in
|
||||
* that struct. (XXX currently, this approach can leak regex graphs
|
||||
* across index rescans. Not clear if that's worth fixing.)
|
||||
*/
|
||||
cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
|
||||
if (cache == NULL ||
|
||||
strategy != *((StrategyNumber *) cache) ||
|
||||
VARSIZE(cachedQuery) != querysize ||
|
||||
memcmp(cachedQuery, query, querysize) != 0)
|
||||
cache->strategy != strategy ||
|
||||
VARSIZE(cache->query) != querysize ||
|
||||
memcmp((char *) cache->query, (char *) query, querysize) != 0)
|
||||
{
|
||||
char *newcache;
|
||||
gtrgm_consistent_cache *newcache;
|
||||
TrgmPackedGraph *graph = NULL;
|
||||
Size qtrgsize;
|
||||
|
||||
switch (strategy)
|
||||
{
|
||||
@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
|
||||
qtrg = generate_wildcard_trgm(VARDATA(query),
|
||||
querysize - VARHDRSZ);
|
||||
break;
|
||||
case RegExpICaseStrategyNumber:
|
||||
#ifndef IGNORECASE
|
||||
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
||||
#endif
|
||||
/* FALL THRU */
|
||||
case RegExpStrategyNumber:
|
||||
qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
|
||||
&graph, fcinfo->flinfo->fn_mcxt);
|
||||
/* just in case an empty array is returned ... */
|
||||
if (qtrg && ARRNELEM(qtrg) <= 0)
|
||||
{
|
||||
pfree(qtrg);
|
||||
qtrg = NULL;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
||||
qtrg = NULL; /* keep compiler quiet */
|
||||
break;
|
||||
}
|
||||
|
||||
newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
|
||||
MAXALIGN(sizeof(StrategyNumber)) +
|
||||
MAXALIGN(querysize) +
|
||||
VARSIZE(qtrg));
|
||||
cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber));
|
||||
qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
|
||||
|
||||
*((StrategyNumber *) newcache) = strategy;
|
||||
memcpy(cachedQuery, query, querysize);
|
||||
memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg));
|
||||
newcache = (gtrgm_consistent_cache *)
|
||||
MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
|
||||
MAXALIGN(sizeof(gtrgm_consistent_cache)) +
|
||||
MAXALIGN(querysize) +
|
||||
qtrgsize);
|
||||
|
||||
newcache->strategy = strategy;
|
||||
newcache->query = (text *)
|
||||
((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
|
||||
memcpy((char *) newcache->query, (char *) query, querysize);
|
||||
if (qtrg)
|
||||
{
|
||||
newcache->trigrams = (TRGM *)
|
||||
((char *) newcache->query + MAXALIGN(querysize));
|
||||
memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
|
||||
/* release qtrg in case it was made in fn_mcxt */
|
||||
pfree(qtrg);
|
||||
}
|
||||
else
|
||||
newcache->trigrams = NULL;
|
||||
newcache->graph = graph;
|
||||
|
||||
if (cache)
|
||||
pfree(cache);
|
||||
fcinfo->flinfo->fn_extra = newcache;
|
||||
fcinfo->flinfo->fn_extra = (void *) newcache;
|
||||
cache = newcache;
|
||||
}
|
||||
|
||||
qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize));
|
||||
qtrg = cache->trigrams;
|
||||
|
||||
switch (strategy)
|
||||
{
|
||||
@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
|
||||
}
|
||||
}
|
||||
break;
|
||||
case RegExpICaseStrategyNumber:
|
||||
#ifndef IGNORECASE
|
||||
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
||||
#endif
|
||||
/* FALL THRU */
|
||||
case RegExpStrategyNumber:
|
||||
/* Regexp search is inexact */
|
||||
*recheck = true;
|
||||
|
||||
/* Check regex match as much as we can with available info */
|
||||
if (qtrg)
|
||||
{
|
||||
if (GIST_LEAF(entry))
|
||||
{ /* all leafs contains orig trgm */
|
||||
bool *check;
|
||||
|
||||
check = trgm_presence_map(qtrg, key);
|
||||
res = trigramsMatchGraph(cache->graph, check);
|
||||
pfree(check);
|
||||
}
|
||||
else if (ISALLTRUE(key))
|
||||
{ /* non-leaf contains signature */
|
||||
res = true;
|
||||
}
|
||||
else
|
||||
{ /* non-leaf contains signature */
|
||||
int32 k,
|
||||
tmp = 0,
|
||||
len = ARRNELEM(qtrg);
|
||||
trgm *ptr = GETARR(qtrg);
|
||||
BITVECP sign = GETSIGN(key);
|
||||
|
||||
/* descend only if at least one trigram is present */
|
||||
res = false;
|
||||
for (k = 0; k < len; k++)
|
||||
{
|
||||
CPTRGM(((char *) &tmp), ptr + k);
|
||||
if (GETBIT(sign, HASHVAL(tmp)))
|
||||
{
|
||||
res = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* trigram-free query must be rechecked everywhere */
|
||||
res = true;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
||||
res = false; /* keep compiler quiet */
|
||||
|
@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a palloc'd boolean array showing, for each trigram in "query",
|
||||
* whether it is present in the trigram array "key".
|
||||
* This relies on the "key" array being sorted, but "query" need not be.
|
||||
*/
|
||||
bool *
|
||||
trgm_presence_map(TRGM *query, TRGM *key)
|
||||
{
|
||||
bool *result;
|
||||
trgm *ptrq = GETARR(query),
|
||||
*ptrk = GETARR(key);
|
||||
int lenq = ARRNELEM(query),
|
||||
lenk = ARRNELEM(key),
|
||||
i;
|
||||
|
||||
result = (bool *) palloc0(lenq * sizeof(bool));
|
||||
|
||||
/* for each query trigram, do a binary search in the key array */
|
||||
for (i = 0; i < lenq; i++)
|
||||
{
|
||||
int lo = 0;
|
||||
int hi = lenk;
|
||||
|
||||
while (lo < hi)
|
||||
{
|
||||
int mid = (lo + hi) / 2;
|
||||
int res = CMPTRGM(ptrq, ptrk + mid);
|
||||
|
||||
if (res < 0)
|
||||
hi = mid;
|
||||
else if (res > 0)
|
||||
lo = mid + 1;
|
||||
else
|
||||
{
|
||||
result[i] = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ptrq++;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Datum
|
||||
similarity(PG_FUNCTION_ARGS)
|
||||
{
|
||||
|
@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
|
||||
*
|
||||
* Returns an array of trigrams required by the regular expression, or NULL if
|
||||
* the regular expression was too complex to analyze. In addition, a packed
|
||||
* graph representation of the regex is returned into *graph.
|
||||
* graph representation of the regex is returned into *graph. The results
|
||||
* must be allocated in rcontext (which might or might not be the current
|
||||
* context).
|
||||
*/
|
||||
TRGM *
|
||||
createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
|
||||
createTrgmNFA(text *text_re, Oid collation,
|
||||
TrgmPackedGraph **graph, MemoryContext rcontext)
|
||||
{
|
||||
TRGM *trg;
|
||||
regex_t regex;
|
||||
@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
|
||||
|
||||
/*
|
||||
* This processing generates a great deal of cruft, which we'd like to
|
||||
* clean up before returning (since this function is normally called in a
|
||||
* clean up before returning (since this function may be called in a
|
||||
* query-lifespan memory context). Make a temp context we can work in so
|
||||
* that cleanup is easy. Note that the returned data structures must be
|
||||
* allocated in caller's context, however.
|
||||
* that cleanup is easy.
|
||||
*/
|
||||
tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
|
||||
"createTrgmNFA temporary context",
|
||||
@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
|
||||
*/
|
||||
PG_TRY();
|
||||
{
|
||||
trg = createTrgmNFAInternal(®ex, graph, oldcontext);
|
||||
trg = createTrgmNFAInternal(®ex, graph, rcontext);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
|
@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Beginning in <productname>PostgreSQL</> 9.3, <filename>pg_trgm</filename>
|
||||
GIN indexes also support index searches for regular-expression matches
|
||||
Beginning in <productname>PostgreSQL</> 9.3, these index types also support
|
||||
index searches for regular-expression matches
|
||||
(<literal>~</> and <literal>~*</> operators), for example
|
||||
<programlisting>
|
||||
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
|
||||
|
Loading…
x
Reference in New Issue
Block a user