mirror of
https://github.com/postgres/postgres.git
synced 2025-08-09 17:03:00 +03:00
Support indexing of regular-expression searches in contrib/pg_trgm.
This works by extracting trigrams from the given regular expression, in generally the same spirit as the previously-existing support for LIKE searches, though of course the details are far more complicated. Currently, only GIN indexes are supported. We might be able to make it work with GiST indexes later. The implementation includes adding API functions to backend/regex/ to provide a view of the search NFA created from a regular expression. These functions are meant to be generic enough to be supportable in a standalone version of the regex library, should that ever happen. Alexander Korotkov, reviewed by Heikki Linnakangas and Tom Lane
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
# contrib/pg_trgm/Makefile
|
||||
|
||||
MODULE_big = pg_trgm
|
||||
OBJS = trgm_op.o trgm_gist.o trgm_gin.o
|
||||
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o
|
||||
|
||||
EXTENSION = pg_trgm
|
||||
DATA = pg_trgm--1.0.sql pg_trgm--unpackaged--1.0.sql
|
||||
DATA = pg_trgm--1.1.sql pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
|
||||
|
||||
REGRESS = pg_trgm
|
||||
|
||||
|
@@ -60,7 +60,7 @@ select similarity('---', '####---');
|
||||
(1 row)
|
||||
|
||||
CREATE TABLE test_trgm(t text);
|
||||
\copy test_trgm from 'data/trgm.data
|
||||
\copy test_trgm from 'data/trgm.data'
|
||||
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
||||
t | sml
|
||||
-------------+----------
|
||||
@@ -3470,6 +3470,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
|
||||
create table test2(t text);
|
||||
insert into test2 values ('abcdef');
|
||||
insert into test2 values ('quark');
|
||||
insert into test2 values (' z foo bar');
|
||||
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
|
||||
set enable_seqscan=off;
|
||||
explain (costs off)
|
||||
@@ -3521,6 +3522,142 @@ select * from test2 where t ilike 'qua%';
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t like '%z foo bar%';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t like ' z foo%';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
explain (costs off)
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
QUERY PLAN
|
||||
--------------------------------------------
|
||||
Bitmap Heap Scan on test2
|
||||
Recheck Cond: (t ~ '[abc]{3}'::text)
|
||||
-> Bitmap Index Scan on test2_idx_gin
|
||||
Index Cond: (t ~ '[abc]{3}'::text)
|
||||
(4 rows)
|
||||
|
||||
explain (costs off)
|
||||
select * from test2 where t ~* 'DEF';
|
||||
QUERY PLAN
|
||||
------------------------------------------
|
||||
Bitmap Heap Scan on test2
|
||||
Recheck Cond: (t ~* 'DEF'::text)
|
||||
-> Bitmap Index Scan on test2_idx_gin
|
||||
Index Cond: (t ~* 'DEF'::text)
|
||||
(4 rows)
|
||||
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'a[bc]+d';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ '(abc)*$';
|
||||
t
|
||||
-------------
|
||||
abcdef
|
||||
quark
|
||||
z foo bar
|
||||
(3 rows)
|
||||
|
||||
select * from test2 where t ~* 'DEF';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'dEf';
|
||||
t
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
select * from test2 where t ~* '^q';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~* 'ab[a-z]{3}';
|
||||
t
|
||||
--------
|
||||
abcdef
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~* '(^| )qua';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'q.*rk$';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ 'q';
|
||||
t
|
||||
-------
|
||||
quark
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ '[a-z]{3}';
|
||||
t
|
||||
-------------
|
||||
abcdef
|
||||
quark
|
||||
z foo bar
|
||||
(3 rows)
|
||||
|
||||
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
||||
t
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
select * from test2 where t ~ 'z foo bar';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
select * from test2 where t ~ ' z foo';
|
||||
t
|
||||
-------------
|
||||
z foo bar
|
||||
(1 row)
|
||||
|
||||
drop index test2_idx_gin;
|
||||
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
|
||||
set enable_seqscan=off;
|
||||
|
8
contrib/pg_trgm/pg_trgm--1.0--1.1.sql
Normal file
8
contrib/pg_trgm/pg_trgm--1.0--1.1.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
/* contrib/pg_trgm/pg_trgm--1.0--1.1.sql */
|
||||
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
|
||||
|
||||
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
|
||||
OPERATOR 5 pg_catalog.~ (text, text),
|
||||
OPERATOR 6 pg_catalog.~* (text, text);
|
@@ -1,4 +1,4 @@
|
||||
/* contrib/pg_trgm/pg_trgm--1.0.sql */
|
||||
/* contrib/pg_trgm/pg_trgm--1.1.sql */
|
||||
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION pg_trgm" to load this file. \quit
|
||||
@@ -164,3 +164,9 @@ AS
|
||||
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
|
||||
OPERATOR 3 pg_catalog.~~ (text, text),
|
||||
OPERATOR 4 pg_catalog.~~* (text, text);
|
||||
|
||||
-- Add operators that are new in 9.3.
|
||||
|
||||
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
|
||||
OPERATOR 5 pg_catalog.~ (text, text),
|
||||
OPERATOR 6 pg_catalog.~* (text, text);
|
@@ -1,5 +1,5 @@
|
||||
# pg_trgm extension
|
||||
comment = 'text similarity measurement and index searching based on trigrams'
|
||||
default_version = '1.0'
|
||||
default_version = '1.1'
|
||||
module_pathname = '$libdir/pg_trgm'
|
||||
relocatable = true
|
||||
|
@@ -15,7 +15,7 @@ select similarity('---', '####---');
|
||||
|
||||
CREATE TABLE test_trgm(t text);
|
||||
|
||||
\copy test_trgm from 'data/trgm.data
|
||||
\copy test_trgm from 'data/trgm.data'
|
||||
|
||||
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
||||
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
||||
@@ -43,6 +43,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
|
||||
create table test2(t text);
|
||||
insert into test2 values ('abcdef');
|
||||
insert into test2 values ('quark');
|
||||
insert into test2 values (' z foo bar');
|
||||
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
|
||||
set enable_seqscan=off;
|
||||
explain (costs off)
|
||||
@@ -54,6 +55,29 @@ select * from test2 where t like '%bcd%';
|
||||
select * from test2 where t like E'%\\bcd%';
|
||||
select * from test2 where t ilike '%BCD%';
|
||||
select * from test2 where t ilike 'qua%';
|
||||
select * from test2 where t like '%z foo bar%';
|
||||
select * from test2 where t like ' z foo%';
|
||||
explain (costs off)
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
explain (costs off)
|
||||
select * from test2 where t ~* 'DEF';
|
||||
select * from test2 where t ~ '[abc]{3}';
|
||||
select * from test2 where t ~ 'a[bc]+d';
|
||||
select * from test2 where t ~ '(abc)*$';
|
||||
select * from test2 where t ~* 'DEF';
|
||||
select * from test2 where t ~ 'dEf';
|
||||
select * from test2 where t ~* '^q';
|
||||
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
||||
select * from test2 where t ~* 'ab[a-z]{3}';
|
||||
select * from test2 where t ~* '(^| )qua';
|
||||
select * from test2 where t ~ 'q.*rk$';
|
||||
select * from test2 where t ~ 'q';
|
||||
select * from test2 where t ~ '[a-z]{3}';
|
||||
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
||||
select * from test2 where t ~ 'z foo bar';
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
select * from test2 where t ~ ' z foo bar';
|
||||
select * from test2 where t ~ ' z foo';
|
||||
drop index test2_idx_gin;
|
||||
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
|
||||
set enable_seqscan=off;
|
||||
|
@@ -7,18 +7,20 @@
|
||||
#include "access/gist.h"
|
||||
#include "access/itup.h"
|
||||
#include "storage/bufpage.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
/* options */
|
||||
/*
|
||||
* Options ... but note that trgm_regexp.c effectively assumes these values
|
||||
* of LPADDING and RPADDING.
|
||||
*/
|
||||
#define LPADDING 2
|
||||
#define RPADDING 1
|
||||
#define KEEPONLYALNUM
|
||||
/*
|
||||
* Caution: IGNORECASE macro means that trigrams are case-insensitive.
|
||||
* If this macro is disabled, the ~~* operator must be removed from the
|
||||
* operator classes, because we can't handle case-insensitive wildcard search
|
||||
* with case-sensitive trigrams. Failure to do this will result in "cannot
|
||||
* handle ~~* with case-sensitive trigrams" errors.
|
||||
* If this macro is disabled, the ~* and ~~* operators must be removed from
|
||||
* the operator classes, because we can't handle case-insensitive wildcard
|
||||
* search with case-sensitive trigrams. Failure to do this will result in
|
||||
* "cannot handle ~*(~~*) with case-sensitive trigrams" errors.
|
||||
*/
|
||||
#define IGNORECASE
|
||||
#define DIVUNION
|
||||
@@ -28,6 +30,8 @@
|
||||
#define DistanceStrategyNumber 2
|
||||
#define LikeStrategyNumber 3
|
||||
#define ILikeStrategyNumber 4
|
||||
#define RegExpStrategyNumber 5
|
||||
#define RegExpICaseStrategyNumber 6
|
||||
|
||||
|
||||
typedef char trgm[3];
|
||||
@@ -42,11 +46,11 @@ typedef char trgm[3];
|
||||
*(((char*)(a))+2) = *(((char*)(b))+2); \
|
||||
} while(0);
|
||||
|
||||
uint32 trgm2int(trgm *ptr);
|
||||
|
||||
#ifdef KEEPONLYALNUM
|
||||
#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c))
|
||||
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
|
||||
#else
|
||||
#define ISWORDCHR(c) (!t_isspace(c))
|
||||
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
|
||||
#endif
|
||||
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
|
||||
@@ -99,11 +103,18 @@ typedef char *BITVECP;
|
||||
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
|
||||
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
|
||||
|
||||
typedef struct TrgmPackedGraph TrgmPackedGraph;
|
||||
|
||||
extern float4 trgm_limit;
|
||||
|
||||
TRGM *generate_trgm(char *str, int slen);
|
||||
TRGM *generate_wildcard_trgm(const char *str, int slen);
|
||||
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
|
||||
bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
|
||||
extern uint32 trgm2int(trgm *ptr);
|
||||
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
|
||||
extern TRGM *generate_trgm(char *str, int slen);
|
||||
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
|
||||
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
|
||||
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
|
||||
extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph,
|
||||
Oid collation);
|
||||
extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
|
||||
|
||||
#endif /* __TRGM_H__ */
|
||||
|
@@ -80,13 +80,15 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
|
||||
StrategyNumber strategy = PG_GETARG_UINT16(2);
|
||||
|
||||
/* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
|
||||
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
|
||||
Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
|
||||
|
||||
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
|
||||
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
|
||||
Datum *entries = NULL;
|
||||
TRGM *trg;
|
||||
int32 trglen;
|
||||
trgm *ptr;
|
||||
TrgmPackedGraph *graph;
|
||||
int32 i;
|
||||
|
||||
switch (strategy)
|
||||
@@ -107,6 +109,33 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
|
||||
*/
|
||||
trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
|
||||
break;
|
||||
case RegExpICaseStrategyNumber:
|
||||
#ifndef IGNORECASE
|
||||
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
||||
#endif
|
||||
/* FALL THRU */
|
||||
case RegExpStrategyNumber:
|
||||
trg = createTrgmNFA(val, &graph, PG_GET_COLLATION());
|
||||
if (trg && ARRNELEM(trg) > 0)
|
||||
{
|
||||
/*
|
||||
* Successful regex processing: store NFA-like graph as
|
||||
* extra_data. GIN API requires an array of nentries
|
||||
* Pointers, but we just put the same value in each element.
|
||||
*/
|
||||
trglen = ARRNELEM(trg);
|
||||
*extra_data = (Pointer *) palloc(sizeof(Pointer) * trglen);
|
||||
for (i = 0; i < trglen; i++)
|
||||
(*extra_data)[i] = (Pointer) graph;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* No result: have to do full index scan. */
|
||||
*nentries = 0;
|
||||
*searchMode = GIN_SEARCH_MODE_ALL;
|
||||
PG_RETURN_POINTER(entries);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
||||
trg = NULL; /* keep compiler quiet */
|
||||
@@ -146,8 +175,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
|
||||
|
||||
/* text *query = PG_GETARG_TEXT_P(2); */
|
||||
int32 nkeys = PG_GETARG_INT32(3);
|
||||
|
||||
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
|
||||
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
|
||||
bool *recheck = (bool *) PG_GETARG_POINTER(5);
|
||||
bool res;
|
||||
int32 i,
|
||||
@@ -189,6 +217,21 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
|
||||
}
|
||||
}
|
||||
break;
|
||||
case RegExpICaseStrategyNumber:
|
||||
#ifndef IGNORECASE
|
||||
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
||||
#endif
|
||||
/* FALL THRU */
|
||||
case RegExpStrategyNumber:
|
||||
if (nkeys < 1)
|
||||
{
|
||||
/* Regex processing gave no result: do full index scan */
|
||||
res = true;
|
||||
}
|
||||
else
|
||||
res = trigramsMatchGraph((TrgmPackedGraph *) extra_data[0],
|
||||
check);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
||||
res = false; /* keep compiler quiet */
|
||||
|
@@ -77,12 +77,6 @@ unique_array(trgm *a, int len)
|
||||
return curend + 1 - a;
|
||||
}
|
||||
|
||||
#ifdef KEEPONLYALNUM
|
||||
#define iswordchr(c) (t_isalpha(c) || t_isdigit(c))
|
||||
#else
|
||||
#define iswordchr(c) (!t_isspace(c))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Finds first word in string, returns pointer to the word,
|
||||
* endword points to the character after word
|
||||
@@ -92,7 +86,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
|
||||
{
|
||||
char *beginword = str;
|
||||
|
||||
while (beginword - str < lenstr && !iswordchr(beginword))
|
||||
while (beginword - str < lenstr && !ISWORDCHR(beginword))
|
||||
beginword += pg_mblen(beginword);
|
||||
|
||||
if (beginword - str >= lenstr)
|
||||
@@ -100,7 +94,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
|
||||
|
||||
*endword = beginword;
|
||||
*charlen = 0;
|
||||
while (*endword - str < lenstr && iswordchr(*endword))
|
||||
while (*endword - str < lenstr && ISWORDCHR(*endword))
|
||||
{
|
||||
*endword += pg_mblen(*endword);
|
||||
(*charlen)++;
|
||||
@@ -114,7 +108,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
|
||||
* which is always exactly three bytes. If we have three single-byte
|
||||
* characters, we just use them as-is; otherwise we form a hash value.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
compact_trigram(trgm *tptr, char *str, int bytelen)
|
||||
{
|
||||
if (bytelen == 3)
|
||||
@@ -290,7 +284,7 @@ get_wildcard_part(const char *str, int lenstr,
|
||||
{
|
||||
if (in_escape)
|
||||
{
|
||||
if (iswordchr(beginword))
|
||||
if (ISWORDCHR(beginword))
|
||||
break;
|
||||
in_escape = false;
|
||||
in_leading_wildcard_meta = false;
|
||||
@@ -301,7 +295,7 @@ get_wildcard_part(const char *str, int lenstr,
|
||||
in_escape = true;
|
||||
else if (ISWILDCARDCHAR(beginword))
|
||||
in_leading_wildcard_meta = true;
|
||||
else if (iswordchr(beginword))
|
||||
else if (ISWORDCHR(beginword))
|
||||
break;
|
||||
else
|
||||
in_leading_wildcard_meta = false;
|
||||
@@ -344,7 +338,7 @@ get_wildcard_part(const char *str, int lenstr,
|
||||
clen = pg_mblen(endword);
|
||||
if (in_escape)
|
||||
{
|
||||
if (iswordchr(endword))
|
||||
if (ISWORDCHR(endword))
|
||||
{
|
||||
memcpy(s, endword, clen);
|
||||
(*charlen)++;
|
||||
@@ -372,7 +366,7 @@ get_wildcard_part(const char *str, int lenstr,
|
||||
in_trailing_wildcard_meta = true;
|
||||
break;
|
||||
}
|
||||
else if (iswordchr(endword))
|
||||
else if (ISWORDCHR(endword))
|
||||
{
|
||||
memcpy(s, endword, clen);
|
||||
(*charlen)++;
|
||||
|
2197
contrib/pg_trgm/trgm_regexp.c
Normal file
2197
contrib/pg_trgm/trgm_regexp.c
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user