1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-14 18:42:34 +03:00

Replace regular expression package with Henry Spencer's latest version

(extracted from Tcl 8.4.1 release, as Henry still hasn't got round to
making it a separate library).  This solves a performance problem for
multibyte, as well as upgrading our regexp support to match recent Tcl
and nearly match recent Perl.
This commit is contained in:
Tom Lane
2003-02-05 17:41:33 +00:00
parent 32c3db0f86
commit 7bcc6d98fb
29 changed files with 10704 additions and 5215 deletions

View File

@ -1,14 +1,14 @@
/*-------------------------------------------------------------------------
*
* regexp.c
* regular expression handling code.
* Postgres' interface to the regular expression package.
*
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.44 2003/02/05 17:41:32 tgl Exp $
*
* Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance
@ -30,171 +30,189 @@
#include "postgres.h"
#include "regex/regex.h"
#include "mb/pg_wchar.h"
#include "utils/builtins.h"
#if defined(DISABLE_XOPEN_NLS)
#undef _XOPEN_SOURCE
#endif /* DISABLE_XOPEN_NLS */
/* this is the number of cached regular expressions held. */
/*
* We cache precompiled regular expressions using a "self organizing list"
* structure, in which recently-used items tend to be near the front.
* Whenever we use an entry, it's moved up to the front of the list.
* Over time, an item's average position corresponds to its frequency of use.
*
* When we first create an entry, it's inserted at the front of
* the array, dropping the entry at the end of the array if necessary to
* make room. (This might seem to be weighting the new entry too heavily,
* but if we insert new entries further back, we'll be unable to adjust to
* a sudden shift in the query mix where we are presented with MAX_CACHED_RES
* never-before-seen items used circularly. We ought to be able to handle
* that case, so we have to insert at the front.)
*
* Knuth mentions a variant strategy in which a used item is moved up just
* one place in the list. Although he says this uses fewer comparisons on
* average, it seems not to adapt very well to the situation where you have
* both some reusable patterns and a steady stream of non-reusable patterns.
* A reusable pattern that isn't used at least as often as non-reusable
* patterns are seen will "fail to keep up" and will drop off the end of the
* cache. With move-to-front, a reusable pattern is guaranteed to stay in
* the cache as long as it's used at least once in every MAX_CACHED_RES uses.
*/
/* this is the maximum number of cached regular expressions */
#ifndef MAX_CACHED_RES
#define MAX_CACHED_RES 32
#endif
/* this structure describes a cached regular expression */
struct cached_re_str
/* this structure describes one cached regular expression */
typedef struct cached_re_str
{
char *cre_s; /* pattern as null-terminated string */
int cre_type; /* compiled-type: extended,icase etc */
text *cre_pat; /* original RE (untoasted TEXT form) */
int cre_flags; /* compile flags: extended,icase etc */
regex_t cre_re; /* the compiled regular expression */
unsigned long cre_lru; /* lru tag */
};
} cached_re_str;
static int rec = 0; /* # of cached re's */
static struct cached_re_str rev[MAX_CACHED_RES]; /* cached re's */
static unsigned long lru; /* system lru tag */
static int pg_lastrec = 0;
/* attempt to compile `re' as an re, then match it against text */
/* cflags - flag to regcomp indicates case sensitivity */
static bool
RE_compile_and_execute(text *text_re, char *text, int cflags,
int nmatch, regmatch_t *pmatch)
{
char *re;
int oldest;
int i;
int regcomp_result;
/* Convert 'text' pattern to null-terminated string */
re = DatumGetCString(DirectFunctionCall1(textout,
PointerGetDatum(text_re)));
/*
* Find a previously compiled regular expression. Run the cache as a
* ring buffer, starting the search from the previous match if any.
*/
i = pg_lastrec;
while (i < rec)
{
if (rev[i].cre_s != NULL)
{
if (strcmp(rev[i].cre_s, re) == 0 &&
rev[i].cre_type == cflags)
{
pg_lastrec = i;
rev[i].cre_lru = ++lru;
pfree(re);
return (pg_regexec(&rev[i].cre_re,
text, nmatch,
pmatch, 0) == 0);
}
}
i++;
/*
* If we were not at the first slot to start, then think about
* wrapping if necessary.
*/
if (pg_lastrec != 0)
{
if (i >= rec)
i = 0;
else if (i == pg_lastrec)
break;
}
}
/* we didn't find it - make room in the cache for it */
if (rec >= MAX_CACHED_RES)
{
/* cache is full - find the oldest entry */
for (oldest = 0, i = 1; i < rec; i++)
{
if (rev[i].cre_lru < rev[oldest].cre_lru)
oldest = i;
}
}
else
oldest = rec++;
/* if there was an old re, then de-allocate the space it used */
if (rev[oldest].cre_s != (char *) NULL)
{
for (lru = i = 0; i < rec; i++)
{
/* downweight all of the other cached entries */
rev[i].cre_lru = (rev[i].cre_lru - rev[oldest].cre_lru) / 2;
if (rev[i].cre_lru > lru)
lru = rev[i].cre_lru;
}
pg_regfree(&rev[oldest].cre_re);
/*
* use malloc/free for the cre_s field because the storage has to
* persist across transactions
*/
free(rev[oldest].cre_s);
rev[oldest].cre_s = (char *) NULL;
}
/* compile the re */
regcomp_result = pg_regcomp(&rev[oldest].cre_re, re, cflags);
if (regcomp_result == 0)
{
pg_lastrec = oldest;
/*
* use malloc/free for the cre_s field because the storage has to
* persist across transactions
*/
rev[oldest].cre_s = strdup(re);
rev[oldest].cre_lru = ++lru;
rev[oldest].cre_type = cflags;
pfree(re);
/* agc - fixed an old typo here */
return (pg_regexec(&rev[oldest].cre_re, text,
nmatch, pmatch, 0) == 0);
}
else
{
char errMsg[1000];
/* re didn't compile */
pg_regerror(regcomp_result, &rev[oldest].cre_re, errMsg,
sizeof(errMsg));
elog(ERROR, "Invalid regular expression: %s", errMsg);
}
/* not reached */
return false;
}
static int num_res = 0; /* # of cached re's */
static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
/*
fixedlen_regexeq:
a generic fixed length regexp routine
s - the string to match against (not necessarily null-terminated)
p - the pattern (as a text*)
charlen - the length of the string
*/
* RE_compile_and_execute - compile and execute a RE, caching if possible
*
* Returns TRUE on match, FALSE on no match
*
* text_re --- the pattern, expressed as an *untoasted* TEXT object
* dat --- the data to match against (need not be null-terminated)
* dat_len --- the length of the data string
* cflags --- compile options for the pattern
* nmatch, pmatch --- optional return area for match details
*
* Both pattern and data are given in the database encoding. We internally
* convert to array of pg_wchar which is what Spencer's regex package wants.
*/
static bool
fixedlen_regexeq(char *s, text *p, int charlen, int cflags)
RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
int cflags, int nmatch, regmatch_t *pmatch)
{
char *sterm;
bool result;
int text_re_len = VARSIZE(text_re);
pg_wchar *data;
size_t data_len;
pg_wchar *pattern;
size_t pattern_len;
int i;
int regcomp_result;
int regexec_result;
cached_re_str re_temp;
/* be sure sterm is null-terminated */
sterm = (char *) palloc(charlen + 1);
memcpy(sterm, s, charlen);
sterm[charlen] = '\0';
/* Convert data string to wide characters */
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(dat, data, dat_len);
result = RE_compile_and_execute(p, sterm, cflags, 0, NULL);
/*
* Look for a match among previously compiled REs. Since the data
* structure is self-organizing with most-used entries at the front,
* our search strategy can just be to scan from the front.
*/
for (i = 0; i < num_res; i++)
{
if (memcmp(re_array[i].cre_pat, text_re, text_re_len) == 0 &&
re_array[i].cre_flags == cflags)
{
/*
* Found a match; move it to front if not there already.
*/
if (i > 0)
{
re_temp = re_array[i];
memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
re_array[0] = re_temp;
}
pfree(sterm);
/* Perform RE match and return result */
regexec_result = pg_regexec(&re_array[0].cre_re,
data,
data_len,
NULL, /* no details */
nmatch,
pmatch,
0);
return result;
pfree(data);
return (regexec_result == 0);
}
}
/*
* Couldn't find it, so try to compile the new RE. To avoid leaking
* resources on failure, we build into the re_temp local.
*/
/* Convert pattern string to wide characters */
pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar));
pattern_len = pg_mb2wchar_with_len((unsigned char *) VARDATA(text_re),
pattern,
text_re_len - VARHDRSZ);
regcomp_result = pg_regcomp(&re_temp.cre_re,
pattern,
pattern_len,
cflags);
pfree(pattern);
if (regcomp_result != 0)
{
/* re didn't compile */
char errMsg[100];
pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
/* XXX should we pg_regfree here? */
elog(ERROR, "Invalid regular expression: %s", errMsg);
}
/*
* use malloc/free for the cre_pat field because the storage has to
* persist across transactions
*/
re_temp.cre_pat = malloc(text_re_len);
if (re_temp.cre_pat == NULL)
{
pg_regfree(&re_temp.cre_re);
elog(ERROR, "Out of memory");
}
memcpy(re_temp.cre_pat, text_re, text_re_len);
re_temp.cre_flags = cflags;
/*
* Okay, we have a valid new item in re_temp; insert it into the
* storage array. Discard last entry if needed.
*/
if (num_res >= MAX_CACHED_RES)
{
--num_res;
Assert(num_res < MAX_CACHED_RES);
pg_regfree(&re_array[num_res].cre_re);
free(re_array[num_res].cre_pat);
}
if (num_res > 0)
memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
re_array[0] = re_temp;
num_res++;
/* Perform RE match and return result */
regexec_result = pg_regexec(&re_array[0].cre_re,
data,
data_len,
NULL, /* no details */
nmatch,
pmatch,
0);
pfree(data);
return (regexec_result == 0);
}
@ -208,10 +226,11 @@ nameregexeq(PG_FUNCTION_ARGS)
Name n = PG_GETARG_NAME(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(fixedlen_regexeq(NameStr(*n),
p,
strlen(NameStr(*n)),
REG_EXTENDED));
PG_RETURN_BOOL(RE_compile_and_execute(p,
(unsigned char *) NameStr(*n),
strlen(NameStr(*n)),
REG_ADVANCED,
0, NULL));
}
Datum
@ -220,10 +239,11 @@ nameregexne(PG_FUNCTION_ARGS)
Name n = PG_GETARG_NAME(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!fixedlen_regexeq(NameStr(*n),
p,
strlen(NameStr(*n)),
REG_EXTENDED));
PG_RETURN_BOOL(!RE_compile_and_execute(p,
(unsigned char *) NameStr(*n),
strlen(NameStr(*n)),
REG_ADVANCED,
0, NULL));
}
Datum
@ -232,10 +252,11 @@ textregexeq(PG_FUNCTION_ARGS)
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(fixedlen_regexeq(VARDATA(s),
p,
VARSIZE(s) - VARHDRSZ,
REG_EXTENDED));
PG_RETURN_BOOL(RE_compile_and_execute(p,
(unsigned char *) VARDATA(s),
VARSIZE(s) - VARHDRSZ,
REG_ADVANCED,
0, NULL));
}
Datum
@ -244,10 +265,11 @@ textregexne(PG_FUNCTION_ARGS)
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!fixedlen_regexeq(VARDATA(s),
p,
VARSIZE(s) - VARHDRSZ,
REG_EXTENDED));
PG_RETURN_BOOL(!RE_compile_and_execute(p,
(unsigned char *) VARDATA(s),
VARSIZE(s) - VARHDRSZ,
REG_ADVANCED,
0, NULL));
}
@ -257,40 +279,17 @@ textregexne(PG_FUNCTION_ARGS)
*/
Datum
texticregexeq(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(fixedlen_regexeq(VARDATA(s),
p,
VARSIZE(s) - VARHDRSZ,
REG_ICASE | REG_EXTENDED));
}
Datum
texticregexne(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!fixedlen_regexeq(VARDATA(s),
p,
VARSIZE(s) - VARHDRSZ,
REG_ICASE | REG_EXTENDED));
}
Datum
nameicregexeq(PG_FUNCTION_ARGS)
{
Name n = PG_GETARG_NAME(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(fixedlen_regexeq(NameStr(*n),
p,
strlen(NameStr(*n)),
REG_ICASE | REG_EXTENDED));
PG_RETURN_BOOL(RE_compile_and_execute(p,
(unsigned char *) NameStr(*n),
strlen(NameStr(*n)),
REG_ICASE | REG_ADVANCED,
0, NULL));
}
Datum
@ -299,41 +298,63 @@ nameicregexne(PG_FUNCTION_ARGS)
Name n = PG_GETARG_NAME(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!fixedlen_regexeq(NameStr(*n),
p,
strlen(NameStr(*n)),
REG_ICASE | REG_EXTENDED));
PG_RETURN_BOOL(!RE_compile_and_execute(p,
(unsigned char *) NameStr(*n),
strlen(NameStr(*n)),
REG_ICASE | REG_ADVANCED,
0, NULL));
}
Datum
texticregexeq(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(RE_compile_and_execute(p,
(unsigned char *) VARDATA(s),
VARSIZE(s) - VARHDRSZ,
REG_ICASE | REG_ADVANCED,
0, NULL));
}
Datum
texticregexne(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!RE_compile_and_execute(p,
(unsigned char *) VARDATA(s),
VARSIZE(s) - VARHDRSZ,
REG_ICASE | REG_ADVANCED,
0, NULL));
}
/* textregexsubstr()
* Return a substring matched by a regular expression.
/*
* textregexsubstr()
* Return a substring matched by a regular expression.
*/
Datum
textregexsubstr(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
char *sterm;
int len;
bool match;
regmatch_t pmatch[2];
/* be sure sterm is null-terminated */
len = VARSIZE(s) - VARHDRSZ;
sterm = (char *) palloc(len + 1);
memcpy(sterm, VARDATA(s), len);
sterm[len] = '\0';
/*
* We pass two regmatch_t structs to get info about the overall match
* and the match for the first parenthesized subexpression (if any).
* If there is a parenthesized subexpression, we return what it matched;
* else return what the whole regexp matched.
*/
match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch);
pfree(sterm);
match = RE_compile_and_execute(p,
(unsigned char *) VARDATA(s),
VARSIZE(s) - VARHDRSZ,
REG_ADVANCED,
2, pmatch);
/* match? then return the substring matching the pattern */
if (match)