mirror of
https://github.com/postgres/postgres.git
synced 2025-08-27 07:42:10 +03:00
Back-patch fix for extraction of fixed prefixes from regular expressions.
Back-patch of commits628cbb50ba
andc6aae3042b
. This has been broken since 7.3, so back-patch to all supported branches.
This commit is contained in:
@@ -12,7 +12,7 @@ subdir = src/backend/regex
|
||||
top_builddir = ../../..
|
||||
include $(top_builddir)/src/Makefile.global
|
||||
|
||||
OBJS = regcomp.o regerror.o regexec.o regfree.o
|
||||
OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
|
||||
|
||||
include $(top_srcdir)/src/backend/common.mk
|
||||
|
||||
|
@@ -66,8 +66,9 @@ initcm(struct vars * v,
|
||||
cd = cm->cd; /* cm->cd[WHITE] */
|
||||
cd->sub = NOSUB;
|
||||
cd->arcs = NULL;
|
||||
cd->flags = 0;
|
||||
cd->firstchr = CHR_MIN;
|
||||
cd->nchrs = CHR_MAX - CHR_MIN + 1;
|
||||
cd->flags = 0;
|
||||
|
||||
/* upper levels of tree */
|
||||
for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
|
||||
@@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
|
||||
cd->nchrs = 0;
|
||||
cd->sub = NOSUB;
|
||||
cd->arcs = NULL;
|
||||
cd->firstchr = CHR_MIN; /* in case never set otherwise */
|
||||
cd->flags = 0;
|
||||
cd->block = NULL;
|
||||
|
||||
@@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
|
||||
if (co == sco) /* already in an open subcolor */
|
||||
return co; /* rest is redundant */
|
||||
cm->cd[co].nchrs--;
|
||||
if (cm->cd[sco].nchrs == 0)
|
||||
cm->cd[sco].firstchr = c;
|
||||
cm->cd[sco].nchrs++;
|
||||
setcolor(cm, c, sco);
|
||||
return sco;
|
||||
@@ -438,6 +442,11 @@ subrange(struct vars * v,
|
||||
|
||||
/*
|
||||
* subblock - allocate new subcolors for one tree block of chrs, fill in arcs
|
||||
*
|
||||
* Note: subcolors that are created during execution of this function
|
||||
* will not be given a useful value of firstchr; it'll be left as CHR_MIN.
|
||||
* For the current usage of firstchr in pg_regprefix, this does not matter
|
||||
* because such subcolors won't occur in the common prefix of a regex.
|
||||
*/
|
||||
static void
|
||||
subblock(struct vars * v,
|
||||
|
@@ -1330,14 +1330,16 @@ compact(struct nfa * nfa,
|
||||
for (s = nfa->states; s != NULL; s = s->next)
|
||||
{
|
||||
nstates++;
|
||||
narcs += 1 + s->nouts + 1;
|
||||
/* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
|
||||
narcs += s->nouts + 1; /* need one extra for endmarker */
|
||||
}
|
||||
|
||||
cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
|
||||
cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
|
||||
cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
|
||||
if (cnfa->states == NULL || cnfa->arcs == NULL)
|
||||
if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
|
||||
{
|
||||
if (cnfa->stflags != NULL)
|
||||
FREE(cnfa->stflags);
|
||||
if (cnfa->states != NULL)
|
||||
FREE(cnfa->states);
|
||||
if (cnfa->arcs != NULL)
|
||||
@@ -1359,9 +1361,8 @@ compact(struct nfa * nfa,
|
||||
for (s = nfa->states; s != NULL; s = s->next)
|
||||
{
|
||||
assert((size_t) s->no < nstates);
|
||||
cnfa->stflags[s->no] = 0;
|
||||
cnfa->states[s->no] = ca;
|
||||
ca->co = 0; /* clear and skip flags "arc" */
|
||||
ca++;
|
||||
first = ca;
|
||||
for (a = s->outs; a != NULL; a = a->outchain)
|
||||
switch (a->type)
|
||||
@@ -1392,8 +1393,8 @@ compact(struct nfa * nfa,
|
||||
|
||||
/* mark no-progress states */
|
||||
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
|
||||
cnfa->states[a->to->no]->co = 1;
|
||||
cnfa->states[nfa->pre->no]->co = 1;
|
||||
cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
|
||||
cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa)
|
||||
{
|
||||
assert(cnfa->nstates != 0); /* not empty already */
|
||||
cnfa->nstates = 0;
|
||||
FREE(cnfa->stflags);
|
||||
FREE(cnfa->states);
|
||||
FREE(cnfa->arcs);
|
||||
}
|
||||
@@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa,
|
||||
fprintf(f, ", haslacons");
|
||||
fprintf(f, "\n");
|
||||
for (st = 0; st < cnfa->nstates; st++)
|
||||
dumpcstate(st, cnfa->states[st], cnfa, f);
|
||||
dumpcstate(st, cnfa, f);
|
||||
fflush(f);
|
||||
}
|
||||
#endif
|
||||
@@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa,
|
||||
*/
|
||||
static void
|
||||
dumpcstate(int st,
|
||||
struct carc * ca,
|
||||
struct cnfa * cnfa,
|
||||
FILE *f)
|
||||
{
|
||||
int i;
|
||||
struct carc * ca;
|
||||
int pos;
|
||||
|
||||
fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
|
||||
fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
|
||||
pos = 1;
|
||||
for (i = 1; ca[i].co != COLORLESS; i++)
|
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
||||
{
|
||||
if (ca[i].co < cnfa->ncolors)
|
||||
fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
|
||||
if (ca->co < cnfa->ncolors)
|
||||
fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
|
||||
else
|
||||
fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
|
||||
ca[i].to);
|
||||
fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
|
||||
if (pos == 5)
|
||||
{
|
||||
fprintf(f, "\n");
|
||||
@@ -1653,7 +1653,7 @@ dumpcstate(int st,
|
||||
else
|
||||
pos++;
|
||||
}
|
||||
if (i == 1 || pos != 1)
|
||||
if (ca == cnfa->states[st] || pos != 1)
|
||||
fprintf(f, "\n");
|
||||
fflush(f);
|
||||
}
|
||||
|
@@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *);
|
||||
static int dumprarcs(struct arc *, struct state *, FILE *, int);
|
||||
static void dumparc(struct arc *, struct state *, FILE *);
|
||||
static void dumpcnfa(struct cnfa *, FILE *);
|
||||
static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
|
||||
static void dumpcstate(int, struct cnfa *, FILE *);
|
||||
#endif
|
||||
/* === regc_cvec.c === */
|
||||
static struct cvec *newcvec(int, int);
|
||||
|
@@ -458,14 +458,14 @@ miss(struct vars * v, /* used only for debug flags */
|
||||
gotstate = 0;
|
||||
for (i = 0; i < d->nstates; i++)
|
||||
if (ISBSET(css->states, i))
|
||||
for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
|
||||
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
|
||||
if (ca->co == co)
|
||||
{
|
||||
BSET(d->work, ca->to);
|
||||
gotstate = 1;
|
||||
if (ca->to == cnfa->post)
|
||||
ispost = 1;
|
||||
if (!cnfa->states[ca->to]->co)
|
||||
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
|
||||
noprogress = 0;
|
||||
FDEBUG(("%d -> %d\n", i, ca->to));
|
||||
}
|
||||
@@ -476,10 +476,9 @@ miss(struct vars * v, /* used only for debug flags */
|
||||
dolacons = 0;
|
||||
for (i = 0; i < d->nstates; i++)
|
||||
if (ISBSET(d->work, i))
|
||||
for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
|
||||
ca++)
|
||||
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
|
||||
{
|
||||
if (ca->co <= cnfa->ncolors)
|
||||
if (ca->co < cnfa->ncolors)
|
||||
continue; /* NOTE CONTINUE */
|
||||
sawlacons = 1;
|
||||
if (ISBSET(d->work, ca->to))
|
||||
@@ -490,7 +489,7 @@ miss(struct vars * v, /* used only for debug flags */
|
||||
dolacons = 1;
|
||||
if (ca->to == cnfa->post)
|
||||
ispost = 1;
|
||||
if (!cnfa->states[ca->to]->co)
|
||||
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
|
||||
noprogress = 0;
|
||||
FDEBUG(("%d :> %d\n", i, ca->to));
|
||||
}
|
||||
|
259
src/backend/regex/regprefix.c
Normal file
259
src/backend/regex/regprefix.c
Normal file
@@ -0,0 +1,259 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* regprefix.c
|
||||
* Extract a common prefix, if any, from a compiled regex.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1998, 1999 Henry Spencer
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/regex/regprefix.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "regex/regguts.h"
|
||||
|
||||
|
||||
/*
|
||||
* forward declarations
|
||||
*/
|
||||
static int findprefix(struct cnfa * cnfa, struct colormap * cm,
|
||||
chr *string, size_t *slength);
|
||||
|
||||
|
||||
/*
|
||||
* pg_regprefix - get common prefix for regular expression
|
||||
*
|
||||
* Returns one of:
|
||||
* REG_NOMATCH: there is no common prefix of strings matching the regex
|
||||
* REG_PREFIX: there is a common prefix of strings matching the regex
|
||||
* REG_EXACT: all strings satisfying the regex must match the same string
|
||||
* or a REG_XXX error code
|
||||
*
|
||||
* In the non-failure cases, *string is set to a malloc'd string containing
|
||||
* the common prefix or exact value, of length *slength (measured in chrs
|
||||
* not bytes!).
|
||||
*
|
||||
* This function does not analyze all complex cases (such as lookahead
|
||||
* constraints) exactly. Therefore it is possible that some strings matching
|
||||
* the reported prefix or exact-match string do not satisfy the regex. But
|
||||
* it should never be the case that a string satisfying the regex does not
|
||||
* match the reported prefix or exact-match string.
|
||||
*/
|
||||
int
|
||||
pg_regprefix(regex_t *re,
|
||||
chr **string,
|
||||
size_t *slength)
|
||||
{
|
||||
struct guts *g;
|
||||
struct cnfa *cnfa;
|
||||
int st;
|
||||
|
||||
/* sanity checks */
|
||||
if (string == NULL || slength == NULL)
|
||||
return REG_INVARG;
|
||||
*string = NULL; /* initialize for failure cases */
|
||||
*slength = 0;
|
||||
if (re == NULL || re->re_magic != REMAGIC)
|
||||
return REG_INVARG;
|
||||
if (re->re_csize != sizeof(chr))
|
||||
return REG_MIXED;
|
||||
|
||||
/* Initialize locale-dependent support */
|
||||
pg_set_regex_collation(re->re_collation);
|
||||
|
||||
/* setup */
|
||||
g = (struct guts *) re->re_guts;
|
||||
if (g->info & REG_UIMPOSSIBLE)
|
||||
return REG_NOMATCH;
|
||||
|
||||
/*
|
||||
* This implementation considers only the search NFA for the topmost regex
|
||||
* tree node. Therefore, constraints such as backrefs are not fully
|
||||
* applied, which is allowed per the function's API spec.
|
||||
*/
|
||||
assert(g->tree != NULL);
|
||||
cnfa = &g->tree->cnfa;
|
||||
|
||||
/*
|
||||
* Since a correct NFA should never contain any exit-free loops, it should
|
||||
* not be possible for our traversal to return to a previously visited
|
||||
* NFA state. Hence we need at most nstates chrs in the output string.
|
||||
*/
|
||||
*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
|
||||
if (*string == NULL)
|
||||
return REG_ESPACE;
|
||||
|
||||
/* do it */
|
||||
st = findprefix(cnfa, &g->cmap, *string, slength);
|
||||
|
||||
assert(*slength <= cnfa->nstates);
|
||||
|
||||
/* clean up */
|
||||
if (st != REG_PREFIX && st != REG_EXACT)
|
||||
{
|
||||
FREE(*string);
|
||||
*string = NULL;
|
||||
*slength = 0;
|
||||
}
|
||||
|
||||
return st;
|
||||
}
|
||||
|
||||
/*
|
||||
* findprefix - extract common prefix from cNFA
|
||||
*
|
||||
* Results are returned into the preallocated chr array string[], with
|
||||
* *slength (which must be preset to zero) incremented for each chr.
|
||||
*/
|
||||
static int /* regprefix return code */
|
||||
findprefix(struct cnfa * cnfa,
|
||||
struct colormap * cm,
|
||||
chr *string,
|
||||
size_t *slength)
|
||||
{
|
||||
int st;
|
||||
int nextst;
|
||||
color thiscolor;
|
||||
chr c;
|
||||
struct carc *ca;
|
||||
|
||||
/*
|
||||
* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
|
||||
* anchored left. If we have both BOS and BOL, they must go to the
|
||||
* same next state.
|
||||
*/
|
||||
st = cnfa->pre;
|
||||
nextst = -1;
|
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
||||
{
|
||||
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
|
||||
{
|
||||
if (nextst == -1)
|
||||
nextst = ca->to;
|
||||
else if (nextst != ca->to)
|
||||
return REG_NOMATCH;
|
||||
}
|
||||
else
|
||||
return REG_NOMATCH;
|
||||
}
|
||||
if (nextst == -1)
|
||||
return REG_NOMATCH;
|
||||
|
||||
/*
|
||||
* Scan through successive states, stopping as soon as we find one with
|
||||
* more than one acceptable transition character (either multiple colors
|
||||
* on out-arcs, or a color with more than one member chr).
|
||||
*
|
||||
* We could find a state with multiple out-arcs that are all labeled with
|
||||
* the same singleton color; this comes from patterns like "^ab(cde|cxy)".
|
||||
* In that case we add the chr "c" to the output string but then exit the
|
||||
* loop with nextst == -1. This leaves a little bit on the table: if the
|
||||
* pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
|
||||
* to the prefix. But chasing multiple parallel state chains doesn't seem
|
||||
* worth the trouble.
|
||||
*/
|
||||
do
|
||||
{
|
||||
st = nextst;
|
||||
nextst = -1;
|
||||
thiscolor = COLORLESS;
|
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
||||
{
|
||||
/* We ignore lookahead constraints */
|
||||
if (ca->co >= cnfa->ncolors)
|
||||
continue;
|
||||
/* We can also ignore BOS/BOL arcs */
|
||||
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
|
||||
continue;
|
||||
/* ... but EOS/EOL arcs terminate the search */
|
||||
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
|
||||
{
|
||||
thiscolor = COLORLESS;
|
||||
break;
|
||||
}
|
||||
if (thiscolor == COLORLESS)
|
||||
{
|
||||
/* First plain outarc */
|
||||
thiscolor = ca->co;
|
||||
nextst = ca->to;
|
||||
}
|
||||
else if (thiscolor == ca->co)
|
||||
{
|
||||
/* Another plain outarc for same color */
|
||||
nextst = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* More than one plain outarc color terminates the search */
|
||||
thiscolor = COLORLESS;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Done if we didn't find exactly one color on plain outarcs */
|
||||
if (thiscolor == COLORLESS)
|
||||
break;
|
||||
/* The color must be a singleton */
|
||||
if (cm->cd[thiscolor].nchrs != 1)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Identify the color's sole member chr and add it to the prefix
|
||||
* string. In general the colormap data structure doesn't provide a
|
||||
* way to find color member chrs, except by trying GETCOLOR() on each
|
||||
* possible chr value, which won't do at all. However, for the cases
|
||||
* we care about it should be sufficient to test the "firstchr" value,
|
||||
* that is the first chr ever added to the color. There are cases
|
||||
* where this might no longer be a member of the color (so we do need
|
||||
* to test), but none of them are likely to arise for a character that
|
||||
* is a member of a common prefix. If we do hit such a corner case,
|
||||
* we just fall out without adding anything to the prefix string.
|
||||
*/
|
||||
c = cm->cd[thiscolor].firstchr;
|
||||
if (GETCOLOR(cm, c) != thiscolor)
|
||||
break;
|
||||
|
||||
string[(*slength)++] = c;
|
||||
|
||||
/* Advance to next state, but only if we have a unique next state */
|
||||
} while (nextst != -1);
|
||||
|
||||
/*
|
||||
* If we ended at a state that only has EOS/EOL outarcs leading to the
|
||||
* "post" state, then we have an exact-match string. Note this is true
|
||||
* even if the string is of zero length.
|
||||
*/
|
||||
nextst = -1;
|
||||
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
||||
{
|
||||
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
|
||||
{
|
||||
if (nextst == -1)
|
||||
nextst = ca->to;
|
||||
else if (nextst != ca->to)
|
||||
{
|
||||
nextst = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
nextst = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (nextst == cnfa->post)
|
||||
return REG_EXACT;
|
||||
|
||||
/*
|
||||
* Otherwise, if we were unable to identify any prefix characters, say
|
||||
* NOMATCH --- the pattern is anchored left, but doesn't specify any
|
||||
* particular first character.
|
||||
*/
|
||||
if (*slength > 0)
|
||||
return REG_PREFIX;
|
||||
|
||||
return REG_NOMATCH;
|
||||
}
|
Reference in New Issue
Block a user