mirror of
https://github.com/postgres/postgres.git
synced 2025-04-27 22:56:53 +03:00
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
933 lines
23 KiB
C
933 lines
23 KiB
C
/*
|
|
* DFA routines
|
|
* This file is #included by regexec.c.
|
|
*
|
|
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
|
*
|
|
* Development of this software was funded, in part, by Cray Research Inc.,
|
|
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
|
* Corporation, none of whom are responsible for the results. The author
|
|
* thanks all of them.
|
|
*
|
|
* Redistribution and use in source and binary forms -- with or without
|
|
* modification -- are permitted for any purpose, provided that
|
|
* redistributions in source form retain this entire copyright notice and
|
|
* indicate the origin and nature of any modifications.
|
|
*
|
|
* I'd appreciate being given credit for this package in the documentation
|
|
* of software which uses it, but that is not a requirement.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
|
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* src/backend/regex/rege_dfa.c
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* longest - longest-preferred matching engine
|
|
*
|
|
* On success, returns match endpoint address. Returns NULL on no match.
|
|
* Internal errors also return NULL, with v->err set.
|
|
*/
|
|
static chr *
|
|
longest(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start, /* where the match should start */
|
|
chr *stop, /* match must end at or before here */
|
|
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
|
{
|
|
chr *cp;
|
|
chr *realstop = (stop == v->stop) ? stop : stop + 1;
|
|
color co;
|
|
struct sset *css;
|
|
struct sset *ss;
|
|
chr *post;
|
|
int i;
|
|
struct colormap *cm = d->cm;
|
|
|
|
/* prevent "uninitialized variable" warnings */
|
|
if (hitstopp != NULL)
|
|
*hitstopp = 0;
|
|
|
|
/* initialize */
|
|
css = initialize(v, d, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
cp = start;
|
|
|
|
/* startup */
|
|
FDEBUG(("+++ startup +++\n"));
|
|
if (cp == v->start)
|
|
{
|
|
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
}
|
|
else
|
|
{
|
|
co = GETCOLOR(cm, *(cp - 1));
|
|
FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
|
|
}
|
|
css = miss(v, d, css, co, cp, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
css->lastseen = cp;
|
|
|
|
/*
|
|
* This is the main text-scanning loop. It seems worth having two copies
|
|
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
|
|
* builds, when you're not actively tracing.
|
|
*/
|
|
#ifdef REG_DEBUG
|
|
if (v->eflags & REG_FTRACE)
|
|
{
|
|
while (cp < realstop)
|
|
{
|
|
FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while (cp < realstop)
|
|
{
|
|
co = GETCOLOR(cm, *cp);
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
|
|
if (ISERR())
|
|
return NULL;
|
|
|
|
/* shutdown */
|
|
FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
|
|
if (cp == v->stop && stop == v->stop)
|
|
{
|
|
if (hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
ss = miss(v, d, css, co, cp, start);
|
|
if (ISERR())
|
|
return NULL;
|
|
/* special case: match ended at eol? */
|
|
if (ss != NULL && (ss->flags & POSTSTATE))
|
|
return cp;
|
|
else if (ss != NULL)
|
|
ss->lastseen = cp; /* to be tidy */
|
|
}
|
|
|
|
/* find last match, if any */
|
|
post = d->lastpost;
|
|
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
|
if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
|
|
(post == NULL || post < ss->lastseen))
|
|
post = ss->lastseen;
|
|
if (post != NULL) /* found one */
|
|
return post - 1;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* shortest - shortest-preferred matching engine
|
|
*
|
|
* On success, returns match endpoint address. Returns NULL on no match.
|
|
* Internal errors also return NULL, with v->err set.
|
|
*/
|
|
static chr *
|
|
shortest(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start, /* where the match should start */
|
|
chr *min, /* match must end at or after here */
|
|
chr *max, /* match must end at or before here */
|
|
chr **coldp, /* store coldstart pointer here, if non-NULL */
|
|
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
|
{
|
|
chr *cp;
|
|
chr *realmin = (min == v->stop) ? min : min + 1;
|
|
chr *realmax = (max == v->stop) ? max : max + 1;
|
|
color co;
|
|
struct sset *css;
|
|
struct sset *ss;
|
|
struct colormap *cm = d->cm;
|
|
|
|
/* prevent "uninitialized variable" warnings */
|
|
if (coldp != NULL)
|
|
*coldp = NULL;
|
|
if (hitstopp != NULL)
|
|
*hitstopp = 0;
|
|
|
|
/* initialize */
|
|
css = initialize(v, d, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
cp = start;
|
|
|
|
/* startup */
|
|
FDEBUG(("--- startup ---\n"));
|
|
if (cp == v->start)
|
|
{
|
|
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
}
|
|
else
|
|
{
|
|
co = GETCOLOR(cm, *(cp - 1));
|
|
FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
|
|
}
|
|
css = miss(v, d, css, co, cp, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
css->lastseen = cp;
|
|
ss = css;
|
|
|
|
/*
|
|
* This is the main text-scanning loop. It seems worth having two copies
|
|
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
|
|
* builds, when you're not actively tracing.
|
|
*/
|
|
#ifdef REG_DEBUG
|
|
if (v->eflags & REG_FTRACE)
|
|
{
|
|
while (cp < realmax)
|
|
{
|
|
FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
if ((ss->flags & POSTSTATE) && cp >= realmin)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while (cp < realmax)
|
|
{
|
|
co = GETCOLOR(cm, *cp);
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
if ((ss->flags & POSTSTATE) && cp >= realmin)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
}
|
|
|
|
if (ss == NULL)
|
|
return NULL;
|
|
|
|
if (coldp != NULL) /* report last no-progress state set, if any */
|
|
*coldp = lastcold(v, d);
|
|
|
|
if ((ss->flags & POSTSTATE) && cp > min)
|
|
{
|
|
assert(cp >= realmin);
|
|
cp--;
|
|
}
|
|
else if (cp == v->stop && max == v->stop)
|
|
{
|
|
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
ss = miss(v, d, css, co, cp, start);
|
|
/* match might have ended at eol */
|
|
if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
}
|
|
|
|
if (ss == NULL || !(ss->flags & POSTSTATE))
|
|
return NULL;
|
|
|
|
return cp;
|
|
}
|
|
|
|
/*
|
|
* matchuntil - incremental matching engine
|
|
*
|
|
* This is meant for use with a search-style NFA (that is, the pattern is
|
|
* known to act as though it had a leading .*). We determine whether a
|
|
* match exists starting at v->start and ending at probe. Multiple calls
|
|
* require only O(N) time not O(N^2) so long as the probe values are
|
|
* nondecreasing. *lastcss and *lastcp must be initialized to NULL before
|
|
* starting a series of calls.
|
|
*
|
|
* Returns 1 if a match exists, 0 if not.
|
|
* Internal errors also return 0, with v->err set.
|
|
*/
|
|
static int
|
|
matchuntil(struct vars *v,
|
|
struct dfa *d,
|
|
chr *probe, /* we want to know if a match ends here */
|
|
struct sset **lastcss, /* state storage across calls */
|
|
chr **lastcp) /* state storage across calls */
|
|
{
|
|
chr *cp = *lastcp;
|
|
color co;
|
|
struct sset *css = *lastcss;
|
|
struct sset *ss;
|
|
struct colormap *cm = d->cm;
|
|
|
|
/* initialize and startup, or restart, if necessary */
|
|
if (cp == NULL || cp > probe)
|
|
{
|
|
cp = v->start;
|
|
css = initialize(v, d, cp);
|
|
if (css == NULL)
|
|
return 0;
|
|
|
|
FDEBUG((">>> startup >>>\n"));
|
|
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
|
|
css = miss(v, d, css, co, cp, v->start);
|
|
if (css == NULL)
|
|
return 0;
|
|
css->lastseen = cp;
|
|
}
|
|
else if (css == NULL)
|
|
{
|
|
/* we previously found that no match is possible beyond *lastcp */
|
|
return 0;
|
|
}
|
|
ss = css;
|
|
|
|
/*
|
|
* This is the main text-scanning loop. It seems worth having two copies
|
|
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
|
|
* builds, when you're not actively tracing.
|
|
*/
|
|
#ifdef REG_DEBUG
|
|
if (v->eflags & REG_FTRACE)
|
|
{
|
|
while (cp < probe)
|
|
{
|
|
FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, v->start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while (cp < probe)
|
|
{
|
|
co = GETCOLOR(cm, *cp);
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, v->start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
|
|
*lastcss = ss;
|
|
*lastcp = cp;
|
|
|
|
if (ss == NULL)
|
|
return 0; /* impossible match, or internal error */
|
|
|
|
/* We need to process one more chr, or the EOS symbol, to check match */
|
|
if (cp < v->stop)
|
|
{
|
|
FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
ss = miss(v, d, css, co, cp + 1, v->start);
|
|
}
|
|
else
|
|
{
|
|
assert(cp == v->stop);
|
|
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
ss = miss(v, d, css, co, cp, v->start);
|
|
}
|
|
|
|
if (ss == NULL || !(ss->flags & POSTSTATE))
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* lastcold - determine last point at which no progress had been made
|
|
*/
|
|
static chr * /* endpoint, or NULL */
|
|
lastcold(struct vars *v,
|
|
struct dfa *d)
|
|
{
|
|
struct sset *ss;
|
|
chr *nopr;
|
|
int i;
|
|
|
|
nopr = d->lastnopr;
|
|
if (nopr == NULL)
|
|
nopr = v->start;
|
|
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
|
if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
|
|
nopr = ss->lastseen;
|
|
return nopr;
|
|
}
|
|
|
|
/*
|
|
* newdfa - set up a fresh DFA
|
|
*/
|
|
static struct dfa *
|
|
newdfa(struct vars *v,
|
|
struct cnfa *cnfa,
|
|
struct colormap *cm,
|
|
struct smalldfa *sml) /* preallocated space, may be NULL */
|
|
{
|
|
struct dfa *d;
|
|
size_t nss = cnfa->nstates * 2;
|
|
int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
|
|
struct smalldfa *smallwas = sml;
|
|
|
|
assert(cnfa != NULL && cnfa->nstates != 0);
|
|
|
|
if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
|
|
{
|
|
assert(wordsper == 1);
|
|
if (sml == NULL)
|
|
{
|
|
sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
|
|
if (sml == NULL)
|
|
{
|
|
ERR(REG_ESPACE);
|
|
return NULL;
|
|
}
|
|
}
|
|
d = &sml->dfa;
|
|
d->ssets = sml->ssets;
|
|
d->statesarea = sml->statesarea;
|
|
d->work = &d->statesarea[nss];
|
|
d->outsarea = sml->outsarea;
|
|
d->incarea = sml->incarea;
|
|
d->cptsmalloced = 0;
|
|
d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL;
|
|
}
|
|
else
|
|
{
|
|
d = (struct dfa *) MALLOC(sizeof(struct dfa));
|
|
if (d == NULL)
|
|
{
|
|
ERR(REG_ESPACE);
|
|
return NULL;
|
|
}
|
|
d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
|
|
d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
|
|
sizeof(unsigned));
|
|
d->work = &d->statesarea[nss * wordsper];
|
|
d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
|
|
sizeof(struct sset *));
|
|
d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
|
|
sizeof(struct arcp));
|
|
d->cptsmalloced = 1;
|
|
d->mallocarea = (char *) d;
|
|
if (d->ssets == NULL || d->statesarea == NULL ||
|
|
d->outsarea == NULL || d->incarea == NULL)
|
|
{
|
|
freedfa(d);
|
|
ERR(REG_ESPACE);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
|
|
d->nssused = 0;
|
|
d->nstates = cnfa->nstates;
|
|
d->ncolors = cnfa->ncolors;
|
|
d->wordsper = wordsper;
|
|
d->cnfa = cnfa;
|
|
d->cm = cm;
|
|
d->lastpost = NULL;
|
|
d->lastnopr = NULL;
|
|
d->search = d->ssets;
|
|
|
|
/* initialization of sset fields is done as needed */
|
|
|
|
return d;
|
|
}
|
|
|
|
/*
|
|
* freedfa - free a DFA
|
|
*/
|
|
static void
|
|
freedfa(struct dfa *d)
|
|
{
|
|
if (d->cptsmalloced)
|
|
{
|
|
if (d->ssets != NULL)
|
|
FREE(d->ssets);
|
|
if (d->statesarea != NULL)
|
|
FREE(d->statesarea);
|
|
if (d->outsarea != NULL)
|
|
FREE(d->outsarea);
|
|
if (d->incarea != NULL)
|
|
FREE(d->incarea);
|
|
}
|
|
|
|
if (d->mallocarea != NULL)
|
|
FREE(d->mallocarea);
|
|
}
|
|
|
|
/*
|
|
* hash - construct a hash code for a bitvector
|
|
*
|
|
* There are probably better ways, but they're more expensive.
|
|
*/
|
|
static unsigned
|
|
hash(unsigned *uv,
|
|
int n)
|
|
{
|
|
int i;
|
|
unsigned h;
|
|
|
|
h = 0;
|
|
for (i = 0; i < n; i++)
|
|
h ^= uv[i];
|
|
return h;
|
|
}
|
|
|
|
/*
|
|
* initialize - hand-craft a cache entry for startup, otherwise get ready
|
|
*/
|
|
static struct sset *
|
|
initialize(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start)
|
|
{
|
|
struct sset *ss;
|
|
int i;
|
|
|
|
/* is previous one still there? */
|
|
if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
|
|
ss = &d->ssets[0];
|
|
else
|
|
{ /* no, must (re)build it */
|
|
ss = getvacant(v, d, start, start);
|
|
if (ss == NULL)
|
|
return NULL;
|
|
for (i = 0; i < d->wordsper; i++)
|
|
ss->states[i] = 0;
|
|
BSET(ss->states, d->cnfa->pre);
|
|
ss->hash = HASH(ss->states, d->wordsper);
|
|
assert(d->cnfa->pre != d->cnfa->post);
|
|
ss->flags = STARTER | LOCKED | NOPROGRESS;
|
|
/* lastseen dealt with below */
|
|
}
|
|
|
|
for (i = 0; i < d->nssused; i++)
|
|
d->ssets[i].lastseen = NULL;
|
|
ss->lastseen = start; /* maybe untrue, but harmless */
|
|
d->lastpost = NULL;
|
|
d->lastnopr = NULL;
|
|
return ss;
|
|
}
|
|
|
|
/*
|
|
* miss - handle a stateset cache miss
|
|
*
|
|
* css is the current stateset, co is the color of the current input character,
|
|
* cp points to the character after that (which is where we may need to test
|
|
* LACONs). start does not affect matching behavior but is needed for pickss'
|
|
* heuristics about which stateset cache entry to replace.
|
|
*
|
|
* Ordinarily, returns the address of the next stateset (the one that is
|
|
* valid after consuming the input character). Returns NULL if no valid
|
|
* NFA states remain, ie we have a certain match failure.
|
|
* Internal errors also return NULL, with v->err set.
|
|
*/
|
|
static struct sset *
|
|
miss(struct vars *v,
|
|
struct dfa *d,
|
|
struct sset *css,
|
|
color co,
|
|
chr *cp, /* next chr */
|
|
chr *start) /* where the attempt got started */
|
|
{
|
|
struct cnfa *cnfa = d->cnfa;
|
|
int i;
|
|
unsigned h;
|
|
struct carc *ca;
|
|
struct sset *p;
|
|
int ispseudocolor;
|
|
int ispost;
|
|
int noprogress;
|
|
int gotstate;
|
|
int dolacons;
|
|
int sawlacons;
|
|
|
|
/* for convenience, we can be called even if it might not be a miss */
|
|
if (css->outs[co] != NULL)
|
|
{
|
|
FDEBUG(("hit\n"));
|
|
return css->outs[co];
|
|
}
|
|
FDEBUG(("miss\n"));
|
|
|
|
/*
|
|
* Checking for operation cancel in the inner text search loop seems
|
|
* unduly expensive. As a compromise, check during cache misses.
|
|
*/
|
|
if (CANCEL_REQUESTED(v->re))
|
|
{
|
|
ERR(REG_CANCEL);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* What set of states would we end up in after consuming the co character?
|
|
* We first consider PLAIN arcs that consume the character, and then look
|
|
* to see what LACON arcs could be traversed after consuming it.
|
|
*/
|
|
for (i = 0; i < d->wordsper; i++)
|
|
d->work[i] = 0; /* build new stateset bitmap in d->work */
|
|
ispseudocolor = d->cm->cd[co].flags & PSEUDO;
|
|
ispost = 0;
|
|
noprogress = 1;
|
|
gotstate = 0;
|
|
for (i = 0; i < d->nstates; i++)
|
|
if (ISBSET(css->states, i))
|
|
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
|
|
if (ca->co == co ||
|
|
(ca->co == RAINBOW && !ispseudocolor))
|
|
{
|
|
BSET(d->work, ca->to);
|
|
gotstate = 1;
|
|
if (ca->to == cnfa->post)
|
|
ispost = 1;
|
|
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
|
|
noprogress = 0;
|
|
FDEBUG(("%d -> %d\n", i, ca->to));
|
|
}
|
|
if (!gotstate)
|
|
return NULL; /* character cannot reach any new state */
|
|
dolacons = (cnfa->flags & HASLACONS);
|
|
sawlacons = 0;
|
|
/* outer loop handles transitive closure of reachable-by-LACON states */
|
|
while (dolacons)
|
|
{
|
|
dolacons = 0;
|
|
for (i = 0; i < d->nstates; i++)
|
|
if (ISBSET(d->work, i))
|
|
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
|
|
{
|
|
if (ca->co < cnfa->ncolors)
|
|
continue; /* not a LACON arc */
|
|
if (ISBSET(d->work, ca->to))
|
|
continue; /* arc would be a no-op anyway */
|
|
sawlacons = 1; /* this LACON affects our result */
|
|
if (!lacon(v, cnfa, cp, ca->co))
|
|
{
|
|
if (ISERR())
|
|
return NULL;
|
|
continue; /* LACON arc cannot be traversed */
|
|
}
|
|
if (ISERR())
|
|
return NULL;
|
|
BSET(d->work, ca->to);
|
|
dolacons = 1;
|
|
if (ca->to == cnfa->post)
|
|
ispost = 1;
|
|
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
|
|
noprogress = 0;
|
|
FDEBUG(("%d :> %d\n", i, ca->to));
|
|
}
|
|
}
|
|
h = HASH(d->work, d->wordsper);
|
|
|
|
/* Is this stateset already in the cache? */
|
|
for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
|
|
if (HIT(h, d->work, p, d->wordsper))
|
|
{
|
|
FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
if (i == 0)
|
|
{ /* nope, need a new cache entry */
|
|
p = getvacant(v, d, cp, start);
|
|
if (p == NULL)
|
|
return NULL;
|
|
assert(p != css);
|
|
for (i = 0; i < d->wordsper; i++)
|
|
p->states[i] = d->work[i];
|
|
p->hash = h;
|
|
p->flags = (ispost) ? POSTSTATE : 0;
|
|
if (noprogress)
|
|
p->flags |= NOPROGRESS;
|
|
/* lastseen to be dealt with by caller */
|
|
}
|
|
|
|
/*
|
|
* Link new stateset to old, unless a LACON affected the result, in which
|
|
* case we don't create the link. That forces future transitions across
|
|
* this same arc (same prior stateset and character color) to come through
|
|
* miss() again, so that we can recheck the LACON(s), which might or might
|
|
* not pass since context will be different.
|
|
*/
|
|
if (!sawlacons)
|
|
{
|
|
FDEBUG(("c%d[%d]->c%d\n",
|
|
(int) (css - d->ssets), co, (int) (p - d->ssets)));
|
|
css->outs[co] = p;
|
|
css->inchain[co] = p->ins;
|
|
p->ins.ss = css;
|
|
p->ins.co = co;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
/*
|
|
* lacon - lookaround-constraint checker for miss()
|
|
*/
|
|
static int /* predicate: constraint satisfied? */
|
|
lacon(struct vars *v,
|
|
struct cnfa *pcnfa, /* parent cnfa */
|
|
chr *cp,
|
|
color co) /* "color" of the lookaround constraint */
|
|
{
|
|
int n;
|
|
struct subre *sub;
|
|
struct dfa *d;
|
|
chr *end;
|
|
int satisfied;
|
|
|
|
/* Since this is recursive, it could be driven to stack overflow */
|
|
if (STACK_TOO_DEEP(v->re))
|
|
{
|
|
ERR(REG_ETOOBIG);
|
|
return 0;
|
|
}
|
|
|
|
n = co - pcnfa->ncolors;
|
|
assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
|
|
FDEBUG(("=== testing lacon %d\n", n));
|
|
sub = &v->g->lacons[n];
|
|
d = getladfa(v, n);
|
|
if (d == NULL)
|
|
return 0;
|
|
if (LATYPE_IS_AHEAD(sub->subno))
|
|
{
|
|
/* used to use longest() here, but shortest() could be much cheaper */
|
|
end = shortest(v, d, cp, cp, v->stop,
|
|
(chr **) NULL, (int *) NULL);
|
|
satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* To avoid doing O(N^2) work when repeatedly testing a lookbehind
|
|
* constraint in an N-character string, we use matchuntil() which can
|
|
* cache the DFA state across calls. We only need to restart if the
|
|
* probe point decreases, which is not common. The NFA we're using is
|
|
* a search NFA, so it doesn't mind scanning over stuff before the
|
|
* nominal match.
|
|
*/
|
|
satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
|
|
if (!LATYPE_IS_POS(sub->subno))
|
|
satisfied = !satisfied;
|
|
}
|
|
FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
|
|
return satisfied;
|
|
}
|
|
|
|
/*
|
|
* getvacant - get a vacant state set
|
|
*
|
|
* This routine clears out the inarcs and outarcs, but does not otherwise
|
|
* clear the innards of the state set -- that's up to the caller.
|
|
*/
|
|
static struct sset *
|
|
getvacant(struct vars *v,
|
|
struct dfa *d,
|
|
chr *cp,
|
|
chr *start)
|
|
{
|
|
int i;
|
|
struct sset *ss;
|
|
struct sset *p;
|
|
struct arcp ap;
|
|
color co;
|
|
|
|
ss = pickss(v, d, cp, start);
|
|
if (ss == NULL)
|
|
return NULL;
|
|
assert(!(ss->flags & LOCKED));
|
|
|
|
/* clear out its inarcs, including self-referential ones */
|
|
ap = ss->ins;
|
|
while ((p = ap.ss) != NULL)
|
|
{
|
|
co = ap.co;
|
|
FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
|
|
p->outs[co] = NULL;
|
|
ap = p->inchain[co];
|
|
p->inchain[co].ss = NULL; /* paranoia */
|
|
}
|
|
ss->ins.ss = NULL;
|
|
|
|
/* take it off the inarc chains of the ssets reached by its outarcs */
|
|
for (i = 0; i < d->ncolors; i++)
|
|
{
|
|
p = ss->outs[i];
|
|
assert(p != ss); /* not self-referential */
|
|
if (p == NULL)
|
|
continue; /* NOTE CONTINUE */
|
|
FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
|
|
if (p->ins.ss == ss && p->ins.co == i)
|
|
p->ins = ss->inchain[i];
|
|
else
|
|
{
|
|
struct arcp lastap = {NULL, 0};
|
|
|
|
assert(p->ins.ss != NULL);
|
|
for (ap = p->ins; ap.ss != NULL &&
|
|
!(ap.ss == ss && ap.co == i);
|
|
ap = ap.ss->inchain[ap.co])
|
|
lastap = ap;
|
|
assert(ap.ss != NULL);
|
|
lastap.ss->inchain[lastap.co] = ss->inchain[i];
|
|
}
|
|
ss->outs[i] = NULL;
|
|
ss->inchain[i].ss = NULL;
|
|
}
|
|
|
|
/* if ss was a success state, may need to remember location */
|
|
if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
|
|
(d->lastpost == NULL || d->lastpost < ss->lastseen))
|
|
d->lastpost = ss->lastseen;
|
|
|
|
/* likewise for a no-progress state */
|
|
if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
|
|
(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
|
|
d->lastnopr = ss->lastseen;
|
|
|
|
return ss;
|
|
}
|
|
|
|
/*
|
|
* pickss - pick the next stateset to be used
|
|
*/
|
|
static struct sset *
|
|
pickss(struct vars *v,
|
|
struct dfa *d,
|
|
chr *cp,
|
|
chr *start)
|
|
{
|
|
int i;
|
|
struct sset *ss;
|
|
struct sset *end;
|
|
chr *ancient;
|
|
|
|
/* shortcut for cases where cache isn't full */
|
|
if (d->nssused < d->nssets)
|
|
{
|
|
i = d->nssused;
|
|
d->nssused++;
|
|
ss = &d->ssets[i];
|
|
FDEBUG(("new c%d\n", i));
|
|
/* set up innards */
|
|
ss->states = &d->statesarea[i * d->wordsper];
|
|
ss->flags = 0;
|
|
ss->ins.ss = NULL;
|
|
ss->ins.co = WHITE; /* give it some value */
|
|
ss->outs = &d->outsarea[i * d->ncolors];
|
|
ss->inchain = &d->incarea[i * d->ncolors];
|
|
for (i = 0; i < d->ncolors; i++)
|
|
{
|
|
ss->outs[i] = NULL;
|
|
ss->inchain[i].ss = NULL;
|
|
}
|
|
return ss;
|
|
}
|
|
|
|
/* look for oldest, or old enough anyway */
|
|
if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
|
|
ancient = cp - d->nssets * 2 / 3;
|
|
else
|
|
ancient = start;
|
|
for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
|
|
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
|
!(ss->flags & LOCKED))
|
|
{
|
|
d->search = ss + 1;
|
|
FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
|
|
return ss;
|
|
}
|
|
for (ss = d->ssets, end = d->search; ss < end; ss++)
|
|
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
|
!(ss->flags & LOCKED))
|
|
{
|
|
d->search = ss + 1;
|
|
FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
|
|
return ss;
|
|
}
|
|
|
|
/* nobody's old enough?!? -- something's really wrong */
|
|
FDEBUG(("cannot find victim to replace!\n"));
|
|
ERR(REG_ASSERT);
|
|
return NULL;
|
|
}
|