mirror of
https://github.com/postgres/postgres.git
synced 2025-11-06 07:49:08 +03:00
Previously, a PostgreSQL-specific callback checked by the regex engine had a way to trigger a special error code REG_CANCEL if it detected that the next call to CHECK_FOR_INTERRUPTS() would certainly throw via ereport(). A later proposed bugfix aims to move some complex logic out of signal handlers, so that it won't run until the next CHECK_FOR_INTERRUPTS(), which makes the above design impossible unless we split CHECK_FOR_INTERRUPTS() into two phases, one to run logic and another to ereport(). We may develop such a system in the future, but for the regex code it is no longer necessary. An earlier commit moved regex memory management over to our MemoryContext system. Given that the purpose of the two-phase interrupt checking was to free memory before throwing, something we don't need to worry about anymore, it seems simpler to inject CHECK_FOR_INTERRUPTS() directly into cancelation points, and just let it throw. Since the plan is to keep PostgreSQL-specific concerns separate from the main regex engine code (with a view to bein able to stay in sync with other projects), do this with a new macro INTERRUPT(), customizable in regcustom.h and defaulting to nothing. Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/CA%2BhUKGK3PGKwcKqzoosamn36YW-fsuTdOPPF1i_rtEO%3DnEYKSg%40mail.gmail.com
1103 lines
27 KiB
C
1103 lines
27 KiB
C
/*
|
|
* DFA routines
|
|
* This file is #included by regexec.c.
|
|
*
|
|
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
|
*
|
|
* Development of this software was funded, in part, by Cray Research Inc.,
|
|
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
|
* Corporation, none of whom are responsible for the results. The author
|
|
* thanks all of them.
|
|
*
|
|
* Redistribution and use in source and binary forms -- with or without
|
|
* modification -- are permitted for any purpose, provided that
|
|
* redistributions in source form retain this entire copyright notice and
|
|
* indicate the origin and nature of any modifications.
|
|
*
|
|
* I'd appreciate being given credit for this package in the documentation
|
|
* of software which uses it, but that is not a requirement.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
|
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* src/backend/regex/rege_dfa.c
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* longest - longest-preferred matching engine
|
|
*
|
|
* On success, returns match endpoint address. Returns NULL on no match.
|
|
* Internal errors also return NULL, with v->err set.
|
|
*/
|
|
static chr *
|
|
longest(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start, /* where the match should start */
|
|
chr *stop, /* match must end at or before here */
|
|
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
|
{
|
|
chr *cp;
|
|
chr *realstop = (stop == v->stop) ? stop : stop + 1;
|
|
color co;
|
|
struct sset *css;
|
|
struct sset *ss;
|
|
chr *post;
|
|
int i;
|
|
struct colormap *cm = d->cm;
|
|
|
|
/* prevent "uninitialized variable" warnings */
|
|
if (hitstopp != NULL)
|
|
*hitstopp = 0;
|
|
|
|
/* if this is a backref to a known string, just match against that */
|
|
if (d->backno >= 0)
|
|
{
|
|
assert((size_t) d->backno < v->nmatch);
|
|
if (v->pmatch[d->backno].rm_so >= 0)
|
|
{
|
|
cp = dfa_backref(v, d, start, start, stop, false);
|
|
if (cp == v->stop && stop == v->stop && hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
return cp;
|
|
}
|
|
}
|
|
|
|
/* fast path for matchall NFAs */
|
|
if (d->cnfa->flags & MATCHALL)
|
|
{
|
|
size_t nchr = stop - start;
|
|
size_t maxmatchall = d->cnfa->maxmatchall;
|
|
|
|
if (nchr < d->cnfa->minmatchall)
|
|
return NULL;
|
|
if (maxmatchall == DUPINF)
|
|
{
|
|
if (stop == v->stop && hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
}
|
|
else
|
|
{
|
|
if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
if (nchr > maxmatchall)
|
|
return start + maxmatchall;
|
|
}
|
|
return stop;
|
|
}
|
|
|
|
/* initialize */
|
|
css = initialize(v, d, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
cp = start;
|
|
|
|
/* startup */
|
|
FDEBUG(("+++ startup +++\n"));
|
|
if (cp == v->start)
|
|
{
|
|
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
}
|
|
else
|
|
{
|
|
co = GETCOLOR(cm, *(cp - 1));
|
|
FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
|
|
}
|
|
css = miss(v, d, css, co, cp, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
css->lastseen = cp;
|
|
|
|
/*
|
|
* This is the main text-scanning loop. It seems worth having two copies
|
|
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
|
|
* builds, when you're not actively tracing.
|
|
*/
|
|
#ifdef REG_DEBUG
|
|
if (v->eflags & REG_FTRACE)
|
|
{
|
|
while (cp < realstop)
|
|
{
|
|
FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while (cp < realstop)
|
|
{
|
|
co = GETCOLOR(cm, *cp);
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
|
|
if (ISERR())
|
|
return NULL;
|
|
|
|
/* shutdown */
|
|
FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
|
|
if (cp == v->stop && stop == v->stop)
|
|
{
|
|
if (hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
ss = miss(v, d, css, co, cp, start);
|
|
if (ISERR())
|
|
return NULL;
|
|
/* special case: match ended at eol? */
|
|
if (ss != NULL && (ss->flags & POSTSTATE))
|
|
return cp;
|
|
else if (ss != NULL)
|
|
ss->lastseen = cp; /* to be tidy */
|
|
}
|
|
|
|
/* find last match, if any */
|
|
post = d->lastpost;
|
|
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
|
if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
|
|
(post == NULL || post < ss->lastseen))
|
|
post = ss->lastseen;
|
|
if (post != NULL) /* found one */
|
|
return post - 1;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* shortest - shortest-preferred matching engine
|
|
*
|
|
* On success, returns match endpoint address. Returns NULL on no match.
|
|
* Internal errors also return NULL, with v->err set.
|
|
*/
|
|
static chr *
|
|
shortest(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start, /* where the match should start */
|
|
chr *min, /* match must end at or after here */
|
|
chr *max, /* match must end at or before here */
|
|
chr **coldp, /* store coldstart pointer here, if non-NULL */
|
|
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
|
{
|
|
chr *cp;
|
|
chr *realmin = (min == v->stop) ? min : min + 1;
|
|
chr *realmax = (max == v->stop) ? max : max + 1;
|
|
color co;
|
|
struct sset *css;
|
|
struct sset *ss;
|
|
struct colormap *cm = d->cm;
|
|
|
|
/* prevent "uninitialized variable" warnings */
|
|
if (coldp != NULL)
|
|
*coldp = NULL;
|
|
if (hitstopp != NULL)
|
|
*hitstopp = 0;
|
|
|
|
/* if this is a backref to a known string, just match against that */
|
|
if (d->backno >= 0)
|
|
{
|
|
assert((size_t) d->backno < v->nmatch);
|
|
if (v->pmatch[d->backno].rm_so >= 0)
|
|
{
|
|
cp = dfa_backref(v, d, start, min, max, true);
|
|
if (cp != NULL && coldp != NULL)
|
|
*coldp = start;
|
|
/* there is no case where we should set *hitstopp */
|
|
return cp;
|
|
}
|
|
}
|
|
|
|
/* fast path for matchall NFAs */
|
|
if (d->cnfa->flags & MATCHALL)
|
|
{
|
|
size_t nchr = min - start;
|
|
|
|
if (d->cnfa->maxmatchall != DUPINF &&
|
|
nchr > d->cnfa->maxmatchall)
|
|
return NULL;
|
|
if ((max - start) < d->cnfa->minmatchall)
|
|
return NULL;
|
|
if (nchr < d->cnfa->minmatchall)
|
|
min = start + d->cnfa->minmatchall;
|
|
if (coldp != NULL)
|
|
*coldp = start;
|
|
/* there is no case where we should set *hitstopp */
|
|
return min;
|
|
}
|
|
|
|
/* initialize */
|
|
css = initialize(v, d, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
cp = start;
|
|
|
|
/* startup */
|
|
FDEBUG(("--- startup ---\n"));
|
|
if (cp == v->start)
|
|
{
|
|
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
}
|
|
else
|
|
{
|
|
co = GETCOLOR(cm, *(cp - 1));
|
|
FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
|
|
}
|
|
css = miss(v, d, css, co, cp, start);
|
|
if (css == NULL)
|
|
return NULL;
|
|
css->lastseen = cp;
|
|
ss = css;
|
|
|
|
/*
|
|
* This is the main text-scanning loop. It seems worth having two copies
|
|
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
|
|
* builds, when you're not actively tracing.
|
|
*/
|
|
#ifdef REG_DEBUG
|
|
if (v->eflags & REG_FTRACE)
|
|
{
|
|
while (cp < realmax)
|
|
{
|
|
FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
if ((ss->flags & POSTSTATE) && cp >= realmin)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while (cp < realmax)
|
|
{
|
|
co = GETCOLOR(cm, *cp);
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
if ((ss->flags & POSTSTATE) && cp >= realmin)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
}
|
|
|
|
if (ss == NULL)
|
|
return NULL;
|
|
|
|
if (coldp != NULL) /* report last no-progress state set, if any */
|
|
*coldp = lastcold(v, d);
|
|
|
|
if ((ss->flags & POSTSTATE) && cp > min)
|
|
{
|
|
assert(cp >= realmin);
|
|
cp--;
|
|
}
|
|
else if (cp == v->stop && max == v->stop)
|
|
{
|
|
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
ss = miss(v, d, css, co, cp, start);
|
|
/* match might have ended at eol */
|
|
if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
|
|
*hitstopp = 1;
|
|
}
|
|
|
|
if (ss == NULL || !(ss->flags & POSTSTATE))
|
|
return NULL;
|
|
|
|
return cp;
|
|
}
|
|
|
|
/*
|
|
* matchuntil - incremental matching engine
|
|
*
|
|
* This is meant for use with a search-style NFA (that is, the pattern is
|
|
* known to act as though it had a leading .*). We determine whether a
|
|
* match exists starting at v->start and ending at probe. Multiple calls
|
|
* require only O(N) time not O(N^2) so long as the probe values are
|
|
* nondecreasing. *lastcss and *lastcp must be initialized to NULL before
|
|
* starting a series of calls.
|
|
*
|
|
* Returns 1 if a match exists, 0 if not.
|
|
* Internal errors also return 0, with v->err set.
|
|
*/
|
|
static int
|
|
matchuntil(struct vars *v,
|
|
struct dfa *d,
|
|
chr *probe, /* we want to know if a match ends here */
|
|
struct sset **lastcss, /* state storage across calls */
|
|
chr **lastcp) /* state storage across calls */
|
|
{
|
|
chr *cp = *lastcp;
|
|
color co;
|
|
struct sset *css = *lastcss;
|
|
struct sset *ss;
|
|
struct colormap *cm = d->cm;
|
|
|
|
/* fast path for matchall NFAs */
|
|
if (d->cnfa->flags & MATCHALL)
|
|
{
|
|
size_t nchr = probe - v->start;
|
|
|
|
if (nchr < d->cnfa->minmatchall)
|
|
return 0;
|
|
/* maxmatchall will always be infinity, cf. makesearch() */
|
|
assert(d->cnfa->maxmatchall == DUPINF);
|
|
return 1;
|
|
}
|
|
|
|
/* initialize and startup, or restart, if necessary */
|
|
if (cp == NULL || cp > probe)
|
|
{
|
|
cp = v->start;
|
|
css = initialize(v, d, cp);
|
|
if (css == NULL)
|
|
return 0;
|
|
|
|
FDEBUG((">>> startup >>>\n"));
|
|
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
|
|
css = miss(v, d, css, co, cp, v->start);
|
|
if (css == NULL)
|
|
return 0;
|
|
css->lastseen = cp;
|
|
}
|
|
else if (css == NULL)
|
|
{
|
|
/* we previously found that no match is possible beyond *lastcp */
|
|
return 0;
|
|
}
|
|
ss = css;
|
|
|
|
/*
|
|
* This is the main text-scanning loop. It seems worth having two copies
|
|
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
|
|
* builds, when you're not actively tracing.
|
|
*/
|
|
#ifdef REG_DEBUG
|
|
if (v->eflags & REG_FTRACE)
|
|
{
|
|
while (cp < probe)
|
|
{
|
|
FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, v->start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
while (cp < probe)
|
|
{
|
|
co = GETCOLOR(cm, *cp);
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
{
|
|
ss = miss(v, d, css, co, cp + 1, v->start);
|
|
if (ss == NULL)
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
cp++;
|
|
ss->lastseen = cp;
|
|
css = ss;
|
|
}
|
|
}
|
|
|
|
*lastcss = ss;
|
|
*lastcp = cp;
|
|
|
|
if (ss == NULL)
|
|
return 0; /* impossible match, or internal error */
|
|
|
|
/* We need to process one more chr, or the EOS symbol, to check match */
|
|
if (cp < v->stop)
|
|
{
|
|
FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
|
|
co = GETCOLOR(cm, *cp);
|
|
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
|
ss = css->outs[co];
|
|
if (ss == NULL)
|
|
ss = miss(v, d, css, co, cp + 1, v->start);
|
|
}
|
|
else
|
|
{
|
|
assert(cp == v->stop);
|
|
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
|
FDEBUG(("color %ld\n", (long) co));
|
|
ss = miss(v, d, css, co, cp, v->start);
|
|
}
|
|
|
|
if (ss == NULL || !(ss->flags & POSTSTATE))
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* dfa_backref - find best match length for a known backref string
|
|
*
|
|
* When the backref's referent is already available, we can deliver an exact
|
|
* answer with considerably less work than running the backref node's NFA.
|
|
*
|
|
* Return match endpoint for longest or shortest valid repeated match,
|
|
* or NULL if there is no valid match.
|
|
*
|
|
* Should be in sync with cbrdissect(), although that has the different task
|
|
* of checking a match to a predetermined section of the string.
|
|
*/
|
|
static chr *
|
|
dfa_backref(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start, /* where the match should start */
|
|
chr *min, /* match must end at or after here */
|
|
chr *max, /* match must end at or before here */
|
|
bool shortest)
|
|
{
|
|
int n = d->backno;
|
|
int backmin = d->backmin;
|
|
int backmax = d->backmax;
|
|
size_t numreps;
|
|
size_t minreps;
|
|
size_t maxreps;
|
|
size_t brlen;
|
|
chr *brstring;
|
|
chr *p;
|
|
|
|
/* get the backreferenced string (caller should have checked this) */
|
|
if (v->pmatch[n].rm_so == -1)
|
|
return NULL;
|
|
brstring = v->start + v->pmatch[n].rm_so;
|
|
brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
|
|
|
|
/* special-case zero-length backreference to avoid divide by zero */
|
|
if (brlen == 0)
|
|
{
|
|
/*
|
|
* matches only a zero-length string, but any number of repetitions
|
|
* can be considered to be present
|
|
*/
|
|
if (min == start && backmin <= backmax)
|
|
return start;
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* convert min and max into numbers of possible repetitions of the backref
|
|
* string, rounding appropriately
|
|
*/
|
|
if (min <= start)
|
|
minreps = 0;
|
|
else
|
|
minreps = (min - start - 1) / brlen + 1;
|
|
maxreps = (max - start) / brlen;
|
|
|
|
/* apply bounds, then see if there is any allowed match length */
|
|
if (minreps < backmin)
|
|
minreps = backmin;
|
|
if (backmax != DUPINF && maxreps > backmax)
|
|
maxreps = backmax;
|
|
if (maxreps < minreps)
|
|
return NULL;
|
|
|
|
/* quick exit if zero-repetitions match is valid and preferred */
|
|
if (shortest && minreps == 0)
|
|
return start;
|
|
|
|
/* okay, compare the actual string contents */
|
|
p = start;
|
|
numreps = 0;
|
|
while (numreps < maxreps)
|
|
{
|
|
if ((*v->g->compare) (brstring, p, brlen) != 0)
|
|
break;
|
|
p += brlen;
|
|
numreps++;
|
|
if (shortest && numreps >= minreps)
|
|
break;
|
|
}
|
|
|
|
if (numreps >= minreps)
|
|
return p;
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* lastcold - determine last point at which no progress had been made
|
|
*/
|
|
static chr * /* endpoint, or NULL */
|
|
lastcold(struct vars *v,
|
|
struct dfa *d)
|
|
{
|
|
struct sset *ss;
|
|
chr *nopr;
|
|
int i;
|
|
|
|
nopr = d->lastnopr;
|
|
if (nopr == NULL)
|
|
nopr = v->start;
|
|
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
|
if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
|
|
nopr = ss->lastseen;
|
|
return nopr;
|
|
}
|
|
|
|
/*
|
|
* newdfa - set up a fresh DFA
|
|
*
|
|
* Returns NULL (and sets v->err) on failure.
|
|
*/
|
|
static struct dfa *
|
|
newdfa(struct vars *v,
|
|
struct cnfa *cnfa,
|
|
struct colormap *cm,
|
|
struct smalldfa *sml) /* preallocated space, may be NULL */
|
|
{
|
|
struct dfa *d;
|
|
size_t nss = cnfa->nstates * 2;
|
|
int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
|
|
bool ismalloced = false;
|
|
|
|
assert(cnfa != NULL && cnfa->nstates != 0);
|
|
|
|
if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
|
|
{
|
|
assert(wordsper == 1);
|
|
if (sml == NULL)
|
|
{
|
|
sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
|
|
if (sml == NULL)
|
|
{
|
|
ERR(REG_ESPACE);
|
|
return NULL;
|
|
}
|
|
ismalloced = true;
|
|
}
|
|
d = &sml->dfa;
|
|
d->ssets = sml->ssets;
|
|
d->statesarea = sml->statesarea;
|
|
d->work = &d->statesarea[nss];
|
|
d->outsarea = sml->outsarea;
|
|
d->incarea = sml->incarea;
|
|
d->ismalloced = ismalloced;
|
|
d->arraysmalloced = false; /* not separately allocated, anyway */
|
|
}
|
|
else
|
|
{
|
|
d = (struct dfa *) MALLOC(sizeof(struct dfa));
|
|
if (d == NULL)
|
|
{
|
|
ERR(REG_ESPACE);
|
|
return NULL;
|
|
}
|
|
d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
|
|
d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
|
|
sizeof(unsigned));
|
|
d->work = &d->statesarea[nss * wordsper];
|
|
d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
|
|
sizeof(struct sset *));
|
|
d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
|
|
sizeof(struct arcp));
|
|
d->ismalloced = true;
|
|
d->arraysmalloced = true;
|
|
/* now freedfa() will behave sanely */
|
|
if (d->ssets == NULL || d->statesarea == NULL ||
|
|
d->outsarea == NULL || d->incarea == NULL)
|
|
{
|
|
freedfa(d);
|
|
ERR(REG_ESPACE);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
|
|
d->nssused = 0;
|
|
d->nstates = cnfa->nstates;
|
|
d->ncolors = cnfa->ncolors;
|
|
d->wordsper = wordsper;
|
|
d->cnfa = cnfa;
|
|
d->cm = cm;
|
|
d->lastpost = NULL;
|
|
d->lastnopr = NULL;
|
|
d->search = d->ssets;
|
|
d->backno = -1; /* may be set by caller */
|
|
d->backmin = d->backmax = 0;
|
|
|
|
/* initialization of sset fields is done as needed */
|
|
|
|
return d;
|
|
}
|
|
|
|
/*
|
|
* freedfa - free a DFA
|
|
*/
|
|
static void
|
|
freedfa(struct dfa *d)
|
|
{
|
|
if (d->arraysmalloced)
|
|
{
|
|
if (d->ssets != NULL)
|
|
FREE(d->ssets);
|
|
if (d->statesarea != NULL)
|
|
FREE(d->statesarea);
|
|
if (d->outsarea != NULL)
|
|
FREE(d->outsarea);
|
|
if (d->incarea != NULL)
|
|
FREE(d->incarea);
|
|
}
|
|
|
|
if (d->ismalloced)
|
|
FREE(d);
|
|
}
|
|
|
|
/*
|
|
* hash - construct a hash code for a bitvector
|
|
*
|
|
* There are probably better ways, but they're more expensive.
|
|
*/
|
|
static unsigned
|
|
hash(unsigned *uv,
|
|
int n)
|
|
{
|
|
int i;
|
|
unsigned h;
|
|
|
|
h = 0;
|
|
for (i = 0; i < n; i++)
|
|
h ^= uv[i];
|
|
return h;
|
|
}
|
|
|
|
/*
|
|
* initialize - hand-craft a cache entry for startup, otherwise get ready
|
|
*/
|
|
static struct sset *
|
|
initialize(struct vars *v,
|
|
struct dfa *d,
|
|
chr *start)
|
|
{
|
|
struct sset *ss;
|
|
int i;
|
|
|
|
/* is previous one still there? */
|
|
if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
|
|
ss = &d->ssets[0];
|
|
else
|
|
{ /* no, must (re)build it */
|
|
ss = getvacant(v, d, start, start);
|
|
if (ss == NULL)
|
|
return NULL;
|
|
for (i = 0; i < d->wordsper; i++)
|
|
ss->states[i] = 0;
|
|
BSET(ss->states, d->cnfa->pre);
|
|
ss->hash = HASH(ss->states, d->wordsper);
|
|
assert(d->cnfa->pre != d->cnfa->post);
|
|
ss->flags = STARTER | LOCKED | NOPROGRESS;
|
|
/* lastseen dealt with below */
|
|
}
|
|
|
|
for (i = 0; i < d->nssused; i++)
|
|
d->ssets[i].lastseen = NULL;
|
|
ss->lastseen = start; /* maybe untrue, but harmless */
|
|
d->lastpost = NULL;
|
|
d->lastnopr = NULL;
|
|
return ss;
|
|
}
|
|
|
|
/*
|
|
* miss - handle a stateset cache miss
|
|
*
|
|
* css is the current stateset, co is the color of the current input character,
|
|
* cp points to the character after that (which is where we may need to test
|
|
* LACONs). start does not affect matching behavior but is needed for pickss'
|
|
* heuristics about which stateset cache entry to replace.
|
|
*
|
|
* Ordinarily, returns the address of the next stateset (the one that is
|
|
* valid after consuming the input character). Returns NULL if no valid
|
|
* NFA states remain, ie we have a certain match failure.
|
|
* Internal errors also return NULL, with v->err set.
|
|
*/
|
|
static struct sset *
|
|
miss(struct vars *v,
|
|
struct dfa *d,
|
|
struct sset *css,
|
|
color co,
|
|
chr *cp, /* next chr */
|
|
chr *start) /* where the attempt got started */
|
|
{
|
|
struct cnfa *cnfa = d->cnfa;
|
|
int i;
|
|
unsigned h;
|
|
struct carc *ca;
|
|
struct sset *p;
|
|
int ispseudocolor;
|
|
int ispost;
|
|
int noprogress;
|
|
int gotstate;
|
|
int dolacons;
|
|
int sawlacons;
|
|
|
|
/* for convenience, we can be called even if it might not be a miss */
|
|
if (css->outs[co] != NULL)
|
|
{
|
|
FDEBUG(("hit\n"));
|
|
return css->outs[co];
|
|
}
|
|
FDEBUG(("miss\n"));
|
|
|
|
/*
|
|
* Checking for operation cancel in the inner text search loop seems
|
|
* unduly expensive. As a compromise, check during cache misses.
|
|
*/
|
|
INTERRUPT(v->re);
|
|
|
|
/*
|
|
* What set of states would we end up in after consuming the co character?
|
|
* We first consider PLAIN arcs that consume the character, and then look
|
|
* to see what LACON arcs could be traversed after consuming it.
|
|
*/
|
|
for (i = 0; i < d->wordsper; i++)
|
|
d->work[i] = 0; /* build new stateset bitmap in d->work */
|
|
ispseudocolor = d->cm->cd[co].flags & PSEUDO;
|
|
ispost = 0;
|
|
noprogress = 1;
|
|
gotstate = 0;
|
|
for (i = 0; i < d->nstates; i++)
|
|
if (ISBSET(css->states, i))
|
|
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
|
|
if (ca->co == co ||
|
|
(ca->co == RAINBOW && !ispseudocolor))
|
|
{
|
|
BSET(d->work, ca->to);
|
|
gotstate = 1;
|
|
if (ca->to == cnfa->post)
|
|
ispost = 1;
|
|
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
|
|
noprogress = 0;
|
|
FDEBUG(("%d -> %d\n", i, ca->to));
|
|
}
|
|
if (!gotstate)
|
|
return NULL; /* character cannot reach any new state */
|
|
dolacons = (cnfa->flags & HASLACONS);
|
|
sawlacons = 0;
|
|
/* outer loop handles transitive closure of reachable-by-LACON states */
|
|
while (dolacons)
|
|
{
|
|
dolacons = 0;
|
|
for (i = 0; i < d->nstates; i++)
|
|
if (ISBSET(d->work, i))
|
|
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
|
|
{
|
|
if (ca->co < cnfa->ncolors)
|
|
continue; /* not a LACON arc */
|
|
if (ISBSET(d->work, ca->to))
|
|
continue; /* arc would be a no-op anyway */
|
|
sawlacons = 1; /* this LACON affects our result */
|
|
if (!lacon(v, cnfa, cp, ca->co))
|
|
{
|
|
if (ISERR())
|
|
return NULL;
|
|
continue; /* LACON arc cannot be traversed */
|
|
}
|
|
if (ISERR())
|
|
return NULL;
|
|
BSET(d->work, ca->to);
|
|
dolacons = 1;
|
|
if (ca->to == cnfa->post)
|
|
ispost = 1;
|
|
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
|
|
noprogress = 0;
|
|
FDEBUG(("%d :> %d\n", i, ca->to));
|
|
}
|
|
}
|
|
h = HASH(d->work, d->wordsper);
|
|
|
|
/* Is this stateset already in the cache? */
|
|
for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
|
|
if (HIT(h, d->work, p, d->wordsper))
|
|
{
|
|
FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
if (i == 0)
|
|
{ /* nope, need a new cache entry */
|
|
p = getvacant(v, d, cp, start);
|
|
if (p == NULL)
|
|
return NULL;
|
|
assert(p != css);
|
|
for (i = 0; i < d->wordsper; i++)
|
|
p->states[i] = d->work[i];
|
|
p->hash = h;
|
|
p->flags = (ispost) ? POSTSTATE : 0;
|
|
if (noprogress)
|
|
p->flags |= NOPROGRESS;
|
|
/* lastseen to be dealt with by caller */
|
|
}
|
|
|
|
/*
|
|
* Link new stateset to old, unless a LACON affected the result, in which
|
|
* case we don't create the link. That forces future transitions across
|
|
* this same arc (same prior stateset and character color) to come through
|
|
* miss() again, so that we can recheck the LACON(s), which might or might
|
|
* not pass since context will be different.
|
|
*/
|
|
if (!sawlacons)
|
|
{
|
|
FDEBUG(("c%d[%d]->c%d\n",
|
|
(int) (css - d->ssets), co, (int) (p - d->ssets)));
|
|
css->outs[co] = p;
|
|
css->inchain[co] = p->ins;
|
|
p->ins.ss = css;
|
|
p->ins.co = co;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
/*
|
|
* lacon - lookaround-constraint checker for miss()
|
|
*/
|
|
static int /* predicate: constraint satisfied? */
|
|
lacon(struct vars *v,
|
|
struct cnfa *pcnfa, /* parent cnfa */
|
|
chr *cp,
|
|
color co) /* "color" of the lookaround constraint */
|
|
{
|
|
int n;
|
|
struct subre *sub;
|
|
struct dfa *d;
|
|
chr *end;
|
|
int satisfied;
|
|
|
|
/* Since this is recursive, it could be driven to stack overflow */
|
|
if (STACK_TOO_DEEP(v->re))
|
|
{
|
|
ERR(REG_ETOOBIG);
|
|
return 0;
|
|
}
|
|
|
|
n = co - pcnfa->ncolors;
|
|
assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
|
|
FDEBUG(("=== testing lacon %d\n", n));
|
|
sub = &v->g->lacons[n];
|
|
d = getladfa(v, n);
|
|
if (d == NULL)
|
|
return 0;
|
|
if (LATYPE_IS_AHEAD(sub->latype))
|
|
{
|
|
/* used to use longest() here, but shortest() could be much cheaper */
|
|
end = shortest(v, d, cp, cp, v->stop,
|
|
(chr **) NULL, (int *) NULL);
|
|
satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* To avoid doing O(N^2) work when repeatedly testing a lookbehind
|
|
* constraint in an N-character string, we use matchuntil() which can
|
|
* cache the DFA state across calls. We only need to restart if the
|
|
* probe point decreases, which is not common. The NFA we're using is
|
|
* a search NFA, so it doesn't mind scanning over stuff before the
|
|
* nominal match.
|
|
*/
|
|
satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
|
|
if (!LATYPE_IS_POS(sub->latype))
|
|
satisfied = !satisfied;
|
|
}
|
|
FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
|
|
return satisfied;
|
|
}
|
|
|
|
/*
|
|
* getvacant - get a vacant state set
|
|
*
|
|
* This routine clears out the inarcs and outarcs, but does not otherwise
|
|
* clear the innards of the state set -- that's up to the caller.
|
|
*/
|
|
static struct sset *
|
|
getvacant(struct vars *v,
|
|
struct dfa *d,
|
|
chr *cp,
|
|
chr *start)
|
|
{
|
|
int i;
|
|
struct sset *ss;
|
|
struct sset *p;
|
|
struct arcp ap;
|
|
color co;
|
|
|
|
ss = pickss(v, d, cp, start);
|
|
if (ss == NULL)
|
|
return NULL;
|
|
assert(!(ss->flags & LOCKED));
|
|
|
|
/* clear out its inarcs, including self-referential ones */
|
|
ap = ss->ins;
|
|
while ((p = ap.ss) != NULL)
|
|
{
|
|
co = ap.co;
|
|
FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
|
|
p->outs[co] = NULL;
|
|
ap = p->inchain[co];
|
|
p->inchain[co].ss = NULL; /* paranoia */
|
|
}
|
|
ss->ins.ss = NULL;
|
|
|
|
/* take it off the inarc chains of the ssets reached by its outarcs */
|
|
for (i = 0; i < d->ncolors; i++)
|
|
{
|
|
p = ss->outs[i];
|
|
assert(p != ss); /* not self-referential */
|
|
if (p == NULL)
|
|
continue; /* NOTE CONTINUE */
|
|
FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
|
|
if (p->ins.ss == ss && p->ins.co == i)
|
|
p->ins = ss->inchain[i];
|
|
else
|
|
{
|
|
struct arcp lastap = {NULL, 0};
|
|
|
|
assert(p->ins.ss != NULL);
|
|
for (ap = p->ins; ap.ss != NULL &&
|
|
!(ap.ss == ss && ap.co == i);
|
|
ap = ap.ss->inchain[ap.co])
|
|
lastap = ap;
|
|
assert(ap.ss != NULL);
|
|
lastap.ss->inchain[lastap.co] = ss->inchain[i];
|
|
}
|
|
ss->outs[i] = NULL;
|
|
ss->inchain[i].ss = NULL;
|
|
}
|
|
|
|
/* if ss was a success state, may need to remember location */
|
|
if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
|
|
(d->lastpost == NULL || d->lastpost < ss->lastseen))
|
|
d->lastpost = ss->lastseen;
|
|
|
|
/* likewise for a no-progress state */
|
|
if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
|
|
(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
|
|
d->lastnopr = ss->lastseen;
|
|
|
|
return ss;
|
|
}
|
|
|
|
/*
|
|
* pickss - pick the next stateset to be used
|
|
*/
|
|
static struct sset *
|
|
pickss(struct vars *v,
|
|
struct dfa *d,
|
|
chr *cp,
|
|
chr *start)
|
|
{
|
|
int i;
|
|
struct sset *ss;
|
|
struct sset *end;
|
|
chr *ancient;
|
|
|
|
/* shortcut for cases where cache isn't full */
|
|
if (d->nssused < d->nssets)
|
|
{
|
|
i = d->nssused;
|
|
d->nssused++;
|
|
ss = &d->ssets[i];
|
|
FDEBUG(("new c%d\n", i));
|
|
/* set up innards */
|
|
ss->states = &d->statesarea[i * d->wordsper];
|
|
ss->flags = 0;
|
|
ss->ins.ss = NULL;
|
|
ss->ins.co = WHITE; /* give it some value */
|
|
ss->outs = &d->outsarea[i * d->ncolors];
|
|
ss->inchain = &d->incarea[i * d->ncolors];
|
|
for (i = 0; i < d->ncolors; i++)
|
|
{
|
|
ss->outs[i] = NULL;
|
|
ss->inchain[i].ss = NULL;
|
|
}
|
|
return ss;
|
|
}
|
|
|
|
/* look for oldest, or old enough anyway */
|
|
if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
|
|
ancient = cp - d->nssets * 2 / 3;
|
|
else
|
|
ancient = start;
|
|
for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
|
|
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
|
!(ss->flags & LOCKED))
|
|
{
|
|
d->search = ss + 1;
|
|
FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
|
|
return ss;
|
|
}
|
|
for (ss = d->ssets, end = d->search; ss < end; ss++)
|
|
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
|
!(ss->flags & LOCKED))
|
|
{
|
|
d->search = ss + 1;
|
|
FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
|
|
return ss;
|
|
}
|
|
|
|
/* nobody's old enough?!? -- something's really wrong */
|
|
FDEBUG(("cannot find victim to replace!\n"));
|
|
ERR(REG_ASSERT);
|
|
return NULL;
|
|
}
|