mirror of
https://github.com/postgres/postgres.git
synced 2025-07-30 11:03:19 +03:00
Improve performance of regular expression back-references.
In some cases, at the time that we're doing an NFA-based precheck of whether a backref subexpression can match at a particular place in the string, we already know which substring the referenced subexpression matched. If so, we might as well forget about the NFA and just compare the substring; this is faster and it gives an exact rather than approximate answer. In general, this optimization can help while we are prechecking within the second child expression of a concat node, while the capture was within the first child expression; then the substring was saved during cdissect() of the first child and will be available to NFA checks done while cdissect() recurses into the second child. It can help quite a lot if the tree looks like concat / \ capture concat / \ expensive stuff backref as we will be able to avoid recursively dissecting the "expensive stuff" before discovering that the backref isn't satisfied with a particular midpoint that the lower concat node is testing. This doesn't help if the concat tree is left-deep, as the capture node won't get set soon enough (and it's hard to fix that without changing the engine's match behavior). Fortunately, right-deep concat trees are the common case. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
This commit is contained in:
@ -58,6 +58,19 @@ longest(struct vars *v,
|
|||||||
if (hitstopp != NULL)
|
if (hitstopp != NULL)
|
||||||
*hitstopp = 0;
|
*hitstopp = 0;
|
||||||
|
|
||||||
|
/* if this is a backref to a known string, just match against that */
|
||||||
|
if (d->backno >= 0)
|
||||||
|
{
|
||||||
|
assert((size_t) d->backno < v->nmatch);
|
||||||
|
if (v->pmatch[d->backno].rm_so >= 0)
|
||||||
|
{
|
||||||
|
cp = dfa_backref(v, d, start, start, stop, false);
|
||||||
|
if (cp == v->stop && stop == v->stop && hitstopp != NULL)
|
||||||
|
*hitstopp = 1;
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* fast path for matchall NFAs */
|
/* fast path for matchall NFAs */
|
||||||
if (d->cnfa->flags & MATCHALL)
|
if (d->cnfa->flags & MATCHALL)
|
||||||
{
|
{
|
||||||
@ -210,6 +223,20 @@ shortest(struct vars *v,
|
|||||||
if (hitstopp != NULL)
|
if (hitstopp != NULL)
|
||||||
*hitstopp = 0;
|
*hitstopp = 0;
|
||||||
|
|
||||||
|
/* if this is a backref to a known string, just match against that */
|
||||||
|
if (d->backno >= 0)
|
||||||
|
{
|
||||||
|
assert((size_t) d->backno < v->nmatch);
|
||||||
|
if (v->pmatch[d->backno].rm_so >= 0)
|
||||||
|
{
|
||||||
|
cp = dfa_backref(v, d, start, min, max, true);
|
||||||
|
if (cp != NULL && coldp != NULL)
|
||||||
|
*coldp = start;
|
||||||
|
/* there is no case where we should set *hitstopp */
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* fast path for matchall NFAs */
|
/* fast path for matchall NFAs */
|
||||||
if (d->cnfa->flags & MATCHALL)
|
if (d->cnfa->flags & MATCHALL)
|
||||||
{
|
{
|
||||||
@ -467,6 +494,94 @@ matchuntil(struct vars *v,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* dfa_backref - find best match length for a known backref string
|
||||||
|
*
|
||||||
|
* When the backref's referent is already available, we can deliver an exact
|
||||||
|
* answer with considerably less work than running the backref node's NFA.
|
||||||
|
*
|
||||||
|
* Return match endpoint for longest or shortest valid repeated match,
|
||||||
|
* or NULL if there is no valid match.
|
||||||
|
*
|
||||||
|
* Should be in sync with cbrdissect(), although that has the different task
|
||||||
|
* of checking a match to a predetermined section of the string.
|
||||||
|
*/
|
||||||
|
static chr *
|
||||||
|
dfa_backref(struct vars *v,
|
||||||
|
struct dfa *d,
|
||||||
|
chr *start, /* where the match should start */
|
||||||
|
chr *min, /* match must end at or after here */
|
||||||
|
chr *max, /* match must end at or before here */
|
||||||
|
bool shortest)
|
||||||
|
{
|
||||||
|
int n = d->backno;
|
||||||
|
int backmin = d->backmin;
|
||||||
|
int backmax = d->backmax;
|
||||||
|
size_t numreps;
|
||||||
|
size_t minreps;
|
||||||
|
size_t maxreps;
|
||||||
|
size_t brlen;
|
||||||
|
chr *brstring;
|
||||||
|
chr *p;
|
||||||
|
|
||||||
|
/* get the backreferenced string (caller should have checked this) */
|
||||||
|
if (v->pmatch[n].rm_so == -1)
|
||||||
|
return NULL;
|
||||||
|
brstring = v->start + v->pmatch[n].rm_so;
|
||||||
|
brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
|
||||||
|
|
||||||
|
/* special-case zero-length backreference to avoid divide by zero */
|
||||||
|
if (brlen == 0)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* matches only a zero-length string, but any number of repetitions
|
||||||
|
* can be considered to be present
|
||||||
|
*/
|
||||||
|
if (min == start && backmin <= backmax)
|
||||||
|
return start;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* convert min and max into numbers of possible repetitions of the backref
|
||||||
|
* string, rounding appropriately
|
||||||
|
*/
|
||||||
|
if (min <= start)
|
||||||
|
minreps = 0;
|
||||||
|
else
|
||||||
|
minreps = (min - start - 1) / brlen + 1;
|
||||||
|
maxreps = (max - start) / brlen;
|
||||||
|
|
||||||
|
/* apply bounds, then see if there is any allowed match length */
|
||||||
|
if (minreps < backmin)
|
||||||
|
minreps = backmin;
|
||||||
|
if (backmax != DUPINF && maxreps > backmax)
|
||||||
|
maxreps = backmax;
|
||||||
|
if (maxreps < minreps)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* quick exit if zero-repetitions match is valid and preferred */
|
||||||
|
if (shortest && minreps == 0)
|
||||||
|
return start;
|
||||||
|
|
||||||
|
/* okay, compare the actual string contents */
|
||||||
|
p = start;
|
||||||
|
numreps = 0;
|
||||||
|
while (numreps < maxreps)
|
||||||
|
{
|
||||||
|
if ((*v->g->compare) (brstring, p, brlen) != 0)
|
||||||
|
break;
|
||||||
|
p += brlen;
|
||||||
|
numreps++;
|
||||||
|
if (shortest && numreps >= minreps)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numreps >= minreps)
|
||||||
|
return p;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* lastcold - determine last point at which no progress had been made
|
* lastcold - determine last point at which no progress had been made
|
||||||
*/
|
*/
|
||||||
@ -563,6 +678,8 @@ newdfa(struct vars *v,
|
|||||||
d->lastpost = NULL;
|
d->lastpost = NULL;
|
||||||
d->lastnopr = NULL;
|
d->lastnopr = NULL;
|
||||||
d->search = d->ssets;
|
d->search = d->ssets;
|
||||||
|
d->backno = -1; /* may be set by caller */
|
||||||
|
d->backmin = d->backmax = 0;
|
||||||
|
|
||||||
/* initialization of sset fields is done as needed */
|
/* initialization of sset fields is done as needed */
|
||||||
|
|
||||||
|
@ -77,6 +77,9 @@ struct dfa
|
|||||||
chr *lastpost; /* location of last cache-flushed success */
|
chr *lastpost; /* location of last cache-flushed success */
|
||||||
chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
|
chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
|
||||||
struct sset *search; /* replacement-search-pointer memory */
|
struct sset *search; /* replacement-search-pointer memory */
|
||||||
|
int backno; /* if DFA for a backref, subno it refers to */
|
||||||
|
short backmin; /* min repetitions for backref */
|
||||||
|
short backmax; /* max repetitions for backref */
|
||||||
bool ismalloced; /* should this struct dfa be freed? */
|
bool ismalloced; /* should this struct dfa be freed? */
|
||||||
bool arraysmalloced; /* should its subsidiary arrays be freed? */
|
bool arraysmalloced; /* should its subsidiary arrays be freed? */
|
||||||
};
|
};
|
||||||
@ -154,6 +157,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
|
|||||||
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
|
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
|
||||||
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
|
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
|
||||||
static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
|
static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
|
||||||
|
static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool);
|
||||||
static chr *lastcold(struct vars *, struct dfa *);
|
static chr *lastcold(struct vars *, struct dfa *);
|
||||||
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
|
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
|
||||||
static void freedfa(struct dfa *);
|
static void freedfa(struct dfa *);
|
||||||
@ -342,13 +346,23 @@ static struct dfa *
|
|||||||
getsubdfa(struct vars *v,
|
getsubdfa(struct vars *v,
|
||||||
struct subre *t)
|
struct subre *t)
|
||||||
{
|
{
|
||||||
if (v->subdfas[t->id] == NULL)
|
struct dfa *d = v->subdfas[t->id];
|
||||||
|
|
||||||
|
if (d == NULL)
|
||||||
{
|
{
|
||||||
v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
|
d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
|
||||||
if (ISERR())
|
if (ISERR())
|
||||||
return NULL;
|
return NULL;
|
||||||
|
/* set up additional info if this is a backref node */
|
||||||
|
if (t->op == 'b')
|
||||||
|
{
|
||||||
|
d->backno = t->backno;
|
||||||
|
d->backmin = t->min;
|
||||||
|
d->backmax = t->max;
|
||||||
|
}
|
||||||
|
v->subdfas[t->id] = d;
|
||||||
}
|
}
|
||||||
return v->subdfas[t->id];
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -369,6 +383,7 @@ getladfa(struct vars *v,
|
|||||||
v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
|
v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
|
||||||
if (ISERR())
|
if (ISERR())
|
||||||
return NULL;
|
return NULL;
|
||||||
|
/* a LACON can't contain a backref, so nothing else to do */
|
||||||
}
|
}
|
||||||
return v->ladfas[n];
|
return v->ladfas[n];
|
||||||
}
|
}
|
||||||
@ -927,6 +942,9 @@ crevcondissect(struct vars *v,
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* cbrdissect - dissect match for backref node
|
* cbrdissect - dissect match for backref node
|
||||||
|
*
|
||||||
|
* The backref match might already have been verified by dfa_backref(),
|
||||||
|
* but we don't know that for sure so must check it here.
|
||||||
*/
|
*/
|
||||||
static int /* regexec return code */
|
static int /* regexec return code */
|
||||||
cbrdissect(struct vars *v,
|
cbrdissect(struct vars *v,
|
||||||
|
Reference in New Issue
Block a user