mirror of
https://github.com/postgres/postgres.git
synced 2025-09-05 02:22:28 +03:00
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has *only* capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug.
This commit is contained in:
@@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *);
|
||||
static int dissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int condissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int altdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int iterdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int reviterdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int cdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int ccondissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int crevdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int caltdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int citerdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
|
||||
|
||||
/* === rege_dfa.c === */
|
||||
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
|
||||
@@ -563,14 +567,17 @@ dissect(struct vars * v,
|
||||
case '=': /* terminal node */
|
||||
assert(t->left == NULL && t->right == NULL);
|
||||
return REG_OKAY; /* no action, parent did the work */
|
||||
case '|': /* alternation */
|
||||
assert(t->left != NULL);
|
||||
return altdissect(v, t, begin, end);
|
||||
case 'b': /* back ref -- shouldn't be calling us! */
|
||||
return REG_ASSERT;
|
||||
case '.': /* concatenation */
|
||||
assert(t->left != NULL && t->right != NULL);
|
||||
return condissect(v, t, begin, end);
|
||||
case '|': /* alternation */
|
||||
assert(t->left != NULL);
|
||||
return altdissect(v, t, begin, end);
|
||||
case '*': /* iteration */
|
||||
assert(t->left != NULL);
|
||||
return iterdissect(v, t, begin, end);
|
||||
case '(': /* capturing */
|
||||
assert(t->left != NULL && t->right == NULL);
|
||||
assert(t->subno > 0);
|
||||
@@ -696,6 +703,375 @@ altdissect(struct vars * v,
|
||||
return REG_ASSERT; /* none of them matched?!? */
|
||||
}
|
||||
|
||||
/*
|
||||
* iterdissect - iteration subexpression matches (uncomplicated)
|
||||
*/
|
||||
static int /* regexec return code */
|
||||
iterdissect(struct vars * v,
|
||||
struct subre * t,
|
||||
chr *begin, /* beginning of relevant substring */
|
||||
chr *end) /* end of same */
|
||||
{
|
||||
struct dfa *d;
|
||||
chr **endpts;
|
||||
chr *limit;
|
||||
int min_matches;
|
||||
size_t max_matches;
|
||||
int nverified;
|
||||
int k;
|
||||
int i;
|
||||
int er;
|
||||
|
||||
assert(t->op == '*');
|
||||
assert(t->left != NULL && t->left->cnfa.nstates > 0);
|
||||
assert(begin <= end);
|
||||
|
||||
if (t->left->flags & SHORTER) /* reverse scan */
|
||||
return reviterdissect(v, t, begin, end);
|
||||
|
||||
/*
|
||||
* If zero matches are allowed, and target string is empty, just declare
|
||||
* victory. OTOH, if target string isn't empty, zero matches can't work
|
||||
* so we pretend the min is 1.
|
||||
*/
|
||||
min_matches = t->min;
|
||||
if (min_matches <= 0)
|
||||
{
|
||||
if (begin == end)
|
||||
return REG_OKAY;
|
||||
min_matches = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need workspace to track the endpoints of each sub-match. Normally
|
||||
* we consider only nonzero-length sub-matches, so there can be at most
|
||||
* end-begin of them. However, if min is larger than that, we will also
|
||||
* consider zero-length sub-matches in order to find enough matches.
|
||||
*
|
||||
* For convenience, endpts[0] contains the "begin" pointer and we store
|
||||
* sub-match endpoints in endpts[1..max_matches].
|
||||
*/
|
||||
max_matches = end - begin;
|
||||
if (max_matches > t->max && t->max != INFINITY)
|
||||
max_matches = t->max;
|
||||
if (max_matches < min_matches)
|
||||
max_matches = min_matches;
|
||||
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
|
||||
if (endpts == NULL)
|
||||
return REG_ESPACE;
|
||||
endpts[0] = begin;
|
||||
|
||||
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
|
||||
if (ISERR())
|
||||
{
|
||||
FREE(endpts);
|
||||
return v->err;
|
||||
}
|
||||
MDEBUG(("iter %d\n", t->retry));
|
||||
|
||||
/*
|
||||
* Our strategy is to first find a set of sub-match endpoints that are
|
||||
* valid according to the child node's DFA, and then recursively dissect
|
||||
* each sub-match to confirm validity. If any validity check fails,
|
||||
* backtrack the last sub-match and try again. And, when we next try for
|
||||
* a validity check, we need not recheck any successfully verified
|
||||
* sub-matches that we didn't move the endpoints of. nverified remembers
|
||||
* how many sub-matches are currently known okay.
|
||||
*/
|
||||
|
||||
/* initialize to consider first sub-match */
|
||||
nverified = 0;
|
||||
k = 1;
|
||||
limit = end;
|
||||
|
||||
/* iterate until satisfaction or failure */
|
||||
while (k > 0)
|
||||
{
|
||||
/* try to find an endpoint for the k'th sub-match */
|
||||
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
|
||||
if (endpts[k] == NULL)
|
||||
{
|
||||
/* no match possible, so see if we can shorten previous one */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
MDEBUG(("%d: working endpoint %d: %ld\n",
|
||||
t->retry, k, LOFF(endpts[k])));
|
||||
|
||||
/* k'th sub-match can no longer be considered verified */
|
||||
if (nverified >= k)
|
||||
nverified = k - 1;
|
||||
|
||||
if (endpts[k] != end)
|
||||
{
|
||||
/* haven't reached end yet, try another iteration if allowed */
|
||||
if (k >= max_matches)
|
||||
{
|
||||
/* must try to shorten some previous match */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
|
||||
/* reject zero-length match unless necessary to achieve min */
|
||||
if (endpts[k] == endpts[k - 1] &&
|
||||
(k >= min_matches || min_matches - k < end - endpts[k]))
|
||||
goto backtrack;
|
||||
|
||||
k++;
|
||||
limit = end;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've identified a way to divide the string into k sub-matches
|
||||
* that works so far as the child DFA can tell. If k is an allowed
|
||||
* number of matches, start the slow part: recurse to verify each
|
||||
* sub-match. We always have k <= max_matches, needn't check that.
|
||||
*/
|
||||
if (k < min_matches)
|
||||
goto backtrack;
|
||||
|
||||
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
|
||||
|
||||
for (i = nverified + 1; i <= k; i++)
|
||||
{
|
||||
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
|
||||
if (er == REG_OKAY)
|
||||
{
|
||||
nverified = i;
|
||||
continue;
|
||||
}
|
||||
if (er == REG_NOMATCH)
|
||||
break;
|
||||
/* oops, something failed */
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return er;
|
||||
}
|
||||
|
||||
if (i > k)
|
||||
{
|
||||
/* satisfaction */
|
||||
MDEBUG(("%d successful\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_OKAY;
|
||||
}
|
||||
|
||||
/* match failed to verify, so backtrack */
|
||||
|
||||
backtrack:
|
||||
/*
|
||||
* Must consider shorter versions of the current sub-match. However,
|
||||
* we'll only ask for a zero-length match if necessary.
|
||||
*/
|
||||
while (k > 0)
|
||||
{
|
||||
chr *prev_end = endpts[k - 1];
|
||||
|
||||
if (endpts[k] > prev_end)
|
||||
{
|
||||
limit = endpts[k] - 1;
|
||||
if (limit > prev_end ||
|
||||
(k < min_matches && min_matches - k >= end - prev_end))
|
||||
{
|
||||
/* break out of backtrack loop, continue the outer one */
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* can't shorten k'th sub-match any more, consider previous one */
|
||||
k--;
|
||||
}
|
||||
}
|
||||
|
||||
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
|
||||
MDEBUG(("%d failed\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_ASSERT;
|
||||
}
|
||||
|
||||
/*
|
||||
* reviterdissect - shortest-first iteration subexpression matches
|
||||
*/
|
||||
static int /* regexec return code */
|
||||
reviterdissect(struct vars * v,
|
||||
struct subre * t,
|
||||
chr *begin, /* beginning of relevant substring */
|
||||
chr *end) /* end of same */
|
||||
{
|
||||
struct dfa *d;
|
||||
chr **endpts;
|
||||
chr *limit;
|
||||
int min_matches;
|
||||
size_t max_matches;
|
||||
int nverified;
|
||||
int k;
|
||||
int i;
|
||||
int er;
|
||||
|
||||
assert(t->op == '*');
|
||||
assert(t->left != NULL && t->left->cnfa.nstates > 0);
|
||||
assert(t->left->flags & SHORTER);
|
||||
assert(begin <= end);
|
||||
|
||||
/*
|
||||
* If zero matches are allowed, and target string is empty, just declare
|
||||
* victory. OTOH, if target string isn't empty, zero matches can't work
|
||||
* so we pretend the min is 1.
|
||||
*/
|
||||
min_matches = t->min;
|
||||
if (min_matches <= 0)
|
||||
{
|
||||
if (begin == end)
|
||||
return REG_OKAY;
|
||||
min_matches = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need workspace to track the endpoints of each sub-match. Normally
|
||||
* we consider only nonzero-length sub-matches, so there can be at most
|
||||
* end-begin of them. However, if min is larger than that, we will also
|
||||
* consider zero-length sub-matches in order to find enough matches.
|
||||
*
|
||||
* For convenience, endpts[0] contains the "begin" pointer and we store
|
||||
* sub-match endpoints in endpts[1..max_matches].
|
||||
*/
|
||||
max_matches = end - begin;
|
||||
if (max_matches > t->max && t->max != INFINITY)
|
||||
max_matches = t->max;
|
||||
if (max_matches < min_matches)
|
||||
max_matches = min_matches;
|
||||
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
|
||||
if (endpts == NULL)
|
||||
return REG_ESPACE;
|
||||
endpts[0] = begin;
|
||||
|
||||
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
|
||||
if (ISERR())
|
||||
{
|
||||
FREE(endpts);
|
||||
return v->err;
|
||||
}
|
||||
MDEBUG(("reviter %d\n", t->retry));
|
||||
|
||||
/*
|
||||
* Our strategy is to first find a set of sub-match endpoints that are
|
||||
* valid according to the child node's DFA, and then recursively dissect
|
||||
* each sub-match to confirm validity. If any validity check fails,
|
||||
* backtrack the last sub-match and try again. And, when we next try for
|
||||
* a validity check, we need not recheck any successfully verified
|
||||
* sub-matches that we didn't move the endpoints of. nverified remembers
|
||||
* how many sub-matches are currently known okay.
|
||||
*/
|
||||
|
||||
/* initialize to consider first sub-match */
|
||||
nverified = 0;
|
||||
k = 1;
|
||||
limit = begin;
|
||||
|
||||
/* iterate until satisfaction or failure */
|
||||
while (k > 0)
|
||||
{
|
||||
/* disallow zero-length match unless necessary to achieve min */
|
||||
if (limit == endpts[k - 1] &&
|
||||
limit != end &&
|
||||
(k >= min_matches || min_matches - k < end - limit))
|
||||
limit++;
|
||||
|
||||
/* try to find an endpoint for the k'th sub-match */
|
||||
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
|
||||
(chr **) NULL, (int *) NULL);
|
||||
if (endpts[k] == NULL)
|
||||
{
|
||||
/* no match possible, so see if we can lengthen previous one */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
MDEBUG(("%d: working endpoint %d: %ld\n",
|
||||
t->retry, k, LOFF(endpts[k])));
|
||||
|
||||
/* k'th sub-match can no longer be considered verified */
|
||||
if (nverified >= k)
|
||||
nverified = k - 1;
|
||||
|
||||
if (endpts[k] != end)
|
||||
{
|
||||
/* haven't reached end yet, try another iteration if allowed */
|
||||
if (k >= max_matches)
|
||||
{
|
||||
/* must try to lengthen some previous match */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
|
||||
k++;
|
||||
limit = endpts[k - 1];
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've identified a way to divide the string into k sub-matches
|
||||
* that works so far as the child DFA can tell. If k is an allowed
|
||||
* number of matches, start the slow part: recurse to verify each
|
||||
* sub-match. We always have k <= max_matches, needn't check that.
|
||||
*/
|
||||
if (k < min_matches)
|
||||
goto backtrack;
|
||||
|
||||
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
|
||||
|
||||
for (i = nverified + 1; i <= k; i++)
|
||||
{
|
||||
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
|
||||
if (er == REG_OKAY)
|
||||
{
|
||||
nverified = i;
|
||||
continue;
|
||||
}
|
||||
if (er == REG_NOMATCH)
|
||||
break;
|
||||
/* oops, something failed */
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return er;
|
||||
}
|
||||
|
||||
if (i > k)
|
||||
{
|
||||
/* satisfaction */
|
||||
MDEBUG(("%d successful\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_OKAY;
|
||||
}
|
||||
|
||||
/* match failed to verify, so backtrack */
|
||||
|
||||
backtrack:
|
||||
/*
|
||||
* Must consider longer versions of the current sub-match.
|
||||
*/
|
||||
while (k > 0)
|
||||
{
|
||||
if (endpts[k] < end)
|
||||
{
|
||||
limit = endpts[k] + 1;
|
||||
/* break out of backtrack loop, continue the outer one */
|
||||
break;
|
||||
}
|
||||
/* can't lengthen k'th sub-match any more, consider previous one */
|
||||
k--;
|
||||
}
|
||||
}
|
||||
|
||||
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
|
||||
MDEBUG(("%d failed\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_ASSERT;
|
||||
}
|
||||
|
||||
/*
|
||||
* cdissect - determine subexpression matches (with complications)
|
||||
* The retry memory stores the offset of the trial midpoint from begin,
|
||||
@@ -717,15 +1093,18 @@ cdissect(struct vars * v,
|
||||
case '=': /* terminal node */
|
||||
assert(t->left == NULL && t->right == NULL);
|
||||
return REG_OKAY; /* no action, parent did the work */
|
||||
case '|': /* alternation */
|
||||
assert(t->left != NULL);
|
||||
return caltdissect(v, t, begin, end);
|
||||
case 'b': /* back reference */
|
||||
assert(t->left == NULL && t->right == NULL);
|
||||
return cbrdissect(v, t, begin, end);
|
||||
case '.': /* concatenation */
|
||||
assert(t->left != NULL && t->right != NULL);
|
||||
return ccondissect(v, t, begin, end);
|
||||
case '|': /* alternation */
|
||||
assert(t->left != NULL);
|
||||
return caltdissect(v, t, begin, end);
|
||||
case '*': /* iteration */
|
||||
assert(t->left != NULL);
|
||||
return citerdissect(v, t, begin, end);
|
||||
case '(': /* capturing */
|
||||
assert(t->left != NULL && t->right == NULL);
|
||||
assert(t->subno > 0);
|
||||
@@ -847,7 +1226,7 @@ ccondissect(struct vars * v,
|
||||
}
|
||||
|
||||
/*
|
||||
* crevdissect - determine backref shortest-first subexpression matches
|
||||
* crevdissect - shortest-first concatenation subexpression matches
|
||||
* The retry memory stores the offset of the trial midpoint from begin,
|
||||
* plus 1 so that 0 uniquely means "clean slate".
|
||||
*/
|
||||
@@ -1088,6 +1467,377 @@ caltdissect(struct vars * v,
|
||||
return caltdissect(v, t->right, begin, end);
|
||||
}
|
||||
|
||||
/*
|
||||
* citerdissect - iteration subexpression matches (with complications)
|
||||
*/
|
||||
static int /* regexec return code */
|
||||
citerdissect(struct vars * v,
|
||||
struct subre * t,
|
||||
chr *begin, /* beginning of relevant substring */
|
||||
chr *end) /* end of same */
|
||||
{
|
||||
struct dfa *d;
|
||||
chr **endpts;
|
||||
chr *limit;
|
||||
int min_matches;
|
||||
size_t max_matches;
|
||||
int nverified;
|
||||
int k;
|
||||
int i;
|
||||
int er;
|
||||
|
||||
assert(t->op == '*');
|
||||
assert(t->left != NULL && t->left->cnfa.nstates > 0);
|
||||
assert(begin <= end);
|
||||
|
||||
if (t->left->flags & SHORTER) /* reverse scan */
|
||||
return creviterdissect(v, t, begin, end);
|
||||
|
||||
/*
|
||||
* If zero matches are allowed, and target string is empty, just declare
|
||||
* victory. OTOH, if target string isn't empty, zero matches can't work
|
||||
* so we pretend the min is 1.
|
||||
*/
|
||||
min_matches = t->min;
|
||||
if (min_matches <= 0)
|
||||
{
|
||||
if (begin == end)
|
||||
return REG_OKAY;
|
||||
min_matches = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need workspace to track the endpoints of each sub-match. Normally
|
||||
* we consider only nonzero-length sub-matches, so there can be at most
|
||||
* end-begin of them. However, if min is larger than that, we will also
|
||||
* consider zero-length sub-matches in order to find enough matches.
|
||||
*
|
||||
* For convenience, endpts[0] contains the "begin" pointer and we store
|
||||
* sub-match endpoints in endpts[1..max_matches].
|
||||
*/
|
||||
max_matches = end - begin;
|
||||
if (max_matches > t->max && t->max != INFINITY)
|
||||
max_matches = t->max;
|
||||
if (max_matches < min_matches)
|
||||
max_matches = min_matches;
|
||||
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
|
||||
if (endpts == NULL)
|
||||
return REG_ESPACE;
|
||||
endpts[0] = begin;
|
||||
|
||||
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
|
||||
if (ISERR())
|
||||
{
|
||||
FREE(endpts);
|
||||
return v->err;
|
||||
}
|
||||
MDEBUG(("citer %d\n", t->retry));
|
||||
|
||||
/*
|
||||
* Our strategy is to first find a set of sub-match endpoints that are
|
||||
* valid according to the child node's DFA, and then recursively dissect
|
||||
* each sub-match to confirm validity. If any validity check fails,
|
||||
* backtrack the last sub-match and try again. And, when we next try for
|
||||
* a validity check, we need not recheck any successfully verified
|
||||
* sub-matches that we didn't move the endpoints of. nverified remembers
|
||||
* how many sub-matches are currently known okay.
|
||||
*/
|
||||
|
||||
/* initialize to consider first sub-match */
|
||||
nverified = 0;
|
||||
k = 1;
|
||||
limit = end;
|
||||
|
||||
/* iterate until satisfaction or failure */
|
||||
while (k > 0)
|
||||
{
|
||||
/* try to find an endpoint for the k'th sub-match */
|
||||
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
|
||||
if (endpts[k] == NULL)
|
||||
{
|
||||
/* no match possible, so see if we can shorten previous one */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
MDEBUG(("%d: working endpoint %d: %ld\n",
|
||||
t->retry, k, LOFF(endpts[k])));
|
||||
|
||||
/* k'th sub-match can no longer be considered verified */
|
||||
if (nverified >= k)
|
||||
nverified = k - 1;
|
||||
|
||||
if (endpts[k] != end)
|
||||
{
|
||||
/* haven't reached end yet, try another iteration if allowed */
|
||||
if (k >= max_matches)
|
||||
{
|
||||
/* must try to shorten some previous match */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
|
||||
/* reject zero-length match unless necessary to achieve min */
|
||||
if (endpts[k] == endpts[k - 1] &&
|
||||
(k >= min_matches || min_matches - k < end - endpts[k]))
|
||||
goto backtrack;
|
||||
|
||||
k++;
|
||||
limit = end;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've identified a way to divide the string into k sub-matches
|
||||
* that works so far as the child DFA can tell. If k is an allowed
|
||||
* number of matches, start the slow part: recurse to verify each
|
||||
* sub-match. We always have k <= max_matches, needn't check that.
|
||||
*/
|
||||
if (k < min_matches)
|
||||
goto backtrack;
|
||||
|
||||
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
|
||||
|
||||
for (i = nverified + 1; i <= k; i++)
|
||||
{
|
||||
zapmem(v, t->left);
|
||||
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
|
||||
if (er == REG_OKAY)
|
||||
{
|
||||
nverified = i;
|
||||
continue;
|
||||
}
|
||||
if (er == REG_NOMATCH)
|
||||
break;
|
||||
/* oops, something failed */
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return er;
|
||||
}
|
||||
|
||||
if (i > k)
|
||||
{
|
||||
/* satisfaction */
|
||||
MDEBUG(("%d successful\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_OKAY;
|
||||
}
|
||||
|
||||
/* match failed to verify, so backtrack */
|
||||
|
||||
backtrack:
|
||||
/*
|
||||
* Must consider shorter versions of the current sub-match. However,
|
||||
* we'll only ask for a zero-length match if necessary.
|
||||
*/
|
||||
while (k > 0)
|
||||
{
|
||||
chr *prev_end = endpts[k - 1];
|
||||
|
||||
if (endpts[k] > prev_end)
|
||||
{
|
||||
limit = endpts[k] - 1;
|
||||
if (limit > prev_end ||
|
||||
(k < min_matches && min_matches - k >= end - prev_end))
|
||||
{
|
||||
/* break out of backtrack loop, continue the outer one */
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* can't shorten k'th sub-match any more, consider previous one */
|
||||
k--;
|
||||
}
|
||||
}
|
||||
|
||||
/* all possibilities exhausted */
|
||||
MDEBUG(("%d failed\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_NOMATCH;
|
||||
}
|
||||
|
||||
/*
|
||||
* creviterdissect - shortest-first iteration subexpression matches
|
||||
*/
|
||||
static int /* regexec return code */
|
||||
creviterdissect(struct vars * v,
|
||||
struct subre * t,
|
||||
chr *begin, /* beginning of relevant substring */
|
||||
chr *end) /* end of same */
|
||||
{
|
||||
struct dfa *d;
|
||||
chr **endpts;
|
||||
chr *limit;
|
||||
int min_matches;
|
||||
size_t max_matches;
|
||||
int nverified;
|
||||
int k;
|
||||
int i;
|
||||
int er;
|
||||
|
||||
assert(t->op == '*');
|
||||
assert(t->left != NULL && t->left->cnfa.nstates > 0);
|
||||
assert(t->left->flags & SHORTER);
|
||||
assert(begin <= end);
|
||||
|
||||
/*
|
||||
* If zero matches are allowed, and target string is empty, just declare
|
||||
* victory. OTOH, if target string isn't empty, zero matches can't work
|
||||
* so we pretend the min is 1.
|
||||
*/
|
||||
min_matches = t->min;
|
||||
if (min_matches <= 0)
|
||||
{
|
||||
if (begin == end)
|
||||
return REG_OKAY;
|
||||
min_matches = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need workspace to track the endpoints of each sub-match. Normally
|
||||
* we consider only nonzero-length sub-matches, so there can be at most
|
||||
* end-begin of them. However, if min is larger than that, we will also
|
||||
* consider zero-length sub-matches in order to find enough matches.
|
||||
*
|
||||
* For convenience, endpts[0] contains the "begin" pointer and we store
|
||||
* sub-match endpoints in endpts[1..max_matches].
|
||||
*/
|
||||
max_matches = end - begin;
|
||||
if (max_matches > t->max && t->max != INFINITY)
|
||||
max_matches = t->max;
|
||||
if (max_matches < min_matches)
|
||||
max_matches = min_matches;
|
||||
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
|
||||
if (endpts == NULL)
|
||||
return REG_ESPACE;
|
||||
endpts[0] = begin;
|
||||
|
||||
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
|
||||
if (ISERR())
|
||||
{
|
||||
FREE(endpts);
|
||||
return v->err;
|
||||
}
|
||||
MDEBUG(("creviter %d\n", t->retry));
|
||||
|
||||
/*
|
||||
* Our strategy is to first find a set of sub-match endpoints that are
|
||||
* valid according to the child node's DFA, and then recursively dissect
|
||||
* each sub-match to confirm validity. If any validity check fails,
|
||||
* backtrack the last sub-match and try again. And, when we next try for
|
||||
* a validity check, we need not recheck any successfully verified
|
||||
* sub-matches that we didn't move the endpoints of. nverified remembers
|
||||
* how many sub-matches are currently known okay.
|
||||
*/
|
||||
|
||||
/* initialize to consider first sub-match */
|
||||
nverified = 0;
|
||||
k = 1;
|
||||
limit = begin;
|
||||
|
||||
/* iterate until satisfaction or failure */
|
||||
while (k > 0)
|
||||
{
|
||||
/* disallow zero-length match unless necessary to achieve min */
|
||||
if (limit == endpts[k - 1] &&
|
||||
limit != end &&
|
||||
(k >= min_matches || min_matches - k < end - limit))
|
||||
limit++;
|
||||
|
||||
/* try to find an endpoint for the k'th sub-match */
|
||||
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
|
||||
(chr **) NULL, (int *) NULL);
|
||||
if (endpts[k] == NULL)
|
||||
{
|
||||
/* no match possible, so see if we can lengthen previous one */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
MDEBUG(("%d: working endpoint %d: %ld\n",
|
||||
t->retry, k, LOFF(endpts[k])));
|
||||
|
||||
/* k'th sub-match can no longer be considered verified */
|
||||
if (nverified >= k)
|
||||
nverified = k - 1;
|
||||
|
||||
if (endpts[k] != end)
|
||||
{
|
||||
/* haven't reached end yet, try another iteration if allowed */
|
||||
if (k >= max_matches)
|
||||
{
|
||||
/* must try to lengthen some previous match */
|
||||
k--;
|
||||
goto backtrack;
|
||||
}
|
||||
|
||||
k++;
|
||||
limit = endpts[k - 1];
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've identified a way to divide the string into k sub-matches
|
||||
* that works so far as the child DFA can tell. If k is an allowed
|
||||
* number of matches, start the slow part: recurse to verify each
|
||||
* sub-match. We always have k <= max_matches, needn't check that.
|
||||
*/
|
||||
if (k < min_matches)
|
||||
goto backtrack;
|
||||
|
||||
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
|
||||
|
||||
for (i = nverified + 1; i <= k; i++)
|
||||
{
|
||||
zapmem(v, t->left);
|
||||
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
|
||||
if (er == REG_OKAY)
|
||||
{
|
||||
nverified = i;
|
||||
continue;
|
||||
}
|
||||
if (er == REG_NOMATCH)
|
||||
break;
|
||||
/* oops, something failed */
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return er;
|
||||
}
|
||||
|
||||
if (i > k)
|
||||
{
|
||||
/* satisfaction */
|
||||
MDEBUG(("%d successful\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_OKAY;
|
||||
}
|
||||
|
||||
/* match failed to verify, so backtrack */
|
||||
|
||||
backtrack:
|
||||
/*
|
||||
* Must consider longer versions of the current sub-match.
|
||||
*/
|
||||
while (k > 0)
|
||||
{
|
||||
if (endpts[k] < end)
|
||||
{
|
||||
limit = endpts[k] + 1;
|
||||
/* break out of backtrack loop, continue the outer one */
|
||||
break;
|
||||
}
|
||||
/* can't lengthen k'th sub-match any more, consider previous one */
|
||||
k--;
|
||||
}
|
||||
}
|
||||
|
||||
/* all possibilities exhausted */
|
||||
MDEBUG(("%d failed\n", t->retry));
|
||||
freedfa(d);
|
||||
FREE(endpts);
|
||||
return REG_NOMATCH;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#include "rege_dfa.c"
|
||||
|
Reference in New Issue
Block a user