diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index d3e850a8699..c9c73e978b6 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -138,14 +138,9 @@ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa static void zapallsubs(regmatch_t *, size_t); static void zaptreesubs(struct vars *, struct subre *); static void subset(struct vars *, struct subre *, chr *, chr *); -static int dissect(struct vars *, struct subre *, chr *, chr *); -static int condissect(struct vars *, struct subre *, chr *, chr *); -static int altdissect(struct vars *, struct subre *, chr *, chr *); -static int iterdissect(struct vars *, struct subre *, chr *, chr *); -static int reviterdissect(struct vars *, struct subre *, chr *, chr *); static int cdissect(struct vars *, struct subre *, chr *, chr *); static int ccondissect(struct vars *, struct subre *, chr *, chr *); -static int crevdissect(struct vars *, struct subre *, chr *, chr *); +static int crevcondissect(struct vars *, struct subre *, chr *, chr *); static int cbrdissect(struct vars *, struct subre *, chr *, chr *); static int caltdissect(struct vars *, struct subre *, chr *, chr *); static int citerdissect(struct vars *, struct subre *, chr *, chr *); @@ -376,9 +371,9 @@ find(struct vars * v, if (v->nmatch == 1) /* no need for submatches */ return REG_OKAY; - /* submatches */ + /* find submatches */ zapallsubs(v->pmatch, v->nmatch); - return dissect(v, v->g->tree, begin, end); + return cdissect(v, v->g->tree, begin, end); } /* @@ -568,505 +563,19 @@ subset(struct vars * v, } /* - * dissect - determine subexpression matches (uncomplicated case) - */ -static int /* regexec return code */ -dissect(struct vars * v, - struct subre * t, - chr *begin, /* beginning of relevant substring */ - chr *end) /* end of same */ -{ - assert(t != NULL); - MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); - - switch (t->op) - { - case '=': /* terminal node */ - assert(t->left == NULL && t->right == NULL); - return REG_OKAY; /* no action, parent did the work */ - case 'b': /* back ref -- shouldn't be calling us! */ - return REG_ASSERT; - case '.': /* concatenation */ - assert(t->left != NULL && t->right != NULL); - return condissect(v, t, begin, end); - case '|': /* alternation */ - assert(t->left != NULL); - return altdissect(v, t, begin, end); - case '*': /* iteration */ - assert(t->left != NULL); - return iterdissect(v, t, begin, end); - case '(': /* capturing */ - assert(t->left != NULL && t->right == NULL); - assert(t->subno > 0); - subset(v, t, begin, end); - return dissect(v, t->left, begin, end); - default: - return REG_ASSERT; - } -} - -/* - * condissect - determine concatenation subexpression matches (uncomplicated) - */ -static int /* regexec return code */ -condissect(struct vars * v, - struct subre * t, - chr *begin, /* beginning of relevant substring */ - chr *end) /* end of same */ -{ - struct dfa *d; - struct dfa *d2; - chr *mid; - int i; - int shorter = (t->left->flags & SHORTER) ? 1 : 0; - chr *stop = (shorter) ? end : begin; - - assert(t->op == '.'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(t->right != NULL && t->right->cnfa.nstates > 0); - - d = getsubdfa(v, t->left); - NOERR(); - d2 = getsubdfa(v, t->right); - NOERR(); - - /* pick a tentative midpoint */ - if (shorter) - mid = shortest(v, d, begin, begin, end, (chr **) NULL, - (int *) NULL); - else - mid = longest(v, d, begin, end, (int *) NULL); - if (mid == NULL) - return REG_ASSERT; - MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); - - /* iterate until satisfaction or failure */ - while (longest(v, d2, mid, end, (int *) NULL) != end) - { - /* that midpoint didn't work, find a new one */ - if (mid == stop) - { - /* all possibilities exhausted! */ - MDEBUG(("no midpoint!\n")); - return REG_ASSERT; - } - if (shorter) - mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, - (int *) NULL); - else - mid = longest(v, d, begin, mid - 1, (int *) NULL); - if (mid == NULL) - { - /* failed to find a new one! */ - MDEBUG(("failed midpoint!\n")); - return REG_ASSERT; - } - MDEBUG(("new midpoint %ld\n", LOFF(mid))); - } - - /* satisfaction */ - MDEBUG(("successful\n")); - i = dissect(v, t->left, begin, mid); - if (i != REG_OKAY) - return i; - return dissect(v, t->right, mid, end); -} - -/* - * altdissect - determine alternative subexpression matches (uncomplicated) - */ -static int /* regexec return code */ -altdissect(struct vars * v, - struct subre * t, - chr *begin, /* beginning of relevant substring */ - chr *end) /* end of same */ -{ - struct dfa *d; - int i; - - assert(t != NULL); - assert(t->op == '|'); - - for (i = 0; t != NULL; t = t->right, i++) - { - MDEBUG(("trying %dth\n", i)); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - d = getsubdfa(v, t->left); - NOERR(); - if (longest(v, d, begin, end, (int *) NULL) == end) - { - MDEBUG(("success\n")); - return dissect(v, t->left, begin, end); - } - } - return REG_ASSERT; /* none of them matched?!? */ -} - -/* - * iterdissect - iteration subexpression matches (uncomplicated) - */ -static int /* regexec return code */ -iterdissect(struct vars * v, - struct subre * t, - chr *begin, /* beginning of relevant substring */ - chr *end) /* end of same */ -{ - struct dfa *d; - chr **endpts; - chr *limit; - int min_matches; - size_t max_matches; - int nverified; - int k; - int i; - int er; - - assert(t->op == '*'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(begin <= end); - - if (t->left->flags & SHORTER) /* reverse scan */ - return reviterdissect(v, t, begin, end); - - /* - * If zero matches are allowed, and target string is empty, just declare - * victory. OTOH, if target string isn't empty, zero matches can't work - * so we pretend the min is 1. - */ - min_matches = t->min; - if (min_matches <= 0) - { - if (begin == end) - return REG_OKAY; - min_matches = 1; - } - - /* - * We need workspace to track the endpoints of each sub-match. Normally - * we consider only nonzero-length sub-matches, so there can be at most - * end-begin of them. However, if min is larger than that, we will also - * consider zero-length sub-matches in order to find enough matches. - * - * For convenience, endpts[0] contains the "begin" pointer and we store - * sub-match endpoints in endpts[1..max_matches]. - */ - max_matches = end - begin; - if (max_matches > t->max && t->max != INFINITY) - max_matches = t->max; - if (max_matches < min_matches) - max_matches = min_matches; - endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); - if (endpts == NULL) - return REG_ESPACE; - endpts[0] = begin; - - d = getsubdfa(v, t->left); - if (ISERR()) - { - FREE(endpts); - return v->err; - } - MDEBUG(("iter %d\n", t->id)); - - /* - * Our strategy is to first find a set of sub-match endpoints that are - * valid according to the child node's DFA, and then recursively dissect - * each sub-match to confirm validity. If any validity check fails, - * backtrack the last sub-match and try again. And, when we next try for - * a validity check, we need not recheck any successfully verified - * sub-matches that we didn't move the endpoints of. nverified remembers - * how many sub-matches are currently known okay. - */ - - /* initialize to consider first sub-match */ - nverified = 0; - k = 1; - limit = end; - - /* iterate until satisfaction or failure */ - while (k > 0) - { - /* try to find an endpoint for the k'th sub-match */ - endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); - if (endpts[k] == NULL) - { - /* no match possible, so see if we can shorten previous one */ - k--; - goto backtrack; - } - MDEBUG(("%d: working endpoint %d: %ld\n", - t->id, k, LOFF(endpts[k]))); - - /* k'th sub-match can no longer be considered verified */ - if (nverified >= k) - nverified = k - 1; - - if (endpts[k] != end) - { - /* haven't reached end yet, try another iteration if allowed */ - if (k >= max_matches) - { - /* must try to shorten some previous match */ - k--; - goto backtrack; - } - - /* reject zero-length match unless necessary to achieve min */ - if (endpts[k] == endpts[k - 1] && - (k >= min_matches || min_matches - k < end - endpts[k])) - goto backtrack; - - k++; - limit = end; - continue; - } - - /* - * We've identified a way to divide the string into k sub-matches - * that works so far as the child DFA can tell. If k is an allowed - * number of matches, start the slow part: recurse to verify each - * sub-match. We always have k <= max_matches, needn't check that. - */ - if (k < min_matches) - goto backtrack; - - MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k)); - - for (i = nverified + 1; i <= k; i++) - { - er = dissect(v, t->left, endpts[i - 1], endpts[i]); - if (er == REG_OKAY) - { - nverified = i; - continue; - } - if (er == REG_NOMATCH) - break; - /* oops, something failed */ - FREE(endpts); - return er; - } - - if (i > k) - { - /* satisfaction */ - MDEBUG(("%d successful\n", t->id)); - FREE(endpts); - return REG_OKAY; - } - - /* match failed to verify, so backtrack */ - -backtrack: - /* - * Must consider shorter versions of the current sub-match. However, - * we'll only ask for a zero-length match if necessary. - */ - while (k > 0) - { - chr *prev_end = endpts[k - 1]; - - if (endpts[k] > prev_end) - { - limit = endpts[k] - 1; - if (limit > prev_end || - (k < min_matches && min_matches - k >= end - prev_end)) - { - /* break out of backtrack loop, continue the outer one */ - break; - } - } - /* can't shorten k'th sub-match any more, consider previous one */ - k--; - } - } - - /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ - MDEBUG(("%d failed\n", t->id)); - FREE(endpts); - return REG_ASSERT; -} - -/* - * reviterdissect - shortest-first iteration subexpression matches - */ -static int /* regexec return code */ -reviterdissect(struct vars * v, - struct subre * t, - chr *begin, /* beginning of relevant substring */ - chr *end) /* end of same */ -{ - struct dfa *d; - chr **endpts; - chr *limit; - int min_matches; - size_t max_matches; - int nverified; - int k; - int i; - int er; - - assert(t->op == '*'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(t->left->flags & SHORTER); - assert(begin <= end); - - /* - * If zero matches are allowed, and target string is empty, just declare - * victory. OTOH, if target string isn't empty, zero matches can't work - * so we pretend the min is 1. - */ - min_matches = t->min; - if (min_matches <= 0) - { - if (begin == end) - return REG_OKAY; - min_matches = 1; - } - - /* - * We need workspace to track the endpoints of each sub-match. Normally - * we consider only nonzero-length sub-matches, so there can be at most - * end-begin of them. However, if min is larger than that, we will also - * consider zero-length sub-matches in order to find enough matches. - * - * For convenience, endpts[0] contains the "begin" pointer and we store - * sub-match endpoints in endpts[1..max_matches]. - */ - max_matches = end - begin; - if (max_matches > t->max && t->max != INFINITY) - max_matches = t->max; - if (max_matches < min_matches) - max_matches = min_matches; - endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); - if (endpts == NULL) - return REG_ESPACE; - endpts[0] = begin; - - d = getsubdfa(v, t->left); - if (ISERR()) - { - FREE(endpts); - return v->err; - } - MDEBUG(("reviter %d\n", t->id)); - - /* - * Our strategy is to first find a set of sub-match endpoints that are - * valid according to the child node's DFA, and then recursively dissect - * each sub-match to confirm validity. If any validity check fails, - * backtrack the last sub-match and try again. And, when we next try for - * a validity check, we need not recheck any successfully verified - * sub-matches that we didn't move the endpoints of. nverified remembers - * how many sub-matches are currently known okay. - */ - - /* initialize to consider first sub-match */ - nverified = 0; - k = 1; - limit = begin; - - /* iterate until satisfaction or failure */ - while (k > 0) - { - /* disallow zero-length match unless necessary to achieve min */ - if (limit == endpts[k - 1] && - limit != end && - (k >= min_matches || min_matches - k < end - limit)) - limit++; - - /* try to find an endpoint for the k'th sub-match */ - endpts[k] = shortest(v, d, endpts[k - 1], limit, end, - (chr **) NULL, (int *) NULL); - if (endpts[k] == NULL) - { - /* no match possible, so see if we can lengthen previous one */ - k--; - goto backtrack; - } - MDEBUG(("%d: working endpoint %d: %ld\n", - t->id, k, LOFF(endpts[k]))); - - /* k'th sub-match can no longer be considered verified */ - if (nverified >= k) - nverified = k - 1; - - if (endpts[k] != end) - { - /* haven't reached end yet, try another iteration if allowed */ - if (k >= max_matches) - { - /* must try to lengthen some previous match */ - k--; - goto backtrack; - } - - k++; - limit = endpts[k - 1]; - continue; - } - - /* - * We've identified a way to divide the string into k sub-matches - * that works so far as the child DFA can tell. If k is an allowed - * number of matches, start the slow part: recurse to verify each - * sub-match. We always have k <= max_matches, needn't check that. - */ - if (k < min_matches) - goto backtrack; - - MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k)); - - for (i = nverified + 1; i <= k; i++) - { - er = dissect(v, t->left, endpts[i - 1], endpts[i]); - if (er == REG_OKAY) - { - nverified = i; - continue; - } - if (er == REG_NOMATCH) - break; - /* oops, something failed */ - FREE(endpts); - return er; - } - - if (i > k) - { - /* satisfaction */ - MDEBUG(("%d successful\n", t->id)); - FREE(endpts); - return REG_OKAY; - } - - /* match failed to verify, so backtrack */ - -backtrack: - /* - * Must consider longer versions of the current sub-match. - */ - while (k > 0) - { - if (endpts[k] < end) - { - limit = endpts[k] + 1; - /* break out of backtrack loop, continue the outer one */ - break; - } - /* can't lengthen k'th sub-match any more, consider previous one */ - k--; - } - } - - /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ - MDEBUG(("%d failed\n", t->id)); - FREE(endpts); - return REG_ASSERT; -} - -/* - * cdissect - determine subexpression matches (with complications) + * cdissect - check backrefs and determine subexpression matches + * + * cdissect recursively processes a subre tree to check matching of backrefs + * and/or identify submatch boundaries for capture nodes. The proposed match + * runs from "begin" to "end" (not including "end"), and we are basically + * "dissecting" it to see where the submatches are. + * + * Before calling any level of cdissect, the caller must have run the node's + * DFA and found that the proposed substring satisfies the DFA. (We make + * the caller do that because in concatenation and iteration nodes, it's + * much faster to check all the substrings against the child DFAs before we + * recurse.) Also, caller must have cleared subexpression match data via + * zaptreesubs (or zapallsubs at the top level). */ static int /* regexec return code */ cdissect(struct vars * v, @@ -1083,33 +592,54 @@ cdissect(struct vars * v, { case '=': /* terminal node */ assert(t->left == NULL && t->right == NULL); - return REG_OKAY; /* no action, parent did the work */ + er = REG_OKAY; /* no action, parent did the work */ + break; case 'b': /* back reference */ assert(t->left == NULL && t->right == NULL); - return cbrdissect(v, t, begin, end); + er = cbrdissect(v, t, begin, end); + break; case '.': /* concatenation */ assert(t->left != NULL && t->right != NULL); - return ccondissect(v, t, begin, end); + if (t->left->flags & SHORTER) /* reverse scan */ + er = crevcondissect(v, t, begin, end); + else + er = ccondissect(v, t, begin, end); + break; case '|': /* alternation */ assert(t->left != NULL); - return caltdissect(v, t, begin, end); + er = caltdissect(v, t, begin, end); + break; case '*': /* iteration */ assert(t->left != NULL); - return citerdissect(v, t, begin, end); + if (t->left->flags & SHORTER) /* reverse scan */ + er = creviterdissect(v, t, begin, end); + else + er = citerdissect(v, t, begin, end); + break; case '(': /* capturing */ assert(t->left != NULL && t->right == NULL); assert(t->subno > 0); er = cdissect(v, t->left, begin, end); if (er == REG_OKAY) subset(v, t, begin, end); - return er; + break; default: - return REG_ASSERT; + er = REG_ASSERT; + break; } + + /* + * We should never have a match failure unless backrefs lurk below; + * otherwise, either caller failed to check the DFA, or there's some + * inconsistency between the DFA and the node's innards. + */ + assert(er != REG_NOMATCH || (t->flags & BACKR)); + + return er; } /* - * ccondissect - concatenation subexpression matches (with complications) + * ccondissect - dissect match for concatenation node */ static int /* regexec return code */ ccondissect(struct vars * v, @@ -1125,9 +655,7 @@ ccondissect(struct vars * v, assert(t->op == '.'); assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(t->right != NULL && t->right->cnfa.nstates > 0); - - if (t->left->flags & SHORTER) /* reverse scan */ - return crevdissect(v, t, begin, end); + assert(!(t->left->flags & SHORTER)); d = getsubdfa(v, t->left); NOERR(); @@ -1158,7 +686,7 @@ ccondissect(struct vars * v, return REG_OKAY; } } - if (er != REG_OKAY && er != REG_NOMATCH) + if (er != REG_NOMATCH) return er; } @@ -1186,13 +714,13 @@ ccondissect(struct vars * v, } /* - * crevdissect - shortest-first concatenation subexpression matches + * crevcondissect - dissect match for concatenation node, shortest-first */ static int /* regexec return code */ -crevdissect(struct vars * v, - struct subre * t, - chr *begin, /* beginning of relevant substring */ - chr *end) /* end of same */ +crevcondissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ { struct dfa *d; struct dfa *d2; @@ -1204,12 +732,11 @@ crevdissect(struct vars * v, assert(t->right != NULL && t->right->cnfa.nstates > 0); assert(t->left->flags & SHORTER); - /* concatenation -- need to split the substring between parts */ d = getsubdfa(v, t->left); NOERR(); d2 = getsubdfa(v, t->right); NOERR(); - MDEBUG(("crev %d\n", t->id)); + MDEBUG(("crevcon %d\n", t->id)); /* pick a tentative midpoint */ mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); @@ -1234,7 +761,7 @@ crevdissect(struct vars * v, return REG_OKAY; } } - if (er != REG_OKAY && er != REG_NOMATCH) + if (er != REG_NOMATCH) return er; } @@ -1262,7 +789,7 @@ crevdissect(struct vars * v, } /* - * cbrdissect - determine backref subexpression matches + * cbrdissect - dissect match for backref node */ static int /* regexec return code */ cbrdissect(struct vars * v, @@ -1343,7 +870,7 @@ cbrdissect(struct vars * v, } /* - * caltdissect - determine alternative subexpression matches (w. complications) + * caltdissect - dissect match for alternation node */ static int /* regexec return code */ caltdissect(struct vars * v, @@ -1354,29 +881,32 @@ caltdissect(struct vars * v, struct dfa *d; int er; - if (t == NULL) - return REG_NOMATCH; + /* We loop, rather than tail-recurse, to handle a chain of alternatives */ + while (t != NULL) + { + assert(t->op == '|'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(t->op == '|'); - assert(t->left != NULL); + MDEBUG(("calt n%d\n", t->id)); - MDEBUG(("calt n%d\n", t->id)); + d = getsubdfa(v, t->left); + NOERR(); + if (longest(v, d, begin, end, (int *) NULL) == end) + { + MDEBUG(("calt matched\n")); + er = cdissect(v, t->left, begin, end); + if (er != REG_NOMATCH) + return er; + } - d = getsubdfa(v, t->left); - NOERR(); - if (longest(v, d, begin, end, (int *) NULL) != end) - return caltdissect(v, t->right, begin, end); - MDEBUG(("calt matched\n")); + t = t->right; + } - er = cdissect(v, t->left, begin, end); - if (er != REG_NOMATCH) - return er; - - return caltdissect(v, t->right, begin, end); + return REG_NOMATCH; } /* - * citerdissect - iteration subexpression matches (with complications) + * citerdissect - dissect match for iteration node */ static int /* regexec return code */ citerdissect(struct vars * v, @@ -1396,11 +926,9 @@ citerdissect(struct vars * v, assert(t->op == '*'); assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(!(t->left->flags & SHORTER)); assert(begin <= end); - if (t->left->flags & SHORTER) /* reverse scan */ - return creviterdissect(v, t, begin, end); - /* * If zero matches are allowed, and target string is empty, just declare * victory. OTOH, if target string isn't empty, zero matches can't work @@ -1562,7 +1090,7 @@ backtrack: } /* - * creviterdissect - shortest-first iteration subexpression matches + * creviterdissect - dissect match for iteration node, shortest-first */ static int /* regexec return code */ creviterdissect(struct vars * v,