diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 4f9da5b0468..6b80140e909 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -1088,8 +1088,12 @@ parseqatom(struct vars * v, NOERR(); } - /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */ - if (m == 0) + /* + * It's quantifier time. If the atom is just a BACKREF, we'll let it deal + * with quantifiers internally. Otherwise, the first step is to turn + * x{0,...} into x{1,...}|empty + */ + if (m == 0 && atomtype != BACKREF) { EMPTYARC(s2, atom->end); /* the bypass */ assert(PREF(qprefer) != 0); diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index f8e31f8f4ad..224da5064b6 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -720,7 +720,7 @@ cdissect(struct vars * v, case '|': /* alternation */ assert(t->left != NULL); return caltdissect(v, t, begin, end); - case 'b': /* back ref -- shouldn't be calling us! */ + case 'b': /* back reference */ assert(t->left == NULL && t->right == NULL); return cbrdissect(v, t, begin, end); case '.': /* concatenation */ @@ -962,12 +962,12 @@ cbrdissect(struct vars * v, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { - int i; int n = t->subno; - size_t len; - chr *paren; + size_t numreps; + size_t tlen; + size_t brlen; + chr *brstring; chr *p; - chr *stop; int min = t->min; int max = t->max; @@ -978,46 +978,65 @@ cbrdissect(struct vars * v, MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); + /* get the backreferenced string */ if (v->pmatch[n].rm_so == -1) return REG_NOMATCH; - paren = v->start + v->pmatch[n].rm_so; - len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; + brstring = v->start + v->pmatch[n].rm_so; + brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; /* no room to maneuver -- retries are pointless */ if (v->mem[t->retry]) return REG_NOMATCH; v->mem[t->retry] = 1; - /* special-case zero-length string */ - if (len == 0) + /* special cases for zero-length strings */ + if (brlen == 0) { - if (begin == end) + /* + * matches only if target is zero length, but any number of + * repetitions can be considered to be present + */ + if (begin == end && min <= max) + { + MDEBUG(("cbackref matched trivially\n")); return REG_OKAY; + } return REG_NOMATCH; } - - /* and too-short string */ - assert(end >= begin); - if ((size_t) (end - begin) < len) - return REG_NOMATCH; - stop = end - len; - - /* count occurrences */ - i = 0; - for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) + if (begin == end) { - if ((*v->g->compare) (paren, p, len) != 0) - break; - i++; - } - MDEBUG(("cbackref found %d\n", i)); - - /* and sort it out */ - if (p != end) /* didn't consume all of it */ + /* matches only if zero repetitions are okay */ + if (min == 0) + { + MDEBUG(("cbackref matched trivially\n")); + return REG_OKAY; + } return REG_NOMATCH; - if (min <= i && (i <= max || max == INFINITY)) - return REG_OKAY; - return REG_NOMATCH; /* out of range */ + } + + /* + * check target length to see if it could possibly be an allowed number of + * repetitions of brstring + */ + assert(end > begin); + tlen = end - begin; + if (tlen % brlen != 0) + return REG_NOMATCH; + numreps = tlen / brlen; + if (numreps < min || (numreps > max && max != INFINITY)) + return REG_NOMATCH; + + /* okay, compare the actual string contents */ + p = begin; + while (numreps-- > 0) + { + if ((*v->g->compare) (brstring, p, brlen) != 0) + return REG_NOMATCH; + p += brlen; + } + + MDEBUG(("cbackref matched\n")); + return REG_OKAY; } /* diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out new file mode 100644 index 00000000000..5694908163a --- /dev/null +++ b/src/test/regress/expected/regex.out @@ -0,0 +1,36 @@ +-- +-- Regular expression tests +-- +-- Don't want to have to double backslashes in regexes +set standard_conforming_strings = on; +-- Test simple quantified backrefs +select 'bbbbb' ~ '^([bc])\1*$' as t; + t +--- + t +(1 row) + +select 'ccc' ~ '^([bc])\1*$' as t; + t +--- + t +(1 row) + +select 'xxx' ~ '^([bc])\1*$' as f; + f +--- + f +(1 row) + +select 'bbc' ~ '^([bc])\1*$' as f; + f +--- + f +(1 row) + +select 'b' ~ '^([bc])\1*$' as t; + t +--- + t +(1 row) + diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 862f5b20077..8852e0a40fc 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -30,7 +30,7 @@ test: point lseg box path polygon circle date time timetz timestamp timestamptz # geometry depends on point, lseg, box, path, polygon and circle # horology depends on interval, timetz, timestamp, timestamptz, reltime and abstime # ---------- -test: geometry horology oidjoins type_sanity opr_sanity +test: geometry horology regex oidjoins type_sanity opr_sanity # ---------- # These four each depend on the previous one diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 142fc9cf0d1..0bc5df7fe73 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -42,6 +42,7 @@ test: tstypes test: comments test: geometry test: horology +test: regex test: oidjoins test: type_sanity test: opr_sanity diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql new file mode 100644 index 00000000000..242a81ef329 --- /dev/null +++ b/src/test/regress/sql/regex.sql @@ -0,0 +1,13 @@ +-- +-- Regular expression tests +-- + +-- Don't want to have to double backslashes in regexes +set standard_conforming_strings = on; + +-- Test simple quantified backrefs +select 'bbbbb' ~ '^([bc])\1*$' as t; +select 'ccc' ~ '^([bc])\1*$' as t; +select 'xxx' ~ '^([bc])\1*$' as f; +select 'bbc' ~ '^([bc])\1*$' as f; +select 'b' ~ '^([bc])\1*$' as t;