1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00

Avoid determining regexp subexpression matches, when possible.

Identifying the precise match locations for parenthesized subexpressions
is a fairly expensive task given the way our regexp engine works, both
at regexp compile time (where we must create an optimized NFA for each
parenthesized subexpression) and at runtime (where determining exact
match locations requires laborious search).

Up to now we've made little attempt to optimize this situation.  This
patch identifies cases where we know at compile time that we won't
need to know subexpression match locations, and teaches the regexp
compiler to not bother creating per-subexpression regexps for
parenthesis pairs that are not referenced by backrefs elsewhere in
the regexp.  (To preserve semantics, we obviously still have to
pin down the match locations of backref references.)  Users could
have obtained the same results before this by being careful to
write "non capturing" parentheses wherever possible, but few people
bother with that.

Discussion: https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2021-08-09 11:26:34 -04:00
parent 76ad24400d
commit 0e6aa8747d
10 changed files with 154 additions and 45 deletions

View File

@@ -106,7 +106,7 @@ typedef struct
#define REG_QUOTE 000004 /* no special characters, none */
#define REG_NOSPEC REG_QUOTE /* historical synonym */
#define REG_ICASE 000010 /* ignore case */
#define REG_NOSUB 000020 /* don't care about subexpressions */
#define REG_NOSUB 000020 /* caller doesn't need subexpr match data */
#define REG_EXPANDED 000040 /* expanded format, white space & comments */
#define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */
#define REG_NLANCH 000200 /* ^ matches after \n, $ before */

View File

@@ -477,13 +477,14 @@ struct subre
#define MIXED 04 /* mixed preference below */
#define CAP 010 /* capturing parens here or below */
#define BACKR 020 /* back reference here or below */
#define BRUSE 040 /* is referenced by a back reference */
#define INUSE 0100 /* in use in final tree */
#define NOPROP 03 /* bits which may not propagate up */
#define UPPROP (MIXED|CAP|BACKR) /* flags which should propagate up */
#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
#define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */
#define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
#define UP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
#define MESSY(f) ((f)&(MIXED|CAP|BACKR))
#define PREF(f) ((f)&NOPROP)
#define PREF(f) ((f)&(LONGER|SHORTER))
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
char latype; /* LATYPE code, if lookaround constraint */