Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
2025-07-31 22:04:40 +03:00 · 2021-02-25 13:00:40 -05:00
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@ -193,83 +193,6 @@ prefixes(struct vars *v)
 	}
 }

-/*
- * lexnest - "call a subroutine", interpolating string at the lexical level
- *
- * Note, this is not a very general facility.  There are a number of
- * implicit assumptions about what sorts of strings can be subroutines.
- */
-static void
-lexnest(struct vars *v,
-		const chr *beginp,		/* start of interpolation */
-		const chr *endp)		/* one past end of interpolation */
-{
-	assert(v->savenow == NULL); /* only one level of nesting */
-	v->savenow = v->now;
-	v->savestop = v->stop;
-	v->now = beginp;
-	v->stop = endp;
-}
-
-/*
- * string constants to interpolate as expansions of things like \d
- */
-static const chr backd[] = {	/* \d */
-	CHR('['), CHR('['), CHR(':'),
-	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr backD[] = {	/* \D */
-	CHR('['), CHR('^'), CHR('['), CHR(':'),
-	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr brbackd[] = {	/* \d within brackets */
-	CHR('['), CHR(':'),
-	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
-	CHR(':'), CHR(']')
-};
-static const chr backs[] = {	/* \s */
-	CHR('['), CHR('['), CHR(':'),
-	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr backS[] = {	/* \S */
-	CHR('['), CHR('^'), CHR('['), CHR(':'),
-	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr brbacks[] = {	/* \s within brackets */
-	CHR('['), CHR(':'),
-	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
-	CHR(':'), CHR(']')
-};
-static const chr backw[] = {	/* \w */
-	CHR('['), CHR('['), CHR(':'),
-	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-	CHR(':'), CHR(']'), CHR('_'), CHR(']')
-};
-static const chr backW[] = {	/* \W */
-	CHR('['), CHR('^'), CHR('['), CHR(':'),
-	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-	CHR(':'), CHR(']'), CHR('_'), CHR(']')
-};
-static const chr brbackw[] = {	/* \w within brackets */
-	CHR('['), CHR(':'),
-	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-	CHR(':'), CHR(']'), CHR('_')
-};
-
-/*
- * lexword - interpolate a bracket expression for word characters
- * Possibly ought to inquire whether there is a "word" character class.
- */
-static void
-lexword(struct vars *v)
-{
-	lexnest(v, backw, ENDOF(backw));
-}
-
 /*
 * next - get next token
 */
@ -292,14 +215,6 @@ next(struct vars *v)
 		RETV(SBEGIN, 0);		/* same as \A */
 	}

-	/* if we're nested and we've hit end, return to outer level */
-	if (v->savenow != NULL && ATEOS())
-	{
-		v->now = v->savenow;
-		v->stop = v->savestop;
-		v->savenow = v->savestop = NULL;
-	}
-
 	/* skip white space etc. if appropriate (not in literal or []) */
 	if (v->cflags & REG_EXPANDED)
 		switch (v->lexcon)
@ -420,32 +335,15 @@ next(struct vars *v)
 					NOTE(REG_UNONPOSIX);
 					if (ATEOS())
 						FAILW(REG_EESCAPE);
-					(DISCARD) lexescape(v);
+					if (!lexescape(v))
+						return 0;
 					switch (v->nexttype)
 					{			/* not all escapes okay here */
 						case PLAIN:
+						case CCLASSS:
+						case CCLASSC:
 							return 1;
 							break;
-						case CCLASS:
-							switch (v->nextvalue)
-							{
-								case 'd':
-									lexnest(v, brbackd, ENDOF(brbackd));
-									break;
-								case 's':
-									lexnest(v, brbacks, ENDOF(brbacks));
-									break;
-								case 'w':
-									lexnest(v, brbackw, ENDOF(brbackw));
-									break;
-								default:
-									FAILW(REG_EESCAPE);
-									break;
-							}
-							/* lexnest done, back up and try again */
-							v->nexttype = v->lasttype;
-							return next(v);
-							break;
 					}
 					/* not one of the acceptable escapes */
 					FAILW(REG_EESCAPE);
@ -691,49 +589,17 @@ next(struct vars *v)
 		}
 		RETV(PLAIN, *v->now++);
 	}
-	(DISCARD) lexescape(v);
-	if (ISERR())
-		FAILW(REG_EESCAPE);
-	if (v->nexttype == CCLASS)
-	{							/* fudge at lexical level */
-		switch (v->nextvalue)
-		{
-			case 'd':
-				lexnest(v, backd, ENDOF(backd));
-				break;
-			case 'D':
-				lexnest(v, backD, ENDOF(backD));
-				break;
-			case 's':
-				lexnest(v, backs, ENDOF(backs));
-				break;
-			case 'S':
-				lexnest(v, backS, ENDOF(backS));
-				break;
-			case 'w':
-				lexnest(v, backw, ENDOF(backw));
-				break;
-			case 'W':
-				lexnest(v, backW, ENDOF(backW));
-				break;
-			default:
-				assert(NOTREACHED);
-				FAILW(REG_ASSERT);
-				break;
-		}
-		/* lexnest done, back up and try again */
-		v->nexttype = v->lasttype;
-		return next(v);
-	}
-	/* otherwise, lexescape has already done the work */
-	return !ISERR();
+	return lexescape(v);
 }

 /*
 * lexescape - parse an ARE backslash escape (backslash already eaten)
- * Note slightly nonstandard use of the CCLASS type code.
+ *
+ * This is used for ARE backslashes both normally and inside bracket
+ * expressions.  In the latter case, not all escape types are allowed,
+ * but the caller must reject unwanted ones after we return.
 */
-static int						/* not actually used, but convenient for RETV */
+static int
 lexescape(struct vars *v)
 {
 	chr			c;
@ -775,11 +641,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('d'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'd');
+			RETV(CCLASSS, CC_DIGIT);
 			break;
 		case CHR('D'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'D');
+			RETV(CCLASSC, CC_DIGIT);
 			break;
 		case CHR('e'):
 			NOTE(REG_UUNPORT);
@ -802,11 +668,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('s'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 's');
+			RETV(CCLASSS, CC_SPACE);
 			break;
 		case CHR('S'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'S');
+			RETV(CCLASSC, CC_SPACE);
 			break;
 		case CHR('t'):
 			RETV(PLAIN, CHR('\t'));
@ -828,11 +694,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('w'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'w');
+			RETV(CCLASSS, CC_WORD);
 			break;
 		case CHR('W'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'W');
+			RETV(CCLASSC, CC_WORD);
 			break;
 		case CHR('x'):
 			NOTE(REG_UUNPORT);