Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
2025-09-06 13:46:51 +03:00 · 2021-02-25 13:00:40 -05:00
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -350,17 +350,13 @@ static const struct cname
 };

 /*
- * The following arrays define the valid character class names.
+ * The following array defines the valid character class names.
+ * The entries must match enum char_classes in regguts.h.
 */
 static const char *const classNames[NUM_CCLASSES + 1] = {
 	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
-	"lower", "print", "punct", "space", "upper", "xdigit", NULL
-};
-
-enum classes
-{
-	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
-	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+	"lower", "print", "punct", "space", "upper", "xdigit", "word",
+	NULL
 };

 /*
@@ -536,7 +532,36 @@ eclass(struct vars *v,			/* context */
 }

 /*
- * cclass - supply cvec for a character class
+ * lookupcclass - lookup a character class identified by name
+ *
+ * On failure, sets an error code in *v; the result is then garbage.
+ */
+static enum char_classes
+lookupcclass(struct vars *v,	/* context (for returning errors) */
+			 const chr *startp, /* where the name starts */
+			 const chr *endp)	/* just past the end of the name */
+{
+	size_t		len;
+	const char *const *namePtr;
+	int			i;
+
+	/*
+	 * Map the name to the corresponding enumerated value.
+	 */
+	len = endp - startp;
+	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
+	{
+		if (strlen(*namePtr) == len &&
+			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
+			return (enum char_classes) i;
+	}
+
+	ERR(REG_ECTYPE);
+	return (enum char_classes) 0;
+}
+
+/*
+ * cclasscvec - supply cvec for a character class
 *
 * Must include case counterparts if "cases" is true.
 *
@@ -545,45 +570,20 @@ eclass(struct vars *v,			/* context */
 * because callers are not supposed to explicitly free the result either way.
 */
 static struct cvec *
-cclass(struct vars *v,			/* context */
-	   const chr *startp,		/* where the name starts */
-	   const chr *endp,			/* just past the end of the name */
-	   int cases)				/* case-independent? */
+cclasscvec(struct vars *v,		/* context */
+		   enum char_classes cclasscode,	/* class to build a cvec for */
+		   int cases)			/* case-independent? */
 {
-	size_t		len;
 	struct cvec *cv = NULL;
-	const char *const *namePtr;
-	int			i,
-				index;
-
-	/*
-	 * Map the name to the corresponding enumerated value.
-	 */
-	len = endp - startp;
-	index = -1;
-	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
-	{
-		if (strlen(*namePtr) == len &&
-			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
-		{
-			index = i;
-			break;
-		}
-	}
-	if (index == -1)
-	{
-		ERR(REG_ECTYPE);
-		return NULL;
-	}

 	/*
 	 * Remap lower and upper to alpha if the match is case insensitive.
 	 */

 	if (cases &&
-		((enum classes) index == CC_LOWER ||
-		 (enum classes) index == CC_UPPER))
-		index = (int) CC_ALPHA;
+		(cclasscode == CC_LOWER ||
+		 cclasscode == CC_UPPER))
+		cclasscode = CC_ALPHA;

 	/*
 	 * Now compute the character class contents.  For classes that are based
@@ -595,16 +595,19 @@ cclass(struct vars *v,			/* context */
 	 * NB: keep this code in sync with cclass_column_index(), below.
 	 */

-	switch ((enum classes) index)
+	switch (cclasscode)
 	{
 		case CC_PRINT:
-			cv = pg_ctype_get_cache(pg_wc_isprint, index);
+			cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
 			break;
 		case CC_ALNUM:
-			cv = pg_ctype_get_cache(pg_wc_isalnum, index);
+			cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
 			break;
 		case CC_ALPHA:
-			cv = pg_ctype_get_cache(pg_wc_isalpha, index);
+			cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
+			break;
+		case CC_WORD:
+			cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
 			break;
 		case CC_ASCII:
 			/* hard-wired meaning */
@@ -625,10 +628,10 @@ cclass(struct vars *v,			/* context */
 			addrange(cv, 0x7f, 0x9f);
 			break;
 		case CC_DIGIT:
-			cv = pg_ctype_get_cache(pg_wc_isdigit, index);
+			cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
 			break;
 		case CC_PUNCT:
-			cv = pg_ctype_get_cache(pg_wc_ispunct, index);
+			cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
 			break;
 		case CC_XDIGIT:

@@ -646,16 +649,16 @@ cclass(struct vars *v,			/* context */
 			}
 			break;
 		case CC_SPACE:
-			cv = pg_ctype_get_cache(pg_wc_isspace, index);
+			cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
 			break;
 		case CC_LOWER:
-			cv = pg_ctype_get_cache(pg_wc_islower, index);
+			cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
 			break;
 		case CC_UPPER:
-			cv = pg_ctype_get_cache(pg_wc_isupper, index);
+			cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
 			break;
 		case CC_GRAPH:
-			cv = pg_ctype_get_cache(pg_wc_isgraph, index);
+			cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
 			break;
 	}

@@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)

 	/*
 	 * Note: we should not see requests to consider cclasses that are not
-	 * treated as locale-specific by cclass(), above.
+	 * treated as locale-specific by cclasscvec(), above.
 	 */
 	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
 		colnum |= cm->classbits[CC_PRINT];
@@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
 		colnum |= cm->classbits[CC_ALNUM];
 	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
 		colnum |= cm->classbits[CC_ALPHA];
+	if (cm->classbits[CC_WORD] && pg_wc_isword(c))
+		colnum |= cm->classbits[CC_WORD];
 	assert(cm->classbits[CC_ASCII] == 0);
 	assert(cm->classbits[CC_BLANK] == 0);
 	assert(cm->classbits[CC_CNTRL] == 0);