1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-06 13:46:51 +03:00

Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within
bracket expressions.  There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.

Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets.  POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.

One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".

Some implementation cleanup along the way:

* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.

* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.

* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does?  Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.

Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2021-02-25 13:00:40 -05:00
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions

View File

@@ -350,17 +350,13 @@ static const struct cname
};
/*
* The following arrays define the valid character class names.
* The following array defines the valid character class names.
* The entries must match enum char_classes in regguts.h.
*/
static const char *const classNames[NUM_CCLASSES + 1] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper", "xdigit", NULL
};
enum classes
{
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
"lower", "print", "punct", "space", "upper", "xdigit", "word",
NULL
};
/*
@@ -536,7 +532,36 @@ eclass(struct vars *v, /* context */
}
/*
* cclass - supply cvec for a character class
* lookupcclass - lookup a character class identified by name
*
* On failure, sets an error code in *v; the result is then garbage.
*/
static enum char_classes
lookupcclass(struct vars *v, /* context (for returning errors) */
const chr *startp, /* where the name starts */
const chr *endp) /* just past the end of the name */
{
size_t len;
const char *const *namePtr;
int i;
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
return (enum char_classes) i;
}
ERR(REG_ECTYPE);
return (enum char_classes) 0;
}
/*
* cclasscvec - supply cvec for a character class
*
* Must include case counterparts if "cases" is true.
*
@@ -545,45 +570,20 @@ eclass(struct vars *v, /* context */
* because callers are not supposed to explicitly free the result either way.
*/
static struct cvec *
cclass(struct vars *v, /* context */
const chr *startp, /* where the name starts */
const chr *endp, /* just past the end of the name */
int cases) /* case-independent? */
cclasscvec(struct vars *v, /* context */
enum char_classes cclasscode, /* class to build a cvec for */
int cases) /* case-independent? */
{
size_t len;
struct cvec *cv = NULL;
const char *const *namePtr;
int i,
index;
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
index = -1;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
{
index = i;
break;
}
}
if (index == -1)
{
ERR(REG_ECTYPE);
return NULL;
}
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
((enum classes) index == CC_LOWER ||
(enum classes) index == CC_UPPER))
index = (int) CC_ALPHA;
(cclasscode == CC_LOWER ||
cclasscode == CC_UPPER))
cclasscode = CC_ALPHA;
/*
* Now compute the character class contents. For classes that are based
@@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */
* NB: keep this code in sync with cclass_column_index(), below.
*/
switch ((enum classes) index)
switch (cclasscode)
{
case CC_PRINT:
cv = pg_ctype_get_cache(pg_wc_isprint, index);
cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
break;
case CC_ALNUM:
cv = pg_ctype_get_cache(pg_wc_isalnum, index);
cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
break;
case CC_ALPHA:
cv = pg_ctype_get_cache(pg_wc_isalpha, index);
cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
break;
case CC_WORD:
cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
break;
case CC_ASCII:
/* hard-wired meaning */
@@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
cv = pg_ctype_get_cache(pg_wc_isdigit, index);
cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
break;
case CC_PUNCT:
cv = pg_ctype_get_cache(pg_wc_ispunct, index);
cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
break;
case CC_XDIGIT:
@@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */
}
break;
case CC_SPACE:
cv = pg_ctype_get_cache(pg_wc_isspace, index);
cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
break;
case CC_LOWER:
cv = pg_ctype_get_cache(pg_wc_islower, index);
cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
break;
case CC_UPPER:
cv = pg_ctype_get_cache(pg_wc_isupper, index);
cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
break;
case CC_GRAPH:
cv = pg_ctype_get_cache(pg_wc_isgraph, index);
cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
break;
}
@@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)
/*
* Note: we should not see requests to consider cclasses that are not
* treated as locale-specific by cclass(), above.
* treated as locale-specific by cclasscvec(), above.
*/
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
colnum |= cm->classbits[CC_PRINT];
@@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
colnum |= cm->classbits[CC_ALNUM];
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
colnum |= cm->classbits[CC_ALPHA];
if (cm->classbits[CC_WORD] && pg_wc_isword(c))
colnum |= cm->classbits[CC_WORD];
assert(cm->classbits[CC_ASCII] == 0);
assert(cm->classbits[CC_BLANK] == 0);
assert(cm->classbits[CC_CNTRL] == 0);