1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00

Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within
bracket expressions.  There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.

Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets.  POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.

One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".

Some implementation cleanup along the way:

* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.

* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.

* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does?  Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.

Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2021-02-25 13:00:40 -05:00
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions

View File

@@ -127,6 +127,18 @@
#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
/*
* known character classes
*/
enum char_classes
{
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD
};
#define NUM_CCLASSES 14
/*
* As soon as possible, we map chrs into equivalence classes -- "colors" --
* which are of much more manageable number.
@@ -164,12 +176,14 @@ struct colordesc
#define NOSUB COLORLESS /* value of "sub" when no open subcolor */
struct arc *arcs; /* chain of all arcs of this color */
chr firstchr; /* simple char first assigned to this color */
int flags; /* bit values defined next */
int flags; /* bitmask of the following flags: */
#define FREECOL 01 /* currently free */
#define PSEUDO 02 /* pseudocolor, no real chars */
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
#define COLMARK 04 /* temporary marker used in some functions */
};
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
/*
* The color map itself
*
@@ -199,8 +213,6 @@ struct colordesc
* appear in increasing chr-value order.
*/
#define NUM_CCLASSES 13 /* must match data in regc_locale.c */
typedef struct colormaprange
{
chr cmin; /* range represents cmin..cmax inclusive */