1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-31 22:04:40 +03:00

Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within
bracket expressions.  There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.

Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets.  POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.

One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".

Some implementation cleanup along the way:

* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.

* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.

* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does?  Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.

Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2021-02-25 13:00:40 -05:00
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions

View File

@ -193,83 +193,6 @@ prefixes(struct vars *v)
}
}
/*
* lexnest - "call a subroutine", interpolating string at the lexical level
*
* Note, this is not a very general facility. There are a number of
* implicit assumptions about what sorts of strings can be subroutines.
*/
static void
lexnest(struct vars *v,
const chr *beginp, /* start of interpolation */
const chr *endp) /* one past end of interpolation */
{
assert(v->savenow == NULL); /* only one level of nesting */
v->savenow = v->now;
v->savestop = v->stop;
v->now = beginp;
v->stop = endp;
}
/*
* string constants to interpolate as expansions of things like \d
*/
static const chr backd[] = { /* \d */
CHR('['), CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr backD[] = { /* \D */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr brbackd[] = { /* \d within brackets */
CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']')
};
static const chr backs[] = { /* \s */
CHR('['), CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr backS[] = { /* \S */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr brbacks[] = { /* \s within brackets */
CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']')
};
static const chr backw[] = { /* \w */
CHR('['), CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
CHR(':'), CHR(']'), CHR('_'), CHR(']')
};
static const chr backW[] = { /* \W */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
CHR(':'), CHR(']'), CHR('_'), CHR(']')
};
static const chr brbackw[] = { /* \w within brackets */
CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
CHR(':'), CHR(']'), CHR('_')
};
/*
* lexword - interpolate a bracket expression for word characters
* Possibly ought to inquire whether there is a "word" character class.
*/
static void
lexword(struct vars *v)
{
lexnest(v, backw, ENDOF(backw));
}
/*
* next - get next token
*/
@ -292,14 +215,6 @@ next(struct vars *v)
RETV(SBEGIN, 0); /* same as \A */
}
/* if we're nested and we've hit end, return to outer level */
if (v->savenow != NULL && ATEOS())
{
v->now = v->savenow;
v->stop = v->savestop;
v->savenow = v->savestop = NULL;
}
/* skip white space etc. if appropriate (not in literal or []) */
if (v->cflags & REG_EXPANDED)
switch (v->lexcon)
@ -420,32 +335,15 @@ next(struct vars *v)
NOTE(REG_UNONPOSIX);
if (ATEOS())
FAILW(REG_EESCAPE);
(DISCARD) lexescape(v);
if (!lexescape(v))
return 0;
switch (v->nexttype)
{ /* not all escapes okay here */
case PLAIN:
case CCLASSS:
case CCLASSC:
return 1;
break;
case CCLASS:
switch (v->nextvalue)
{
case 'd':
lexnest(v, brbackd, ENDOF(brbackd));
break;
case 's':
lexnest(v, brbacks, ENDOF(brbacks));
break;
case 'w':
lexnest(v, brbackw, ENDOF(brbackw));
break;
default:
FAILW(REG_EESCAPE);
break;
}
/* lexnest done, back up and try again */
v->nexttype = v->lasttype;
return next(v);
break;
}
/* not one of the acceptable escapes */
FAILW(REG_EESCAPE);
@ -691,49 +589,17 @@ next(struct vars *v)
}
RETV(PLAIN, *v->now++);
}
(DISCARD) lexescape(v);
if (ISERR())
FAILW(REG_EESCAPE);
if (v->nexttype == CCLASS)
{ /* fudge at lexical level */
switch (v->nextvalue)
{
case 'd':
lexnest(v, backd, ENDOF(backd));
break;
case 'D':
lexnest(v, backD, ENDOF(backD));
break;
case 's':
lexnest(v, backs, ENDOF(backs));
break;
case 'S':
lexnest(v, backS, ENDOF(backS));
break;
case 'w':
lexnest(v, backw, ENDOF(backw));
break;
case 'W':
lexnest(v, backW, ENDOF(backW));
break;
default:
assert(NOTREACHED);
FAILW(REG_ASSERT);
break;
}
/* lexnest done, back up and try again */
v->nexttype = v->lasttype;
return next(v);
}
/* otherwise, lexescape has already done the work */
return !ISERR();
return lexescape(v);
}
/*
* lexescape - parse an ARE backslash escape (backslash already eaten)
* Note slightly nonstandard use of the CCLASS type code.
*
* This is used for ARE backslashes both normally and inside bracket
* expressions. In the latter case, not all escape types are allowed,
* but the caller must reject unwanted ones after we return.
*/
static int /* not actually used, but convenient for RETV */
static int
lexescape(struct vars *v)
{
chr c;
@ -775,11 +641,11 @@ lexescape(struct vars *v)
break;
case CHR('d'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'd');
RETV(CCLASSS, CC_DIGIT);
break;
case CHR('D'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'D');
RETV(CCLASSC, CC_DIGIT);
break;
case CHR('e'):
NOTE(REG_UUNPORT);
@ -802,11 +668,11 @@ lexescape(struct vars *v)
break;
case CHR('s'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 's');
RETV(CCLASSS, CC_SPACE);
break;
case CHR('S'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'S');
RETV(CCLASSC, CC_SPACE);
break;
case CHR('t'):
RETV(PLAIN, CHR('\t'));
@ -828,11 +694,11 @@ lexescape(struct vars *v)
break;
case CHR('w'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'w');
RETV(CCLASSS, CC_WORD);
break;
case CHR('W'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'W');
RETV(CCLASSC, CC_WORD);
break;
case CHR('x'):
NOTE(REG_UUNPORT);