mirror of
https://github.com/postgres/postgres.git
synced 2025-07-31 22:04:40 +03:00
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
This commit is contained in:
@ -193,83 +193,6 @@ prefixes(struct vars *v)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* lexnest - "call a subroutine", interpolating string at the lexical level
|
||||
*
|
||||
* Note, this is not a very general facility. There are a number of
|
||||
* implicit assumptions about what sorts of strings can be subroutines.
|
||||
*/
|
||||
static void
|
||||
lexnest(struct vars *v,
|
||||
const chr *beginp, /* start of interpolation */
|
||||
const chr *endp) /* one past end of interpolation */
|
||||
{
|
||||
assert(v->savenow == NULL); /* only one level of nesting */
|
||||
v->savenow = v->now;
|
||||
v->savestop = v->stop;
|
||||
v->now = beginp;
|
||||
v->stop = endp;
|
||||
}
|
||||
|
||||
/*
|
||||
* string constants to interpolate as expansions of things like \d
|
||||
*/
|
||||
static const chr backd[] = { /* \d */
|
||||
CHR('['), CHR('['), CHR(':'),
|
||||
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr backD[] = { /* \D */
|
||||
CHR('['), CHR('^'), CHR('['), CHR(':'),
|
||||
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr brbackd[] = { /* \d within brackets */
|
||||
CHR('['), CHR(':'),
|
||||
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
|
||||
CHR(':'), CHR(']')
|
||||
};
|
||||
static const chr backs[] = { /* \s */
|
||||
CHR('['), CHR('['), CHR(':'),
|
||||
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr backS[] = { /* \S */
|
||||
CHR('['), CHR('^'), CHR('['), CHR(':'),
|
||||
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
|
||||
CHR(':'), CHR(']'), CHR(']')
|
||||
};
|
||||
static const chr brbacks[] = { /* \s within brackets */
|
||||
CHR('['), CHR(':'),
|
||||
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
|
||||
CHR(':'), CHR(']')
|
||||
};
|
||||
static const chr backw[] = { /* \w */
|
||||
CHR('['), CHR('['), CHR(':'),
|
||||
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
|
||||
CHR(':'), CHR(']'), CHR('_'), CHR(']')
|
||||
};
|
||||
static const chr backW[] = { /* \W */
|
||||
CHR('['), CHR('^'), CHR('['), CHR(':'),
|
||||
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
|
||||
CHR(':'), CHR(']'), CHR('_'), CHR(']')
|
||||
};
|
||||
static const chr brbackw[] = { /* \w within brackets */
|
||||
CHR('['), CHR(':'),
|
||||
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
|
||||
CHR(':'), CHR(']'), CHR('_')
|
||||
};
|
||||
|
||||
/*
|
||||
* lexword - interpolate a bracket expression for word characters
|
||||
* Possibly ought to inquire whether there is a "word" character class.
|
||||
*/
|
||||
static void
|
||||
lexword(struct vars *v)
|
||||
{
|
||||
lexnest(v, backw, ENDOF(backw));
|
||||
}
|
||||
|
||||
/*
|
||||
* next - get next token
|
||||
*/
|
||||
@ -292,14 +215,6 @@ next(struct vars *v)
|
||||
RETV(SBEGIN, 0); /* same as \A */
|
||||
}
|
||||
|
||||
/* if we're nested and we've hit end, return to outer level */
|
||||
if (v->savenow != NULL && ATEOS())
|
||||
{
|
||||
v->now = v->savenow;
|
||||
v->stop = v->savestop;
|
||||
v->savenow = v->savestop = NULL;
|
||||
}
|
||||
|
||||
/* skip white space etc. if appropriate (not in literal or []) */
|
||||
if (v->cflags & REG_EXPANDED)
|
||||
switch (v->lexcon)
|
||||
@ -420,32 +335,15 @@ next(struct vars *v)
|
||||
NOTE(REG_UNONPOSIX);
|
||||
if (ATEOS())
|
||||
FAILW(REG_EESCAPE);
|
||||
(DISCARD) lexescape(v);
|
||||
if (!lexescape(v))
|
||||
return 0;
|
||||
switch (v->nexttype)
|
||||
{ /* not all escapes okay here */
|
||||
case PLAIN:
|
||||
case CCLASSS:
|
||||
case CCLASSC:
|
||||
return 1;
|
||||
break;
|
||||
case CCLASS:
|
||||
switch (v->nextvalue)
|
||||
{
|
||||
case 'd':
|
||||
lexnest(v, brbackd, ENDOF(brbackd));
|
||||
break;
|
||||
case 's':
|
||||
lexnest(v, brbacks, ENDOF(brbacks));
|
||||
break;
|
||||
case 'w':
|
||||
lexnest(v, brbackw, ENDOF(brbackw));
|
||||
break;
|
||||
default:
|
||||
FAILW(REG_EESCAPE);
|
||||
break;
|
||||
}
|
||||
/* lexnest done, back up and try again */
|
||||
v->nexttype = v->lasttype;
|
||||
return next(v);
|
||||
break;
|
||||
}
|
||||
/* not one of the acceptable escapes */
|
||||
FAILW(REG_EESCAPE);
|
||||
@ -691,49 +589,17 @@ next(struct vars *v)
|
||||
}
|
||||
RETV(PLAIN, *v->now++);
|
||||
}
|
||||
(DISCARD) lexescape(v);
|
||||
if (ISERR())
|
||||
FAILW(REG_EESCAPE);
|
||||
if (v->nexttype == CCLASS)
|
||||
{ /* fudge at lexical level */
|
||||
switch (v->nextvalue)
|
||||
{
|
||||
case 'd':
|
||||
lexnest(v, backd, ENDOF(backd));
|
||||
break;
|
||||
case 'D':
|
||||
lexnest(v, backD, ENDOF(backD));
|
||||
break;
|
||||
case 's':
|
||||
lexnest(v, backs, ENDOF(backs));
|
||||
break;
|
||||
case 'S':
|
||||
lexnest(v, backS, ENDOF(backS));
|
||||
break;
|
||||
case 'w':
|
||||
lexnest(v, backw, ENDOF(backw));
|
||||
break;
|
||||
case 'W':
|
||||
lexnest(v, backW, ENDOF(backW));
|
||||
break;
|
||||
default:
|
||||
assert(NOTREACHED);
|
||||
FAILW(REG_ASSERT);
|
||||
break;
|
||||
}
|
||||
/* lexnest done, back up and try again */
|
||||
v->nexttype = v->lasttype;
|
||||
return next(v);
|
||||
}
|
||||
/* otherwise, lexescape has already done the work */
|
||||
return !ISERR();
|
||||
return lexescape(v);
|
||||
}
|
||||
|
||||
/*
|
||||
* lexescape - parse an ARE backslash escape (backslash already eaten)
|
||||
* Note slightly nonstandard use of the CCLASS type code.
|
||||
*
|
||||
* This is used for ARE backslashes both normally and inside bracket
|
||||
* expressions. In the latter case, not all escape types are allowed,
|
||||
* but the caller must reject unwanted ones after we return.
|
||||
*/
|
||||
static int /* not actually used, but convenient for RETV */
|
||||
static int
|
||||
lexescape(struct vars *v)
|
||||
{
|
||||
chr c;
|
||||
@ -775,11 +641,11 @@ lexescape(struct vars *v)
|
||||
break;
|
||||
case CHR('d'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'd');
|
||||
RETV(CCLASSS, CC_DIGIT);
|
||||
break;
|
||||
case CHR('D'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'D');
|
||||
RETV(CCLASSC, CC_DIGIT);
|
||||
break;
|
||||
case CHR('e'):
|
||||
NOTE(REG_UUNPORT);
|
||||
@ -802,11 +668,11 @@ lexescape(struct vars *v)
|
||||
break;
|
||||
case CHR('s'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 's');
|
||||
RETV(CCLASSS, CC_SPACE);
|
||||
break;
|
||||
case CHR('S'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'S');
|
||||
RETV(CCLASSC, CC_SPACE);
|
||||
break;
|
||||
case CHR('t'):
|
||||
RETV(PLAIN, CHR('\t'));
|
||||
@ -828,11 +694,11 @@ lexescape(struct vars *v)
|
||||
break;
|
||||
case CHR('w'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'w');
|
||||
RETV(CCLASSS, CC_WORD);
|
||||
break;
|
||||
case CHR('W'):
|
||||
NOTE(REG_ULOCALE);
|
||||
RETV(CCLASS, 'W');
|
||||
RETV(CCLASSC, CC_WORD);
|
||||
break;
|
||||
case CHR('x'):
|
||||
NOTE(REG_UUNPORT);
|
||||
|
Reference in New Issue
Block a user