mirror of
https://github.com/postgres/postgres.git
synced 2025-06-25 01:02:05 +03:00
Improve parser's one-extra-token lookahead mechanism.
There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one.
This commit is contained in:
@ -42,10 +42,8 @@ my %replace_token = (
|
||||
|
||||
# or in the block
|
||||
my %replace_string = (
|
||||
'WITH_TIME' => 'with time',
|
||||
'WITH_ORDINALITY' => 'with ordinality',
|
||||
'NULLS_FIRST' => 'nulls first',
|
||||
'NULLS_LAST' => 'nulls last',
|
||||
'NULLS_LA' => 'nulls',
|
||||
'WITH_LA' => 'with',
|
||||
'TYPECAST' => '::',
|
||||
'DOT_DOT' => '..',
|
||||
'COLON_EQUALS' => ':=',);
|
||||
|
@ -3,11 +3,8 @@
|
||||
* parser.c
|
||||
* Main entry point/driver for PostgreSQL grammar
|
||||
*
|
||||
* Note that the grammar is not allowed to perform any table access
|
||||
* (since we need to be able to do basic parsing even while inside an
|
||||
* aborted transaction). Therefore, the data structures returned by
|
||||
* the grammar are "raw" parsetrees that still need to be analyzed by
|
||||
* analyze.c and related files.
|
||||
* This should match src/backend/parser/parser.c, except that we do not
|
||||
* need to bother with re-entrant interfaces.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
|
||||
@ -29,18 +26,21 @@ static bool have_lookahead; /* is lookahead info valid? */
|
||||
static int lookahead_token; /* one-token lookahead */
|
||||
static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
|
||||
static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
|
||||
static char *lookahead_yytext; /* start current token */
|
||||
static char *lookahead_end; /* end of current token */
|
||||
static char lookahead_hold_char; /* to be put back at *lookahead_end */
|
||||
|
||||
|
||||
/*
|
||||
* Intermediate filter between parser and base lexer (base_yylex in scan.l).
|
||||
*
|
||||
* The filter is needed because in some cases the standard SQL grammar
|
||||
* This filter is needed because in some cases the standard SQL grammar
|
||||
* requires more than one token lookahead. We reduce these cases to one-token
|
||||
* lookahead by combining tokens here, in order to keep the grammar LALR(1).
|
||||
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
|
||||
*
|
||||
* Using a filter is simpler than trying to recognize multiword tokens
|
||||
* directly in scan.l, because we'd have to allow for comments between the
|
||||
* words. Furthermore it's not clear how to do it without re-introducing
|
||||
* words. Furthermore it's not clear how to do that without re-introducing
|
||||
* scanner backtrack, which would cost more performance than this filter
|
||||
* layer does.
|
||||
*/
|
||||
@ -49,8 +49,10 @@ filtered_base_yylex(void)
|
||||
{
|
||||
int cur_token;
|
||||
int next_token;
|
||||
int cur_token_length;
|
||||
YYSTYPE cur_yylval;
|
||||
YYLTYPE cur_yylloc;
|
||||
char *cur_yytext;
|
||||
|
||||
/* Get next token --- we might already have it */
|
||||
if (have_lookahead)
|
||||
@ -58,74 +60,86 @@ filtered_base_yylex(void)
|
||||
cur_token = lookahead_token;
|
||||
base_yylval = lookahead_yylval;
|
||||
base_yylloc = lookahead_yylloc;
|
||||
yytext = lookahead_yytext;
|
||||
*lookahead_end = lookahead_hold_char;
|
||||
have_lookahead = false;
|
||||
}
|
||||
else
|
||||
cur_token = base_yylex();
|
||||
|
||||
/* Do we need to look ahead for a possible multiword token? */
|
||||
/*
|
||||
* If this token isn't one that requires lookahead, just return it. If it
|
||||
* does, determine the token length. (We could get that via strlen(), but
|
||||
* since we have such a small set of possibilities, hardwiring seems
|
||||
* feasible and more efficient.)
|
||||
*/
|
||||
switch (cur_token)
|
||||
{
|
||||
case NULLS_P:
|
||||
cur_token_length = 5;
|
||||
break;
|
||||
case WITH:
|
||||
cur_token_length = 4;
|
||||
break;
|
||||
default:
|
||||
return cur_token;
|
||||
}
|
||||
|
||||
/*
|
||||
* NULLS FIRST and NULLS LAST must be reduced to one token
|
||||
*/
|
||||
cur_yylval = base_yylval;
|
||||
cur_yylloc = base_yylloc;
|
||||
next_token = base_yylex();
|
||||
/*
|
||||
* Identify end+1 of current token. base_yylex() has temporarily stored a
|
||||
* '\0' here, and will undo that when we call it again. We need to redo
|
||||
* it to fully revert the lookahead call for error reporting purposes.
|
||||
*/
|
||||
lookahead_end = yytext + cur_token_length;
|
||||
Assert(*lookahead_end == '\0');
|
||||
|
||||
/* Save and restore lexer output variables around the call */
|
||||
cur_yylval = base_yylval;
|
||||
cur_yylloc = base_yylloc;
|
||||
cur_yytext = yytext;
|
||||
|
||||
/* Get next token, saving outputs into lookahead variables */
|
||||
next_token = base_yylex();
|
||||
|
||||
lookahead_token = next_token;
|
||||
lookahead_yylval = base_yylval;
|
||||
lookahead_yylloc = base_yylloc;
|
||||
lookahead_yytext = yytext;
|
||||
|
||||
base_yylval = cur_yylval;
|
||||
base_yylloc = cur_yylloc;
|
||||
yytext = cur_yytext;
|
||||
|
||||
/* Now revert the un-truncation of the current token */
|
||||
lookahead_hold_char = *lookahead_end;
|
||||
*lookahead_end = '\0';
|
||||
|
||||
have_lookahead = true;
|
||||
|
||||
/* Replace cur_token if needed, based on lookahead */
|
||||
switch (cur_token)
|
||||
{
|
||||
case NULLS_P:
|
||||
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
|
||||
switch (next_token)
|
||||
{
|
||||
case FIRST_P:
|
||||
cur_token = NULLS_FIRST;
|
||||
break;
|
||||
case LAST_P:
|
||||
cur_token = NULLS_LAST;
|
||||
break;
|
||||
default:
|
||||
/* save the lookahead token for next time */
|
||||
lookahead_token = next_token;
|
||||
lookahead_yylval = base_yylval;
|
||||
lookahead_yylloc = base_yylloc;
|
||||
have_lookahead = true;
|
||||
/* and back up the output info to cur_token */
|
||||
base_yylval = cur_yylval;
|
||||
base_yylloc = cur_yylloc;
|
||||
cur_token = NULLS_LA;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case WITH:
|
||||
|
||||
/*
|
||||
* WITH TIME must be reduced to one token
|
||||
*/
|
||||
cur_yylval = base_yylval;
|
||||
cur_yylloc = base_yylloc;
|
||||
next_token = base_yylex();
|
||||
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
|
||||
switch (next_token)
|
||||
{
|
||||
case TIME:
|
||||
cur_token = WITH_TIME;
|
||||
break;
|
||||
case ORDINALITY:
|
||||
cur_token = WITH_ORDINALITY;
|
||||
break;
|
||||
default:
|
||||
/* save the lookahead token for next time */
|
||||
lookahead_token = next_token;
|
||||
lookahead_yylval = base_yylval;
|
||||
lookahead_yylloc = base_yylloc;
|
||||
have_lookahead = true;
|
||||
/* and back up the output info to cur_token */
|
||||
base_yylval = cur_yylval;
|
||||
base_yylloc = cur_yylloc;
|
||||
cur_token = WITH_LA;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return cur_token;
|
||||
|
Reference in New Issue
Block a user