1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-02 09:02:37 +03:00

Reduce size of backend scanner's tables.

Previously, the core scanner's yy_transition[] array had 37045 elements.
Since that number is larger than INT16_MAX, Flex generated the array to
contain 32-bit integers.  By reimplementing some of the bulkier scanner
rules, this patch reduces the array to 20495 elements.  The much smaller
total length, combined with the consequent use of 16-bit integers for
the array elements reduces the binary size by over 200kB.  This was
accomplished in two ways:

1. Consolidate handling of quote continuations into a new start condition,
rather than duplicating that logic for five different string types.

2. Treat Unicode strings and identifiers followed by a UESCAPE sequence
as three separate tokens, rather than one.  The logic to de-escape
Unicode strings is moved to the filter code in parser.c, which already
had the ability to provide special processing for token sequences.
While we could have implemented the conversion in the grammar, that
approach was rejected for performance and maintainability reasons.

Performance in microbenchmarks of raw parsing seems equal or slightly
faster in most cases, and it's reasonable to expect that in real-world
usage (with more competition for the CPU cache) there will be a larger
win.  The exception is UESCAPE sequences; lexing those is about 10%
slower, primarily because the scanner now has to be called three times
rather than one.  This seems acceptable since that feature is very
rarely used.

The psql and epcg lexers are likewise modified, primarily because we
want to keep them all in sync.  Since those lexers don't use the
space-hogging -CF option, the space savings is much less, but it's
still good for perhaps 10kB apiece.

While at it, merge the ecpg lexer's handling of C-style comments used
in SQL and in C.  Those have different rules regarding nested comments,
but since we already have the ability to keep track of the previous
start condition, we can use that to handle both cases within a single
start condition.  This matches the core scanner more closely.

John Naylor

Discussion: https://postgr.es/m/CACPNZCvaoa3EgVWm5yZhcSTX6RAtaLgniCPcBVOCwm8h3xpWkw@mail.gmail.com
This commit is contained in:
Tom Lane
2020-01-13 15:04:31 -05:00
parent 259bbe1778
commit 7f380c59f8
19 changed files with 676 additions and 624 deletions

View File

@ -114,12 +114,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
* <xd> delimited identifiers (double-quoted identifiers)
* <xh> hexadecimal numeric string
* <xq> standard quoted strings
* <xqs> quote stop (detect continued strings)
* <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
* <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
* <xus> quoted string with Unicode escapes
* <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
*
* Note: we intentionally don't mimic the backend's <xeu> state; we have
* no need to distinguish it from <xe> state, and no good way to get out
@ -132,12 +131,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
%x xd
%x xh
%x xq
%x xqs
%x xe
%x xdolq
%x xui
%x xuiend
%x xus
%x xusend
/*
* In order to make the world safe for Windows and Mac clients as well as
@ -177,19 +175,18 @@ special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
/*
* To ensure that {quotecontinue} can be scanned without having to back up
* if the full pattern isn't matched, we include trailing whitespace in
* {quotestop}. This matches all cases where {quotecontinue} fails to match,
* except for {quote} followed by whitespace and just one "-" (not two,
* which would start a {comment}). To cover that we have {quotefail}.
* The actions for {quotestop} and {quotefail} must throw back characters
* beyond the quote proper.
*/
quote '
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*"-"
/* If we see {quote} then {quotecontinue}, the quoted string continues */
quotecontinue {whitespace_with_newline}{quote}
/*
* {quotecontinuefail} is needed to avoid lexer backup when we fail to match
* {quotecontinue}. It might seem that this could just be {whitespace}*,
* but if there's a dash after {whitespace_with_newline}, it must be consumed
* to see if there's another dash --- which would start a {comment} and thus
* allow continuation of the {quotecontinue} token.
*/
quotecontinuefail {whitespace}*"-"?
/* Bit string
* It is tempting to scan the string for only those characters
@ -250,21 +247,12 @@ xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
/* Unicode escapes */
uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
/* error rule to avoid backup */
uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
/* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote}
/* Quoted string with Unicode escapes */
xusstart [uU]&{quote}
/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
xustop1 {uescapefail}?
xustop2 {uescape}
/* error rule to avoid backup */
xufailed [uU]&
@ -438,20 +426,10 @@ other .
BEGIN(xb);
ECHO;
}
<xb>{quotestop} |
<xb>{quotefail} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
<xh>{xhinside} |
<xb>{xbinside} {
ECHO;
}
<xh>{quotecontinue} |
<xb>{quotecontinue} {
ECHO;
}
{xhstart} {
/* Hexadecimal bit type.
@ -463,12 +441,6 @@ other .
BEGIN(xh);
ECHO;
}
<xh>{quotestop} |
<xh>{quotefail} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
{xnstart} {
yyless(1); /* eat only 'n' this time */
@ -490,32 +462,41 @@ other .
BEGIN(xus);
ECHO;
}
<xq,xe>{quotestop} |
<xq,xe>{quotefail} {
yyless(1);
BEGIN(INITIAL);
<xb,xh,xq,xe,xus>{quote} {
/*
* When we are scanning a quoted string and see an end
* quote, we must look ahead for a possible continuation.
* If we don't see one, we know the end quote was in fact
* the end of the string. To reduce the lexer table size,
* we use a single "xqs" state to do the lookahead for all
* types of strings.
*/
cur_state->state_before_str_stop = YYSTATE;
BEGIN(xqs);
ECHO;
}
<xus>{quotestop} |
<xus>{quotefail} {
/* throw back all but the quote */
yyless(1);
BEGIN(xusend);
<xqs>{quotecontinue} {
/*
* Found a quote continuation, so return to the in-quote
* state and continue scanning the literal. Nothing is
* added to the literal's contents.
*/
BEGIN(cur_state->state_before_str_stop);
ECHO;
}
<xusend>{whitespace} {
ECHO;
}
<xusend>{other} |
<xusend>{xustop1} {
<xqs>{quotecontinuefail} |
<xqs>{other} {
/*
* Failed to see a quote continuation. Throw back
* everything after the end quote, and handle the string
* according to the state we were in previously.
*/
yyless(0);
BEGIN(INITIAL);
ECHO;
}
<xusend>{xustop2} {
BEGIN(INITIAL);
ECHO;
/* There's nothing to echo ... */
}
<xq,xe,xus>{xqdouble} {
ECHO;
}
@ -540,9 +521,6 @@ other .
<xe>{xehexesc} {
ECHO;
}
<xq,xe,xus>{quotecontinue} {
ECHO;
}
<xe>. {
/* This is only needed for \ just before EOF */
ECHO;
@ -599,21 +577,7 @@ other .
BEGIN(INITIAL);
ECHO;
}
<xui>{dquote} {
yyless(1);
BEGIN(xuiend);
ECHO;
}
<xuiend>{whitespace} {
ECHO;
}
<xuiend>{other} |
<xuiend>{xustop1} {
yyless(0);
BEGIN(INITIAL);
ECHO;
}
<xuiend>{xustop2} {
<xui>{dquote} {
BEGIN(INITIAL);
ECHO;
}
@ -1084,8 +1048,7 @@ psql_scan(PsqlScanState state,
switch (state->start_state)
{
case INITIAL:
case xuiend: /* we treat these like INITIAL */
case xusend:
case xqs: /* we treat this like INITIAL */
if (state->paren_depth > 0)
{
result = PSCAN_INCOMPLETE;
@ -1240,7 +1203,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state)
bool
psql_scan_in_quote(PsqlScanState state)
{
return state->start_state != INITIAL;
return state->start_state != INITIAL &&
state->start_state != xqs;
}
/*