Reduce size of backend scanner's tables.

Previously, the core scanner's yy_transition[] array had 37045 elements. Since that number is larger than INT16_MAX, Flex generated the array to contain 32-bit integers. By reimplementing some of the bulkier scanner rules, this patch reduces the array to 20495 elements. The much smaller total length, combined with the consequent use of 16-bit integers for the array elements reduces the binary size by over 200kB. This was accomplished in two ways: 1. Consolidate handling of quote continuations into a new start condition, rather than duplicating that logic for five different string types. 2. Treat Unicode strings and identifiers followed by a UESCAPE sequence as three separate tokens, rather than one. The logic to de-escape Unicode strings is moved to the filter code in parser.c, which already had the ability to provide special processing for token sequences. While we could have implemented the conversion in the grammar, that approach was rejected for performance and maintainability reasons. Performance in microbenchmarks of raw parsing seems equal or slightly faster in most cases, and it's reasonable to expect that in real-world usage (with more competition for the CPU cache) there will be a larger win. The exception is UESCAPE sequences; lexing those is about 10% slower, primarily because the scanner now has to be called three times rather than one. This seems acceptable since that feature is very rarely used. The psql and epcg lexers are likewise modified, primarily because we want to keep them all in sync. Since those lexers don't use the space-hogging -CF option, the space savings is much less, but it's still good for perhaps 10kB apiece. While at it, merge the ecpg lexer's handling of C-style comments used in SQL and in C. Those have different rules regarding nested comments, but since we already have the ability to keep track of the previous start condition, we can use that to handle both cases within a single start condition. This matches the core scanner more closely. John Naylor Discussion: https://postgr.es/m/CACPNZCvaoa3EgVWm5yZhcSTX6RAtaLgniCPcBVOCwm8h3xpWkw@mail.gmail.com
2025-07-02 09:02:37 +03:00 · 2020-01-13 15:04:31 -05:00
parent 259bbe1778
commit 7f380c59f8
19 changed files with 676 additions and 624 deletions
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@ -114,12 +114,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
 *  <xd> delimited identifiers (double-quoted identifiers)
 *  <xh> hexadecimal numeric string
 *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
 *  <xe> extended quoted strings (support backslash escape sequences)
 *  <xdolq> $foo$ quoted strings
 *  <xui> quoted identifier with Unicode escapes
- *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
 *  <xus> quoted string with Unicode escapes
- *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
 *
 * Note: we intentionally don't mimic the backend's <xeu> state; we have
 * no need to distinguish it from <xe> state, and no good way to get out
@ -132,12 +131,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
 %x xd
 %x xh
 %x xq
+%x xqs
 %x xe
 %x xdolq
 %x xui
-%x xuiend
 %x xus
-%x xusend

 /*
 * In order to make the world safe for Windows and Mac clients as well as
@ -177,19 +175,18 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)

-/*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}).  To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
- */
 quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue	{whitespace_with_newline}{quote}
+
+/*
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}.  It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
+ */
+quotecontinuefail	{whitespace}*"-"?

 /* Bit string
 * It is tempting to scan the string for only those characters
@ -250,21 +247,12 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

-/* Unicode escapes */
-uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
-/* error rule to avoid backup */
-uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
-
 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}

 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}

-/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
-xustop1		{uescapefail}?
-xustop2		{uescape}
-
 /* error rule to avoid backup */
 xufailed		[uU]&

@ -438,20 +426,10 @@ other			.
 					BEGIN(xb);
 					ECHO;
 				}
-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					ECHO;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					ECHO;
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					ECHO;
-				}

 {xhstart}		{
 					/* Hexadecimal bit type.
@ -463,12 +441,6 @@ other			.
 					BEGIN(xh);
 					ECHO;
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
-					ECHO;
-				}

 {xnstart}		{
 					yyless(1);	/* eat only 'n' this time */
@ -490,32 +462,41 @@ other			.
 					BEGIN(xus);
 					ECHO;
 				}
-<xq,xe>{quotestop}	|
-<xq,xe>{quotefail} {
-					yyless(1);
-					BEGIN(INITIAL);
+
+<xb,xh,xq,xe,xus>{quote} {
+					/*
+					 * When we are scanning a quoted string and see an end
+					 * quote, we must look ahead for a possible continuation.
+					 * If we don't see one, we know the end quote was in fact
+					 * the end of the string.  To reduce the lexer table size,
+					 * we use a single "xqs" state to do the lookahead for all
+					 * types of strings.
+					 */
+					cur_state->state_before_str_stop = YYSTATE;
+					BEGIN(xqs);
 					ECHO;
 				}
-<xus>{quotestop} |
-<xus>{quotefail} {
-					/* throw back all but the quote */
-					yyless(1);
-					BEGIN(xusend);
+<xqs>{quotecontinue} {
+					/*
+					 * Found a quote continuation, so return to the in-quote
+					 * state and continue scanning the literal.  Nothing is
+					 * added to the literal's contents.
+					 */
+					BEGIN(cur_state->state_before_str_stop);
 					ECHO;
 				}
-<xusend>{whitespace} {
-					ECHO;
-				}
-<xusend>{other} |
-<xusend>{xustop1} {
+<xqs>{quotecontinuefail} |
+<xqs>{other}	{
+					/*
+					 * Failed to see a quote continuation.  Throw back
+					 * everything after the end quote, and handle the string
+					 * according to the state we were in previously.
+					 */
 					yyless(0);
 					BEGIN(INITIAL);
-					ECHO;
-				}
-<xusend>{xustop2} {
-					BEGIN(INITIAL);
-					ECHO;
+					/* There's nothing to echo ... */
 				}
+
 <xq,xe,xus>{xqdouble} {
 					ECHO;
 				}
@ -540,9 +521,6 @@ other			.
 <xe>{xehexesc}  {
 					ECHO;
 				}
-<xq,xe,xus>{quotecontinue} {
-					ECHO;
-				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					ECHO;
@ -599,21 +577,7 @@ other			.
 					BEGIN(INITIAL);
 					ECHO;
 				}
-<xui>{dquote} {
-					yyless(1);
-					BEGIN(xuiend);
-					ECHO;
-				}
-<xuiend>{whitespace} {
-					ECHO;
-				}
-<xuiend>{other} |
-<xuiend>{xustop1} {
-					yyless(0);
-					BEGIN(INITIAL);
-					ECHO;
-				}
-<xuiend>{xustop2}	{
+<xui>{dquote}	{
 					BEGIN(INITIAL);
 					ECHO;
 				}
@ -1084,8 +1048,7 @@ psql_scan(PsqlScanState state,
 			switch (state->start_state)
 			{
 				case INITIAL:
-				case xuiend:	/* we treat these like INITIAL */
-				case xusend:
+				case xqs:		/* we treat this like INITIAL */
 					if (state->paren_depth > 0)
 					{
 						result = PSCAN_INCOMPLETE;
@ -1240,7 +1203,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state)
 bool
 psql_scan_in_quote(PsqlScanState state)
 {
-	return state->start_state != INITIAL;
+	return state->start_state != INITIAL &&
+			state->start_state != xqs;
 }

 /*