Reduce size of backend scanner's tables.

Previously, the core scanner's yy_transition[] array had 37045 elements. Since that number is larger than INT16_MAX, Flex generated the array to contain 32-bit integers. By reimplementing some of the bulkier scanner rules, this patch reduces the array to 20495 elements. The much smaller total length, combined with the consequent use of 16-bit integers for the array elements reduces the binary size by over 200kB. This was accomplished in two ways: 1. Consolidate handling of quote continuations into a new start condition, rather than duplicating that logic for five different string types. 2. Treat Unicode strings and identifiers followed by a UESCAPE sequence as three separate tokens, rather than one. The logic to de-escape Unicode strings is moved to the filter code in parser.c, which already had the ability to provide special processing for token sequences. While we could have implemented the conversion in the grammar, that approach was rejected for performance and maintainability reasons. Performance in microbenchmarks of raw parsing seems equal or slightly faster in most cases, and it's reasonable to expect that in real-world usage (with more competition for the CPU cache) there will be a larger win. The exception is UESCAPE sequences; lexing those is about 10% slower, primarily because the scanner now has to be called three times rather than one. This seems acceptable since that feature is very rarely used. The psql and epcg lexers are likewise modified, primarily because we want to keep them all in sync. Since those lexers don't use the space-hogging -CF option, the space savings is much less, but it's still good for perhaps 10kB apiece. While at it, merge the ecpg lexer's handling of C-style comments used in SQL and in C. Those have different rules regarding nested comments, but since we already have the ability to keep track of the previous start condition, we can use that to handle both cases within a single start condition. This matches the core scanner more closely. John Naylor Discussion: https://postgr.es/m/CACPNZCvaoa3EgVWm5yZhcSTX6RAtaLgniCPcBVOCwm8h3xpWkw@mail.gmail.com
2025-11-21 00:42:43 +03:00 · 2020-01-13 15:04:31 -05:00
parent 259bbe1778
commit 7f380c59f8
19 changed files with 676 additions and 624 deletions
--- a/src/interfaces/ecpg/preproc/ecpg.tokens
+++ b/src/interfaces/ecpg/preproc/ecpg.tokens
@@ -24,4 +24,3 @@
                S_TYPEDEF

 %token CSTRING CVARIABLE CPP_LINE IP
-%token DOLCONST ECONST NCONST UCONST UIDENT
--- a/src/interfaces/ecpg/preproc/ecpg.trailer
+++ b/src/interfaces/ecpg/preproc/ecpg.trailer
@@ -1719,46 +1719,12 @@ ecpg_bconst:	BCONST		{ $$ = make_name(); } ;

 ecpg_fconst:	FCONST		{ $$ = make_name(); } ;

-ecpg_sconst:
-		SCONST
-		{
-			/* could have been input as '' or $$ */
-			$$ = (char *)mm_alloc(strlen($1) + 3);
-			$$[0]='\'';
-			strcpy($$+1, $1);
-			$$[strlen($1)+1]='\'';
-			$$[strlen($1)+2]='\0';
-			free($1);
-		}
-		| ECONST
-		{
-			$$ = (char *)mm_alloc(strlen($1) + 4);
-			$$[0]='E';
-			$$[1]='\'';
-			strcpy($$+2, $1);
-			$$[strlen($1)+2]='\'';
-			$$[strlen($1)+3]='\0';
-			free($1);
-		}
-		| NCONST
-		{
-			$$ = (char *)mm_alloc(strlen($1) + 4);
-			$$[0]='N';
-			$$[1]='\'';
-			strcpy($$+2, $1);
-			$$[strlen($1)+2]='\'';
-			$$[strlen($1)+3]='\0';
-			free($1);
-		}
-		| UCONST	{ $$ = $1; }
-		| DOLCONST	{ $$ = $1; }
-		;
+ecpg_sconst:	SCONST		{ $$ = $1; } ;

 ecpg_xconst:	XCONST		{ $$ = make_name(); } ;

-ecpg_ident:	IDENT		{ $$ = make_name(); }
+ecpg_ident:	IDENT		{ $$ = $1; }
 		| CSTRING	{ $$ = make3_str(mm_strdup("\""), $1, mm_strdup("\"")); }
-		| UIDENT	{ $$ = $1; }
 		;

 quoted_ident_stringvar: name
--- a/src/interfaces/ecpg/preproc/ecpg.type
+++ b/src/interfaces/ecpg/preproc/ecpg.type
@@ -122,12 +122,8 @@
 %type <str> CSTRING
 %type <str> CPP_LINE
 %type <str> CVARIABLE
-%type <str> DOLCONST
-%type <str> ECONST
-%type <str> NCONST
 %type <str> SCONST
-%type <str> UCONST
-%type <str> UIDENT
+%type <str> IDENT

 %type  <struct_union> s_struct_union_symbol

--- a/src/interfaces/ecpg/preproc/parse.pl
+++ b/src/interfaces/ecpg/preproc/parse.pl
@@ -218,8 +218,8 @@ sub main
 				if ($a eq 'IDENT' && $prior eq '%nonassoc')
 				{

-					# add two more tokens to the list
-					$str = $str . "\n%nonassoc CSTRING\n%nonassoc UIDENT";
+					# add more tokens to the list
+					$str = $str . "\n%nonassoc CSTRING";
 				}
 				$prior = $a;
 			}
--- a/src/interfaces/ecpg/preproc/parser.c
+++ b/src/interfaces/ecpg/preproc/parser.c
@@ -6,6 +6,9 @@
 * This should match src/backend/parser/parser.c, except that we do not
 * need to bother with re-entrant interfaces.
 *
+ * Note: ECPG doesn't report error location like the backend does.
+ * This file will need work if we ever want it to.
+ *
 *
 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@@ -27,8 +30,9 @@ static int	lookahead_token;	/* one-token lookahead */
 static YYSTYPE lookahead_yylval;	/* yylval for lookahead token */
 static YYLTYPE lookahead_yylloc;	/* yylloc for lookahead token */
 static char *lookahead_yytext;	/* start current token */
-static char *lookahead_end;		/* end of current token */
-static char lookahead_hold_char;	/* to be put back at *lookahead_end */
+
+static bool check_uescapechar(unsigned char escape);
+static bool ecpg_isspace(char ch);


 /*
@@ -43,13 +47,16 @@ static char lookahead_hold_char;	/* to be put back at *lookahead_end */
 * words.  Furthermore it's not clear how to do that without re-introducing
 * scanner backtrack, which would cost more performance than this filter
 * layer does.
+ *
+ * We also use this filter to convert UIDENT and USCONST sequences into
+ * plain IDENT and SCONST tokens.  While that could be handled by additional
+ * productions in the main grammar, it's more efficient to do it like this.
 */
 int
 filtered_base_yylex(void)
 {
 	int			cur_token;
 	int			next_token;
-	int			cur_token_length;
 	YYSTYPE		cur_yylval;
 	YYLTYPE		cur_yylloc;
 	char	   *cur_yytext;
@@ -61,41 +68,26 @@ filtered_base_yylex(void)
 		base_yylval = lookahead_yylval;
 		base_yylloc = lookahead_yylloc;
 		base_yytext = lookahead_yytext;
-		*lookahead_end = lookahead_hold_char;
 		have_lookahead = false;
 	}
 	else
 		cur_token = base_yylex();

 	/*
-	 * If this token isn't one that requires lookahead, just return it.  If it
-	 * does, determine the token length.  (We could get that via strlen(), but
-	 * since we have such a small set of possibilities, hardwiring seems
-	 * feasible and more efficient.)
+	 * If this token isn't one that requires lookahead, just return it.
 	 */
 	switch (cur_token)
 	{
 		case NOT:
-			cur_token_length = 3;
-			break;
 		case NULLS_P:
-			cur_token_length = 5;
-			break;
 		case WITH:
-			cur_token_length = 4;
+		case UIDENT:
+		case USCONST:
 			break;
 		default:
 			return cur_token;
 	}

-	/*
-	 * Identify end+1 of current token.  base_yylex() has temporarily stored a
-	 * '\0' here, and will undo that when we call it again.  We need to redo
-	 * it to fully revert the lookahead call for error reporting purposes.
-	 */
-	lookahead_end = base_yytext + cur_token_length;
-	Assert(*lookahead_end == '\0');
-
 	/* Save and restore lexer output variables around the call */
 	cur_yylval = base_yylval;
 	cur_yylloc = base_yylloc;
@@ -113,10 +105,6 @@ filtered_base_yylex(void)
 	base_yylloc = cur_yylloc;
 	base_yytext = cur_yytext;

-	/* Now revert the un-truncation of the current token */
-	lookahead_hold_char = *lookahead_end;
-	*lookahead_end = '\0';
-
 	have_lookahead = true;

 	/* Replace cur_token if needed, based on lookahead */
@@ -157,7 +145,87 @@ filtered_base_yylex(void)
 					break;
 			}
 			break;
+		case UIDENT:
+		case USCONST:
+			/* Look ahead for UESCAPE */
+			if (next_token == UESCAPE)
+			{
+				/* Yup, so get third token, which had better be SCONST */
+				const char *escstr;
+
+				/*
+				 * Again save and restore lexer output variables around the
+				 * call
+				 */
+				cur_yylval = base_yylval;
+				cur_yylloc = base_yylloc;
+				cur_yytext = base_yytext;
+
+				/* Get third token */
+				next_token = base_yylex();
+
+				if (next_token != SCONST)
+					mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
+
+				/*
+				 * Save and check escape string, which the scanner returns
+				 * with quotes
+				 */
+				escstr = base_yylval.str;
+				if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
+					mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
+
+				base_yylval = cur_yylval;
+				base_yylloc = cur_yylloc;
+				base_yytext = cur_yytext;
+
+				/* Combine 3 tokens into 1 */
+				base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
+
+				/* Clear have_lookahead, thereby consuming all three tokens */
+				have_lookahead = false;
+			}
+
+			if (cur_token == UIDENT)
+				cur_token = IDENT;
+			else if (cur_token == USCONST)
+				cur_token = SCONST;
+			break;
 	}

 	return cur_token;
 }
+
+/*
+ * check_uescapechar() and ecpg_isspace() should match their equivalents
+ * in pgc.l.
+ */
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| ecpg_isspace(escape))
+		return false;
+	else
+		return true;
+}
+
+/*
+ * ecpg_isspace() --- return true if flex scanner considers char whitespace
+ */
+static bool
+ecpg_isspace(char ch)
+{
+	if (ch == ' ' ||
+		ch == '\t' ||
+		ch == '\n' ||
+		ch == '\r' ||
+		ch == '\f')
+		return true;
+	return false;
+}
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -6,6 +6,9 @@
 *
 * This is a modified version of src/backend/parser/scan.l
 *
+ * The ecpg scanner is not backup-free, so the fail rules are
+ * only here to simplify syncing this file with scan.l.
+ *
 *
 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@@ -61,7 +64,10 @@ static bool isdefine(void);
 static bool isinformixdefine(void);

 char *token_start;
-static int state_before;
+
+/* vars to keep track of start conditions when scanning literals */
+static int state_before_str_start;
+static int state_before_str_stop;

 struct _yy_buffer
 {
@@ -105,13 +111,13 @@ static struct _if_value
 * and to eliminate parsing troubles for numeric strings.
 * Exclusive states:
 *  <xb> bit string literal
- *  <xcc> extended C-style comments in C
- *  <xcsql> extended C-style comments in SQL
+ *  <xc> extended C-style comments
 *  <xd> delimited identifiers (double-quoted identifiers)
 *  <xdc> double-quoted strings in C
 *  <xh> hexadecimal numeric string
 *  <xn> national character quoted strings
 *  <xq> standard quoted strings
+ *  <xqs> quote stop (detect continued strings)
 *  <xe> extended quoted strings (support backslash escape sequences)
 *  <xqc> single-quoted strings in C
 *  <xdolq> $foo$ quoted strings
@@ -120,18 +126,21 @@ static struct _if_value
 *  <xcond> condition of an EXEC SQL IFDEF construct
 *  <xskip> skipping the inactive part of an EXEC SQL IFDEF construct
 *
+ * Note: we intentionally don't mimic the backend's <xeu> state; we have
+ * no need to distinguish it from <xe> state.
+ *
 * Remember to add an <<EOF>> case whenever you add a new exclusive state!
 * The default one is probably not the right thing.
 */

 %x xb
-%x xcc
-%x xcsql
+%x xc
 %x xd
 %x xdc
 %x xh
 %x xn
 %x xq
+%x xqs
 %x xe
 %x xqc
 %x xdolq
@@ -181,9 +190,17 @@ horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{whitespace}*)

 quote			'
-quotestop		{quote}{whitespace}*
-quotecontinue	{quote}{whitespace_with_newline}{quote}
-quotefail		{quote}{whitespace}*"-"
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue	{whitespace_with_newline}{quote}
+
+/*
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}.  It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
+ */
+quotecontinuefail	{whitespace}*"-"?

 /* Bit string
 */
@@ -237,19 +254,11 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

-/* Unicode escapes */
-/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are
- * not needed here, but could be added if desired.)
- */
-uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
-
 /* Quoted identifier with Unicode escapes */
 xuistart		[uU]&{dquote}
-xuistop			{dquote}({whitespace}*{uescape})?

 /* Quoted string with Unicode escapes */
 xusstart		[uU]&{quote}
-xusstop			{quote}({whitespace}*{uescape})?

 /* special stuff for C strings */
 xdcqq			\\\\
@@ -408,54 +417,58 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 {whitespace}	{
 					/* ignore */
 				}
-
-{xcstart}		{
-					token_start = yytext;
-					state_before = YYSTATE;
-					xcdepth = 0;
-					BEGIN(xcsql);
-					/* Put back any characters past slash-star; see above */
-					yyless(2);
-					fputs("/*", yyout);
-				}
 } /* <SQL> */

-<C>{xcstart}	{
+<C,SQL>{
+{xcstart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					xcdepth = 0;
-					BEGIN(xcc);
+					BEGIN(xc);
 					/* Put back any characters past slash-star; see above */
 					yyless(2);
 					fputs("/*", yyout);
 				}
-<xcc>{xcstart}	{ ECHO; }
-<xcsql>{xcstart}	{
-					xcdepth++;
-					/* Put back any characters past slash-star; see above */
-					yyless(2);
-					fputs("/_*", yyout);
-				}
-<xcsql>{xcstop}	{
-					if (xcdepth <= 0)
+} /* <C,SQL> */
+
+<xc>{
+{xcstart}		{
+					if (state_before_str_start == SQL)
+					{
+						xcdepth++;
+						/* Put back any characters past slash-star; see above */
+						yyless(2);
+						fputs("/_*", yyout);
+					}
+					else if (state_before_str_start == C)
 					{
 						ECHO;
-						BEGIN(state_before);
-						token_start = NULL;
 					}
-					else
-					{
-						xcdepth--;
-						fputs("*_/", yyout);
-					}
-				}
-<xcc>{xcstop}	{
-					ECHO;
-					BEGIN(state_before);
-					token_start = NULL;
 				}

-<xcc,xcsql>{
+{xcstop}		{
+					if (state_before_str_start == SQL)
+					{
+						if (xcdepth <= 0)
+						{
+							ECHO;
+							BEGIN(SQL);
+							token_start = NULL;
+						}
+						else
+						{
+							xcdepth--;
+							fputs("*_/", yyout);
+						}
+					}
+					else if (state_before_str_start == C)
+					{
+						ECHO;
+						BEGIN(C);
+						token_start = NULL;
+					}
+				}
+
 {xcinside}		{
 					ECHO;
 				}
@@ -471,7 +484,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 <<EOF>>			{
 					mmfatal(PARSE_ERROR, "unterminated /* comment");
 				}
-} /* <xcc,xcsql> */
+} /* <xc> */

 <SQL>{
 {xbstart}		{
@@ -482,23 +495,10 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 				}
 } /* <SQL> */

-<xb>{quotestop}	|
-<xb>{quotefail} {
-					yyless(1);
-					BEGIN(SQL);
-					if (literalbuf[strspn(literalbuf, "01") + 1] != '\0')
-						mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal");
-					base_yylval.str = mm_strdup(literalbuf);
-					return BCONST;
-				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
 					addlit(yytext, yyleng);
 				}
-<xh>{quotecontinue}	|
-<xb>{quotecontinue}	{
-					/* ignore */
-				}
 <xb><<EOF>>		{ mmfatal(PARSE_ERROR, "unterminated bit string literal"); }

 <SQL>{xhstart}	{
@@ -507,19 +507,11 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					startlit();
 					addlitchar('x');
 				}
-<xh>{quotestop}	|
-<xh>{quotefail} {
-					yyless(1);
-					BEGIN(SQL);
-					base_yylval.str = mm_strdup(literalbuf);
-					return XCONST;
-				}
-
 <xh><<EOF>>		{ mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); }

 <C>{xqstart}	{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xqc);
 					startlit();
 				}
@@ -530,59 +522,91 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					 * Transfer it as-is to the backend.
 					 */
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xn);
 					startlit();
 				}

 {xqstart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xq);
 					startlit();
 				}
 {xestart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xe);
 					startlit();
 				}
 {xusstart}		{
 					token_start = yytext;
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xus);
 					startlit();
-					addlit(yytext, yyleng);
 				}
 } /* <SQL> */

-<xq,xqc>{quotestop} |
-<xq,xqc>{quotefail} {
-					yyless(1);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return SCONST;
+<xb,xh,xq,xqc,xe,xn,xus>{quote} {
+					/*
+					 * When we are scanning a quoted string and see an end
+					 * quote, we must look ahead for a possible continuation.
+					 * If we don't see one, we know the end quote was in fact
+					 * the end of the string.  To reduce the lexer table size,
+					 * we use a single "xqs" state to do the lookahead for all
+					 * types of strings.
+					 */
+					state_before_str_stop = YYSTATE;
+					BEGIN(xqs);
 				}
-<xe>{quotestop} |
-<xe>{quotefail} {
-					yyless(1);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return ECONST;
+<xqs>{quotecontinue} {
+					/*
+					 * Found a quote continuation, so return to the in-quote
+					 * state and continue scanning the literal.  Nothing is
+					 * added to the literal's contents.
+					 */
+					BEGIN(state_before_str_stop);
 				}
-<xn>{quotestop} |
-<xn>{quotefail} {
-					yyless(1);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return NCONST;
-				}
-<xus>{xusstop} {
-					addlit(yytext, yyleng);
-					BEGIN(state_before);
-					base_yylval.str = mm_strdup(literalbuf);
-					return UCONST;
+<xqs>{quotecontinuefail} |
+<xqs>{other} |
+<xqs><<EOF>>	{
+					/*
+					 * Failed to see a quote continuation.  Throw back
+					 * everything after the end quote, and handle the string
+					 * according to the state we were in previously.
+					 */
+					yyless(0);
+					BEGIN(state_before_str_start);
+
+					switch (state_before_str_stop)
+					{
+						case xb:
+							if (literalbuf[strspn(literalbuf, "01") + 1] != '\0')
+								mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal");
+							base_yylval.str = mm_strdup(literalbuf);
+							return BCONST;
+						case xh:
+							base_yylval.str = mm_strdup(literalbuf);
+							return XCONST;
+						case xq:
+							/* fallthrough */
+						case xqc:
+							base_yylval.str = psprintf("'%s'", literalbuf);
+							return SCONST;
+						case xe:
+							base_yylval.str = psprintf("E'%s'", literalbuf);
+							return SCONST;
+						case xn:
+							base_yylval.str = psprintf("N'%s'", literalbuf);
+							return SCONST;
+						case xus:
+							base_yylval.str = psprintf("U&'%s'", literalbuf);
+							return USCONST;
+						default:
+							mmfatal(PARSE_ERROR, "unhandled previous state in xqs\n");
+					}
 				}
+
 <xq,xe,xn,xus>{xqdouble}	{ addlitchar('\''); }
 <xqc>{xqcquote}	{
 					addlitchar('\\');
@@ -604,9 +628,6 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 <xe>{xehexesc}  {
 					addlit(yytext, yyleng);
 				}
-<xq,xqc,xe,xn,xus>{quotecontinue}	{
-					/* ignore */
-				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					addlitchar(yytext[0]);
@@ -639,7 +660,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 						dolqstart = NULL;
 						BEGIN(SQL);
 						base_yylval.str = mm_strdup(literalbuf);
-						return DOLCONST;
+						return SCONST;
 					}
 					else
 					{
@@ -666,20 +687,19 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+

 <SQL>{
 {xdstart}		{
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xd);
 					startlit();
 				}
 {xuistart}		{
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xui);
 					startlit();
-					addlit(yytext, yyleng);
 				}
 } /* <SQL> */

 <xd>{xdstop}	{
-					BEGIN(state_before);
+					BEGIN(state_before_str_start);
 					if (literallen == 0)
 						mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
 					/* The backend will truncate the identifier here. We do not as it does not change the result. */
@@ -687,17 +707,16 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					return CSTRING;
 				}
 <xdc>{xdstop}	{
-					BEGIN(state_before);
+					BEGIN(state_before_str_start);
 					base_yylval.str = mm_strdup(literalbuf);
 					return CSTRING;
 				}
-<xui>{xuistop}	{
-					BEGIN(state_before);
+<xui>{dquote}	{
+					BEGIN(state_before_str_start);
 					if (literallen == 2) /* "U&" */
 						mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
 					/* The backend will truncate the identifier here. We do not as it does not change the result. */
-					addlit(yytext, yyleng);
-					base_yylval.str = mm_strdup(literalbuf);
+					base_yylval.str = psprintf("U&\"%s\"", literalbuf);
 					return UIDENT;
 				}
 <xd,xui>{xddouble}	{
@@ -708,7 +727,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 				}
 <xd,xui><<EOF>>	{ mmfatal(PARSE_ERROR, "unterminated quoted identifier"); }
 <C>{xdstart}	{
-					state_before = YYSTATE;
+					state_before_str_start = YYSTATE;
 					BEGIN(xdc);
 					startlit();
 				}
--- a/src/interfaces/ecpg/test/expected/preproc-strings.c
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.c
@@ -45,7 +45,7 @@ int main(void)
 #line 13 "strings.pgc"


-  { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select 'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$", ECPGt_EOIT, 
+  { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select 'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' UESCAPE '!' , $foo$abc$def$foo$", ECPGt_EOIT, 
 	ECPGt_char,&(s1),(long)0,(long)1,(1)*sizeof(char), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
 	ECPGt_char,&(s2),(long)0,(long)1,(1)*sizeof(char), 
--- a/src/interfaces/ecpg/test/expected/preproc-strings.stderr
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stderr
@@ -8,7 +8,7 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_process_output on line 13: OK: SET
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 15: query: select 'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 15: query: select 'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' UESCAPE '!' , $foo$abc$def$foo$; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_execute on line 15: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000