Unicode escapes in strings and identifiers

2025-11-21 00:42:43 +03:00 · 2008-10-29 08:04:54 +00:00
parent 05bba3d176
commit 06735e3256
18 changed files with 638 additions and 59 deletions
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -12,7 +12,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.166 2008/05/20 23:17:32 meskes Exp $
+ *	  $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.167 2008/10/29 08:04:53 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -103,6 +103,8 @@ static struct _if_value
 *	<xe> extended quoted strings (support backslash escape sequences)
 *	<xn> national character quoted strings
 *  <xdolq> $foo$ quoted strings
+ *  <xui> quoted identifier with Unicode escapes
+ *  <xus> quoted string with Unicode escapes
 */

 %x xb
@@ -117,6 +119,8 @@ static struct _if_value
 %x xdolq
 %x xcond
 %x xskip
+%x xui
+%x xus

 /* Bit string
 */
@@ -172,6 +176,18 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

+/* Unicode escapes */
+/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are not needed here, but could be added if desired.) */
+uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+
+/* Quoted identifier with Unicode escapes */
+xuistart		[uU]&{dquote}
+xuistop			{dquote}({whitespace}*{uescape})?
+
+/* Quoted string with Unicode escapes */
+xusstart		[uU]&{quote}
+xusstop			{quote}({whitespace}*{uescape})?
+
 /* special stuff for C strings */
 xdcqq			\\\\
 xdcqdq			\\\"
@@ -433,6 +449,13 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 				BEGIN(xe);
 				startlit();
 			}
+<SQL>{xusstart}	{
+				token_start = yytext;
+				state_before = YYSTATE;
+				BEGIN(xus);
+				startlit();
+				addlit(yytext, yyleng);
+			}
 <xq,xqc>{quotestop} |
 <xq,xqc>{quotefail} {
 				yyless(1);
@@ -454,22 +477,28 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 				yylval.str = mm_strdup(literalbuf);
 				return NCONST;
 			}
-<xq,xe,xn>{xqdouble}	{ addlitchar('\''); }
+<xus>{xusstop} {
+				addlit(yytext, yyleng);
+				BEGIN(state_before);
+				yylval.str = mm_strdup(literalbuf);
+				return UCONST;
+			}
+<xq,xe,xn,xus>{xqdouble}	{ addlitchar('\''); }
 <xqc>{xqcquote}		{
 				addlitchar('\\');
 				addlitchar('\'');
 			}
-<xq,xqc,xn>{xqinside}	{ addlit(yytext, yyleng); }
+<xq,xqc,xn,xus>{xqinside}	{ addlit(yytext, yyleng); }
 <xe>{xeinside}		{ addlit(yytext, yyleng); }
 <xe>{xeescape}  	{ addlit(yytext, yyleng); }
 <xe>{xeoctesc}		{ addlit(yytext, yyleng); }
 <xe>{xehexesc}		{ addlit(yytext, yyleng); }
-<xq,xqc,xe,xn>{quotecontinue}	{ /* ignore */ }
+<xq,xqc,xe,xn,xus>{quotecontinue}	{ /* ignore */ }
 <xe>.                   {
 			   /* This is only needed for \ just before EOF */
 			   addlitchar(yytext[0]);
 			}
-<xq,xqc,xe,xn><<EOF>>	{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); }
+<xq,xqc,xe,xn,xus><<EOF>>	{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); }
 <SQL>{dolqfailed}	{
 				/* throw back all but the initial "$" */
 				yyless(1);
@@ -515,6 +544,12 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 						BEGIN(xd);
 						startlit();
 					}
+<SQL>{xuistart}		{
+						state_before = YYSTATE;
+						BEGIN(xui);
+						startlit();
+						addlit(yytext, yyleng);
+					}
 <xd>{xdstop}		{
 						BEGIN(state_before);
 						if (literallen == 0)
@@ -528,9 +563,18 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 						yylval.str = mm_strdup(literalbuf);
 						return CSTRING;
 					}
-<xd>{xddouble}		{ addlitchar('"'); }
-<xd>{xdinside}		{ addlit(yytext, yyleng); }
-<xd,xdc><<EOF>>		{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); }
+<xui>{xuistop}		{
+						BEGIN(state_before);
+						if (literallen == 2) /* "U&" */
+							mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
+						/* The backend will truncate the idnetifier here. We do not as it does not change the result. */
+						addlit(yytext, yyleng);
+						yylval.str = mm_strdup(literalbuf);
+						return UIDENT;
+					}
+<xd,xui>{xddouble}		{ addlitchar('"'); }
+<xd,xui>{xdinside}		{ addlit(yytext, yyleng); }
+<xd,xdc,xui><<EOF>>		{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); }
 <C,SQL>{xdstart}	{
 						state_before = YYSTATE;
 						BEGIN(xdc);
--- a/src/interfaces/ecpg/preproc/preproc.y
+++ b/src/interfaces/ecpg/preproc/preproc.y
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/preproc.y,v 1.379 2008/10/28 14:09:45 petere Exp $ */
+/* $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/preproc.y,v 1.380 2008/10/29 08:04:53 petere Exp $ */

 /* Copyright comment */
 %{
@@ -509,7 +509,7 @@ add_typedef(char *name, char * dimension, char * length, enum ECPGttype type_enu

 /* Special token types, not actually keywords - see the "lex" file */
 %token <str>	IDENT SCONST Op CSTRING CVARIABLE CPP_LINE IP BCONST
-%token <str>	XCONST DOLCONST ECONST NCONST
+%token <str>	XCONST DOLCONST ECONST NCONST UCONST UIDENT
 %token <ival>	ICONST PARAM
 %token <dval>	FCONST

@@ -4966,6 +4966,10 @@ Sconst:  SCONST
 			$$[strlen($1)+3]='\0';
 			free($1);
 		}
+	| UCONST
+		{
+			$$ = $1; 
+		}
 	| DOLCONST
 		{
 			$$ = $1; 
@@ -7013,6 +7017,7 @@ cvariable:	CVARIABLE
 		;
 ident: IDENT				{ $$ = $1; }
 		| CSTRING		{ $$ = make3_str(make_str("\""), $1, make_str("\"")); }
+		| UIDENT		{ $$ = $1; }
 		;

 quoted_ident_stringvar: name
--- a/src/interfaces/ecpg/test/ecpg_schedule
+++ b/src/interfaces/ecpg/test/ecpg_schedule
@@ -18,6 +18,7 @@ test: preproc/autoprep
 test: preproc/comment
 test: preproc/define
 test: preproc/init
+test: preproc/strings
 test: preproc/type
 test: preproc/variable
 test: preproc/whenever
--- a/src/interfaces/ecpg/test/ecpg_schedule_tcp
+++ b/src/interfaces/ecpg/test/ecpg_schedule_tcp
@@ -18,6 +18,7 @@ test: preproc/autoprep
 test: preproc/comment
 test: preproc/define
 test: preproc/init
+test: preproc/strings
 test: preproc/type
 test: preproc/variable
 test: preproc/whenever
--- a/src/interfaces/ecpg/test/expected/preproc-strings.c
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.c
@@ -0,0 +1,62 @@
+/* Processed by ecpg (regression mode) */
+/* These include files are added by the preprocessor */
+#include <ecpglib.h>
+#include <ecpgerrno.h>
+#include <sqlca.h>
+/* End of automatic include section */
+#define ECPGdebug(X,Y) ECPGdebug((X)+100,(Y))
+
+#line 1 "strings.pgc"
+#include <stdlib.h>
+
+
+#line 1 "regression.h"
+
+
+
+
+
+
+#line 3 "strings.pgc"
+
+
+/* exec sql begin declare section */
+      
+
+#line 6 "strings.pgc"
+ char * s1    , * s2    , * s3    , * s4    , * s5    , * s6    ;
+/* exec sql end declare section */
+#line 7 "strings.pgc"
+
+
+int main(void)
+{
+  ECPGdebug(1, stderr);
+
+  { ECPGconnect(__LINE__, 0, "regress1" , NULL, NULL , NULL, 0); }
+#line 13 "strings.pgc"
+
+
+  { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select  'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$     ", ECPGt_EOIT, 
+	ECPGt_char,&(s1),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s2),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s3),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s4),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s5),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s6),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);}
+#line 21 "strings.pgc"
+
+
+  printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6);
+
+  { ECPGdisconnect(__LINE__, "CURRENT");}
+#line 25 "strings.pgc"
+
+  exit (0);
+}
--- a/src/interfaces/ecpg/test/expected/preproc-strings.stderr
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stderr
@@ -0,0 +1,36 @@
+[NO_PID]: ECPGdebug: set to 1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ECPGconnect: opening database regress1 on <DEFAULT> port <DEFAULT>  
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: query: select  'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$     ; with 0 parameter(s) on connection regress1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: using PQexec
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: correctly got 1 tuples with 6 fields
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abc$def offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_finish: connection regress1 closed
+[NO_PID]: sqlca: code: 0, state: 00000
--- a/src/interfaces/ecpg/test/expected/preproc-strings.stdout
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stdout
@@ -0,0 +1 @@
+abcdef abcdef abcdef data data abc$def
--- a/src/interfaces/ecpg/test/preproc/Makefile
+++ b/src/interfaces/ecpg/test/preproc/Makefile
@@ -9,6 +9,7 @@ TESTS = array_of_struct array_of_struct.c \
 	comment comment.c \
 	define define.c \
 	init init.c \
+	strings strings.c \
 	type type.c \
 	variable variable.c \
 	whenever whenever.c
--- a/src/interfaces/ecpg/test/preproc/strings.pgc
+++ b/src/interfaces/ecpg/test/preproc/strings.pgc
@@ -0,0 +1,27 @@
+#include <stdlib.h>
+
+exec sql include ../regression;
+
+exec sql begin declare section;
+char *s1, *s2, *s3, *s4, *s5, *s6;
+exec sql end declare section;
+
+int main(void)
+{
+  ECPGdebug(1, stderr);
+
+  exec sql connect to REGRESSDB1;
+
+  exec sql select 'abcdef',
+                  N'abcdef' AS foo,
+                  E'abc\bdef' AS "foo",
+                  U&'d\0061t\0061' AS U&"foo",
+                  U&'d!+000061t!+000061' uescape '!',
+                  $foo$abc$def$foo$
+                  into :s1, :s2, :s3, :s4, :s5, :s6;
+
+  printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6);
+
+  exec sql disconnect;
+  exit (0);
+}