Unicode escapes in E'...' strings

Author: Marko Kreen <markokr@gmail.com>
2025-07-03 20:02:46 +03:00 · 2009-09-22 23:52:53 +00:00
parent 9048b73184
commit c2bb0378cf
3 changed files with 98 additions and 9 deletions
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
 <chapter id="sql-syntax">
 <title>SQL Syntax</title>
@ -398,6 +398,14 @@ SELECT 'foo'      'bar';
        </entry>
        <entry>hexadecimal byte value</entry>
       </row>
       <row>
        <entry>
         <literal>\u<replaceable>xxxx</replaceable></literal>,
         <literal>\U<replaceable>xxxxxxxx</replaceable></literal>
         (<replaceable>x</replaceable> = 0 - 9, A - F)
        </entry>
        <entry>16 or 32-bit hexadecimal Unicode character value</entry>
       </row>
      </tbody>
      </tgroup>
     </table>
@ -411,13 +419,25 @@ SELECT 'foo'      'bar';
    </para>
    <para>
-     It is your responsibility that the byte sequences you create are
+     It is your responsibility that the byte sequences you create,
     especially when using the octal or hexadecimal escapes, compose
     valid characters in the server character set encoding.  When the
-     server encoding is UTF-8, then the alternative Unicode escape
+     server encoding is UTF-8, then the Unicode escapes or the
-     syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
+     alternative Unicode escape syntax, explained
-     should be used instead.  (The alternative would be doing the
+     in <xref linkend="sql-syntax-strings-uescape">, should be used
-     UTF-8 encoding by hand and writing out the bytes, which would be
+     instead.  (The alternative would be doing the UTF-8 encoding by
-     very cumbersome.)
+     hand and writing out the bytes, which would be very cumbersome.)
    </para>
    <para>
     The Unicode escape syntax works fully only when the server
     encoding is UTF-8.  When other server encodings are used, only
     code points in the ASCII range (up to <literal>\u007F</>) can be
     specified.  Both the 4-digit and the 8-digit form can be used to
     specify UTF-16 surrogate pairs to compose characters with code
     points larger than <literal>\FFFF</literal> (although the
     availability of the 8-digit form technically makes this
     unnecessary).
    </para>
    <caution>
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@ -24,7 +24,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
 static char *litbufdup(base_yyscan_t yyscanner);
 static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
 static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
 static bool is_utf16_surrogate_first(pg_wchar c);
 static bool is_utf16_surrogate_second(pg_wchar c);
 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
 extern int	base_yyget_column(yyscan_t yyscanner);
 extern void base_yyset_column(int column_no, yyscan_t yyscanner);
 static void addunicode(pg_wchar c, yyscan_t yyscanner);
 %}
 %option reentrant
@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
 *  <xdolq> $foo$ quoted strings
 *  <xui> quoted identifier with Unicode escapes
 *  <xus> quoted string with Unicode escapes
 *  <xeu> Unicode surrogate pair in extended quoted string
 */
 %x xb
@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
 %x xdolq
 %x xui
 %x xus
 %x xeu
 /*
 * In order to make the world safe for Windows and Mac clients as well as
@ -223,6 +230,8 @@ xeinside		[^\\']+
 xeescape		[\\][^0-7]
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
 xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
 xeunicodebad	[\\]([uU])
 /* Extended quote
 * xqdouble implements embedded quote, ''''
@ -535,6 +544,45 @@ other			.
 <xe>{xeinside}  {
 					addlit(yytext, yyleng, yyscanner);
 				}
 <xe>{xeunicode} {
 					pg_wchar c = strtoul(yytext+2, NULL, 16);
 					check_escape_warning(yyscanner);
 					if (is_utf16_surrogate_first(c))
 					{
 						yyextra->utf16_first_part = c;
 						BEGIN(xeu);
 					}
 					else if (is_utf16_surrogate_second(c))
 						yyerror("invalid Unicode surrogate pair");
 					else
 						addunicode(c, yyscanner);
 				}
 <xeu>{xeunicode} {
 					pg_wchar c = strtoul(yytext+2, NULL, 16);
 					if (!is_utf16_surrogate_second(c))
 						yyerror("invalid Unicode surrogate pair");
 					c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
 					addunicode(c, yyscanner);
 					BEGIN(xe);
 				}
 <xeu>.			|
 <xeu>\n			|
 <xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
 <xe>{xeunicodebad}	{
 						ereport(ERROR,
 								(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 								 errmsg("invalid Unicode escape"),
 								 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
 								 lexer_errposition()));
 					}
 <xe>{xeescape}  {
 					if (yytext[1] == '\'')
 					{
@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
 	if (ptr)
 		pfree(ptr);
 }
 static void
 addunicode(pg_wchar c, base_yyscan_t yyscanner)
 {
 	char buf[8];
 	if (c == 0 || c > 0x10FFFF)
 		yyerror("invalid Unicode escape value");
 	if (c > 0x7F)
 	{
 		if (GetDatabaseEncoding() != PG_UTF8)
 			yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
 		yyextra->saw_non_ascii = true;
 	}
 	unicode_to_utf8(c, (unsigned char *)buf);
 	addlit(buf, pg_mblen(buf), yyscanner);
 }
--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@ -11,7 +11,7 @@
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
 	int			xcdepth;		/* depth of nesting in slash-star comments */
 	char	   *dolqstart;		/* current $foo$ quote start string */
 	/* first part of UTF16 surrogate pair for Unicode escapes */
 	int32		utf16_first_part;
 	/* state variables for literal-lexing warnings */
 	bool		warn_on_first_escape;
 	bool		saw_non_ascii;