mirror of
https://github.com/postgres/postgres.git
synced 2025-04-25 21:42:33 +03:00
Unicode escapes in E'...' strings
Author: Marko Kreen <markokr@gmail.com>
This commit is contained in:
parent
9048b73184
commit
c2bb0378cf
@ -1,4 +1,4 @@
|
|||||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
|
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
|
||||||
|
|
||||||
<chapter id="sql-syntax">
|
<chapter id="sql-syntax">
|
||||||
<title>SQL Syntax</title>
|
<title>SQL Syntax</title>
|
||||||
@ -398,6 +398,14 @@ SELECT 'foo' 'bar';
|
|||||||
</entry>
|
</entry>
|
||||||
<entry>hexadecimal byte value</entry>
|
<entry>hexadecimal byte value</entry>
|
||||||
</row>
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry>
|
||||||
|
<literal>\u<replaceable>xxxx</replaceable></literal>,
|
||||||
|
<literal>\U<replaceable>xxxxxxxx</replaceable></literal>
|
||||||
|
(<replaceable>x</replaceable> = 0 - 9, A - F)
|
||||||
|
</entry>
|
||||||
|
<entry>16 or 32-bit hexadecimal Unicode character value</entry>
|
||||||
|
</row>
|
||||||
</tbody>
|
</tbody>
|
||||||
</tgroup>
|
</tgroup>
|
||||||
</table>
|
</table>
|
||||||
@ -411,13 +419,25 @@ SELECT 'foo' 'bar';
|
|||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
It is your responsibility that the byte sequences you create are
|
It is your responsibility that the byte sequences you create,
|
||||||
|
especially when using the octal or hexadecimal escapes, compose
|
||||||
valid characters in the server character set encoding. When the
|
valid characters in the server character set encoding. When the
|
||||||
server encoding is UTF-8, then the alternative Unicode escape
|
server encoding is UTF-8, then the Unicode escapes or the
|
||||||
syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
|
alternative Unicode escape syntax, explained
|
||||||
should be used instead. (The alternative would be doing the
|
in <xref linkend="sql-syntax-strings-uescape">, should be used
|
||||||
UTF-8 encoding by hand and writing out the bytes, which would be
|
instead. (The alternative would be doing the UTF-8 encoding by
|
||||||
very cumbersome.)
|
hand and writing out the bytes, which would be very cumbersome.)
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
The Unicode escape syntax works fully only when the server
|
||||||
|
encoding is UTF-8. When other server encodings are used, only
|
||||||
|
code points in the ASCII range (up to <literal>\u007F</>) can be
|
||||||
|
specified. Both the 4-digit and the 8-digit form can be used to
|
||||||
|
specify UTF-16 surrogate pairs to compose characters with code
|
||||||
|
points larger than <literal>\FFFF</literal> (although the
|
||||||
|
availability of the 8-digit form technically makes this
|
||||||
|
unnecessary).
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<caution>
|
<caution>
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
|
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
|
|||||||
static char *litbufdup(base_yyscan_t yyscanner);
|
static char *litbufdup(base_yyscan_t yyscanner);
|
||||||
static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
|
static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
|
||||||
static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
|
static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
|
||||||
|
static bool is_utf16_surrogate_first(pg_wchar c);
|
||||||
|
static bool is_utf16_surrogate_second(pg_wchar c);
|
||||||
|
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
|
||||||
|
|
||||||
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
|
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
|
||||||
|
|
||||||
@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
|
|||||||
extern int base_yyget_column(yyscan_t yyscanner);
|
extern int base_yyget_column(yyscan_t yyscanner);
|
||||||
extern void base_yyset_column(int column_no, yyscan_t yyscanner);
|
extern void base_yyset_column(int column_no, yyscan_t yyscanner);
|
||||||
|
|
||||||
|
static void addunicode(pg_wchar c, yyscan_t yyscanner);
|
||||||
|
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%option reentrant
|
%option reentrant
|
||||||
@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
|
|||||||
* <xdolq> $foo$ quoted strings
|
* <xdolq> $foo$ quoted strings
|
||||||
* <xui> quoted identifier with Unicode escapes
|
* <xui> quoted identifier with Unicode escapes
|
||||||
* <xus> quoted string with Unicode escapes
|
* <xus> quoted string with Unicode escapes
|
||||||
|
* <xeu> Unicode surrogate pair in extended quoted string
|
||||||
*/
|
*/
|
||||||
|
|
||||||
%x xb
|
%x xb
|
||||||
@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
|
|||||||
%x xdolq
|
%x xdolq
|
||||||
%x xui
|
%x xui
|
||||||
%x xus
|
%x xus
|
||||||
|
%x xeu
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In order to make the world safe for Windows and Mac clients as well as
|
* In order to make the world safe for Windows and Mac clients as well as
|
||||||
@ -223,6 +230,8 @@ xeinside [^\\']+
|
|||||||
xeescape [\\][^0-7]
|
xeescape [\\][^0-7]
|
||||||
xeoctesc [\\][0-7]{1,3}
|
xeoctesc [\\][0-7]{1,3}
|
||||||
xehexesc [\\]x[0-9A-Fa-f]{1,2}
|
xehexesc [\\]x[0-9A-Fa-f]{1,2}
|
||||||
|
xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
|
||||||
|
xeunicodebad [\\]([uU])
|
||||||
|
|
||||||
/* Extended quote
|
/* Extended quote
|
||||||
* xqdouble implements embedded quote, ''''
|
* xqdouble implements embedded quote, ''''
|
||||||
@ -535,6 +544,45 @@ other .
|
|||||||
<xe>{xeinside} {
|
<xe>{xeinside} {
|
||||||
addlit(yytext, yyleng, yyscanner);
|
addlit(yytext, yyleng, yyscanner);
|
||||||
}
|
}
|
||||||
|
<xe>{xeunicode} {
|
||||||
|
pg_wchar c = strtoul(yytext+2, NULL, 16);
|
||||||
|
|
||||||
|
check_escape_warning(yyscanner);
|
||||||
|
|
||||||
|
if (is_utf16_surrogate_first(c))
|
||||||
|
{
|
||||||
|
yyextra->utf16_first_part = c;
|
||||||
|
BEGIN(xeu);
|
||||||
|
}
|
||||||
|
else if (is_utf16_surrogate_second(c))
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
else
|
||||||
|
addunicode(c, yyscanner);
|
||||||
|
}
|
||||||
|
<xeu>{xeunicode} {
|
||||||
|
pg_wchar c = strtoul(yytext+2, NULL, 16);
|
||||||
|
|
||||||
|
if (!is_utf16_surrogate_second(c))
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
|
||||||
|
c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
|
||||||
|
|
||||||
|
addunicode(c, yyscanner);
|
||||||
|
|
||||||
|
BEGIN(xe);
|
||||||
|
}
|
||||||
|
<xeu>. |
|
||||||
|
<xeu>\n |
|
||||||
|
<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
|
||||||
|
|
||||||
|
<xe>{xeunicodebad} {
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
|
||||||
|
errmsg("invalid Unicode escape"),
|
||||||
|
errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
|
||||||
|
lexer_errposition()));
|
||||||
|
}
|
||||||
|
|
||||||
<xe>{xeescape} {
|
<xe>{xeescape} {
|
||||||
if (yytext[1] == '\'')
|
if (yytext[1] == '\'')
|
||||||
{
|
{
|
||||||
@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
|
|||||||
if (ptr)
|
if (ptr)
|
||||||
pfree(ptr);
|
pfree(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
addunicode(pg_wchar c, base_yyscan_t yyscanner)
|
||||||
|
{
|
||||||
|
char buf[8];
|
||||||
|
|
||||||
|
if (c == 0 || c > 0x10FFFF)
|
||||||
|
yyerror("invalid Unicode escape value");
|
||||||
|
if (c > 0x7F)
|
||||||
|
{
|
||||||
|
if (GetDatabaseEncoding() != PG_UTF8)
|
||||||
|
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
|
||||||
|
yyextra->saw_non_ascii = true;
|
||||||
|
}
|
||||||
|
unicode_to_utf8(c, (unsigned char *)buf);
|
||||||
|
addlit(buf, pg_mblen(buf), yyscanner);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
|
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
|
|||||||
int xcdepth; /* depth of nesting in slash-star comments */
|
int xcdepth; /* depth of nesting in slash-star comments */
|
||||||
char *dolqstart; /* current $foo$ quote start string */
|
char *dolqstart; /* current $foo$ quote start string */
|
||||||
|
|
||||||
|
/* first part of UTF16 surrogate pair for Unicode escapes */
|
||||||
|
int32 utf16_first_part;
|
||||||
|
|
||||||
/* state variables for literal-lexing warnings */
|
/* state variables for literal-lexing warnings */
|
||||||
bool warn_on_first_escape;
|
bool warn_on_first_escape;
|
||||||
bool saw_non_ascii;
|
bool saw_non_ascii;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user