1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-23 14:01:44 +03:00

Change type "char"'s I/O format for non-ASCII characters.

Previously, a byte with the high bit set was just transmitted
as-is by charin() and charout().  This is problematic if the
database encoding is multibyte, because the result of charout()
won't be validly encoded, which breaks various stuff that
expects all text strings to be validly encoded.  We've
previously decided to enforce encoding validity rather than try
to individually harden each place that might have a problem with
such strings, so it's time to do something about "char".

To fix, represent high-bit-set characters as \ooo (backslash
and three octal digits), following the ancient "escape" format
for bytea.  charin() will continue to accept the old way as well,
though that is only reachable in single-byte encodings.

Add some test cases just so there is coverage for this code.
We'll otherwise leave this question undocumented as it was before,
because we don't really want to encourage end-user use of "char".

For the moment, back-patch into v15 so that this change appears
in 15beta3.  If there's not great pushback we should consider
absorbing this change into the older branches.

Discussion: https://postgr.es/m/2318797.1638558730@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2022-08-02 10:29:35 -04:00
parent 5b94d3ccb7
commit c034b629cc
6 changed files with 263 additions and 28 deletions

View File

@ -1338,9 +1338,10 @@ SELECT b, char_length(b) FROM test2;
<para> <para>
There are two other fixed-length character types in There are two other fixed-length character types in
<productname>PostgreSQL</productname>, shown in <xref <productname>PostgreSQL</productname>, shown in <xref
linkend="datatype-character-special-table"/>. The <type>name</type> linkend="datatype-character-special-table"/>.
type exists <emphasis>only</emphasis> for the storage of identifiers These are not intended for general-purpose use, only for use
in the internal system catalogs and is not intended for use by the general user. Its in the internal system catalogs.
The <type>name</type> type is used to store identifiers. Its
length is currently defined as 64 bytes (63 usable characters plus length is currently defined as 64 bytes (63 usable characters plus
terminator) but should be referenced using the constant terminator) but should be referenced using the constant
<symbol>NAMEDATALEN</symbol> in <literal>C</literal> source code. <symbol>NAMEDATALEN</symbol> in <literal>C</literal> source code.
@ -1348,7 +1349,8 @@ SELECT b, char_length(b) FROM test2;
is therefore adjustable for special uses); the default maximum is therefore adjustable for special uses); the default maximum
length might change in a future release. The type <type>"char"</type> length might change in a future release. The type <type>"char"</type>
(note the quotes) is different from <type>char(1)</type> in that it (note the quotes) is different from <type>char(1)</type> in that it
only uses one byte of storage. It is internally used in the system only uses one byte of storage, and therefore can store only a single
ASCII character. It is used in the system
catalogs as a simplistic enumeration type. catalogs as a simplistic enumeration type.
</para> </para>

View File

@ -20,6 +20,11 @@
#include "libpq/pqformat.h" #include "libpq/pqformat.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
#define TOOCTAL(c) ((c) + '0')
#define FROMOCTAL(c) ((unsigned char) (c) - '0')
/***************************************************************************** /*****************************************************************************
* USER I/O ROUTINES * * USER I/O ROUTINES *
*****************************************************************************/ *****************************************************************************/
@ -27,31 +32,53 @@
/* /*
* charin - converts "x" to 'x' * charin - converts "x" to 'x'
* *
* Note that an empty input string will implicitly be converted to \0. * This accepts the formats charout produces. If we have multibyte input
* that is not in the form '\ooo', then we take its first byte as the value
* and silently discard the rest; this is a backwards-compatibility provision.
*/ */
Datum Datum
charin(PG_FUNCTION_ARGS) charin(PG_FUNCTION_ARGS)
{ {
char *ch = PG_GETARG_CSTRING(0); char *ch = PG_GETARG_CSTRING(0);
if (strlen(ch) == 4 && ch[0] == '\\' &&
ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) +
(FROMOCTAL(ch[2]) << 3) +
FROMOCTAL(ch[3]));
/* This will do the right thing for a zero-length input string */
PG_RETURN_CHAR(ch[0]); PG_RETURN_CHAR(ch[0]);
} }
/* /*
* charout - converts 'x' to "x" * charout - converts 'x' to "x"
* *
* Note that if the char value is \0, the resulting string will appear * The possible output formats are:
* to be empty (null-terminated after zero characters). So this is the * 1. 0x00 is represented as an empty string.
* inverse of the charin() function for such data. * 2. 0x01..0x7F are represented as a single ASCII byte.
* 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits).
* Case 3 is meant to match the traditional "escape" format of bytea.
*/ */
Datum Datum
charout(PG_FUNCTION_ARGS) charout(PG_FUNCTION_ARGS)
{ {
char ch = PG_GETARG_CHAR(0); char ch = PG_GETARG_CHAR(0);
char *result = (char *) palloc(2); char *result = (char *) palloc(5);
if (IS_HIGHBIT_SET(ch))
{
result[0] = '\\';
result[1] = TOOCTAL(((unsigned char) ch) >> 6);
result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07);
result[3] = TOOCTAL(((unsigned char) ch) & 07);
result[4] = '\0';
}
else
{
/* This produces acceptable results for 0x00 as well */
result[0] = ch; result[0] = ch;
result[1] = '\0'; result[1] = '\0';
}
PG_RETURN_CSTRING(result); PG_RETURN_CSTRING(result);
} }
@ -176,15 +203,20 @@ Datum
text_char(PG_FUNCTION_ARGS) text_char(PG_FUNCTION_ARGS)
{ {
text *arg1 = PG_GETARG_TEXT_PP(0); text *arg1 = PG_GETARG_TEXT_PP(0);
char *ch = VARDATA_ANY(arg1);
char result; char result;
/* /*
* An empty input string is converted to \0 (for consistency with charin). * Conversion rules are the same as in charin(), but here we need to
* If the input is longer than one character, the excess data is silently * handle the empty-string case honestly.
* discarded.
*/ */
if (VARSIZE_ANY_EXHDR(arg1) > 0) if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' &&
result = *(VARDATA_ANY(arg1)); ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
result = (FROMOCTAL(ch[1]) << 6) +
(FROMOCTAL(ch[2]) << 3) +
FROMOCTAL(ch[3]);
else if (VARSIZE_ANY_EXHDR(arg1) > 0)
result = ch[0];
else else
result = '\0'; result = '\0';
@ -195,13 +227,21 @@ Datum
char_text(PG_FUNCTION_ARGS) char_text(PG_FUNCTION_ARGS)
{ {
char arg1 = PG_GETARG_CHAR(0); char arg1 = PG_GETARG_CHAR(0);
text *result = palloc(VARHDRSZ + 1); text *result = palloc(VARHDRSZ + 4);
/* /*
* Convert \0 to an empty string, for consistency with charout (and * Conversion rules are the same as in charout(), but here we need to be
* because the text stuff doesn't like embedded nulls all that well). * honest about converting 0x00 to an empty string.
*/ */
if (arg1 != '\0') if (IS_HIGHBIT_SET(arg1))
{
SET_VARSIZE(result, VARHDRSZ + 4);
(VARDATA(result))[0] = '\\';
(VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6);
(VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07);
(VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07);
}
else if (arg1 != '\0')
{ {
SET_VARSIZE(result, VARHDRSZ + 1); SET_VARSIZE(result, VARHDRSZ + 1);
*(VARDATA(result)) = arg1; *(VARDATA(result)) = arg1;

View File

@ -1,8 +1,8 @@
-- --
-- CHAR -- CHAR
-- --
-- fixed-length by value -- Per SQL standard, CHAR means character(1), that is a varlena type
-- internally passed by value if <= 4 bytes in storage -- with a constraint restricting it to one character (not byte)
SELECT char 'c' = char 'c' AS true; SELECT char 'c' = char 'c' AS true;
true true
------ ------
@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
abcd abcd
(4 rows) (4 rows)
--
-- Also test "char", which is an ad-hoc one-byte type. It can only
-- really store ASCII characters, but we allow high-bit-set characters
-- to be accessed via bytea-like escapes.
--
SELECT 'a'::"char";
char
------
a
(1 row)
SELECT '\101'::"char";
char
------
A
(1 row)
SELECT '\377'::"char";
char
------
\377
(1 row)
SELECT 'a'::"char"::text;
text
------
a
(1 row)
SELECT '\377'::"char"::text;
text
------
\377
(1 row)
SELECT '\000'::"char"::text;
text
------
(1 row)
SELECT 'a'::text::"char";
char
------
a
(1 row)
SELECT '\377'::text::"char";
char
------
\377
(1 row)
SELECT ''::text::"char";
char
------
(1 row)

View File

@ -1,8 +1,8 @@
-- --
-- CHAR -- CHAR
-- --
-- fixed-length by value -- Per SQL standard, CHAR means character(1), that is a varlena type
-- internally passed by value if <= 4 bytes in storage -- with a constraint restricting it to one character (not byte)
SELECT char 'c' = char 'c' AS true; SELECT char 'c' = char 'c' AS true;
true true
------ ------
@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
abcd abcd
(4 rows) (4 rows)
--
-- Also test "char", which is an ad-hoc one-byte type. It can only
-- really store ASCII characters, but we allow high-bit-set characters
-- to be accessed via bytea-like escapes.
--
SELECT 'a'::"char";
char
------
a
(1 row)
SELECT '\101'::"char";
char
------
A
(1 row)
SELECT '\377'::"char";
char
------
\377
(1 row)
SELECT 'a'::"char"::text;
text
------
a
(1 row)
SELECT '\377'::"char"::text;
text
------
\377
(1 row)
SELECT '\000'::"char"::text;
text
------
(1 row)
SELECT 'a'::text::"char";
char
------
a
(1 row)
SELECT '\377'::text::"char";
char
------
\377
(1 row)
SELECT ''::text::"char";
char
------
(1 row)

View File

@ -1,8 +1,8 @@
-- --
-- CHAR -- CHAR
-- --
-- fixed-length by value -- Per SQL standard, CHAR means character(1), that is a varlena type
-- internally passed by value if <= 4 bytes in storage -- with a constraint restricting it to one character (not byte)
SELECT char 'c' = char 'c' AS true; SELECT char 'c' = char 'c' AS true;
true true
------ ------
@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
abcd abcd
(4 rows) (4 rows)
--
-- Also test "char", which is an ad-hoc one-byte type. It can only
-- really store ASCII characters, but we allow high-bit-set characters
-- to be accessed via bytea-like escapes.
--
SELECT 'a'::"char";
char
------
a
(1 row)
SELECT '\101'::"char";
char
------
A
(1 row)
SELECT '\377'::"char";
char
------
\377
(1 row)
SELECT 'a'::"char"::text;
text
------
a
(1 row)
SELECT '\377'::"char"::text;
text
------
\377
(1 row)
SELECT '\000'::"char"::text;
text
------
(1 row)
SELECT 'a'::text::"char";
char
------
a
(1 row)
SELECT '\377'::text::"char";
char
------
\377
(1 row)
SELECT ''::text::"char";
char
------
(1 row)

View File

@ -2,8 +2,8 @@
-- CHAR -- CHAR
-- --
-- fixed-length by value -- Per SQL standard, CHAR means character(1), that is a varlena type
-- internally passed by value if <= 4 bytes in storage -- with a constraint restricting it to one character (not byte)
SELECT char 'c' = char 'c' AS true; SELECT char 'c' = char 'c' AS true;
@ -71,3 +71,19 @@ DROP TABLE CHAR_TBL;
INSERT INTO CHAR_TBL (f1) VALUES ('abcde'); INSERT INTO CHAR_TBL (f1) VALUES ('abcde');
SELECT * FROM CHAR_TBL; SELECT * FROM CHAR_TBL;
--
-- Also test "char", which is an ad-hoc one-byte type. It can only
-- really store ASCII characters, but we allow high-bit-set characters
-- to be accessed via bytea-like escapes.
--
SELECT 'a'::"char";
SELECT '\101'::"char";
SELECT '\377'::"char";
SELECT 'a'::"char"::text;
SELECT '\377'::"char"::text;
SELECT '\000'::"char"::text;
SELECT 'a'::text::"char";
SELECT '\377'::text::"char";
SELECT ''::text::"char";