diff --git a/src/common/wchar.c b/src/common/wchar.c index 76b7dfdfcb6..40588beb48d 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -16,6 +16,25 @@ #include "utils/ascii.h" +/* + * In today's multibyte encodings other than UTF8, this two-byte sequence + * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0. + * + * For historical reasons, several verifychar implementations opt to reject + * this pair specifically. Byte pair range constraints, in encoding + * originator documentation, always excluded this pair. No core conversion + * could translate it. However, longstanding verifychar implementations + * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate + * pairs not valid per encoding originator documentation. To avoid tightening + * core or non-core conversions in a security patch, we sought this one pair. + * + * PQescapeString() historically used spaces for BYTE1; many other values + * could suffice for BYTE1. + */ +#define NONUTF8_INVALID_BYTE0 (0x8d) +#define NONUTF8_INVALID_BYTE1 (' ') + + /* * Operations on multi-byte encodings are driven by a table of helper * functions. @@ -1465,6 +1484,11 @@ pg_big5_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -1514,6 +1538,11 @@ pg_gbk_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -1563,6 +1592,11 @@ pg_uhc_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -2007,6 +2041,19 @@ pg_utf8_islegal(const unsigned char *source, int length) } +/* + * Fills the provided buffer with two bytes such that: + * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0 + */ +void +pg_encoding_set_invalid(int encoding, char *dst) +{ + Assert(pg_encoding_max_length(encoding) > 1); + + dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0); + dst[1] = NONUTF8_INVALID_BYTE1; +} + /* *------------------------------------------------------------------- * encoding info table @@ -2128,5 +2175,11 @@ pg_encoding_max_length(int encoding) { Assert(PG_VALID_ENCODING(encoding)); - return pg_wchar_table[encoding].maxmblen; + /* + * Check for the encoding despite the assert, due to some mingw versions + * otherwise issuing bogus warnings. + */ + return PG_VALID_ENCODING(encoding) ? + pg_wchar_table[encoding].maxmblen : + pg_wchar_table[PG_SQL_ASCII].maxmblen; } diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 249cd18a357..08f6fa6e085 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -662,6 +662,7 @@ extern int pg_valid_server_encoding_id(int encoding); * (in addition to the ones just above). The constant tables declared * earlier in this file are also available from libpgcommon. */ +extern void pg_encoding_set_invalid(int encoding, char *dst); extern int pg_encoding_mblen(int encoding, const char *mbstr); extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr); extern int pg_encoding_dsplen(int encoding, const char *mbstr); diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 442e7aff2b2..d785f92561e 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -5,6 +5,13 @@ \getenv libdir PG_LIBDIR \getenv dlsuffix PG_DLSUFFIX \set regresslib :libdir '/regress' :dlsuffix +CREATE FUNCTION test_enc_setup() RETURNS void + AS :'regresslib', 'test_enc_setup' + LANGUAGE C STRICT; +SELECT FROM test_enc_setup(); +-- +(1 row) + CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index bd6fc206734..423add66502 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -1105,6 +1105,56 @@ test_opclass_options_func(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } +/* one-time tests for encoding infrastructure */ +PG_FUNCTION_INFO_V1(test_enc_setup); +Datum +test_enc_setup(PG_FUNCTION_ARGS) +{ + /* Test pg_encoding_set_invalid() */ + for (int i = 0; i < _PG_LAST_ENCODING_; i++) + { + char buf[2], + bigbuf[16]; + int len, + mblen, + valid; + + if (pg_encoding_max_length(i) == 1) + continue; + pg_encoding_set_invalid(i, buf); + len = strnlen(buf, 2); + if (len != 2) + elog(WARNING, + "official invalid string for encoding \"%s\" has length %d", + pg_enc2name_tbl[i].name, len); + mblen = pg_encoding_mblen(i, buf); + if (mblen != 2) + elog(WARNING, + "official invalid string for encoding \"%s\" has mblen %d", + pg_enc2name_tbl[i].name, mblen); + valid = pg_encoding_verifymbstr(i, buf, len); + if (valid != 0) + elog(WARNING, + "official invalid string for encoding \"%s\" has valid prefix of length %d", + pg_enc2name_tbl[i].name, valid); + valid = pg_encoding_verifymbstr(i, buf, 1); + if (valid != 0) + elog(WARNING, + "first byte of official invalid string for encoding \"%s\" has valid prefix of length %d", + pg_enc2name_tbl[i].name, valid); + memset(bigbuf, ' ', sizeof(bigbuf)); + bigbuf[0] = buf[0]; + bigbuf[1] = buf[1]; + valid = pg_encoding_verifymbstr(i, bigbuf, sizeof(bigbuf)); + if (valid != 0) + elog(WARNING, + "trailing data changed official invalid string for encoding \"%s\" to have valid prefix of length %d", + pg_enc2name_tbl[i].name, valid); + } + + PG_RETURN_VOID(); +} + /* * Call an encoding conversion or verification function. * diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 9a65fca91fb..b567a1a5721 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -8,6 +8,11 @@ \set regresslib :libdir '/regress' :dlsuffix +CREATE FUNCTION test_enc_setup() RETURNS void + AS :'regresslib', 'test_enc_setup' + LANGUAGE C STRICT; +SELECT FROM test_enc_setup(); + CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT;