mirror of
https://github.com/postgres/postgres.git
synced 2025-11-24 00:23:06 +03:00
Rationalize code placement between wchar.c, encnames.c, and mbutils.c.
Move all the backend-only code that'd crept into wchar.c and encnames.c into mbutils.c. To remove the last few #ifdef dependencies from wchar.c and encnames.c, also make the following changes: * Adjust get_encoding_name_for_icu to return NULL, not throw an error, for unsupported encodings. Its sole caller can perfectly well throw an error instead. (While at it, I also made this function and its sibling is_encoding_supported_by_icu proof against out-of-range encoding IDs.) * Remove the overlength-name error condition from pg_char_to_encoding. It's completely silly not to treat that just like any other the-name-is-not-in-the-table case. Also, get rid of pg_mic_mblen --- there's no obvious reason why conv.c shouldn't call pg_mule_mblen instead. Other than that, this is just code movement and comment-polishing with no functional changes. Notably, I reordered declarations in pg_wchar.h to show which functions are frontend-accessible and which are not. Discussion: https://postgr.es/m/CA+TgmoYO8oq-iy8E02rD8eX25T-9SmyxKWqqks5OMHxKvGXpXQ@mail.gmail.com
This commit is contained in:
@@ -1066,6 +1066,23 @@ pg_client_encoding(PG_FUNCTION_ARGS)
|
||||
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
|
||||
}
|
||||
|
||||
Datum
|
||||
PG_char_to_encoding(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Name s = PG_GETARG_NAME(0);
|
||||
|
||||
PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
|
||||
}
|
||||
|
||||
Datum
|
||||
PG_encoding_to_char(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 encoding = PG_GETARG_INT32(0);
|
||||
const char *encoding_name = pg_encoding_to_char(encoding);
|
||||
|
||||
return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
|
||||
}
|
||||
|
||||
/*
|
||||
* gettext() returns messages in this encoding. This often matches the
|
||||
* database encoding, but it differs for SQL_ASCII databases, for processes
|
||||
@@ -1078,6 +1095,438 @@ GetMessageEncoding(void)
|
||||
return MessageEncoding->encoding;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Generic character incrementer function.
|
||||
*
|
||||
* Not knowing anything about the properties of the encoding in use, we just
|
||||
* keep incrementing the last byte until we get a validly-encoded result,
|
||||
* or we run out of values to try. We don't bother to try incrementing
|
||||
* higher-order bytes, so there's no growth in runtime for wider characters.
|
||||
* (If we did try to do that, we'd need to consider the likelihood that 255
|
||||
* is not a valid final byte in the encoding.)
|
||||
*/
|
||||
static bool
|
||||
pg_generic_charinc(unsigned char *charptr, int len)
|
||||
{
|
||||
unsigned char *lastbyte = charptr + len - 1;
|
||||
mbverifier mbverify;
|
||||
|
||||
/* We can just invoke the character verifier directly. */
|
||||
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
|
||||
|
||||
while (*lastbyte < (unsigned char) 255)
|
||||
{
|
||||
(*lastbyte)++;
|
||||
if ((*mbverify) (charptr, len) == len)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8 character incrementer function.
|
||||
*
|
||||
* For a one-byte character less than 0x7F, we just increment the byte.
|
||||
*
|
||||
* For a multibyte character, every byte but the first must fall between 0x80
|
||||
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
|
||||
* the last byte that's not already at its maximum value. If we can't find a
|
||||
* byte that's less than the maximum allowable value, we simply fail. We also
|
||||
* need some special-case logic to skip regions used for surrogate pair
|
||||
* handling, as those should not occur in valid UTF-8.
|
||||
*
|
||||
* Note that we don't reset lower-order bytes back to their minimums, since
|
||||
* we can't afford to make an exhaustive search (see make_greater_string).
|
||||
*/
|
||||
static bool
|
||||
pg_utf8_increment(unsigned char *charptr, int length)
|
||||
{
|
||||
unsigned char a;
|
||||
unsigned char limit;
|
||||
|
||||
switch (length)
|
||||
{
|
||||
default:
|
||||
/* reject lengths 5 and 6 for now */
|
||||
return false;
|
||||
case 4:
|
||||
a = charptr[3];
|
||||
if (a < 0xBF)
|
||||
{
|
||||
charptr[3]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 3:
|
||||
a = charptr[2];
|
||||
if (a < 0xBF)
|
||||
{
|
||||
charptr[2]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 2:
|
||||
a = charptr[1];
|
||||
switch (*charptr)
|
||||
{
|
||||
case 0xED:
|
||||
limit = 0x9F;
|
||||
break;
|
||||
case 0xF4:
|
||||
limit = 0x8F;
|
||||
break;
|
||||
default:
|
||||
limit = 0xBF;
|
||||
break;
|
||||
}
|
||||
if (a < limit)
|
||||
{
|
||||
charptr[1]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 1:
|
||||
a = *charptr;
|
||||
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
|
||||
return false;
|
||||
charptr[0]++;
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* EUC-JP character incrementer function.
|
||||
*
|
||||
* If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
|
||||
* representing JIS X 0201 characters with the second byte ranging between
|
||||
* 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
|
||||
* and otherwise rewrite the whole sequence to 0xa1 0xa1.
|
||||
*
|
||||
* If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
|
||||
* in which the last two bytes range between 0xa1 and 0xfe. The last byte
|
||||
* is incremented if possible, otherwise the second-to-last byte.
|
||||
*
|
||||
* If the sequence starts with a value other than the above and its MSB
|
||||
* is set, it must be a two-byte sequence representing JIS X 0208 characters
|
||||
* with both bytes ranging between 0xa1 and 0xfe. The last byte is
|
||||
* incremented if possible, otherwise the second-to-last byte.
|
||||
*
|
||||
* Otherwise, the sequence is a single-byte ASCII character. It is
|
||||
* incremented up to 0x7f.
|
||||
*/
|
||||
static bool
|
||||
pg_eucjp_increment(unsigned char *charptr, int length)
|
||||
{
|
||||
unsigned char c1,
|
||||
c2;
|
||||
int i;
|
||||
|
||||
c1 = *charptr;
|
||||
|
||||
switch (c1)
|
||||
{
|
||||
case SS2: /* JIS X 0201 */
|
||||
if (length != 2)
|
||||
return false;
|
||||
|
||||
c2 = charptr[1];
|
||||
|
||||
if (c2 >= 0xdf)
|
||||
charptr[0] = charptr[1] = 0xa1;
|
||||
else if (c2 < 0xa1)
|
||||
charptr[1] = 0xa1;
|
||||
else
|
||||
charptr[1]++;
|
||||
break;
|
||||
|
||||
case SS3: /* JIS X 0212 */
|
||||
if (length != 3)
|
||||
return false;
|
||||
|
||||
for (i = 2; i > 0; i--)
|
||||
{
|
||||
c2 = charptr[i];
|
||||
if (c2 < 0xa1)
|
||||
{
|
||||
charptr[i] = 0xa1;
|
||||
return true;
|
||||
}
|
||||
else if (c2 < 0xfe)
|
||||
{
|
||||
charptr[i]++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Out of 3-byte code region */
|
||||
return false;
|
||||
|
||||
default:
|
||||
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
||||
{
|
||||
if (length != 2)
|
||||
return false;
|
||||
|
||||
for (i = 1; i >= 0; i--)
|
||||
{
|
||||
c2 = charptr[i];
|
||||
if (c2 < 0xa1)
|
||||
{
|
||||
charptr[i] = 0xa1;
|
||||
return true;
|
||||
}
|
||||
else if (c2 < 0xfe)
|
||||
{
|
||||
charptr[i]++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Out of 2 byte code region */
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{ /* ASCII, single byte */
|
||||
if (c1 > 0x7e)
|
||||
return false;
|
||||
(*charptr)++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the character incrementer for the encoding for the current database
|
||||
*/
|
||||
mbcharacter_incrementer
|
||||
pg_database_encoding_character_incrementer(void)
|
||||
{
|
||||
/*
|
||||
* Eventually it might be best to add a field to pg_wchar_table[], but for
|
||||
* now we just use a switch.
|
||||
*/
|
||||
switch (GetDatabaseEncoding())
|
||||
{
|
||||
case PG_UTF8:
|
||||
return pg_utf8_increment;
|
||||
|
||||
case PG_EUC_JP:
|
||||
return pg_eucjp_increment;
|
||||
|
||||
default:
|
||||
return pg_generic_charinc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* fetch maximum length of the encoding for the current database
|
||||
*/
|
||||
int
|
||||
pg_database_encoding_max_length(void)
|
||||
{
|
||||
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the current
|
||||
* database encoding. Otherwise same as pg_verify_mbstr().
|
||||
*/
|
||||
bool
|
||||
pg_verifymbstr(const char *mbstr, int len, bool noError)
|
||||
{
|
||||
return
|
||||
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||
* encoding.
|
||||
*/
|
||||
bool
|
||||
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
|
||||
{
|
||||
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||
* encoding.
|
||||
*
|
||||
* mbstr is not necessarily zero terminated; length of mbstr is
|
||||
* specified by len.
|
||||
*
|
||||
* If OK, return length of string in the encoding.
|
||||
* If a problem is found, return -1 when noError is
|
||||
* true; when noError is false, ereport() a descriptive message.
|
||||
*/
|
||||
int
|
||||
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
|
||||
{
|
||||
mbverifier mbverify;
|
||||
int mb_len;
|
||||
|
||||
Assert(PG_VALID_ENCODING(encoding));
|
||||
|
||||
/*
|
||||
* In single-byte encodings, we need only reject nulls (\0).
|
||||
*/
|
||||
if (pg_encoding_max_length(encoding) <= 1)
|
||||
{
|
||||
const char *nullpos = memchr(mbstr, 0, len);
|
||||
|
||||
if (nullpos == NULL)
|
||||
return len;
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, nullpos, 1);
|
||||
}
|
||||
|
||||
/* fetch function pointer just once */
|
||||
mbverify = pg_wchar_table[encoding].mbverify;
|
||||
|
||||
mb_len = 0;
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
int l;
|
||||
|
||||
/* fast path for ASCII-subset characters */
|
||||
if (!IS_HIGHBIT_SET(*mbstr))
|
||||
{
|
||||
if (*mbstr != '\0')
|
||||
{
|
||||
mb_len++;
|
||||
mbstr++;
|
||||
len--;
|
||||
continue;
|
||||
}
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, mbstr, len);
|
||||
}
|
||||
|
||||
l = (*mbverify) ((const unsigned char *) mbstr, len);
|
||||
|
||||
if (l < 0)
|
||||
{
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, mbstr, len);
|
||||
}
|
||||
|
||||
mbstr += l;
|
||||
len -= l;
|
||||
mb_len++;
|
||||
}
|
||||
return mb_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* check_encoding_conversion_args: check arguments of a conversion function
|
||||
*
|
||||
* "expected" arguments can be either an encoding ID or -1 to indicate that
|
||||
* the caller will check whether it accepts the ID.
|
||||
*
|
||||
* Note: the errors here are not really user-facing, so elog instead of
|
||||
* ereport seems sufficient. Also, we trust that the "expected" encoding
|
||||
* arguments are valid encoding IDs, but we don't trust the actuals.
|
||||
*/
|
||||
void
|
||||
check_encoding_conversion_args(int src_encoding,
|
||||
int dest_encoding,
|
||||
int len,
|
||||
int expected_src_encoding,
|
||||
int expected_dest_encoding)
|
||||
{
|
||||
if (!PG_VALID_ENCODING(src_encoding))
|
||||
elog(ERROR, "invalid source encoding ID: %d", src_encoding);
|
||||
if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
|
||||
elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
|
||||
pg_enc2name_tbl[expected_src_encoding].name,
|
||||
pg_enc2name_tbl[src_encoding].name);
|
||||
if (!PG_VALID_ENCODING(dest_encoding))
|
||||
elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
|
||||
if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
|
||||
elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
|
||||
pg_enc2name_tbl[expected_dest_encoding].name,
|
||||
pg_enc2name_tbl[dest_encoding].name);
|
||||
if (len < 0)
|
||||
elog(ERROR, "encoding conversion length must not be negative");
|
||||
}
|
||||
|
||||
/*
|
||||
* report_invalid_encoding: complain about invalid multibyte character
|
||||
*
|
||||
* note: len is remaining length of string, not length of character;
|
||||
* len must be greater than zero, as we always examine the first byte.
|
||||
*/
|
||||
void
|
||||
report_invalid_encoding(int encoding, const char *mbstr, int len)
|
||||
{
|
||||
int l = pg_encoding_mblen(encoding, mbstr);
|
||||
char buf[8 * 5 + 1];
|
||||
char *p = buf;
|
||||
int j,
|
||||
jlimit;
|
||||
|
||||
jlimit = Min(l, len);
|
||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||
|
||||
for (j = 0; j < jlimit; j++)
|
||||
{
|
||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||
if (j < jlimit - 1)
|
||||
p += sprintf(p, " ");
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid byte sequence for encoding \"%s\": %s",
|
||||
pg_enc2name_tbl[encoding].name,
|
||||
buf)));
|
||||
}
|
||||
|
||||
/*
|
||||
* report_untranslatable_char: complain about untranslatable character
|
||||
*
|
||||
* note: len is remaining length of string, not length of character;
|
||||
* len must be greater than zero, as we always examine the first byte.
|
||||
*/
|
||||
void
|
||||
report_untranslatable_char(int src_encoding, int dest_encoding,
|
||||
const char *mbstr, int len)
|
||||
{
|
||||
int l = pg_encoding_mblen(src_encoding, mbstr);
|
||||
char buf[8 * 5 + 1];
|
||||
char *p = buf;
|
||||
int j,
|
||||
jlimit;
|
||||
|
||||
jlimit = Min(l, len);
|
||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||
|
||||
for (j = 0; j < jlimit; j++)
|
||||
{
|
||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||
if (j < jlimit - 1)
|
||||
p += sprintf(p, " ");
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
|
||||
errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
|
||||
buf,
|
||||
pg_enc2name_tbl[src_encoding].name,
|
||||
pg_enc2name_tbl[dest_encoding].name)));
|
||||
}
|
||||
|
||||
|
||||
#ifdef WIN32
|
||||
/*
|
||||
* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
|
||||
@@ -1149,4 +1598,4 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
|
||||
return utf16;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* WIN32 */
|
||||
|
||||
Reference in New Issue
Block a user