mirror of
https://github.com/postgres/postgres.git
synced 2025-07-08 11:42:09 +03:00
Fix char2wchar/wchar2char to support collations properly.
These functions should take a pg_locale_t, not a collation OID, and should call mbstowcs_l/wcstombs_l where available. Where those functions are not available, temporarily select the correct locale with uselocale(). This change removes the bogus assumption that all locales selectable in a given database have the same wide-character conversion method; in particular, the collate.linux.utf8 regression test now passes with LC_CTYPE=C, so long as the database encoding is UTF8. I decided to move the char2wchar/wchar2char functions out of mbutils.c and into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus don't really belong with the mbutils.c functions. Keeping them where they were would have required importing pg_locale_t into pg_wchar.h somehow, which did not seem like a good plan.
This commit is contained in:
@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid)
|
||||
|
||||
return cache_entry->locale;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
|
||||
* Therefore we keep them here rather than with the mbutils code.
|
||||
*/
|
||||
|
||||
#ifdef USE_WIDE_UPPER_LOWER
|
||||
|
||||
/*
|
||||
* wchar2char --- convert wide characters to multibyte format
|
||||
*
|
||||
* This has the same API as the standard wcstombs_l() function; in particular,
|
||||
* tolen is the maximum number of bytes to store at *to, and *from must be
|
||||
* zero-terminated. The output will be zero-terminated iff there is room.
|
||||
*/
|
||||
size_t
|
||||
wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
|
||||
{
|
||||
size_t result;
|
||||
|
||||
if (tolen == 0)
|
||||
return 0;
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
/*
|
||||
* On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
|
||||
* for some reason mbstowcs and wcstombs won't do this for us, so we use
|
||||
* MultiByteToWideChar().
|
||||
*/
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
|
||||
NULL, NULL);
|
||||
/* A zero return is failure */
|
||||
if (result <= 0)
|
||||
result = -1;
|
||||
else
|
||||
{
|
||||
Assert(result <= tolen);
|
||||
/* Microsoft counts the zero terminator in the result */
|
||||
result--;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif /* WIN32 */
|
||||
if (locale == (pg_locale_t) 0)
|
||||
{
|
||||
/* Use wcstombs directly for the default locale */
|
||||
result = wcstombs(to, from, tolen);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef HAVE_LOCALE_T
|
||||
#ifdef HAVE_WCSTOMBS_L
|
||||
/* Use wcstombs_l for nondefault locales */
|
||||
result = wcstombs_l(to, from, tolen, locale);
|
||||
#else /* !HAVE_WCSTOMBS_L */
|
||||
/* We have to temporarily set the locale as current ... ugh */
|
||||
locale_t save_locale = uselocale(locale);
|
||||
|
||||
result = wcstombs(to, from, tolen);
|
||||
|
||||
uselocale(save_locale);
|
||||
#endif /* HAVE_WCSTOMBS_L */
|
||||
#else /* !HAVE_LOCALE_T */
|
||||
/* Can't have locale != 0 without HAVE_LOCALE_T */
|
||||
elog(ERROR, "wcstombs_l is not available");
|
||||
result = 0; /* keep compiler quiet */
|
||||
#endif /* HAVE_LOCALE_T */
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* char2wchar --- convert multibyte characters to wide characters
|
||||
*
|
||||
* This has almost the API of mbstowcs_l(), except that *from need not be
|
||||
* null-terminated; instead, the number of input bytes is specified as
|
||||
* fromlen. Also, we ereport() rather than returning -1 for invalid
|
||||
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
|
||||
* The output will be zero-terminated iff there is room.
|
||||
*/
|
||||
size_t
|
||||
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
size_t result;
|
||||
|
||||
if (tolen == 0)
|
||||
return 0;
|
||||
|
||||
#ifdef WIN32
|
||||
/* See WIN32 "Unicode" comment above */
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
/* Win32 API does not work for zero-length input */
|
||||
if (fromlen == 0)
|
||||
result = 0;
|
||||
else
|
||||
{
|
||||
result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
|
||||
/* A zero return is failure */
|
||||
if (result == 0)
|
||||
result = -1;
|
||||
}
|
||||
|
||||
if (result != -1)
|
||||
{
|
||||
Assert(result < tolen);
|
||||
/* Append trailing null wchar (MultiByteToWideChar() does not) */
|
||||
to[result] = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif /* WIN32 */
|
||||
{
|
||||
/* mbstowcs requires ending '\0' */
|
||||
char *str = pnstrdup(from, fromlen);
|
||||
|
||||
if (locale == (pg_locale_t) 0)
|
||||
{
|
||||
/* Use mbstowcs directly for the default locale */
|
||||
result = mbstowcs(to, str, tolen);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef HAVE_LOCALE_T
|
||||
#ifdef HAVE_WCSTOMBS_L
|
||||
/* Use mbstowcs_l for nondefault locales */
|
||||
result = mbstowcs_l(to, str, tolen, locale);
|
||||
#else /* !HAVE_WCSTOMBS_L */
|
||||
/* We have to temporarily set the locale as current ... ugh */
|
||||
locale_t save_locale = uselocale(locale);
|
||||
|
||||
result = mbstowcs(to, str, tolen);
|
||||
|
||||
uselocale(save_locale);
|
||||
#endif /* HAVE_WCSTOMBS_L */
|
||||
#else /* !HAVE_LOCALE_T */
|
||||
/* Can't have locale != 0 without HAVE_LOCALE_T */
|
||||
elog(ERROR, "mbstowcs_l is not available");
|
||||
result = 0; /* keep compiler quiet */
|
||||
#endif /* HAVE_LOCALE_T */
|
||||
}
|
||||
|
||||
pfree(str);
|
||||
}
|
||||
|
||||
if (result == -1)
|
||||
{
|
||||
/*
|
||||
* Invalid multibyte character encountered. We try to give a useful
|
||||
* error message by letting pg_verifymbstr check the string. But it's
|
||||
* possible that the string is OK to us, and not OK to mbstowcs ---
|
||||
* this suggests that the LC_CTYPE locale is different from the
|
||||
* database encoding. Give a generic error message if verifymbstr
|
||||
* can't find anything wrong.
|
||||
*/
|
||||
pg_verifymbstr(from, fromlen, false); /* might not return */
|
||||
/* but if it does ... */
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid multibyte character for locale"),
|
||||
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif /* USE_WIDE_UPPER_LOWER */
|
||||
|
Reference in New Issue
Block a user