diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index e3e40d6c21a..2deba44abd3 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -292,21 +292,24 @@ hashtext(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; + const char *keydata = VARDATA_ANY(key); + size_t keylen = VARSIZE_ANY_EXHDR(key); - ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize + 1); - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); - buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); + rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); - result = hash_any(buf, bsize); + /* + * In principle, there's no reason to include the terminating NUL + * character in the hash, but it was done before and the behavior + * must be preserved. + */ + result = hash_any((uint8_t *) buf, bsize + 1); pfree(buf); } @@ -350,21 +353,25 @@ hashtextextended(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; + const char *keydata = VARDATA_ANY(key); + size_t keylen = VARSIZE_ANY_EXHDR(key); - ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize + 1); - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); - buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); + rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); - result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); + /* + * In principle, there's no reason to include the terminating NUL + * character in the hash, but it was done before and the behavior + * must be preserved. + */ + result = hash_any_extended((uint8_t *) buf, bsize + 1, + PG_GETARG_INT64(1)); pfree(buf); } diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 059e4fd79f0..ef9efb4a7c9 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -79,6 +79,12 @@ #include #endif +/* + * This should be large enough that most strings will fit, but small enough + * that we feel comfortable putting it on the stack + */ +#define TEXTBUFLEN 1024 + #define MAX_L10N_DATA 80 @@ -123,6 +129,19 @@ static char *IsoLocaleName(const char *); #endif #ifdef USE_ICU +/* + * Converter object for converting between ICU's UChar strings and C strings + * in database encoding. Since the database encoding doesn't change, we only + * need one of these per session. + */ +static UConverter *icu_converter = NULL; + +static void init_icu_converter(void); +static size_t uchar_length(UConverter *converter, + const char *str, int32_t len); +static int32_t uchar_convert(UConverter *converter, + UChar *dest, int32_t destlen, + const char *str, int32_t srclen); static void icu_set_collation_attributes(UCollator *collator, const char *loc); #endif @@ -1731,15 +1750,705 @@ get_collation_actual_version(char collprovider, const char *collcollate) return collversion; } +/* + * pg_strncoll_libc_win32_utf8 + * + * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and + * invoke wcscoll() or wcscoll_l(). + */ +#ifdef WIN32 +static int +pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, + size_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + char *a1p, + *a2p; + int a1len = len1 * 2 + 2; + int a2len = len2 * 2 + 2; + int r; + int result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + Assert(GetDatabaseEncoding() == PG_UTF8); +#ifndef WIN32 + Assert(false); +#endif + + if (a1len + a2len > TEXTBUFLEN) + buf = palloc(a1len + a2len); + + a1p = buf; + a2p = buf + a1len; + + /* API does not work for zero-length input */ + if (len1 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, + (LPWSTR) a1p, a1len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a1p)[r] = 0; + + if (len2 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, + (LPWSTR) a2p, a2len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a2p)[r] = 0; + + errno = 0; +#ifdef HAVE_LOCALE_T + if (locale) + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + else +#endif + result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); + if (result == 2147483647) /* _NLSCMPERROR; missing from mingw + * headers */ + ereport(ERROR, + (errmsg("could not compare Unicode strings: %m"))); + + if (buf != sbuf) + pfree(buf); + + return result; +} +#endif /* WIN32 */ + +/* + * pg_strcoll_libc + * + * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for + * the given locale, platform, and database encoding. If the locale is NULL, + * use the database collation. + * + * Arguments must be encoded in the database encoding and nul-terminated. + */ +static int +pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); +#ifdef WIN32 + if (GetDatabaseEncoding() == PG_UTF8) + { + size_t len1 = strlen(arg1); + size_t len2 = strlen(arg2); + result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); + } + else +#endif /* WIN32 */ + if (locale) + { +#ifdef HAVE_LOCALE_T + result = strcoll_l(arg1, arg2, locale->info.lt); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +#endif + } + else + result = strcoll(arg1, arg2); + + return result; +} + +/* + * pg_strncoll_libc + * + * Nul-terminate the arguments and call pg_strcoll_libc(). + */ +static int +pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + size_t bufsize1 = len1 + 1; + size_t bufsize2 = len2 + 1; + char *arg1n; + char *arg2n; + int result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + +#ifdef WIN32 + /* check for this case before doing the work for nul-termination */ + if (GetDatabaseEncoding() == PG_UTF8) + return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); +#endif /* WIN32 */ + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + arg1n = buf; + arg2n = buf + bufsize1; + + /* nul-terminate arguments */ + memcpy(arg1n, arg1, len1); + arg1n[len1] = '\0'; + memcpy(arg2n, arg2, len2); + arg2n[len2] = '\0'; + + result = pg_strcoll_libc(arg1n, arg2n, locale); + + if (buf != sbuf) + pfree(buf); + + return result; +} #ifdef USE_ICU -/* - * Converter object for converting between ICU's UChar strings and C strings - * in database encoding. Since the database encoding doesn't change, we only - * need one of these per session. - */ -static UConverter *icu_converter = NULL; +/* + * pg_strncoll_icu_no_utf8 + * + * Convert the arguments from the database encoding to UChar strings, then + * call ucol_strcoll(). An argument length of -1 means that the string is + * NUL-terminated. + * + * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), + * caller should call that instead. + */ +static int +pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1, + const char *arg2, int32_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + int32_t ulen1; + int32_t ulen2; + size_t bufsize1; + size_t bufsize2; + UChar *uchar1, + *uchar2; + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); +#ifdef HAVE_UCOL_STRCOLLUTF8 + Assert(GetDatabaseEncoding() != PG_UTF8); +#endif + + init_icu_converter(); + + ulen1 = uchar_length(icu_converter, arg1, len1); + ulen2 = uchar_length(icu_converter, arg2, len2); + + bufsize1 = (ulen1 + 1) * sizeof(UChar); + bufsize2 = (ulen2 + 1) * sizeof(UChar); + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + uchar1 = (UChar *) buf; + uchar2 = (UChar *) (buf + bufsize1); + + ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); + ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); + + result = ucol_strcoll(locale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + + if (buf != sbuf) + pfree(buf); + + return result; +} + +/* + * pg_strncoll_icu + * + * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given + * database encoding. An argument length of -1 means the string is + * NUL-terminated. + * + * Arguments must be encoded in the database encoding. + */ +static int +pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, + pg_locale_t locale) +{ + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); + +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else +#endif + { + result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); + } + + return result; +} + +#endif /* USE_ICU */ + +/* + * pg_strcoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), + * or wcscoll_l() as appropriate for the given locale, platform, and database + * encoding. If the locale is not specified, use the database collation. + * + * Arguments must be encoded in the database encoding and nul-terminated. + * + * The caller is responsible for breaking ties if the collation is + * deterministic; this maintains consistency with pg_strxfrm(), which cannot + * easily account for deterministic collations. + */ +int +pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strcoll_libc(arg1, arg2, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strncoll_icu(arg1, -1, arg2, -1, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * pg_strncoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), + * or wcscoll_l() as appropriate for the given locale, platform, and database + * encoding. If the locale is not specified, use the database collation. + * + * Arguments must be encoded in the database encoding. + * + * This function may need to nul-terminate the arguments for libc functions; + * so if the caller already has nul-terminated strings, it should call + * pg_strcoll() instead. + * + * The caller is responsible for breaking ties if the collation is + * deterministic; this maintains consistency with pg_strnxfrm(), which cannot + * easily account for deterministic collations. + */ +int +pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + int result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strncoll_libc(arg1, len1, arg2, len2, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strncoll_icu(arg1, len1, arg2, len2, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + + +static size_t +pg_strxfrm_libc(char *dest, const char *src, size_t destsize, + pg_locale_t locale) +{ + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + +#ifdef TRUST_STRXFRM +#ifdef HAVE_LOCALE_T + if (locale) + return strxfrm_l(dest, src, destsize, locale->info.lt); + else +#endif + return strxfrm(dest, src, destsize); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +#endif +} + +static size_t +pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + size_t bufsize = srclen + 1; + size_t result; + + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + + if (bufsize > TEXTBUFLEN) + buf = palloc(bufsize); + + /* nul-terminate arguments */ + memcpy(buf, src, srclen); + buf[srclen] = '\0'; + + result = pg_strxfrm_libc(dest, buf, destsize, locale); + + if (buf != sbuf) + pfree(buf); + + /* if dest is defined, it should be nul-terminated */ + Assert(result >= destsize || dest[result] == '\0'); + + return result; +} + +#ifdef USE_ICU + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UChar *uchar; + int32_t ulen; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + result_bsize = ucol_getSortKey(locale->info.icu.ucol, + uchar, ulen, + (uint8_t *) dest, destsize); + + /* + * ucol_getSortKey() counts the nul-terminator in the result length, but + * this function should not. + */ + Assert(result_bsize > 0); + result_bsize--; + + if (buf != sbuf) + pfree(buf); + + /* if dest is defined, it should be nul-terminated */ + Assert(result_bsize >= destsize || dest[result_bsize] == '\0'); + + return result_bsize; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen, + int32_t destsize, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + int32_t ulen = -1; + UChar *uchar = NULL; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + Assert(GetDatabaseEncoding() != PG_UTF8); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + uiter_setString(&iter, uchar, ulen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + + return result_bsize; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen, + int32_t destsize, pg_locale_t locale) +{ + size_t result; + + Assert(locale->provider == COLLPROVIDER_ICU); + + if (GetDatabaseEncoding() == PG_UTF8) + { + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + + uiter_setUTF8(&iter, src, srclen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + } + else + result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize, + locale); + + return result; +} + +#endif + +/* + * Return true if the collation provider supports pg_strxfrm() and + * pg_strnxfrm(); otherwise false. + * + * Unfortunately, it seems that strxfrm() for non-C collations is broken on + * many common platforms; testing of multiple versions of glibc reveals that, + * for many locales, strcoll() and strxfrm() do not return consistent + * results. While no other libc other than Cygwin has so far been shown to + * have a problem, we take the conservative course of action for right now and + * disable this categorically. (Users who are certain this isn't a problem on + * their system can define TRUST_STRXFRM.) + * + * No similar problem is known for the ICU provider. + */ +bool +pg_strxfrm_enabled(pg_locale_t locale) +{ + if (!locale || locale->provider == COLLPROVIDER_LIBC) +#ifdef TRUST_STRXFRM + return true; +#else + return false; +#endif + else if (locale->provider == COLLPROVIDER_ICU) + return true; + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +} + +/* + * pg_strxfrm + * + * Transforms 'src' to a nul-terminated string stored in 'dest' such that + * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on + * untransformed strings. + * + * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest' + * may be NULL. + * + * Returns the number of bytes needed to store the transformed string, + * excluding the terminating nul byte. If the value returned is 'destsize' or + * greater, the resulting contents of 'dest' are undefined. + */ +size_t +pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strxfrm_libc(dest, src, destsize, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strnxfrm_icu(dest, src, -1, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * pg_strnxfrm + * + * Transforms 'src' to a nul-terminated string stored in 'dest' such that + * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on + * untransformed strings. + * + * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may + * be NULL. + * + * Returns the number of bytes needed to store the transformed string, + * excluding the terminating nul byte. If the value returned is 'destsize' or + * greater, the resulting contents of 'dest' are undefined. + * + * This function may need to nul-terminate the argument for libc functions; + * so if the caller already has a nul-terminated string, it should call + * pg_strxfrm() instead. + */ +size_t +pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, + pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * Return true if the collation provider supports pg_strxfrm_prefix() and + * pg_strnxfrm_prefix(); otherwise false. + */ +bool +pg_strxfrm_prefix_enabled(pg_locale_t locale) +{ + if (!locale || locale->provider == COLLPROVIDER_LIBC) + return false; + else if (locale->provider == COLLPROVIDER_ICU) + return true; + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); +} + +/* + * pg_strxfrm_prefix + * + * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary + * memcmp() on the byte sequence is equivalent to pg_strcoll() on + * untransformed strings. The result is not nul-terminated. + * + * The provided 'src' must be nul-terminated. + * + * If destsize is not large enough to hold the resulting byte sequence, stores + * only the first destsize bytes in 'dest'. Returns the number of bytes + * actually copied to 'dest'. + */ +size_t +pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, + pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()", + locale->provider); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +/* + * pg_strnxfrm_prefix + * + * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary + * memcmp() on the byte sequence is equivalent to pg_strcoll() on + * untransformed strings. The result is not nul-terminated. + * + * The provided 'src' must be nul-terminated. + * + * If destsize is not large enough to hold the resulting byte sequence, stores + * only the first destsize bytes in 'dest'. Returns the number of bytes + * actually copied to 'dest'. + * + * This function may need to nul-terminate the argument for libc functions; + * so if the caller already has a nul-terminated string, it should call + * pg_strxfrm_prefix() instead. + */ +size_t +pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, + size_t srclen, pg_locale_t locale) +{ + size_t result; + + if (!locale || locale->provider == COLLPROVIDER_LIBC) + elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()", + locale->provider); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); +#endif + else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", locale->provider); + + return result; +} + +#ifdef USE_ICU static void init_icu_converter(void) { @@ -1767,6 +2476,39 @@ init_icu_converter(void) icu_converter = conv; } +/* + * Find length, in UChars, of given string if converted to UChar string. + */ +static size_t +uchar_length(UConverter *converter, const char *str, int32_t len) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + +/* + * Convert the given source string into a UChar string, stored in dest, and + * return the length (in UChars). + */ +static int32_t +uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, + const char *src, int32_t srclen) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + status = U_ZERO_ERROR; + ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + /* * Convert a string in the database encoding into a string of UChars. * @@ -1782,26 +2524,15 @@ init_icu_converter(void) int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) { - UErrorCode status; - int32_t len_uchar; + int32_t len_uchar; init_icu_converter(); - status = U_ZERO_ERROR; - len_uchar = ucnv_toUChars(icu_converter, NULL, 0, - buff, nbytes, &status); - if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + len_uchar = uchar_length(icu_converter, buff, nbytes); *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); - - status = U_ZERO_ERROR; - len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1, - buff, nbytes, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + len_uchar = uchar_convert(icu_converter, + *buff_uchar, len_uchar + 1, buff, nbytes); return len_uchar; } diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 8ddbae8f51d..9ff3bcbdb75 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -1024,21 +1024,22 @@ hashbpchar(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; - ulen = icu_to_uchar(&uchar, keydata, keylen); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize + 1); - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); - buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); + rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); - result = hash_any(buf, bsize); + /* + * In principle, there's no reason to include the terminating NUL + * character in the hash, but it was done before and the behavior + * must be preserved. + */ + result = hash_any((uint8_t *) buf, bsize + 1); pfree(buf); } @@ -1086,21 +1087,23 @@ hashbpcharextended(PG_FUNCTION_ARGS) #ifdef USE_ICU if (mylocale->provider == COLLPROVIDER_ICU) { - int32_t ulen = -1; - UChar *uchar = NULL; - Size bsize; - uint8_t *buf; + Size bsize, rsize; + char *buf; - ulen = icu_to_uchar(&uchar, keydata, keylen); + bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); + buf = palloc(bsize + 1); - bsize = ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, NULL, 0); - buf = palloc(bsize); - ucol_getSortKey(mylocale->info.icu.ucol, - uchar, ulen, buf, bsize); - pfree(uchar); + rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); + if (rsize != bsize) + elog(ERROR, "pg_strnxfrm() returned unexpected result"); - result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); + /* + * In principle, there's no reason to include the terminating NUL + * character in the hash, but it was done before and the behavior + * must be preserved. + */ + result = hash_any_extended((uint8_t *) buf, bsize + 1, + PG_GETARG_INT64(1)); pfree(buf); } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 170b3a3820b..4ca823ca7b1 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1553,10 +1553,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) } else { - char a1buf[TEXTBUFLEN]; - char a2buf[TEXTBUFLEN]; - char *a1p, - *a2p; pg_locale_t mylocale; mylocale = pg_newlocale_from_collation(collid); @@ -1573,171 +1569,16 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) return 0; -#ifdef WIN32 - /* Win32 does not have UTF-8, so we need to map to UTF-16 */ - if (GetDatabaseEncoding() == PG_UTF8 - && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC)) - { - int a1len; - int a2len; - int r; - - if (len1 >= TEXTBUFLEN / 2) - { - a1len = len1 * 2 + 2; - a1p = palloc(a1len); - } - else - { - a1len = TEXTBUFLEN; - a1p = a1buf; - } - if (len2 >= TEXTBUFLEN / 2) - { - a2len = len2 * 2 + 2; - a2p = palloc(a2len); - } - else - { - a2len = TEXTBUFLEN; - a2p = a2buf; - } - - /* stupid Microsloth API does not work for zero-length input */ - if (len1 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, - (LPWSTR) a1p, a1len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a1p)[r] = 0; - - if (len2 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, - (LPWSTR) a2p, a2len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a2p)[r] = 0; - - errno = 0; -#ifdef HAVE_LOCALE_T - if (mylocale) - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt); - else -#endif - result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); - if (result == 2147483647) /* _NLSCMPERROR; missing from mingw - * headers */ - ereport(ERROR, - (errmsg("could not compare Unicode strings: %m"))); - - /* Break tie if necessary. */ - if (result == 0 && - (!mylocale || mylocale->deterministic)) - { - result = memcmp(arg1, arg2, Min(len1, len2)); - if ((result == 0) && (len1 != len2)) - result = (len1 < len2) ? -1 : 1; - } - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); - - return result; - } -#endif /* WIN32 */ - - if (len1 >= TEXTBUFLEN) - a1p = (char *) palloc(len1 + 1); - else - a1p = a1buf; - if (len2 >= TEXTBUFLEN) - a2p = (char *) palloc(len2 + 1); - else - a2p = a2buf; - - memcpy(a1p, arg1, len1); - a1p[len1] = '\0'; - memcpy(a2p, arg2, len2); - a2p[len2] = '\0'; - - if (mylocale) - { - if (mylocale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(mylocale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, arg1, len1); - ulen2 = icu_to_uchar(&uchar2, arg2, len2); - - result = ucol_strcoll(mylocale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(a1p, a2p, mylocale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif - } - } - else - result = strcoll(a1p, a2p); + result = pg_strncoll(arg1, len1, arg2, len2, mylocale); /* Break tie if necessary. */ if (result == 0 && (!mylocale || mylocale->deterministic)) - result = strcmp(a1p, a2p); - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); + { + result = memcmp(arg1, arg2, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + } } return result; @@ -2073,20 +1914,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) */ locale = pg_newlocale_from_collation(collid); - /* - * There is a further exception on Windows. When the database - * encoding is UTF-8 and we are not using the C collation, complex - * hacks are required. We don't currently have a comparator that - * handles that case, so we fall back on the slow method of having the - * sort code invoke bttextcmp() (in the case of text) via the fmgr - * trampoline. ICU locales work just the same on Windows, however. - */ -#ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8 && - !(locale && locale->provider == COLLPROVIDER_ICU)) - return; -#endif - /* * We use varlenafastcmp_locale except for type NAME. */ @@ -2102,13 +1929,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) /* * Unfortunately, it seems that abbreviation for non-C collations is - * broken on many common platforms; testing of multiple versions of glibc - * reveals that, for many locales, strcoll() and strxfrm() do not return - * consistent results, which is fatal to this optimization. While no - * other libc other than Cygwin has so far been shown to have a problem, - * we take the conservative course of action for right now and disable - * this categorically. (Users who are certain this isn't a problem on - * their system can define TRUST_STRXFRM.) + * broken on many common platforms; see pg_strxfrm_enabled(). * * Even apart from the risk of broken locales, it's possible that there * are platforms where the use of abbreviated keys should be disabled at @@ -2121,10 +1942,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) * categorically, we may still want or need to disable it for particular * platforms. */ -#ifndef TRUST_STRXFRM - if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU)) + if (!collate_c && !pg_strxfrm_enabled(locale)) abbreviate = false; -#endif /* * If we're using abbreviated keys, or if we're using a locale-aware @@ -2395,60 +2214,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) return sss->last_returned; } - if (sss->locale) - { - if (sss->locale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(sss->locale->info.icu.ucol, - a1p, len1, - a2p, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, a1p, len1); - ulen2 = icu_to_uchar(&uchar2, a2p, len2); - - result = ucol_strcoll(sss->locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif - } - } - else - result = strcoll(sss->buf1, sss->buf2); + result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); /* Break tie if necessary. */ if (result == 0 && @@ -2471,6 +2237,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) { + const size_t max_prefix_bytes = sizeof(Datum); VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; VarString *authoritative = DatumGetVarStringPP(original); char *authoritative_data = VARDATA_ANY(authoritative); @@ -2483,7 +2250,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) pres = (char *) &res; /* memset(), so any non-overwritten bytes are NUL */ - memset(pres, 0, sizeof(Datum)); + memset(pres, 0, max_prefix_bytes); len = VARSIZE_ANY_EXHDR(authoritative); /* Get number of bytes, ignoring trailing spaces */ @@ -2518,14 +2285,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * thing: explicitly consider string length. */ if (sss->collate_c) - memcpy(pres, authoritative_data, Min(len, sizeof(Datum))); + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); else { Size bsize; -#ifdef USE_ICU - int32_t ulen = -1; - UChar *uchar = NULL; -#endif /* * We're not using the C collation, so fall back on strxfrm or ICU @@ -2543,7 +2306,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) if (sss->last_len1 == len && sss->cache_blob && memcmp(sss->buf1, authoritative_data, len) == 0) { - memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2)); + memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2)); /* No change affecting cardinality, so no hashing required */ goto done; } @@ -2551,81 +2314,49 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) memcpy(sss->buf1, authoritative_data, len); /* - * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not - * necessary for ICU, but doesn't hurt. + * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated + * strings. */ sss->buf1[len] = '\0'; sss->last_len1 = len; -#ifdef USE_ICU - /* When using ICU and not UTF8, convert string to UChar. */ - if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU && - GetDatabaseEncoding() != PG_UTF8) - ulen = icu_to_uchar(&uchar, sss->buf1, len); -#endif - - /* - * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer, - * and try again. Both of these functions have the result buffer - * content undefined if the result did not fit, so we need to retry - * until everything fits, even though we only need the first few bytes - * in the end. When using ucol_nextSortKeyPart(), however, we only - * ask for as many bytes as we actually need. - */ - for (;;) + if (pg_strxfrm_prefix_enabled(sss->locale)) { -#ifdef USE_ICU - if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU) + if (sss->buflen2 < max_prefix_bytes) { - /* - * When using UTF8, use the iteration interface so we only - * need to produce as many bytes as we actually need. - */ - if (GetDatabaseEncoding() == PG_UTF8) - { - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - - uiter_setUTF8(&iter, sss->buf1, len); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol, - &iter, - state, - (uint8_t *) sss->buf2, - Min(sizeof(Datum), sss->buflen2), - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - } - else - bsize = ucol_getSortKey(sss->locale->info.icu.ucol, - uchar, ulen, - (uint8_t *) sss->buf2, sss->buflen2); + sss->buflen2 = Max(max_prefix_bytes, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = repalloc(sss->buf2, sss->buflen2); } - else -#endif -#ifdef HAVE_LOCALE_T - if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC) - bsize = strxfrm_l(sss->buf2, sss->buf1, - sss->buflen2, sss->locale->info.lt); - else -#endif - bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); - - sss->last_len2 = bsize; - if (bsize < sss->buflen2) - break; + bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1, + max_prefix_bytes, sss->locale); + } + else + { /* - * Grow buffer and retry. + * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try + * again. The pg_strxfrm() function leaves the result buffer + * content undefined if the result did not fit, so we need to + * retry until everything fits, even though we only need the first + * few bytes in the end. */ - sss->buflen2 = Max(bsize + 1, - Min(sss->buflen2 * 2, MaxAllocSize)); - sss->buf2 = repalloc(sss->buf2, sss->buflen2); + for (;;) + { + bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2, + sss->locale); + + sss->last_len2 = bsize; + if (bsize < sss->buflen2) + break; + + /* + * Grow buffer and retry. + */ + sss->buflen2 = Max(bsize + 1, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = repalloc(sss->buf2, sss->buflen2); + } } /* @@ -2637,12 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * (Actually, even if there were NUL bytes in the blob it would be * okay. See remarks on bytea case above.) */ - memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize)); - -#ifdef USE_ICU - if (uchar) - pfree(uchar); -#endif + memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); } /* diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index cede43440b5..def2b55f941 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -100,6 +100,19 @@ extern void make_icu_collator(const char *iculocstr, extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); +extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); +extern int pg_strncoll(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); +extern bool pg_strxfrm_enabled(pg_locale_t locale); +extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize, + pg_locale_t locale); +extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, + size_t srclen, pg_locale_t locale); +extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale); +extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, + pg_locale_t locale); +extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, + size_t srclen, pg_locale_t locale); #ifdef USE_ICU extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);