mirror of
https://github.com/postgres/postgres.git
synced 2025-11-19 13:42:17 +03:00
Refactor to add pg_strcoll(), pg_strxfrm(), and variants.
Offers a generally better separation of responsibilities for collation code. Also, a step towards multi-lib ICU, which should be based on a clean separation of the routines required for collation providers. Callers with NUL-terminated strings should call pg_strcoll() or pg_strxfrm(); callers with strings and their length should call the variants pg_strncoll() or pg_strnxfrm(). Reviewed-by: Peter Eisentraut, Peter Geoghegan Discussion: https://postgr.es/m/a581136455c940d7bd0ff482d3a2bd51af25a94f.camel%40j-davis.com
This commit is contained in:
@@ -1553,10 +1553,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
|
||||
}
|
||||
else
|
||||
{
|
||||
char a1buf[TEXTBUFLEN];
|
||||
char a2buf[TEXTBUFLEN];
|
||||
char *a1p,
|
||||
*a2p;
|
||||
pg_locale_t mylocale;
|
||||
|
||||
mylocale = pg_newlocale_from_collation(collid);
|
||||
@@ -1573,171 +1569,16 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
|
||||
if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
|
||||
return 0;
|
||||
|
||||
#ifdef WIN32
|
||||
/* Win32 does not have UTF-8, so we need to map to UTF-16 */
|
||||
if (GetDatabaseEncoding() == PG_UTF8
|
||||
&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
|
||||
{
|
||||
int a1len;
|
||||
int a2len;
|
||||
int r;
|
||||
|
||||
if (len1 >= TEXTBUFLEN / 2)
|
||||
{
|
||||
a1len = len1 * 2 + 2;
|
||||
a1p = palloc(a1len);
|
||||
}
|
||||
else
|
||||
{
|
||||
a1len = TEXTBUFLEN;
|
||||
a1p = a1buf;
|
||||
}
|
||||
if (len2 >= TEXTBUFLEN / 2)
|
||||
{
|
||||
a2len = len2 * 2 + 2;
|
||||
a2p = palloc(a2len);
|
||||
}
|
||||
else
|
||||
{
|
||||
a2len = TEXTBUFLEN;
|
||||
a2p = a2buf;
|
||||
}
|
||||
|
||||
/* stupid Microsloth API does not work for zero-length input */
|
||||
if (len1 == 0)
|
||||
r = 0;
|
||||
else
|
||||
{
|
||||
r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
|
||||
(LPWSTR) a1p, a1len / 2);
|
||||
if (!r)
|
||||
ereport(ERROR,
|
||||
(errmsg("could not convert string to UTF-16: error code %lu",
|
||||
GetLastError())));
|
||||
}
|
||||
((LPWSTR) a1p)[r] = 0;
|
||||
|
||||
if (len2 == 0)
|
||||
r = 0;
|
||||
else
|
||||
{
|
||||
r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
|
||||
(LPWSTR) a2p, a2len / 2);
|
||||
if (!r)
|
||||
ereport(ERROR,
|
||||
(errmsg("could not convert string to UTF-16: error code %lu",
|
||||
GetLastError())));
|
||||
}
|
||||
((LPWSTR) a2p)[r] = 0;
|
||||
|
||||
errno = 0;
|
||||
#ifdef HAVE_LOCALE_T
|
||||
if (mylocale)
|
||||
result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
|
||||
else
|
||||
#endif
|
||||
result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
|
||||
if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
|
||||
* headers */
|
||||
ereport(ERROR,
|
||||
(errmsg("could not compare Unicode strings: %m")));
|
||||
|
||||
/* Break tie if necessary. */
|
||||
if (result == 0 &&
|
||||
(!mylocale || mylocale->deterministic))
|
||||
{
|
||||
result = memcmp(arg1, arg2, Min(len1, len2));
|
||||
if ((result == 0) && (len1 != len2))
|
||||
result = (len1 < len2) ? -1 : 1;
|
||||
}
|
||||
|
||||
if (a1p != a1buf)
|
||||
pfree(a1p);
|
||||
if (a2p != a2buf)
|
||||
pfree(a2p);
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif /* WIN32 */
|
||||
|
||||
if (len1 >= TEXTBUFLEN)
|
||||
a1p = (char *) palloc(len1 + 1);
|
||||
else
|
||||
a1p = a1buf;
|
||||
if (len2 >= TEXTBUFLEN)
|
||||
a2p = (char *) palloc(len2 + 1);
|
||||
else
|
||||
a2p = a2buf;
|
||||
|
||||
memcpy(a1p, arg1, len1);
|
||||
a1p[len1] = '\0';
|
||||
memcpy(a2p, arg2, len2);
|
||||
a2p[len2] = '\0';
|
||||
|
||||
if (mylocale)
|
||||
{
|
||||
if (mylocale->provider == COLLPROVIDER_ICU)
|
||||
{
|
||||
#ifdef USE_ICU
|
||||
#ifdef HAVE_UCOL_STRCOLLUTF8
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
UErrorCode status;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
result = ucol_strcollUTF8(mylocale->info.icu.ucol,
|
||||
arg1, len1,
|
||||
arg2, len2,
|
||||
&status);
|
||||
if (U_FAILURE(status))
|
||||
ereport(ERROR,
|
||||
(errmsg("collation failed: %s", u_errorName(status))));
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
int32_t ulen1,
|
||||
ulen2;
|
||||
UChar *uchar1,
|
||||
*uchar2;
|
||||
|
||||
ulen1 = icu_to_uchar(&uchar1, arg1, len1);
|
||||
ulen2 = icu_to_uchar(&uchar2, arg2, len2);
|
||||
|
||||
result = ucol_strcoll(mylocale->info.icu.ucol,
|
||||
uchar1, ulen1,
|
||||
uchar2, ulen2);
|
||||
|
||||
pfree(uchar1);
|
||||
pfree(uchar2);
|
||||
}
|
||||
#else /* not USE_ICU */
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
|
||||
#endif /* not USE_ICU */
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef HAVE_LOCALE_T
|
||||
result = strcoll_l(a1p, a2p, mylocale->info.lt);
|
||||
#else
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
result = strcoll(a1p, a2p);
|
||||
result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
|
||||
|
||||
/* Break tie if necessary. */
|
||||
if (result == 0 &&
|
||||
(!mylocale || mylocale->deterministic))
|
||||
result = strcmp(a1p, a2p);
|
||||
|
||||
if (a1p != a1buf)
|
||||
pfree(a1p);
|
||||
if (a2p != a2buf)
|
||||
pfree(a2p);
|
||||
{
|
||||
result = memcmp(arg1, arg2, Min(len1, len2));
|
||||
if ((result == 0) && (len1 != len2))
|
||||
result = (len1 < len2) ? -1 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
@@ -2073,20 +1914,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
|
||||
*/
|
||||
locale = pg_newlocale_from_collation(collid);
|
||||
|
||||
/*
|
||||
* There is a further exception on Windows. When the database
|
||||
* encoding is UTF-8 and we are not using the C collation, complex
|
||||
* hacks are required. We don't currently have a comparator that
|
||||
* handles that case, so we fall back on the slow method of having the
|
||||
* sort code invoke bttextcmp() (in the case of text) via the fmgr
|
||||
* trampoline. ICU locales work just the same on Windows, however.
|
||||
*/
|
||||
#ifdef WIN32
|
||||
if (GetDatabaseEncoding() == PG_UTF8 &&
|
||||
!(locale && locale->provider == COLLPROVIDER_ICU))
|
||||
return;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We use varlenafastcmp_locale except for type NAME.
|
||||
*/
|
||||
@@ -2102,13 +1929,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
|
||||
|
||||
/*
|
||||
* Unfortunately, it seems that abbreviation for non-C collations is
|
||||
* broken on many common platforms; testing of multiple versions of glibc
|
||||
* reveals that, for many locales, strcoll() and strxfrm() do not return
|
||||
* consistent results, which is fatal to this optimization. While no
|
||||
* other libc other than Cygwin has so far been shown to have a problem,
|
||||
* we take the conservative course of action for right now and disable
|
||||
* this categorically. (Users who are certain this isn't a problem on
|
||||
* their system can define TRUST_STRXFRM.)
|
||||
* broken on many common platforms; see pg_strxfrm_enabled().
|
||||
*
|
||||
* Even apart from the risk of broken locales, it's possible that there
|
||||
* are platforms where the use of abbreviated keys should be disabled at
|
||||
@@ -2121,10 +1942,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
|
||||
* categorically, we may still want or need to disable it for particular
|
||||
* platforms.
|
||||
*/
|
||||
#ifndef TRUST_STRXFRM
|
||||
if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
|
||||
if (!collate_c && !pg_strxfrm_enabled(locale))
|
||||
abbreviate = false;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If we're using abbreviated keys, or if we're using a locale-aware
|
||||
@@ -2395,60 +2214,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
|
||||
return sss->last_returned;
|
||||
}
|
||||
|
||||
if (sss->locale)
|
||||
{
|
||||
if (sss->locale->provider == COLLPROVIDER_ICU)
|
||||
{
|
||||
#ifdef USE_ICU
|
||||
#ifdef HAVE_UCOL_STRCOLLUTF8
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
UErrorCode status;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
|
||||
a1p, len1,
|
||||
a2p, len2,
|
||||
&status);
|
||||
if (U_FAILURE(status))
|
||||
ereport(ERROR,
|
||||
(errmsg("collation failed: %s", u_errorName(status))));
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
int32_t ulen1,
|
||||
ulen2;
|
||||
UChar *uchar1,
|
||||
*uchar2;
|
||||
|
||||
ulen1 = icu_to_uchar(&uchar1, a1p, len1);
|
||||
ulen2 = icu_to_uchar(&uchar2, a2p, len2);
|
||||
|
||||
result = ucol_strcoll(sss->locale->info.icu.ucol,
|
||||
uchar1, ulen1,
|
||||
uchar2, ulen2);
|
||||
|
||||
pfree(uchar1);
|
||||
pfree(uchar2);
|
||||
}
|
||||
#else /* not USE_ICU */
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
|
||||
#endif /* not USE_ICU */
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef HAVE_LOCALE_T
|
||||
result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
|
||||
#else
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
result = strcoll(sss->buf1, sss->buf2);
|
||||
result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
|
||||
|
||||
/* Break tie if necessary. */
|
||||
if (result == 0 &&
|
||||
@@ -2471,6 +2237,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
|
||||
static Datum
|
||||
varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||
{
|
||||
const size_t max_prefix_bytes = sizeof(Datum);
|
||||
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
|
||||
VarString *authoritative = DatumGetVarStringPP(original);
|
||||
char *authoritative_data = VARDATA_ANY(authoritative);
|
||||
@@ -2483,7 +2250,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||
|
||||
pres = (char *) &res;
|
||||
/* memset(), so any non-overwritten bytes are NUL */
|
||||
memset(pres, 0, sizeof(Datum));
|
||||
memset(pres, 0, max_prefix_bytes);
|
||||
len = VARSIZE_ANY_EXHDR(authoritative);
|
||||
|
||||
/* Get number of bytes, ignoring trailing spaces */
|
||||
@@ -2518,14 +2285,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||
* thing: explicitly consider string length.
|
||||
*/
|
||||
if (sss->collate_c)
|
||||
memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
|
||||
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
|
||||
else
|
||||
{
|
||||
Size bsize;
|
||||
#ifdef USE_ICU
|
||||
int32_t ulen = -1;
|
||||
UChar *uchar = NULL;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We're not using the C collation, so fall back on strxfrm or ICU
|
||||
@@ -2543,7 +2306,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||
if (sss->last_len1 == len && sss->cache_blob &&
|
||||
memcmp(sss->buf1, authoritative_data, len) == 0)
|
||||
{
|
||||
memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
|
||||
memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
|
||||
/* No change affecting cardinality, so no hashing required */
|
||||
goto done;
|
||||
}
|
||||
@@ -2551,81 +2314,49 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||
memcpy(sss->buf1, authoritative_data, len);
|
||||
|
||||
/*
|
||||
* Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
|
||||
* necessary for ICU, but doesn't hurt.
|
||||
* pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated
|
||||
* strings.
|
||||
*/
|
||||
sss->buf1[len] = '\0';
|
||||
sss->last_len1 = len;
|
||||
|
||||
#ifdef USE_ICU
|
||||
/* When using ICU and not UTF8, convert string to UChar. */
|
||||
if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
|
||||
GetDatabaseEncoding() != PG_UTF8)
|
||||
ulen = icu_to_uchar(&uchar, sss->buf1, len);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
|
||||
* and try again. Both of these functions have the result buffer
|
||||
* content undefined if the result did not fit, so we need to retry
|
||||
* until everything fits, even though we only need the first few bytes
|
||||
* in the end. When using ucol_nextSortKeyPart(), however, we only
|
||||
* ask for as many bytes as we actually need.
|
||||
*/
|
||||
for (;;)
|
||||
if (pg_strxfrm_prefix_enabled(sss->locale))
|
||||
{
|
||||
#ifdef USE_ICU
|
||||
if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
|
||||
if (sss->buflen2 < max_prefix_bytes)
|
||||
{
|
||||
/*
|
||||
* When using UTF8, use the iteration interface so we only
|
||||
* need to produce as many bytes as we actually need.
|
||||
*/
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
UCharIterator iter;
|
||||
uint32_t state[2];
|
||||
UErrorCode status;
|
||||
|
||||
uiter_setUTF8(&iter, sss->buf1, len);
|
||||
state[0] = state[1] = 0; /* won't need that again */
|
||||
status = U_ZERO_ERROR;
|
||||
bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
|
||||
&iter,
|
||||
state,
|
||||
(uint8_t *) sss->buf2,
|
||||
Min(sizeof(Datum), sss->buflen2),
|
||||
&status);
|
||||
if (U_FAILURE(status))
|
||||
ereport(ERROR,
|
||||
(errmsg("sort key generation failed: %s",
|
||||
u_errorName(status))));
|
||||
}
|
||||
else
|
||||
bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
|
||||
uchar, ulen,
|
||||
(uint8_t *) sss->buf2, sss->buflen2);
|
||||
sss->buflen2 = Max(max_prefix_bytes,
|
||||
Min(sss->buflen2 * 2, MaxAllocSize));
|
||||
sss->buf2 = repalloc(sss->buf2, sss->buflen2);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef HAVE_LOCALE_T
|
||||
if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
|
||||
bsize = strxfrm_l(sss->buf2, sss->buf1,
|
||||
sss->buflen2, sss->locale->info.lt);
|
||||
else
|
||||
#endif
|
||||
bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
|
||||
|
||||
sss->last_len2 = bsize;
|
||||
if (bsize < sss->buflen2)
|
||||
break;
|
||||
|
||||
bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
|
||||
max_prefix_bytes, sss->locale);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Grow buffer and retry.
|
||||
* Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
|
||||
* again. The pg_strxfrm() function leaves the result buffer
|
||||
* content undefined if the result did not fit, so we need to
|
||||
* retry until everything fits, even though we only need the first
|
||||
* few bytes in the end.
|
||||
*/
|
||||
sss->buflen2 = Max(bsize + 1,
|
||||
Min(sss->buflen2 * 2, MaxAllocSize));
|
||||
sss->buf2 = repalloc(sss->buf2, sss->buflen2);
|
||||
for (;;)
|
||||
{
|
||||
bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
|
||||
sss->locale);
|
||||
|
||||
sss->last_len2 = bsize;
|
||||
if (bsize < sss->buflen2)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Grow buffer and retry.
|
||||
*/
|
||||
sss->buflen2 = Max(bsize + 1,
|
||||
Min(sss->buflen2 * 2, MaxAllocSize));
|
||||
sss->buf2 = repalloc(sss->buf2, sss->buflen2);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2637,12 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||
* (Actually, even if there were NUL bytes in the blob it would be
|
||||
* okay. See remarks on bytea case above.)
|
||||
*/
|
||||
memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
|
||||
|
||||
#ifdef USE_ICU
|
||||
if (uchar)
|
||||
pfree(uchar);
|
||||
#endif
|
||||
memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user