1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-06 07:49:08 +03:00

Control ctype behavior internally with a method table.

Previously, pattern matching and case mapping behavior branched based
on the provider. Refactor to use a method table, which is less
error-prone.

This is also a step toward multiple provider versions, which we may
want to support in the future.

Reviewed-by: Andreas Karlsson <andreas@proxel.se>
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/2830211e1b6e6a2e26d845780b03e125281ea17b.camel%40j-davis.com
This commit is contained in:
Jeff Davis
2025-07-01 07:42:39 -07:00
parent d81dcc8d62
commit 5a38104b36
9 changed files with 686 additions and 508 deletions

View File

@@ -20,58 +20,13 @@
#include "common/unicode_category.h"
#include "utils/pg_locale.h"
/*
* For the libc provider, to provide as much functionality as possible on a
* variety of platforms without going so far as to implement everything from
* scratch, we use several implementation strategies depending on the
* situation:
*
* 1. In C/POSIX collations, we use hard-wired code. We can't depend on
* the <ctype.h> functions since those will obey LC_CTYPE. Note that these
* collations don't give a fig about multibyte characters.
*
* 2. When working in UTF8 encoding, we use the <wctype.h> functions.
* This assumes that every platform uses Unicode codepoints directly
* as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
* even for non-UTF8 encodings, which may be a problem.) On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
*
* 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
* values up to 255, and punt for values above that. This is 100% correct
* only in single-byte encodings such as LATINn. However, non-Unicode
* multibyte encodings are mostly Far Eastern character sets for which the
* properties being tested here aren't very relevant for higher code values
* anyway. The difficulty with using the <wctype.h> functions with
* non-Unicode multibyte encodings is that we can have no certainty that
* the platform's wchar_t representation matches what we do in pg_wchar
* conversions.
*
* As a special case, in the "default" collation, (2) and (3) force ASCII
* letters to follow ASCII upcase/downcase rules, while in a non-default
* collation we just let the library functions do what they will. The case
* where this matters is treatment of I/i in Turkish, and the behavior is
* meant to match the upper()/lower() SQL functions.
*
* We store the active collation setting in static variables. In principle
* it could be passed down to here via the regex library's "struct vars" data
* structure; but that would require somewhat invasive changes in the regex
* library, and right now there's no real benefit to be gained from that.
*
* NB: the coding here assumes pg_wchar is an unsigned type.
*/
typedef enum
{
PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */
PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */
PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */
PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */
PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */
} PG_Locale_Strategy;
static PG_Locale_Strategy pg_regex_strategy;
static pg_locale_t pg_regex_locale;
static struct pg_locale_struct dummy_c_locale = {
.collate_is_c = true,
.ctype_is_c = true,
};
/*
* Hard-wired character properties for C locale
*/
@@ -228,7 +183,6 @@ void
pg_set_regex_collation(Oid collation)
{
pg_locale_t locale = 0;
PG_Locale_Strategy strategy;
if (!OidIsValid(collation))
{
@@ -249,8 +203,7 @@ pg_set_regex_collation(Oid collation)
* catalog access is available, so we can't call
* pg_newlocale_from_collation().
*/
strategy = PG_REGEX_STRATEGY_C;
locale = 0;
locale = &dummy_c_locale;
}
else
{
@@ -267,113 +220,41 @@ pg_set_regex_collation(Oid collation)
* C/POSIX collations use this path regardless of database
* encoding
*/
strategy = PG_REGEX_STRATEGY_C;
locale = 0;
}
else if (locale->provider == COLLPROVIDER_BUILTIN)
{
Assert(GetDatabaseEncoding() == PG_UTF8);
strategy = PG_REGEX_STRATEGY_BUILTIN;
}
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
{
strategy = PG_REGEX_STRATEGY_ICU;
}
#endif
else
{
Assert(locale->provider == COLLPROVIDER_LIBC);
if (GetDatabaseEncoding() == PG_UTF8)
strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
else
strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
locale = &dummy_c_locale;
}
}
pg_regex_strategy = strategy;
pg_regex_locale = locale;
}
static int
pg_wc_isdigit(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isdigit(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
else
return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale);
}
static int
pg_wc_isalpha(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALPHA));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isalpha(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isalpha(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALPHA));
else
return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale);
}
static int
pg_wc_isalnum(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isalnum(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
else
return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale);
}
static int
@@ -388,231 +269,87 @@ pg_wc_isword(pg_wchar c)
static int
pg_wc_isupper(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISUPPER));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isupper(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isupper_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isupper(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISUPPER));
else
return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale);
}
static int
pg_wc_islower(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISLOWER));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_islower(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
islower_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_islower(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISLOWER));
else
return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale);
}
static int
pg_wc_isgraph(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISGRAPH));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isgraph(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isgraph(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISGRAPH));
else
return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale);
}
static int
pg_wc_isprint(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPRINT));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isprint(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isprint_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isprint(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPRINT));
else
return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale);
}
static int
pg_wc_ispunct(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_ispunct(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
else
return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale);
}
static int
pg_wc_isspace(pg_wchar c)
{
switch (pg_regex_strategy)
{
case PG_REGEX_STRATEGY_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISSPACE));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isspace(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
return (c <= (pg_wchar) UCHAR_MAX &&
isspace_l((unsigned char) c, pg_regex_locale->info.lt));
break;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_isspace(c);
#endif
break;
}
return 0; /* can't get here, but keep compiler quiet */
if (pg_regex_locale->ctype_is_c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISSPACE));
else
return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale);
}
static pg_wchar
pg_wc_toupper(pg_wchar c)
{
switch (pg_regex_strategy)
if (pg_regex_locale->ctype_is_c)
{
case PG_REGEX_STRATEGY_C:
if (c <= (pg_wchar) 127)
return pg_ascii_toupper((unsigned char) c);
return c;
case PG_REGEX_STRATEGY_BUILTIN:
return unicode_uppercase_simple(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
/* force C behavior for ASCII characters, per comments above */
if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
return pg_ascii_toupper((unsigned char) c);
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return towupper_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
/* force C behavior for ASCII characters, per comments above */
if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
return pg_ascii_toupper((unsigned char) c);
if (c <= (pg_wchar) UCHAR_MAX)
return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
return c;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_toupper(c);
#endif
break;
if (c <= (pg_wchar) 127)
return pg_ascii_toupper((unsigned char) c);
return c;
}
return 0; /* can't get here, but keep compiler quiet */
else
return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
}
static pg_wchar
pg_wc_tolower(pg_wchar c)
{
switch (pg_regex_strategy)
if (pg_regex_locale->ctype_is_c)
{
case PG_REGEX_STRATEGY_C:
if (c <= (pg_wchar) 127)
return pg_ascii_tolower((unsigned char) c);
return c;
case PG_REGEX_STRATEGY_BUILTIN:
return unicode_lowercase_simple(c);
case PG_REGEX_STRATEGY_LIBC_WIDE:
/* force C behavior for ASCII characters, per comments above */
if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
return pg_ascii_tolower((unsigned char) c);
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return towlower_l((wint_t) c, pg_regex_locale->info.lt);
/* FALL THRU */
case PG_REGEX_STRATEGY_LIBC_1BYTE:
/* force C behavior for ASCII characters, per comments above */
if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
return pg_ascii_tolower((unsigned char) c);
if (c <= (pg_wchar) UCHAR_MAX)
return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
return c;
case PG_REGEX_STRATEGY_ICU:
#ifdef USE_ICU
return u_tolower(c);
#endif
break;
if (c <= (pg_wchar) 127)
return pg_ascii_tolower((unsigned char) c);
return c;
}
return 0; /* can't get here, but keep compiler quiet */
else
return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
}
@@ -738,37 +475,25 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
* would always be true for production values of MAX_SIMPLE_CHR, but it's
* useful to allow it to be small for testing purposes.)
*/
switch (pg_regex_strategy)
if (pg_regex_locale->ctype_is_c)
{
case PG_REGEX_STRATEGY_C:
#if MAX_SIMPLE_CHR >= 127
max_chr = (pg_wchar) 127;
pcc->cv.cclasscode = -1;
max_chr = (pg_wchar) 127;
pcc->cv.cclasscode = -1;
#else
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
#endif
break;
case PG_REGEX_STRATEGY_BUILTIN:
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
break;
case PG_REGEX_STRATEGY_LIBC_WIDE:
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
break;
case PG_REGEX_STRATEGY_LIBC_1BYTE:
#if MAX_SIMPLE_CHR >= UCHAR_MAX
max_chr = (pg_wchar) UCHAR_MAX;
}
else
{
if (pg_regex_locale->ctype->max_chr != 0 &&
pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
{
max_chr = pg_regex_locale->ctype->max_chr;
pcc->cv.cclasscode = -1;
#else
}
else
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
#endif
break;
case PG_REGEX_STRATEGY_ICU:
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
break;
default:
Assert(false);
max_chr = 0; /* can't get here, but keep compiler quiet */
break;
}
/*