mirror of
https://github.com/postgres/postgres.git
synced 2025-06-29 10:41:53 +03:00
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
This commit is contained in:
@ -78,7 +78,7 @@ size_t
|
|||||||
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||||
pg_locale_t locale)
|
pg_locale_t locale)
|
||||||
{
|
{
|
||||||
return unicode_strlower(dest, destsize, src, srclen);
|
return unicode_strlower(dest, destsize, src, srclen, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
size_t
|
||||||
@ -93,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
|||||||
.prev_alnum = false,
|
.prev_alnum = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return unicode_strtitle(dest, destsize, src, srclen,
|
return unicode_strtitle(dest, destsize, src, srclen, false,
|
||||||
initcap_wbnext, &wbstate);
|
initcap_wbnext, &wbstate);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,7 +101,7 @@ size_t
|
|||||||
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||||
pg_locale_t locale)
|
pg_locale_t locale)
|
||||||
{
|
{
|
||||||
return unicode_strupper(dest, destsize, src, srclen);
|
return unicode_strupper(dest, destsize, src, srclen, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
pg_locale_t
|
pg_locale_t
|
||||||
|
@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
|
|||||||
# These files are part of the Unicode Character Database. Download
|
# These files are part of the Unicode Character Database. Download
|
||||||
# them on demand. The dependency on Makefile.global is for
|
# them on demand. The dependency on Makefile.global is for
|
||||||
# UNICODE_VERSION.
|
# UNICODE_VERSION.
|
||||||
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
|
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
|
||||||
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
||||||
|
|
||||||
unicode_version.h: generate-unicode_version.pl
|
unicode_version.h: generate-unicode_version.pl
|
||||||
@ -91,4 +91,4 @@ clean:
|
|||||||
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
|
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
|
||||||
|
|
||||||
distclean: clean
|
distclean: clean
|
||||||
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
|
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
|
||||||
|
@ -18,12 +18,61 @@
|
|||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
|
|
||||||
#ifdef USE_ICU
|
#ifdef USE_ICU
|
||||||
|
#include <unicode/ucasemap.h>
|
||||||
#include <unicode/uchar.h>
|
#include <unicode/uchar.h>
|
||||||
#endif
|
#endif
|
||||||
#include "common/unicode_case.h"
|
#include "common/unicode_case.h"
|
||||||
#include "common/unicode_category.h"
|
#include "common/unicode_category.h"
|
||||||
#include "common/unicode_version.h"
|
#include "common/unicode_version.h"
|
||||||
|
|
||||||
|
/* enough to hold largest source or result string, including NUL */
|
||||||
|
#define BUFSZ 256
|
||||||
|
|
||||||
|
#ifdef USE_ICU
|
||||||
|
static UCaseMap * casemap = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen);
|
||||||
|
|
||||||
|
/* simple boundary iterator copied from pg_locale_builtin.c */
|
||||||
|
struct WordBoundaryState
|
||||||
|
{
|
||||||
|
const char *str;
|
||||||
|
size_t len;
|
||||||
|
size_t offset;
|
||||||
|
bool init;
|
||||||
|
bool prev_alnum;
|
||||||
|
};
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
initcap_wbnext(void *state)
|
||||||
|
{
|
||||||
|
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
|
||||||
|
|
||||||
|
while (wbstate->offset < wbstate->len &&
|
||||||
|
wbstate->str[wbstate->offset] != '\0')
|
||||||
|
{
|
||||||
|
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||||
|
wbstate->offset);
|
||||||
|
bool curr_alnum = pg_u_isalnum(u, true);
|
||||||
|
|
||||||
|
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
|
||||||
|
{
|
||||||
|
size_t prev_offset = wbstate->offset;
|
||||||
|
|
||||||
|
wbstate->init = true;
|
||||||
|
wbstate->offset += unicode_utf8len(u);
|
||||||
|
wbstate->prev_alnum = curr_alnum;
|
||||||
|
return prev_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
wbstate->offset += unicode_utf8len(u);
|
||||||
|
}
|
||||||
|
|
||||||
|
return wbstate->len;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef USE_ICU
|
#ifdef USE_ICU
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
icu_test_full(char *str)
|
||||||
|
{
|
||||||
|
char lower[BUFSZ];
|
||||||
|
char title[BUFSZ];
|
||||||
|
char upper[BUFSZ];
|
||||||
|
char icu_lower[BUFSZ];
|
||||||
|
char icu_title[BUFSZ];
|
||||||
|
char icu_upper[BUFSZ];
|
||||||
|
UErrorCode status;
|
||||||
|
struct WordBoundaryState wbstate = {
|
||||||
|
.str = str,
|
||||||
|
.len = strlen(str),
|
||||||
|
.offset = 0,
|
||||||
|
.init = false,
|
||||||
|
.prev_alnum = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
unicode_strlower(lower, BUFSZ, str, -1, true);
|
||||||
|
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
|
||||||
|
unicode_strupper(upper, BUFSZ, str, -1, true);
|
||||||
|
status = U_ZERO_ERROR;
|
||||||
|
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
|
||||||
|
status = U_ZERO_ERROR;
|
||||||
|
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
|
||||||
|
status = U_ZERO_ERROR;
|
||||||
|
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
|
||||||
|
|
||||||
|
if (strcmp(lower, icu_lower) != 0)
|
||||||
|
{
|
||||||
|
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
|
||||||
|
icu_lower);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (strcmp(title, icu_title) != 0)
|
||||||
|
{
|
||||||
|
printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
|
||||||
|
icu_title);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (strcmp(upper, icu_upper) != 0)
|
||||||
|
{
|
||||||
|
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
|
||||||
|
icu_upper);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Exhaustively compare case mappings with the results from ICU.
|
* Exhaustively compare case mappings with the results from ICU.
|
||||||
*/
|
*/
|
||||||
@ -64,6 +161,7 @@ test_icu(void)
|
|||||||
if (category != PG_U_UNASSIGNED)
|
if (category != PG_U_UNASSIGNED)
|
||||||
{
|
{
|
||||||
uint8_t icu_category = u_charType(code);
|
uint8_t icu_category = u_charType(code);
|
||||||
|
char code_str[5] = {0};
|
||||||
|
|
||||||
if (icu_category == PG_U_UNASSIGNED)
|
if (icu_category == PG_U_UNASSIGNED)
|
||||||
{
|
{
|
||||||
@ -72,6 +170,9 @@ test_icu(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
icu_test_simple(code);
|
icu_test_simple(code);
|
||||||
|
unicode_to_utf8(code, (unsigned char *) code_str);
|
||||||
|
icu_test_full(code_str);
|
||||||
|
|
||||||
successful++;
|
successful++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -86,7 +187,7 @@ test_icu(void)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_strlower(const char *test_string, const char *expected)
|
test_convert(TestFunc tfunc, const char *test_string, const char *expected)
|
||||||
{
|
{
|
||||||
size_t src1len = strlen(test_string);
|
size_t src1len = strlen(test_string);
|
||||||
size_t src2len = -1; /* NUL-terminated */
|
size_t src2len = -1; /* NUL-terminated */
|
||||||
@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
|
|||||||
|
|
||||||
/* neither source nor destination are NUL-terminated */
|
/* neither source nor destination are NUL-terminated */
|
||||||
memset(dst1, 0x7F, dst1len);
|
memset(dst1, 0x7F, dst1len);
|
||||||
needed = unicode_strlower(dst1, dst1len, src1, src1len);
|
needed = tfunc(dst1, dst1len, src1, src1len);
|
||||||
if (needed != strlen(expected))
|
if (needed != strlen(expected))
|
||||||
{
|
{
|
||||||
printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
|
printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
|
||||||
|
test_string, needed, strlen(expected));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (memcmp(dst1, expected, dst1len) != 0)
|
if (memcmp(dst1, expected, dst1len) != 0)
|
||||||
@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
|
|||||||
|
|
||||||
/* destination is NUL-terminated and source is not */
|
/* destination is NUL-terminated and source is not */
|
||||||
memset(dst2, 0x7F, dst2len);
|
memset(dst2, 0x7F, dst2len);
|
||||||
needed = unicode_strlower(dst2, dst2len, src1, src1len);
|
needed = tfunc(dst2, dst2len, src1, src1len);
|
||||||
if (needed != strlen(expected))
|
if (needed != strlen(expected))
|
||||||
{
|
{
|
||||||
printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
|
printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
|
||||||
|
test_string, needed, strlen(expected));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (strcmp(dst2, expected) != 0)
|
if (strcmp(dst2, expected) != 0)
|
||||||
@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
|
|||||||
|
|
||||||
/* source is NUL-terminated and destination is not */
|
/* source is NUL-terminated and destination is not */
|
||||||
memset(dst1, 0x7F, dst1len);
|
memset(dst1, 0x7F, dst1len);
|
||||||
needed = unicode_strlower(dst1, dst1len, src2, src2len);
|
needed = tfunc(dst1, dst1len, src2, src2len);
|
||||||
if (needed != strlen(expected))
|
if (needed != strlen(expected))
|
||||||
{
|
{
|
||||||
|
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
|
||||||
|
test_string, needed, strlen(expected));
|
||||||
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
|
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
|
|||||||
|
|
||||||
/* both source and destination are NUL-terminated */
|
/* both source and destination are NUL-terminated */
|
||||||
memset(dst2, 0x7F, dst2len);
|
memset(dst2, 0x7F, dst2len);
|
||||||
needed = unicode_strlower(dst2, dst2len, src2, src2len);
|
needed = tfunc(dst2, dst2len, src2, src2len);
|
||||||
if (needed != strlen(expected))
|
if (needed != strlen(expected))
|
||||||
{
|
{
|
||||||
printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
|
printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
|
||||||
|
test_string, needed, strlen(expected));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (strcmp(dst2, expected) != 0)
|
if (strcmp(dst2, expected) != 0)
|
||||||
@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
|
|||||||
free(dst2);
|
free(dst2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
tfunc_lower(char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen)
|
||||||
|
{
|
||||||
|
return unicode_strlower(dst, dstsize, src, srclen, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
tfunc_title(char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen)
|
||||||
|
{
|
||||||
|
struct WordBoundaryState wbstate = {
|
||||||
|
.str = src,
|
||||||
|
.len = srclen,
|
||||||
|
.offset = 0,
|
||||||
|
.init = false,
|
||||||
|
.prev_alnum = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
|
||||||
|
&wbstate);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
tfunc_upper(char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen)
|
||||||
|
{
|
||||||
|
return unicode_strupper(dst, dstsize, src, srclen, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_convert_case()
|
test_convert_case()
|
||||||
{
|
{
|
||||||
/* test string with no case changes */
|
/* test string with no case changes */
|
||||||
test_strlower("√∞", "√∞");
|
test_convert(tfunc_lower, "√∞", "√∞");
|
||||||
|
/* test adjust-to-cased behavior */
|
||||||
|
test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
|
||||||
/* test string with case changes */
|
/* test string with case changes */
|
||||||
test_strlower("ABC", "abc");
|
test_convert(tfunc_upper, "abc", "ABC");
|
||||||
/* test string with case changes and byte length changes */
|
/* test string with case changes and byte length changes */
|
||||||
test_strlower("ȺȺȺ", "ⱥⱥⱥ");
|
test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
|
||||||
|
/* test special case conversions */
|
||||||
|
test_convert(tfunc_upper, "ß", "SS");
|
||||||
|
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
|
||||||
|
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
|
||||||
|
/* test final sigma */
|
||||||
|
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
|
||||||
|
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
|
||||||
|
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
|
||||||
|
|
||||||
|
#ifdef USE_ICU
|
||||||
|
icu_test_full("");
|
||||||
|
icu_test_full("ȺȺȺ");
|
||||||
|
icu_test_full("ßßß");
|
||||||
|
icu_test_full("√∞");
|
||||||
|
icu_test_full("a b");
|
||||||
|
icu_test_full("abc 123xyz");
|
||||||
|
icu_test_full("σςΣ ΣΣΣ");
|
||||||
|
icu_test_full("ıiIİ");
|
||||||
|
/* test <alpha><iota_subscript><acute> */
|
||||||
|
icu_test_full("\u0391\u0345\u0301");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("case_test: convert_case: success\n");
|
printf("case_test: convert_case: success\n");
|
||||||
}
|
}
|
||||||
@ -182,6 +342,22 @@ test_convert_case()
|
|||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
#ifdef USE_ICU
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Disable ICU's word break adjustment for titlecase to match the expected
|
||||||
|
* behavior of unicode_strtitle().
|
||||||
|
*/
|
||||||
|
casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
|
||||||
|
if (U_FAILURE(status))
|
||||||
|
{
|
||||||
|
printf("case_test: failure opening UCaseMap: %s\n",
|
||||||
|
u_errorName(status));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
|
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
|
||||||
#ifdef USE_ICU
|
#ifdef USE_ICU
|
||||||
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
|
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
|
||||||
@ -191,5 +367,9 @@ main(int argc, char **argv)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
test_convert_case();
|
test_convert_case();
|
||||||
|
|
||||||
|
#ifdef USE_ICU
|
||||||
|
ucasemap_close(casemap);
|
||||||
|
#endif
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
# Generate Unicode character case mappings. Does not include tailoring
|
# Generate Unicode character case mappings. Does not include tailoring
|
||||||
# or locale-specific mappings.
|
# or locale-specific mappings.
|
||||||
#
|
#
|
||||||
# Input: UnicodeData.txt
|
# Input: SpecialCasing.txt UnicodeData.txt
|
||||||
# Output: unicode_case_table.h
|
# Output: unicode_case_table.h
|
||||||
#
|
#
|
||||||
# Copyright (c) 2000-2025, PostgreSQL Global Development Group
|
# Copyright (c) 2000-2025, PostgreSQL Global Development Group
|
||||||
@ -21,6 +21,10 @@ GetOptions('outdir:s' => \$output_path);
|
|||||||
|
|
||||||
my $output_table_file = "$output_path/unicode_case_table.h";
|
my $output_table_file = "$output_path/unicode_case_table.h";
|
||||||
|
|
||||||
|
# The maximum number of codepoints that can result from case mapping
|
||||||
|
# of a single character. See Unicode section 5.18 "Case Mappings".
|
||||||
|
my $MAX_CASE_EXPANSION = 3;
|
||||||
|
|
||||||
my $FH;
|
my $FH;
|
||||||
|
|
||||||
my %simple = ();
|
my %simple = ();
|
||||||
@ -51,6 +55,98 @@ while (my $line = <$FH>)
|
|||||||
}
|
}
|
||||||
close $FH;
|
close $FH;
|
||||||
|
|
||||||
|
# Map for special casing rules that aren't represented in the simple
|
||||||
|
# mapping. Language-sensitive mappings are not supported.
|
||||||
|
#
|
||||||
|
# See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the
|
||||||
|
# SpecialCasing.txt file itself for details.
|
||||||
|
|
||||||
|
# for now, only Final_Sigma is supported
|
||||||
|
my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA');
|
||||||
|
|
||||||
|
my %special = ();
|
||||||
|
open($FH, '<', "$output_path/SpecialCasing.txt")
|
||||||
|
or die "Could not open $output_path/SpecialCasing.txt: $!.";
|
||||||
|
while (my $line = <$FH>)
|
||||||
|
{
|
||||||
|
# language-sensitive mappings not supported
|
||||||
|
last if $line =~ /\# Language-Sensitive Mappings/;
|
||||||
|
|
||||||
|
# remove comments
|
||||||
|
$line =~ s/^(.*?)#.*$/$1/s;
|
||||||
|
|
||||||
|
# ignore empty lines
|
||||||
|
next unless $line =~ /;/;
|
||||||
|
|
||||||
|
my @elts = split /;/, $line;
|
||||||
|
my $code = hex($elts[0]);
|
||||||
|
|
||||||
|
# Codepoint may map to multiple characters when converting
|
||||||
|
# case. Split each mapping on whitespace and extract the
|
||||||
|
# hexadecimal into an array of codepoints.
|
||||||
|
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
|
||||||
|
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
|
||||||
|
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
|
||||||
|
my @conditions = map {
|
||||||
|
# supporting negated conditions may require storing a
|
||||||
|
# mask of relevant conditions for a given rule to differentiate
|
||||||
|
# between lack of a condition and a negated condition
|
||||||
|
die "negated conditions not supported" if /^Not_/;
|
||||||
|
$condition_map{$_} || die "unrecognized condition: $_"
|
||||||
|
} (grep /\w+/, (split /\s+/, $elts[4]));
|
||||||
|
|
||||||
|
my $cond_str = (join '|', @conditions) || '0';
|
||||||
|
|
||||||
|
# if empty, create a self-mapping
|
||||||
|
push @lower, $code if (scalar @lower == 0);
|
||||||
|
push @title, $code if (scalar @title == 0);
|
||||||
|
push @upper, $code if (scalar @upper == 0);
|
||||||
|
|
||||||
|
# none should map to more than 3 codepoints
|
||||||
|
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
|
||||||
|
if (scalar @lower) > $MAX_CASE_EXPANSION;
|
||||||
|
die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'"
|
||||||
|
if (scalar @title) > $MAX_CASE_EXPANSION;
|
||||||
|
die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'"
|
||||||
|
if (scalar @upper) > $MAX_CASE_EXPANSION;
|
||||||
|
|
||||||
|
# pad arrays to a fixed length of 3
|
||||||
|
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
|
||||||
|
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
|
||||||
|
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
|
||||||
|
|
||||||
|
# Characters with special mappings may not have simple mappings;
|
||||||
|
# ensure that an entry exists.
|
||||||
|
$simple{$code} ||= {
|
||||||
|
Simple_Lowercase => $code,
|
||||||
|
Simple_Titlecase => $code,
|
||||||
|
Simple_Uppercase => $code
|
||||||
|
};
|
||||||
|
|
||||||
|
# Multiple special case rules for a single codepoint could be
|
||||||
|
# supported by making several entries for each codepoint, and have
|
||||||
|
# the simple mapping point to the first entry. The caller could
|
||||||
|
# scan forward looking for an entry that matches the conditions,
|
||||||
|
# or fall back to the normal behavior.
|
||||||
|
die "multiple special case mappings not supported"
|
||||||
|
if defined $special{$code};
|
||||||
|
|
||||||
|
$special{$code} = {
|
||||||
|
Lowercase => \@lower,
|
||||||
|
Titlecase => \@title,
|
||||||
|
Uppercase => \@upper,
|
||||||
|
Conditions => $cond_str
|
||||||
|
};
|
||||||
|
}
|
||||||
|
close $FH;
|
||||||
|
|
||||||
|
# assign sequential array indexes to the special mappings
|
||||||
|
my $special_idx = 0;
|
||||||
|
foreach my $code (sort { $a <=> $b } (keys %special))
|
||||||
|
{
|
||||||
|
$special{$code}{Index} = $special_idx++;
|
||||||
|
}
|
||||||
|
|
||||||
# Start writing out the output files
|
# Start writing out the output files
|
||||||
open my $OT, '>', $output_table_file
|
open my $OT, '>', $output_table_file
|
||||||
or die "Could not open output file $output_table_file: $!\n";
|
or die "Could not open output file $output_table_file: $!\n";
|
||||||
@ -63,6 +159,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
|
|||||||
$num_simple++ unless $code < 0x80;
|
$num_simple++ unless $code < 0x80;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
my $num_special = scalar(keys %special) + 1;
|
||||||
|
|
||||||
print $OT <<"EOS";
|
print $OT <<"EOS";
|
||||||
/*-------------------------------------------------------------------------
|
/*-------------------------------------------------------------------------
|
||||||
*
|
*
|
||||||
@ -86,6 +184,19 @@ print $OT <<"EOS";
|
|||||||
#include "common/unicode_case.h"
|
#include "common/unicode_case.h"
|
||||||
#include "mb/pg_wchar.h"
|
#include "mb/pg_wchar.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The maximum number of codepoints that can result from case mapping
|
||||||
|
* of a single character. See Unicode section 5.18 "Case Mappings".
|
||||||
|
*/
|
||||||
|
#define MAX_CASE_EXPANSION 3
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Case mapping condition flags. For now, only Final_Sigma is supported.
|
||||||
|
*
|
||||||
|
* See Unicode Context Specification for Casing.
|
||||||
|
*/
|
||||||
|
#define PG_U_FINAL_SIGMA (1 << 0)
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
CaseLower = 0,
|
CaseLower = 0,
|
||||||
@ -94,12 +205,47 @@ typedef enum
|
|||||||
NCaseKind
|
NCaseKind
|
||||||
} CaseKind;
|
} CaseKind;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
pg_wchar codepoint; /* Unicode codepoint */
|
||||||
|
int16 conditions;
|
||||||
|
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
|
||||||
|
} pg_special_case;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
pg_wchar codepoint; /* Unicode codepoint */
|
pg_wchar codepoint; /* Unicode codepoint */
|
||||||
pg_wchar simplemap[NCaseKind];
|
pg_wchar simplemap[NCaseKind];
|
||||||
|
const pg_special_case *special_case;
|
||||||
} pg_case_map;
|
} pg_case_map;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Special case mappings that aren't representable in the simple map.
|
||||||
|
* Entries are referenced from simple_case_map.
|
||||||
|
*/
|
||||||
|
static const pg_special_case special_case[$num_special] =
|
||||||
|
{
|
||||||
|
EOS
|
||||||
|
|
||||||
|
foreach my $code (sort { $a <=> $b } (keys %special))
|
||||||
|
{
|
||||||
|
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
|
||||||
|
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
|
||||||
|
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
|
||||||
|
my $lower = join ", ",
|
||||||
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
|
||||||
|
my $title = join ", ",
|
||||||
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
|
||||||
|
my $upper = join ", ",
|
||||||
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
|
||||||
|
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
|
||||||
|
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
|
||||||
|
}
|
||||||
|
|
||||||
|
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
|
||||||
|
print $OT <<"EOS";
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
|
* Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
|
||||||
* sparse for higher codepoints (requiring scan or binary search).
|
* sparse for higher codepoints (requiring scan or binary search).
|
||||||
@ -114,8 +260,10 @@ for (my $code = 0; $code < 0x80; $code++)
|
|||||||
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
|
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
|
||||||
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
|
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
|
||||||
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
|
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
|
||||||
|
die "unexpected special case for code $code"
|
||||||
|
if defined $special{$code};
|
||||||
printf $OT
|
printf $OT
|
||||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
|
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
|
||||||
$code, $lc, $tc, $uc;
|
$code, $lc, $tc, $uc;
|
||||||
}
|
}
|
||||||
printf $OT "\n";
|
printf $OT "\n";
|
||||||
@ -126,9 +274,14 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
|
|||||||
next unless $code >= 0x80; # already output above
|
next unless $code >= 0x80; # already output above
|
||||||
|
|
||||||
my $map = $simple{$code};
|
my $map = $simple{$code};
|
||||||
|
my $special_case = "NULL";
|
||||||
|
if (exists $special{$code})
|
||||||
|
{
|
||||||
|
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
|
||||||
|
}
|
||||||
printf $OT
|
printf $OT
|
||||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
|
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
|
||||||
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
|
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
|
||||||
$map->{Simple_Uppercase};
|
$map->{Simple_Uppercase}, $special_case;
|
||||||
}
|
}
|
||||||
print $OT "};\n";
|
print $OT "};\n";
|
||||||
|
@ -11,7 +11,7 @@ endif
|
|||||||
|
|
||||||
# These files are part of the Unicode Character Database. Download them on
|
# These files are part of the Unicode Character Database. Download them on
|
||||||
# demand.
|
# demand.
|
||||||
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'UnicodeData.txt']
|
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
|
||||||
url = unicode_baseurl.format(UNICODE_VERSION, f)
|
url = unicode_baseurl.format(UNICODE_VERSION, f)
|
||||||
target = custom_target(f,
|
target = custom_target(f,
|
||||||
output: f,
|
output: f,
|
||||||
@ -26,7 +26,7 @@ update_unicode_targets = []
|
|||||||
|
|
||||||
update_unicode_targets += \
|
update_unicode_targets += \
|
||||||
custom_target('unicode_case_table.h',
|
custom_target('unicode_case_table.h',
|
||||||
input: [unicode_data['UnicodeData.txt']],
|
input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
|
||||||
output: ['unicode_case_table.h'],
|
output: ['unicode_case_table.h'],
|
||||||
command: [
|
command: [
|
||||||
perl, files('generate-unicode_case_table.pl'),
|
perl, files('generate-unicode_case_table.pl'),
|
||||||
|
@ -17,12 +17,15 @@
|
|||||||
|
|
||||||
#include "common/unicode_case.h"
|
#include "common/unicode_case.h"
|
||||||
#include "common/unicode_case_table.h"
|
#include "common/unicode_case_table.h"
|
||||||
|
#include "common/unicode_category.h"
|
||||||
#include "mb/pg_wchar.h"
|
#include "mb/pg_wchar.h"
|
||||||
|
|
||||||
static const pg_case_map *find_case_map(pg_wchar ucs);
|
static const pg_case_map *find_case_map(pg_wchar ucs);
|
||||||
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
CaseKind str_casekind, WordBoundaryNext wbnext,
|
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
|
||||||
void *wbstate);
|
void *wbstate);
|
||||||
|
static bool check_special_conditions(int conditions, const char *str,
|
||||||
|
size_t len, size_t offset);
|
||||||
|
|
||||||
pg_wchar
|
pg_wchar
|
||||||
unicode_lowercase_simple(pg_wchar code)
|
unicode_lowercase_simple(pg_wchar code)
|
||||||
@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
|
|||||||
*
|
*
|
||||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||||
* required buffer size before allocating.
|
* required buffer size before allocating.
|
||||||
|
*
|
||||||
|
* If full is true, use special case mappings if available and if the
|
||||||
|
* conditions are satisfied.
|
||||||
*/
|
*/
|
||||||
size_t
|
size_t
|
||||||
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
|
bool full)
|
||||||
{
|
{
|
||||||
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
|
return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
|||||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||||
* required buffer size before allocating.
|
* required buffer size before allocating.
|
||||||
*
|
*
|
||||||
|
* If full is true, use special case mappings if available and if the
|
||||||
|
* conditions are satisfied. Otherwise, use only simple mappings and use
|
||||||
|
* uppercase instead of titlecase.
|
||||||
|
*
|
||||||
* Titlecasing requires knowledge about word boundaries, which is provided by
|
* Titlecasing requires knowledge about word boundaries, which is provided by
|
||||||
* the callback wbnext. A word boundary is the offset of the start of a word
|
* the callback wbnext. A word boundary is the offset of the start of a word
|
||||||
* or the offset of the character immediately following a word.
|
* or the offset of the character immediately following a word.
|
||||||
@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
|||||||
*/
|
*/
|
||||||
size_t
|
size_t
|
||||||
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
WordBoundaryNext wbnext, void *wbstate)
|
bool full, WordBoundaryNext wbnext, void *wbstate)
|
||||||
{
|
{
|
||||||
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
|
return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
|
||||||
wbstate);
|
wbstate);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
|||||||
*
|
*
|
||||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||||
* required buffer size before allocating.
|
* required buffer size before allocating.
|
||||||
|
*
|
||||||
|
* If full is true, use special case mappings if available and if the
|
||||||
|
* conditions are satisfied.
|
||||||
*/
|
*/
|
||||||
size_t
|
size_t
|
||||||
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
|
bool full)
|
||||||
{
|
{
|
||||||
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
|
return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* Implement Unicode Default Case Conversion algorithm.
|
||||||
|
*
|
||||||
* If str_casekind is CaseLower or CaseUpper, map each character in the string
|
* If str_casekind is CaseLower or CaseUpper, map each character in the string
|
||||||
* for which a mapping is available.
|
* for which a mapping is available.
|
||||||
*
|
*
|
||||||
* If str_casekind is CaseTitle, maps characters found on a word boundary to
|
* If str_casekind is CaseTitle, maps characters found on a word boundary to
|
||||||
* uppercase and other characters to lowercase.
|
* titlecase (or uppercase if full is false) and other characters to
|
||||||
|
* lowercase. NB: does not currently implement the Unicode behavior in which
|
||||||
|
* the word boundary is adjusted to the next Cased character. That behavior
|
||||||
|
* could be implemented as an option, but it doesn't match the default
|
||||||
|
* behavior of ICU, nor does it match the documented behavior of INITCAP().
|
||||||
|
*
|
||||||
|
* If full is true, use special mappings for relevant characters, which can
|
||||||
|
* map a single codepoint to multiple codepoints, or depend on conditions.
|
||||||
*/
|
*/
|
||||||
static size_t
|
static size_t
|
||||||
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
|
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
|
||||||
|
void *wbstate)
|
||||||
{
|
{
|
||||||
/* character CaseKind varies while titlecasing */
|
/* character CaseKind varies while titlecasing */
|
||||||
CaseKind chr_casekind = str_casekind;
|
CaseKind chr_casekind = str_casekind;
|
||||||
@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
|||||||
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
|
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
|
||||||
int u1len = unicode_utf8len(u1);
|
int u1len = unicode_utf8len(u1);
|
||||||
const pg_case_map *casemap = find_case_map(u1);
|
const pg_case_map *casemap = find_case_map(u1);
|
||||||
|
const pg_special_case *special = NULL;
|
||||||
|
|
||||||
if (str_casekind == CaseTitle)
|
if (str_casekind == CaseTitle)
|
||||||
{
|
{
|
||||||
if (srcoff == boundary)
|
if (srcoff == boundary)
|
||||||
{
|
{
|
||||||
chr_casekind = CaseUpper;
|
chr_casekind = full ? CaseTitle : CaseUpper;
|
||||||
boundary = wbnext(wbstate);
|
boundary = wbnext(wbstate);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
chr_casekind = CaseLower;
|
chr_casekind = CaseLower;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find special case that matches the conditions, if any.
|
||||||
|
*
|
||||||
|
* Note: only a single special mapping per codepoint is currently
|
||||||
|
* supported, though Unicode allows for multiple special mappings for
|
||||||
|
* a single codepoint.
|
||||||
|
*/
|
||||||
|
if (full && casemap && casemap->special_case)
|
||||||
|
{
|
||||||
|
int16 conditions = casemap->special_case->conditions;
|
||||||
|
|
||||||
|
Assert(casemap->special_case->codepoint == u1);
|
||||||
|
if (check_special_conditions(conditions, src, srclen, srcoff))
|
||||||
|
special = casemap->special_case;
|
||||||
|
}
|
||||||
|
|
||||||
/* perform mapping, update result_len, and write to dst */
|
/* perform mapping, update result_len, and write to dst */
|
||||||
if (casemap)
|
if (special)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
|
||||||
|
{
|
||||||
|
pg_wchar u2 = special->map[chr_casekind][i];
|
||||||
|
size_t u2len = unicode_utf8len(u2);
|
||||||
|
|
||||||
|
if (u2 == '\0')
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (result_len + u2len <= dstsize)
|
||||||
|
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
|
||||||
|
|
||||||
|
result_len += u2len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (casemap)
|
||||||
{
|
{
|
||||||
pg_wchar u2 = casemap->simplemap[chr_casekind];
|
pg_wchar u2 = casemap->simplemap[chr_casekind];
|
||||||
pg_wchar u2len = unicode_utf8len(u2);
|
pg_wchar u2len = unicode_utf8len(u2);
|
||||||
@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
|||||||
return result_len;
|
return result_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check that the condition matches Final_Sigma, described in Unicode Table
|
||||||
|
* 3-17. The character at the given offset must be directly preceded by a
|
||||||
|
* Cased character, and must not be directly followed by a Cased character.
|
||||||
|
*
|
||||||
|
* Case_Ignorable characters are ignored. NB: some characters may be both
|
||||||
|
* Cased and Case_Ignorable, in which case they are ignored.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
|
||||||
|
{
|
||||||
|
/* the start of the string is not preceded by a Cased character */
|
||||||
|
if (offset == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* iterate backwards, looking for Cased character */
|
||||||
|
for (int i = offset - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
|
||||||
|
{
|
||||||
|
pg_wchar curr = utf8_to_unicode(str + i);
|
||||||
|
|
||||||
|
if (pg_u_prop_case_ignorable(curr))
|
||||||
|
continue;
|
||||||
|
else if (pg_u_prop_cased(curr))
|
||||||
|
break;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else if ((str[i] & 0xC0) == 0x80)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Assert(false); /* invalid UTF-8 */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* end of string is not followed by a Cased character */
|
||||||
|
if (offset == len)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
/* iterate forwards, looking for Cased character */
|
||||||
|
for (int i = offset + 1; i < len && str[i] != '\0'; i++)
|
||||||
|
{
|
||||||
|
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
|
||||||
|
{
|
||||||
|
pg_wchar curr = utf8_to_unicode(str + i);
|
||||||
|
|
||||||
|
if (pg_u_prop_case_ignorable(curr))
|
||||||
|
continue;
|
||||||
|
else if (pg_u_prop_cased(curr))
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if ((str[i] & 0xC0) == 0x80)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Assert(false); /* invalid UTF-8 */
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
check_special_conditions(int conditions, const char *str, size_t len,
|
||||||
|
size_t offset)
|
||||||
|
{
|
||||||
|
if (conditions == 0)
|
||||||
|
return true;
|
||||||
|
else if (conditions == PG_U_FINAL_SIGMA)
|
||||||
|
return check_final_sigma((unsigned char *) str, len, offset);
|
||||||
|
|
||||||
|
/* no other conditions supported */
|
||||||
|
Assert(false);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* find entry in simple case map, if any */
|
/* find entry in simple case map, if any */
|
||||||
static const pg_case_map *
|
static const pg_case_map *
|
||||||
find_case_map(pg_wchar ucs)
|
find_case_map(pg_wchar ucs)
|
||||||
|
@ -22,11 +22,11 @@ pg_wchar unicode_lowercase_simple(pg_wchar code);
|
|||||||
pg_wchar unicode_titlecase_simple(pg_wchar code);
|
pg_wchar unicode_titlecase_simple(pg_wchar code);
|
||||||
pg_wchar unicode_uppercase_simple(pg_wchar code);
|
pg_wchar unicode_uppercase_simple(pg_wchar code);
|
||||||
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen);
|
ssize_t srclen, bool full);
|
||||||
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen, WordBoundaryNext wbnext,
|
ssize_t srclen, bool full,
|
||||||
void *wbstate);
|
WordBoundaryNext wbnext, void *wbstate);
|
||||||
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen);
|
ssize_t srclen, bool full);
|
||||||
|
|
||||||
#endif /* UNICODE_CASE_H */
|
#endif /* UNICODE_CASE_H */
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -3754,6 +3754,7 @@ pg_sha256_ctx
|
|||||||
pg_sha384_ctx
|
pg_sha384_ctx
|
||||||
pg_sha512_ctx
|
pg_sha512_ctx
|
||||||
pg_snapshot
|
pg_snapshot
|
||||||
|
pg_special_case
|
||||||
pg_stack_base_t
|
pg_stack_base_t
|
||||||
pg_time_t
|
pg_time_t
|
||||||
pg_time_usec_t
|
pg_time_usec_t
|
||||||
|
Reference in New Issue
Block a user