mirror of
https://github.com/postgres/postgres.git
synced 2025-10-24 01:29:19 +03:00
Add support for Unicode case folding.
Expand case mapping tables to include entries for case folding, which are parsed from CaseFolding.txt. Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
This commit is contained in:
@@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
|
|||||||
# These files are part of the Unicode Character Database. Download
|
# These files are part of the Unicode Character Database. Download
|
||||||
# them on demand. The dependency on Makefile.global is for
|
# them on demand. The dependency on Makefile.global is for
|
||||||
# UNICODE_VERSION.
|
# UNICODE_VERSION.
|
||||||
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
|
CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
|
||||||
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
||||||
|
|
||||||
unicode_version.h: generate-unicode_version.pl
|
unicode_version.h: generate-unicode_version.pl
|
||||||
$(PERL) $< --version $(UNICODE_VERSION)
|
$(PERL) $< --version $(UNICODE_VERSION)
|
||||||
|
|
||||||
unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
|
unicode_case_table.h: generate-unicode_case_table.pl CaseFolding.txt UnicodeData.txt
|
||||||
$(PERL) $<
|
$(PERL) $<
|
||||||
|
|
||||||
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
|
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
|
||||||
@@ -91,4 +91,4 @@ clean:
|
|||||||
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
|
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
|
||||||
|
|
||||||
distclean: clean
|
distclean: clean
|
||||||
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
|
rm -f CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
|
||||||
|
@@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code)
|
|||||||
pg_wchar lower = unicode_lowercase_simple(code);
|
pg_wchar lower = unicode_lowercase_simple(code);
|
||||||
pg_wchar title = unicode_titlecase_simple(code);
|
pg_wchar title = unicode_titlecase_simple(code);
|
||||||
pg_wchar upper = unicode_uppercase_simple(code);
|
pg_wchar upper = unicode_uppercase_simple(code);
|
||||||
|
pg_wchar fold = unicode_casefold_simple(code);
|
||||||
pg_wchar iculower = u_tolower(code);
|
pg_wchar iculower = u_tolower(code);
|
||||||
pg_wchar icutitle = u_totitle(code);
|
pg_wchar icutitle = u_totitle(code);
|
||||||
pg_wchar icuupper = u_toupper(code);
|
pg_wchar icuupper = u_toupper(code);
|
||||||
|
pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
|
||||||
|
|
||||||
if (lower != iculower || title != icutitle || upper != icuupper)
|
if (lower != iculower || title != icutitle || upper != icuupper ||
|
||||||
|
fold != icufold)
|
||||||
{
|
{
|
||||||
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
|
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
|
||||||
printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
|
printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
|
||||||
lower, title, upper);
|
lower, title, upper, fold);
|
||||||
printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
|
printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
|
||||||
iculower, icutitle, icuupper);
|
iculower, icutitle, icuupper, icufold);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@@ -103,9 +106,11 @@ icu_test_full(char *str)
|
|||||||
char lower[BUFSZ];
|
char lower[BUFSZ];
|
||||||
char title[BUFSZ];
|
char title[BUFSZ];
|
||||||
char upper[BUFSZ];
|
char upper[BUFSZ];
|
||||||
|
char fold[BUFSZ];
|
||||||
char icu_lower[BUFSZ];
|
char icu_lower[BUFSZ];
|
||||||
char icu_title[BUFSZ];
|
char icu_title[BUFSZ];
|
||||||
char icu_upper[BUFSZ];
|
char icu_upper[BUFSZ];
|
||||||
|
char icu_fold[BUFSZ];
|
||||||
UErrorCode status;
|
UErrorCode status;
|
||||||
struct WordBoundaryState wbstate = {
|
struct WordBoundaryState wbstate = {
|
||||||
.str = str,
|
.str = str,
|
||||||
@@ -118,12 +123,15 @@ icu_test_full(char *str)
|
|||||||
unicode_strlower(lower, BUFSZ, str, -1, true);
|
unicode_strlower(lower, BUFSZ, str, -1, true);
|
||||||
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
|
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
|
||||||
unicode_strupper(upper, BUFSZ, str, -1, true);
|
unicode_strupper(upper, BUFSZ, str, -1, true);
|
||||||
|
unicode_strfold(fold, BUFSZ, str, -1, true);
|
||||||
status = U_ZERO_ERROR;
|
status = U_ZERO_ERROR;
|
||||||
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
|
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
|
||||||
status = U_ZERO_ERROR;
|
status = U_ZERO_ERROR;
|
||||||
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
|
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
|
||||||
status = U_ZERO_ERROR;
|
status = U_ZERO_ERROR;
|
||||||
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
|
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
|
||||||
|
status = U_ZERO_ERROR;
|
||||||
|
ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
|
||||||
|
|
||||||
if (strcmp(lower, icu_lower) != 0)
|
if (strcmp(lower, icu_lower) != 0)
|
||||||
{
|
{
|
||||||
@@ -143,6 +151,12 @@ icu_test_full(char *str)
|
|||||||
icu_upper);
|
icu_upper);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
if (strcmp(fold, icu_fold) != 0)
|
||||||
|
{
|
||||||
|
printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
|
||||||
|
icu_fold);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src,
|
|||||||
return unicode_strupper(dst, dstsize, src, srclen, true);
|
return unicode_strupper(dst, dstsize, src, srclen, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
tfunc_fold(char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen)
|
||||||
|
{
|
||||||
|
return unicode_strfold(dst, dstsize, src, srclen, true);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_convert_case()
|
test_convert_case()
|
||||||
@@ -318,10 +338,12 @@ test_convert_case()
|
|||||||
test_convert(tfunc_upper, "ß", "SS");
|
test_convert(tfunc_upper, "ß", "SS");
|
||||||
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
|
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
|
||||||
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
|
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
|
||||||
|
test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
|
||||||
/* test final sigma */
|
/* test final sigma */
|
||||||
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
|
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
|
||||||
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
|
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
|
||||||
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
|
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
|
||||||
|
test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
|
||||||
|
|
||||||
#ifdef USE_ICU
|
#ifdef USE_ICU
|
||||||
icu_test_full("");
|
icu_test_full("");
|
||||||
|
@@ -49,7 +49,8 @@ while (my $line = <$FH>)
|
|||||||
$simple{$code} = {
|
$simple{$code} = {
|
||||||
Simple_Lowercase => ($simple_lowercase || $code),
|
Simple_Lowercase => ($simple_lowercase || $code),
|
||||||
Simple_Titlecase => ($simple_titlecase || $code),
|
Simple_Titlecase => ($simple_titlecase || $code),
|
||||||
Simple_Uppercase => ($simple_uppercase || $code)
|
Simple_Uppercase => ($simple_uppercase || $code),
|
||||||
|
Simple_Foldcase => $code,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -87,6 +88,7 @@ while (my $line = <$FH>)
|
|||||||
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
|
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
|
||||||
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
|
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
|
||||||
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
|
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
|
||||||
|
my @fold = ();
|
||||||
my @conditions = map {
|
my @conditions = map {
|
||||||
# supporting negated conditions may require storing a
|
# supporting negated conditions may require storing a
|
||||||
# mask of relevant conditions for a given rule to differentiate
|
# mask of relevant conditions for a given rule to differentiate
|
||||||
@@ -101,6 +103,7 @@ while (my $line = <$FH>)
|
|||||||
push @lower, $code if (scalar @lower == 0);
|
push @lower, $code if (scalar @lower == 0);
|
||||||
push @title, $code if (scalar @title == 0);
|
push @title, $code if (scalar @title == 0);
|
||||||
push @upper, $code if (scalar @upper == 0);
|
push @upper, $code if (scalar @upper == 0);
|
||||||
|
push @fold, $code;
|
||||||
|
|
||||||
# none should map to more than 3 codepoints
|
# none should map to more than 3 codepoints
|
||||||
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
|
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
|
||||||
@@ -114,13 +117,15 @@ while (my $line = <$FH>)
|
|||||||
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
|
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
|
||||||
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
|
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
|
||||||
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
|
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
|
||||||
|
while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
|
||||||
|
|
||||||
# Characters with special mappings may not have simple mappings;
|
# Characters with special mappings may not have simple mappings;
|
||||||
# ensure that an entry exists.
|
# ensure that an entry exists.
|
||||||
$simple{$code} ||= {
|
$simple{$code} ||= {
|
||||||
Simple_Lowercase => $code,
|
Simple_Lowercase => $code,
|
||||||
Simple_Titlecase => $code,
|
Simple_Titlecase => $code,
|
||||||
Simple_Uppercase => $code
|
Simple_Uppercase => $code,
|
||||||
|
Simple_Foldcase => $code
|
||||||
};
|
};
|
||||||
|
|
||||||
# Multiple special case rules for a single codepoint could be
|
# Multiple special case rules for a single codepoint could be
|
||||||
@@ -135,11 +140,96 @@ while (my $line = <$FH>)
|
|||||||
Lowercase => \@lower,
|
Lowercase => \@lower,
|
||||||
Titlecase => \@title,
|
Titlecase => \@title,
|
||||||
Uppercase => \@upper,
|
Uppercase => \@upper,
|
||||||
|
Foldcase => \@fold,
|
||||||
Conditions => $cond_str
|
Conditions => $cond_str
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
close $FH;
|
close $FH;
|
||||||
|
|
||||||
|
open($FH, '<', "$output_path/CaseFolding.txt")
|
||||||
|
or die "Could not open $output_path/CaseFolding.txt: $!.";
|
||||||
|
while (my $line = <$FH>)
|
||||||
|
{
|
||||||
|
# remove comments
|
||||||
|
$line =~ s/^(.*?)#.*$/$1/s;
|
||||||
|
|
||||||
|
# ignore empty lines
|
||||||
|
next unless $line =~ /;/;
|
||||||
|
|
||||||
|
my @elts = split(';', $line);
|
||||||
|
my $code = hex($elts[0]);
|
||||||
|
my $status = $elts[1] =~ s/^\s+|\s+$//rg;
|
||||||
|
|
||||||
|
# Codepoint may map to multiple characters when folding. Split
|
||||||
|
# each mapping on whitespace and extract the hexadecimal into an
|
||||||
|
# array of codepoints.
|
||||||
|
my @fold = map { hex $_ } (grep /[0-9A-F]+/, (split /\s+/, $elts[2]));
|
||||||
|
|
||||||
|
die "codepoint $code out of range" if $code > 0x10FFFF;
|
||||||
|
|
||||||
|
# status 'T' unsupported; skip
|
||||||
|
next if $status eq 'T';
|
||||||
|
|
||||||
|
# encountered unrecognized status type
|
||||||
|
die "unsupported status type '$status'"
|
||||||
|
if $status ne 'S' && $status ne 'C' && $status ne 'F';
|
||||||
|
|
||||||
|
# initialize simple case mappings if they don't exist
|
||||||
|
$simple{$code} ||= {
|
||||||
|
Simple_Lowercase => $code,
|
||||||
|
Simple_Titlecase => $code,
|
||||||
|
Simple_Uppercase => $code,
|
||||||
|
Simple_Foldcase => $code
|
||||||
|
};
|
||||||
|
|
||||||
|
if ($status eq 'S' || $status eq 'C')
|
||||||
|
{
|
||||||
|
die
|
||||||
|
"Simple case folding for $code has multiple codepoints: '$line' '$elts[2]'"
|
||||||
|
if scalar @fold != 1;
|
||||||
|
my $simple_foldcase = $fold[0];
|
||||||
|
|
||||||
|
die "Simple_Foldcase $code out of range"
|
||||||
|
if $simple_foldcase > 0x10FFFF;
|
||||||
|
|
||||||
|
$simple{$code}{Simple_Foldcase} = $simple_foldcase;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($status eq 'F' || ($status eq 'C' && defined $special{$code}))
|
||||||
|
{
|
||||||
|
while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
|
||||||
|
|
||||||
|
#initialize special case mappings if they don't exist
|
||||||
|
if (!defined $special{$code})
|
||||||
|
{
|
||||||
|
my @lower = ($simple{$code}{Simple_Lowercase});
|
||||||
|
my @title = ($simple{$code}{Simple_Titlecase});
|
||||||
|
my @upper = ($simple{$code}{Simple_Uppercase});
|
||||||
|
while (scalar @lower < $MAX_CASE_EXPANSION)
|
||||||
|
{
|
||||||
|
push @lower, 0x000000;
|
||||||
|
}
|
||||||
|
while (scalar @title < $MAX_CASE_EXPANSION)
|
||||||
|
{
|
||||||
|
push @title, 0x000000;
|
||||||
|
}
|
||||||
|
while (scalar @upper < $MAX_CASE_EXPANSION)
|
||||||
|
{
|
||||||
|
push @upper, 0x000000;
|
||||||
|
}
|
||||||
|
$special{$code} = {
|
||||||
|
Lowercase => \@lower,
|
||||||
|
Titlecase => \@title,
|
||||||
|
Uppercase => \@upper,
|
||||||
|
Conditions => '0'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
$special{$code}{Foldcase} = \@fold;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close $FH;
|
||||||
|
|
||||||
# assign sequential array indexes to the special mappings
|
# assign sequential array indexes to the special mappings
|
||||||
my $special_idx = 0;
|
my $special_idx = 0;
|
||||||
foreach my $code (sort { $a <=> $b } (keys %special))
|
foreach my $code (sort { $a <=> $b } (keys %special))
|
||||||
@@ -202,6 +292,7 @@ typedef enum
|
|||||||
CaseLower = 0,
|
CaseLower = 0,
|
||||||
CaseTitle = 1,
|
CaseTitle = 1,
|
||||||
CaseUpper = 2,
|
CaseUpper = 2,
|
||||||
|
CaseFold = 3,
|
||||||
NCaseKind
|
NCaseKind
|
||||||
} CaseKind;
|
} CaseKind;
|
||||||
|
|
||||||
@@ -232,14 +323,17 @@ foreach my $code (sort { $a <=> $b } (keys %special))
|
|||||||
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
|
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
|
||||||
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
|
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
|
||||||
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
|
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
|
||||||
|
die if scalar @{ $special{$code}{Foldcase} } != $MAX_CASE_EXPANSION;
|
||||||
my $lower = join ", ",
|
my $lower = join ", ",
|
||||||
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
|
||||||
my $title = join ", ",
|
my $title = join ", ",
|
||||||
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
|
||||||
my $upper = join ", ",
|
my $upper = join ", ",
|
||||||
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
|
||||||
|
my $fold = join ", ",
|
||||||
|
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
|
||||||
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
|
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
|
||||||
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
|
printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
|
||||||
}
|
}
|
||||||
|
|
||||||
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
|
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
|
||||||
@@ -260,11 +354,13 @@ for (my $code = 0; $code < 0x80; $code++)
|
|||||||
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
|
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
|
||||||
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
|
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
|
||||||
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
|
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
|
||||||
|
my $fc = ($simple{$code}{Simple_Foldcase} || $code);
|
||||||
|
|
||||||
die "unexpected special case for code $code"
|
die "unexpected special case for code $code"
|
||||||
if defined $special{$code};
|
if defined $special{$code};
|
||||||
printf $OT
|
printf $OT
|
||||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
|
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
|
||||||
$code, $lc, $tc, $uc;
|
$code, $lc, $tc, $uc, $fc;
|
||||||
}
|
}
|
||||||
printf $OT "\n";
|
printf $OT "\n";
|
||||||
|
|
||||||
@@ -280,8 +376,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
|
|||||||
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
|
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
|
||||||
}
|
}
|
||||||
printf $OT
|
printf $OT
|
||||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
|
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
|
||||||
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
|
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
|
||||||
$map->{Simple_Uppercase}, $special_case;
|
$map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
|
||||||
}
|
}
|
||||||
print $OT "};\n";
|
print $OT "};\n";
|
||||||
|
@@ -11,7 +11,7 @@ endif
|
|||||||
|
|
||||||
# These files are part of the Unicode Character Database. Download them on
|
# These files are part of the Unicode Character Database. Download them on
|
||||||
# demand.
|
# demand.
|
||||||
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
|
foreach f : ['CompositionExclusions.txt', 'CaseFolding.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
|
||||||
url = unicode_baseurl.format(UNICODE_VERSION, f)
|
url = unicode_baseurl.format(UNICODE_VERSION, f)
|
||||||
target = custom_target(f,
|
target = custom_target(f,
|
||||||
output: f,
|
output: f,
|
||||||
@@ -26,7 +26,7 @@ update_unicode_targets = []
|
|||||||
|
|
||||||
update_unicode_targets += \
|
update_unicode_targets += \
|
||||||
custom_target('unicode_case_table.h',
|
custom_target('unicode_case_table.h',
|
||||||
input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
|
input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
|
||||||
output: ['unicode_case_table.h'],
|
output: ['unicode_case_table.h'],
|
||||||
command: [
|
command: [
|
||||||
perl, files('generate-unicode_case_table.pl'),
|
perl, files('generate-unicode_case_table.pl'),
|
||||||
|
@@ -51,6 +51,14 @@ unicode_uppercase_simple(pg_wchar code)
|
|||||||
return map ? map->simplemap[CaseUpper] : code;
|
return map ? map->simplemap[CaseUpper] : code;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pg_wchar
|
||||||
|
unicode_casefold_simple(pg_wchar code)
|
||||||
|
{
|
||||||
|
const pg_case_map *map = find_case_map(code);
|
||||||
|
|
||||||
|
return map ? map->simplemap[CaseFold] : code;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* unicode_strlower()
|
* unicode_strlower()
|
||||||
*
|
*
|
||||||
@@ -142,6 +150,30 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
|||||||
NULL);
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* unicode_strfold()
|
||||||
|
*
|
||||||
|
* Case fold src, and return the result length (not including terminating
|
||||||
|
* NUL).
|
||||||
|
*
|
||||||
|
* String src must be encoded in UTF-8. If srclen < 0, src must be
|
||||||
|
* NUL-terminated.
|
||||||
|
*
|
||||||
|
* Result string is stored in dst, truncating if larger than dstsize. If
|
||||||
|
* dstsize is greater than the result length, dst will be NUL-terminated;
|
||||||
|
* otherwise not.
|
||||||
|
*
|
||||||
|
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||||
|
* required buffer size before allocating.
|
||||||
|
*/
|
||||||
|
size_t
|
||||||
|
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
|
bool full)
|
||||||
|
{
|
||||||
|
return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
|
||||||
|
NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Implement Unicode Default Case Conversion algorithm.
|
* Implement Unicode Default Case Conversion algorithm.
|
||||||
*
|
*
|
||||||
|
@@ -21,6 +21,7 @@ typedef size_t (*WordBoundaryNext) (void *wbstate);
|
|||||||
pg_wchar unicode_lowercase_simple(pg_wchar code);
|
pg_wchar unicode_lowercase_simple(pg_wchar code);
|
||||||
pg_wchar unicode_titlecase_simple(pg_wchar code);
|
pg_wchar unicode_titlecase_simple(pg_wchar code);
|
||||||
pg_wchar unicode_uppercase_simple(pg_wchar code);
|
pg_wchar unicode_uppercase_simple(pg_wchar code);
|
||||||
|
pg_wchar unicode_casefold_simple(pg_wchar code);
|
||||||
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen, bool full);
|
ssize_t srclen, bool full);
|
||||||
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
||||||
@@ -28,5 +29,7 @@ size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
|||||||
WordBoundaryNext wbnext, void *wbstate);
|
WordBoundaryNext wbnext, void *wbstate);
|
||||||
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen, bool full);
|
ssize_t srclen, bool full);
|
||||||
|
size_t unicode_strfold(char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen, bool full);
|
||||||
|
|
||||||
#endif /* UNICODE_CASE_H */
|
#endif /* UNICODE_CASE_H */
|
||||||
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user