1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-29 10:41:53 +03:00

Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support more
sophisticated case mapping behavior:

 * support case mappings to multiple codepoints, such as "ß"
   uppercasing to "SS"
 * support conditional case mappings, such as the "final sigma"
 * support titlecase variants, such as "dž" uppercasing to "DŽ" but
   titlecasing to "Dž"

Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
This commit is contained in:
Jeff Davis
2025-01-17 15:56:20 -08:00
parent 6a9b2a631a
commit 286a365b9c
9 changed files with 3645 additions and 2993 deletions

View File

@ -78,7 +78,7 @@ size_t
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale) pg_locale_t locale)
{ {
return unicode_strlower(dest, destsize, src, srclen); return unicode_strlower(dest, destsize, src, srclen, false);
} }
size_t size_t
@ -93,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
.prev_alnum = false, .prev_alnum = false,
}; };
return unicode_strtitle(dest, destsize, src, srclen, return unicode_strtitle(dest, destsize, src, srclen, false,
initcap_wbnext, &wbstate); initcap_wbnext, &wbstate);
} }
@ -101,7 +101,7 @@ size_t
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale) pg_locale_t locale)
{ {
return unicode_strupper(dest, destsize, src, srclen); return unicode_strupper(dest, destsize, src, srclen, false);
} }
pg_locale_t pg_locale_t

View File

@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
# These files are part of the Unicode Character Database. Download # These files are part of the Unicode Character Database. Download
# them on demand. The dependency on Makefile.global is for # them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION. # UNICODE_VERSION.
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
unicode_version.h: generate-unicode_version.pl unicode_version.h: generate-unicode_version.pl
@ -91,4 +91,4 @@ clean:
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
distclean: clean distclean: clean
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

View File

@ -18,12 +18,61 @@
#include <wctype.h> #include <wctype.h>
#ifdef USE_ICU #ifdef USE_ICU
#include <unicode/ucasemap.h>
#include <unicode/uchar.h> #include <unicode/uchar.h>
#endif #endif
#include "common/unicode_case.h" #include "common/unicode_case.h"
#include "common/unicode_category.h" #include "common/unicode_category.h"
#include "common/unicode_version.h" #include "common/unicode_version.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
#ifdef USE_ICU
static UCaseMap * casemap = NULL;
#endif
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
ssize_t srclen);
/* simple boundary iterator copied from pg_locale_builtin.c */
struct WordBoundaryState
{
const char *str;
size_t len;
size_t offset;
bool init;
bool prev_alnum;
};
static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, true);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
size_t prev_offset = wbstate->offset;
wbstate->init = true;
wbstate->offset += unicode_utf8len(u);
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
wbstate->offset += unicode_utf8len(u);
}
return wbstate->len;
}
#ifdef USE_ICU #ifdef USE_ICU
static void static void
@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
} }
} }
static void
icu_test_full(char *str)
{
char lower[BUFSZ];
char title[BUFSZ];
char upper[BUFSZ];
char icu_lower[BUFSZ];
char icu_title[BUFSZ];
char icu_upper[BUFSZ];
UErrorCode status;
struct WordBoundaryState wbstate = {
.str = str,
.len = strlen(str),
.offset = 0,
.init = false,
.prev_alnum = false,
};
unicode_strlower(lower, BUFSZ, str, -1, true);
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
unicode_strupper(upper, BUFSZ, str, -1, true);
status = U_ZERO_ERROR;
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
if (strcmp(lower, icu_lower) != 0)
{
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
icu_lower);
exit(1);
}
if (strcmp(title, icu_title) != 0)
{
printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
icu_title);
exit(1);
}
if (strcmp(upper, icu_upper) != 0)
{
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
icu_upper);
exit(1);
}
}
/* /*
* Exhaustively compare case mappings with the results from ICU. * Exhaustively compare case mappings with the results from ICU.
*/ */
@ -64,6 +161,7 @@ test_icu(void)
if (category != PG_U_UNASSIGNED) if (category != PG_U_UNASSIGNED)
{ {
uint8_t icu_category = u_charType(code); uint8_t icu_category = u_charType(code);
char code_str[5] = {0};
if (icu_category == PG_U_UNASSIGNED) if (icu_category == PG_U_UNASSIGNED)
{ {
@ -72,6 +170,9 @@ test_icu(void)
} }
icu_test_simple(code); icu_test_simple(code);
unicode_to_utf8(code, (unsigned char *) code_str);
icu_test_full(code_str);
successful++; successful++;
} }
} }
@ -86,7 +187,7 @@ test_icu(void)
#endif #endif
static void static void
test_strlower(const char *test_string, const char *expected) test_convert(TestFunc tfunc, const char *test_string, const char *expected)
{ {
size_t src1len = strlen(test_string); size_t src1len = strlen(test_string);
size_t src2len = -1; /* NUL-terminated */ size_t src2len = -1; /* NUL-terminated */
@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
/* neither source nor destination are NUL-terminated */ /* neither source nor destination are NUL-terminated */
memset(dst1, 0x7F, dst1len); memset(dst1, 0x7F, dst1len);
needed = unicode_strlower(dst1, dst1len, src1, src1len); needed = tfunc(dst1, dst1len, src1, src1len);
if (needed != strlen(expected)) if (needed != strlen(expected))
{ {
printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed); printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
exit(1); exit(1);
} }
if (memcmp(dst1, expected, dst1len) != 0) if (memcmp(dst1, expected, dst1len) != 0)
@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
/* destination is NUL-terminated and source is not */ /* destination is NUL-terminated and source is not */
memset(dst2, 0x7F, dst2len); memset(dst2, 0x7F, dst2len);
needed = unicode_strlower(dst2, dst2len, src1, src1len); needed = tfunc(dst2, dst2len, src1, src1len);
if (needed != strlen(expected)) if (needed != strlen(expected))
{ {
printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed); printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
exit(1); exit(1);
} }
if (strcmp(dst2, expected) != 0) if (strcmp(dst2, expected) != 0)
@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
/* source is NUL-terminated and destination is not */ /* source is NUL-terminated and destination is not */
memset(dst1, 0x7F, dst1len); memset(dst1, 0x7F, dst1len);
needed = unicode_strlower(dst1, dst1len, src2, src2len); needed = tfunc(dst1, dst1len, src2, src2len);
if (needed != strlen(expected)) if (needed != strlen(expected))
{ {
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed); printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
exit(1); exit(1);
} }
@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
/* both source and destination are NUL-terminated */ /* both source and destination are NUL-terminated */
memset(dst2, 0x7F, dst2len); memset(dst2, 0x7F, dst2len);
needed = unicode_strlower(dst2, dst2len, src2, src2len); needed = tfunc(dst2, dst2len, src2, src2len);
if (needed != strlen(expected)) if (needed != strlen(expected))
{ {
printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed); printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
exit(1); exit(1);
} }
if (strcmp(dst2, expected) != 0) if (strcmp(dst2, expected) != 0)
@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
free(dst2); free(dst2);
} }
static size_t
tfunc_lower(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
return unicode_strlower(dst, dstsize, src, srclen, true);
}
static size_t
tfunc_title(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
.offset = 0,
.init = false,
.prev_alnum = false,
};
return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
&wbstate);
}
static size_t
tfunc_upper(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
return unicode_strupper(dst, dstsize, src, srclen, true);
}
static void static void
test_convert_case() test_convert_case()
{ {
/* test string with no case changes */ /* test string with no case changes */
test_strlower("√∞", "√∞"); test_convert(tfunc_lower, "√∞", "√∞");
/* test adjust-to-cased behavior */
test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
/* test string with case changes */ /* test string with case changes */
test_strlower("ABC", "abc"); test_convert(tfunc_upper, "abc", "ABC");
/* test string with case changes and byte length changes */ /* test string with case changes and byte length changes */
test_strlower("ȺȺȺ", "ⱥⱥⱥ"); test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
/* test special case conversions */
test_convert(tfunc_upper, "ß", "SS");
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
/* test final sigma */
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
#ifdef USE_ICU
icu_test_full("");
icu_test_full("ȺȺȺ");
icu_test_full("ßßß");
icu_test_full("√∞");
icu_test_full("a b");
icu_test_full("abc 123xyz");
icu_test_full("σςΣ ΣΣΣ");
icu_test_full("ıiIİ");
/* test <alpha><iota_subscript><acute> */
icu_test_full("\u0391\u0345\u0301");
#endif
printf("case_test: convert_case: success\n"); printf("case_test: convert_case: success\n");
} }
@ -182,6 +342,22 @@ test_convert_case()
int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
#ifdef USE_ICU
UErrorCode status = U_ZERO_ERROR;
/*
* Disable ICU's word break adjustment for titlecase to match the expected
* behavior of unicode_strtitle().
*/
casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
if (U_FAILURE(status))
{
printf("case_test: failure opening UCaseMap: %s\n",
u_errorName(status));
exit(1);
}
#endif
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION); printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
#ifdef USE_ICU #ifdef USE_ICU
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION); printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@ -191,5 +367,9 @@ main(int argc, char **argv)
#endif #endif
test_convert_case(); test_convert_case();
#ifdef USE_ICU
ucasemap_close(casemap);
#endif
exit(0); exit(0);
} }

View File

@ -3,7 +3,7 @@
# Generate Unicode character case mappings. Does not include tailoring # Generate Unicode character case mappings. Does not include tailoring
# or locale-specific mappings. # or locale-specific mappings.
# #
# Input: UnicodeData.txt # Input: SpecialCasing.txt UnicodeData.txt
# Output: unicode_case_table.h # Output: unicode_case_table.h
# #
# Copyright (c) 2000-2025, PostgreSQL Global Development Group # Copyright (c) 2000-2025, PostgreSQL Global Development Group
@ -21,6 +21,10 @@ GetOptions('outdir:s' => \$output_path);
my $output_table_file = "$output_path/unicode_case_table.h"; my $output_table_file = "$output_path/unicode_case_table.h";
# The maximum number of codepoints that can result from case mapping
# of a single character. See Unicode section 5.18 "Case Mappings".
my $MAX_CASE_EXPANSION = 3;
my $FH; my $FH;
my %simple = (); my %simple = ();
@ -51,6 +55,98 @@ while (my $line = <$FH>)
} }
close $FH; close $FH;
# Map for special casing rules that aren't represented in the simple
# mapping. Language-sensitive mappings are not supported.
#
# See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the
# SpecialCasing.txt file itself for details.
# for now, only Final_Sigma is supported
my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA');
my %special = ();
open($FH, '<', "$output_path/SpecialCasing.txt")
or die "Could not open $output_path/SpecialCasing.txt: $!.";
while (my $line = <$FH>)
{
# language-sensitive mappings not supported
last if $line =~ /\# Language-Sensitive Mappings/;
# remove comments
$line =~ s/^(.*?)#.*$/$1/s;
# ignore empty lines
next unless $line =~ /;/;
my @elts = split /;/, $line;
my $code = hex($elts[0]);
# Codepoint may map to multiple characters when converting
# case. Split each mapping on whitespace and extract the
# hexadecimal into an array of codepoints.
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
my @conditions = map {
# supporting negated conditions may require storing a
# mask of relevant conditions for a given rule to differentiate
# between lack of a condition and a negated condition
die "negated conditions not supported" if /^Not_/;
$condition_map{$_} || die "unrecognized condition: $_"
} (grep /\w+/, (split /\s+/, $elts[4]));
my $cond_str = (join '|', @conditions) || '0';
# if empty, create a self-mapping
push @lower, $code if (scalar @lower == 0);
push @title, $code if (scalar @title == 0);
push @upper, $code if (scalar @upper == 0);
# none should map to more than 3 codepoints
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
if (scalar @lower) > $MAX_CASE_EXPANSION;
die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'"
if (scalar @title) > $MAX_CASE_EXPANSION;
die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'"
if (scalar @upper) > $MAX_CASE_EXPANSION;
# pad arrays to a fixed length of 3
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
# Characters with special mappings may not have simple mappings;
# ensure that an entry exists.
$simple{$code} ||= {
Simple_Lowercase => $code,
Simple_Titlecase => $code,
Simple_Uppercase => $code
};
# Multiple special case rules for a single codepoint could be
# supported by making several entries for each codepoint, and have
# the simple mapping point to the first entry. The caller could
# scan forward looking for an entry that matches the conditions,
# or fall back to the normal behavior.
die "multiple special case mappings not supported"
if defined $special{$code};
$special{$code} = {
Lowercase => \@lower,
Titlecase => \@title,
Uppercase => \@upper,
Conditions => $cond_str
};
}
close $FH;
# assign sequential array indexes to the special mappings
my $special_idx = 0;
foreach my $code (sort { $a <=> $b } (keys %special))
{
$special{$code}{Index} = $special_idx++;
}
# Start writing out the output files # Start writing out the output files
open my $OT, '>', $output_table_file open my $OT, '>', $output_table_file
or die "Could not open output file $output_table_file: $!\n"; or die "Could not open output file $output_table_file: $!\n";
@ -63,6 +159,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
$num_simple++ unless $code < 0x80; $num_simple++ unless $code < 0x80;
} }
my $num_special = scalar(keys %special) + 1;
print $OT <<"EOS"; print $OT <<"EOS";
/*------------------------------------------------------------------------- /*-------------------------------------------------------------------------
* *
@ -86,6 +184,19 @@ print $OT <<"EOS";
#include "common/unicode_case.h" #include "common/unicode_case.h"
#include "mb/pg_wchar.h" #include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
* of a single character. See Unicode section 5.18 "Case Mappings".
*/
#define MAX_CASE_EXPANSION 3
/*
* Case mapping condition flags. For now, only Final_Sigma is supported.
*
* See Unicode Context Specification for Casing.
*/
#define PG_U_FINAL_SIGMA (1 << 0)
typedef enum typedef enum
{ {
CaseLower = 0, CaseLower = 0,
@ -94,12 +205,47 @@ typedef enum
NCaseKind NCaseKind
} CaseKind; } CaseKind;
typedef struct
{
pg_wchar codepoint; /* Unicode codepoint */
int16 conditions;
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
typedef struct typedef struct
{ {
pg_wchar codepoint; /* Unicode codepoint */ pg_wchar codepoint; /* Unicode codepoint */
pg_wchar simplemap[NCaseKind]; pg_wchar simplemap[NCaseKind];
const pg_special_case *special_case;
} pg_case_map; } pg_case_map;
/*
* Special case mappings that aren't representable in the simple map.
* Entries are referenced from simple_case_map.
*/
static const pg_special_case special_case[$num_special] =
{
EOS
foreach my $code (sort { $a <=> $b } (keys %special))
{
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
my $lower = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
my $title = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
my $upper = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
}
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
print $OT <<"EOS";
};
/* /*
* Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup), * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
* sparse for higher codepoints (requiring scan or binary search). * sparse for higher codepoints (requiring scan or binary search).
@ -114,8 +260,10 @@ for (my $code = 0; $code < 0x80; $code++)
my $lc = ($simple{$code}{Simple_Lowercase} || $code); my $lc = ($simple{$code}{Simple_Lowercase} || $code);
my $tc = ($simple{$code}{Simple_Titlecase} || $code); my $tc = ($simple{$code}{Simple_Titlecase} || $code);
my $uc = ($simple{$code}{Simple_Uppercase} || $code); my $uc = ($simple{$code}{Simple_Uppercase} || $code);
die "unexpected special case for code $code"
if defined $special{$code};
printf $OT printf $OT
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
$code, $lc, $tc, $uc; $code, $lc, $tc, $uc;
} }
printf $OT "\n"; printf $OT "\n";
@ -126,9 +274,14 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
next unless $code >= 0x80; # already output above next unless $code >= 0x80; # already output above
my $map = $simple{$code}; my $map = $simple{$code};
my $special_case = "NULL";
if (exists $special{$code})
{
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
}
printf $OT printf $OT
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
$map->{Simple_Uppercase}; $map->{Simple_Uppercase}, $special_case;
} }
print $OT "};\n"; print $OT "};\n";

View File

@ -11,7 +11,7 @@ endif
# These files are part of the Unicode Character Database. Download them on # These files are part of the Unicode Character Database. Download them on
# demand. # demand.
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'UnicodeData.txt'] foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
url = unicode_baseurl.format(UNICODE_VERSION, f) url = unicode_baseurl.format(UNICODE_VERSION, f)
target = custom_target(f, target = custom_target(f,
output: f, output: f,
@ -26,7 +26,7 @@ update_unicode_targets = []
update_unicode_targets += \ update_unicode_targets += \
custom_target('unicode_case_table.h', custom_target('unicode_case_table.h',
input: [unicode_data['UnicodeData.txt']], input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
output: ['unicode_case_table.h'], output: ['unicode_case_table.h'],
command: [ command: [
perl, files('generate-unicode_case_table.pl'), perl, files('generate-unicode_case_table.pl'),

View File

@ -17,12 +17,15 @@
#include "common/unicode_case.h" #include "common/unicode_case.h"
#include "common/unicode_case_table.h" #include "common/unicode_case_table.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h" #include "mb/pg_wchar.h"
static const pg_case_map *find_case_map(pg_wchar ucs); static const pg_case_map *find_case_map(pg_wchar ucs);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, WordBoundaryNext wbnext, CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate); void *wbstate);
static bool check_special_conditions(int conditions, const char *str,
size_t len, size_t offset);
pg_wchar pg_wchar
unicode_lowercase_simple(pg_wchar code) unicode_lowercase_simple(pg_wchar code)
@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
* *
* If dstsize is zero, dst may be NULL. This is useful for calculating the * If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating. * required buffer size before allocating.
*
* If full is true, use special case mappings if available and if the
* conditions are satisfied.
*/ */
size_t size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen) unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
bool full)
{ {
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL); return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
NULL);
} }
/* /*
@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
* If dstsize is zero, dst may be NULL. This is useful for calculating the * If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating. * required buffer size before allocating.
* *
* If full is true, use special case mappings if available and if the
* conditions are satisfied. Otherwise, use only simple mappings and use
* uppercase instead of titlecase.
*
* Titlecasing requires knowledge about word boundaries, which is provided by * Titlecasing requires knowledge about word boundaries, which is provided by
* the callback wbnext. A word boundary is the offset of the start of a word * the callback wbnext. A word boundary is the offset of the start of a word
* or the offset of the character immediately following a word. * or the offset of the character immediately following a word.
@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
*/ */
size_t size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
WordBoundaryNext wbnext, void *wbstate) bool full, WordBoundaryNext wbnext, void *wbstate)
{ {
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext, return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
wbstate); wbstate);
} }
@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
* *
* If dstsize is zero, dst may be NULL. This is useful for calculating the * If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating. * required buffer size before allocating.
*
* If full is true, use special case mappings if available and if the
* conditions are satisfied.
*/ */
size_t size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen) unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
bool full)
{ {
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL); return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
NULL);
} }
/* /*
* Implement Unicode Default Case Conversion algorithm.
*
* If str_casekind is CaseLower or CaseUpper, map each character in the string * If str_casekind is CaseLower or CaseUpper, map each character in the string
* for which a mapping is available. * for which a mapping is available.
* *
* If str_casekind is CaseTitle, maps characters found on a word boundary to * If str_casekind is CaseTitle, maps characters found on a word boundary to
* uppercase and other characters to lowercase. * titlecase (or uppercase if full is false) and other characters to
* lowercase. NB: does not currently implement the Unicode behavior in which
* the word boundary is adjusted to the next Cased character. That behavior
* could be implemented as an option, but it doesn't match the default
* behavior of ICU, nor does it match the documented behavior of INITCAP().
*
* If full is true, use special mappings for relevant characters, which can
* map a single codepoint to multiple codepoints, or depend on conditions.
*/ */
static size_t static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate) CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate)
{ {
/* character CaseKind varies while titlecasing */ /* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind; CaseKind chr_casekind = str_casekind;
@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff); pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1); int u1len = unicode_utf8len(u1);
const pg_case_map *casemap = find_case_map(u1); const pg_case_map *casemap = find_case_map(u1);
const pg_special_case *special = NULL;
if (str_casekind == CaseTitle) if (str_casekind == CaseTitle)
{ {
if (srcoff == boundary) if (srcoff == boundary)
{ {
chr_casekind = CaseUpper; chr_casekind = full ? CaseTitle : CaseUpper;
boundary = wbnext(wbstate); boundary = wbnext(wbstate);
} }
else else
chr_casekind = CaseLower; chr_casekind = CaseLower;
} }
/*
* Find special case that matches the conditions, if any.
*
* Note: only a single special mapping per codepoint is currently
* supported, though Unicode allows for multiple special mappings for
* a single codepoint.
*/
if (full && casemap && casemap->special_case)
{
int16 conditions = casemap->special_case->conditions;
Assert(casemap->special_case->codepoint == u1);
if (check_special_conditions(conditions, src, srclen, srcoff))
special = casemap->special_case;
}
/* perform mapping, update result_len, and write to dst */ /* perform mapping, update result_len, and write to dst */
if (casemap) if (special)
{
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
{
pg_wchar u2 = special->map[chr_casekind][i];
size_t u2len = unicode_utf8len(u2);
if (u2 == '\0')
break;
if (result_len + u2len <= dstsize)
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
result_len += u2len;
}
}
else if (casemap)
{ {
pg_wchar u2 = casemap->simplemap[chr_casekind]; pg_wchar u2 = casemap->simplemap[chr_casekind];
pg_wchar u2len = unicode_utf8len(u2); pg_wchar u2len = unicode_utf8len(u2);
@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
return result_len; return result_len;
} }
/*
* Check that the condition matches Final_Sigma, described in Unicode Table
* 3-17. The character at the given offset must be directly preceded by a
* Cased character, and must not be directly followed by a Cased character.
*
* Case_Ignorable characters are ignored. NB: some characters may be both
* Cased and Case_Ignorable, in which case they are ignored.
*/
static bool
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
/* the start of the string is not preceded by a Cased character */
if (offset == 0)
return false;
/* iterate backwards, looking for Cased character */
for (int i = offset - 1; i >= 0; i--)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
pg_wchar curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
else if (pg_u_prop_cased(curr))
break;
else
return false;
}
else if ((str[i] & 0xC0) == 0x80)
continue;
Assert(false); /* invalid UTF-8 */
}
/* end of string is not followed by a Cased character */
if (offset == len)
return true;
/* iterate forwards, looking for Cased character */
for (int i = offset + 1; i < len && str[i] != '\0'; i++)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
pg_wchar curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
else if (pg_u_prop_cased(curr))
return false;
else
break;
}
else if ((str[i] & 0xC0) == 0x80)
continue;
Assert(false); /* invalid UTF-8 */
}
return true;
}
static bool
check_special_conditions(int conditions, const char *str, size_t len,
size_t offset)
{
if (conditions == 0)
return true;
else if (conditions == PG_U_FINAL_SIGMA)
return check_final_sigma((unsigned char *) str, len, offset);
/* no other conditions supported */
Assert(false);
return false;
}
/* find entry in simple case map, if any */ /* find entry in simple case map, if any */
static const pg_case_map * static const pg_case_map *
find_case_map(pg_wchar ucs) find_case_map(pg_wchar ucs)

View File

@ -22,11 +22,11 @@ pg_wchar unicode_lowercase_simple(pg_wchar code);
pg_wchar unicode_titlecase_simple(pg_wchar code); pg_wchar unicode_titlecase_simple(pg_wchar code);
pg_wchar unicode_uppercase_simple(pg_wchar code); pg_wchar unicode_uppercase_simple(pg_wchar code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen); ssize_t srclen, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
ssize_t srclen, WordBoundaryNext wbnext, ssize_t srclen, bool full,
void *wbstate); WordBoundaryNext wbnext, void *wbstate);
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
ssize_t srclen); ssize_t srclen, bool full);
#endif /* UNICODE_CASE_H */ #endif /* UNICODE_CASE_H */

File diff suppressed because it is too large Load Diff

View File

@ -3754,6 +3754,7 @@ pg_sha256_ctx
pg_sha384_ctx pg_sha384_ctx
pg_sha512_ctx pg_sha512_ctx
pg_snapshot pg_snapshot
pg_special_case
pg_stack_base_t pg_stack_base_t
pg_time_t pg_time_t
pg_time_usec_t pg_time_usec_t