mirror of
https://github.com/postgres/postgres.git
synced 2025-06-29 10:41:53 +03:00
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
This commit is contained in:
@ -78,7 +78,7 @@ size_t
|
||||
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
return unicode_strlower(dest, destsize, src, srclen);
|
||||
return unicode_strlower(dest, destsize, src, srclen, false);
|
||||
}
|
||||
|
||||
size_t
|
||||
@ -93,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
.prev_alnum = false,
|
||||
};
|
||||
|
||||
return unicode_strtitle(dest, destsize, src, srclen,
|
||||
return unicode_strtitle(dest, destsize, src, srclen, false,
|
||||
initcap_wbnext, &wbstate);
|
||||
}
|
||||
|
||||
@ -101,7 +101,7 @@ size_t
|
||||
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
return unicode_strupper(dest, destsize, src, srclen);
|
||||
return unicode_strupper(dest, destsize, src, srclen, false);
|
||||
}
|
||||
|
||||
pg_locale_t
|
||||
|
@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
|
||||
# These files are part of the Unicode Character Database. Download
|
||||
# them on demand. The dependency on Makefile.global is for
|
||||
# UNICODE_VERSION.
|
||||
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
|
||||
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
||||
|
||||
unicode_version.h: generate-unicode_version.pl
|
||||
@ -91,4 +91,4 @@ clean:
|
||||
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
|
||||
|
||||
distclean: clean
|
||||
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
|
||||
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
|
||||
|
@ -18,12 +18,61 @@
|
||||
#include <wctype.h>
|
||||
|
||||
#ifdef USE_ICU
|
||||
#include <unicode/ucasemap.h>
|
||||
#include <unicode/uchar.h>
|
||||
#endif
|
||||
#include "common/unicode_case.h"
|
||||
#include "common/unicode_category.h"
|
||||
#include "common/unicode_version.h"
|
||||
|
||||
/* enough to hold largest source or result string, including NUL */
|
||||
#define BUFSZ 256
|
||||
|
||||
#ifdef USE_ICU
|
||||
static UCaseMap * casemap = NULL;
|
||||
#endif
|
||||
|
||||
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen);
|
||||
|
||||
/* simple boundary iterator copied from pg_locale_builtin.c */
|
||||
struct WordBoundaryState
|
||||
{
|
||||
const char *str;
|
||||
size_t len;
|
||||
size_t offset;
|
||||
bool init;
|
||||
bool prev_alnum;
|
||||
};
|
||||
|
||||
static size_t
|
||||
initcap_wbnext(void *state)
|
||||
{
|
||||
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
|
||||
|
||||
while (wbstate->offset < wbstate->len &&
|
||||
wbstate->str[wbstate->offset] != '\0')
|
||||
{
|
||||
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||
wbstate->offset);
|
||||
bool curr_alnum = pg_u_isalnum(u, true);
|
||||
|
||||
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
|
||||
{
|
||||
size_t prev_offset = wbstate->offset;
|
||||
|
||||
wbstate->init = true;
|
||||
wbstate->offset += unicode_utf8len(u);
|
||||
wbstate->prev_alnum = curr_alnum;
|
||||
return prev_offset;
|
||||
}
|
||||
|
||||
wbstate->offset += unicode_utf8len(u);
|
||||
}
|
||||
|
||||
return wbstate->len;
|
||||
}
|
||||
|
||||
#ifdef USE_ICU
|
||||
|
||||
static void
|
||||
@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
icu_test_full(char *str)
|
||||
{
|
||||
char lower[BUFSZ];
|
||||
char title[BUFSZ];
|
||||
char upper[BUFSZ];
|
||||
char icu_lower[BUFSZ];
|
||||
char icu_title[BUFSZ];
|
||||
char icu_upper[BUFSZ];
|
||||
UErrorCode status;
|
||||
struct WordBoundaryState wbstate = {
|
||||
.str = str,
|
||||
.len = strlen(str),
|
||||
.offset = 0,
|
||||
.init = false,
|
||||
.prev_alnum = false,
|
||||
};
|
||||
|
||||
unicode_strlower(lower, BUFSZ, str, -1, true);
|
||||
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
|
||||
unicode_strupper(upper, BUFSZ, str, -1, true);
|
||||
status = U_ZERO_ERROR;
|
||||
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
|
||||
status = U_ZERO_ERROR;
|
||||
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
|
||||
status = U_ZERO_ERROR;
|
||||
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
|
||||
|
||||
if (strcmp(lower, icu_lower) != 0)
|
||||
{
|
||||
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
|
||||
icu_lower);
|
||||
exit(1);
|
||||
}
|
||||
if (strcmp(title, icu_title) != 0)
|
||||
{
|
||||
printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
|
||||
icu_title);
|
||||
exit(1);
|
||||
}
|
||||
if (strcmp(upper, icu_upper) != 0)
|
||||
{
|
||||
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
|
||||
icu_upper);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Exhaustively compare case mappings with the results from ICU.
|
||||
*/
|
||||
@ -64,6 +161,7 @@ test_icu(void)
|
||||
if (category != PG_U_UNASSIGNED)
|
||||
{
|
||||
uint8_t icu_category = u_charType(code);
|
||||
char code_str[5] = {0};
|
||||
|
||||
if (icu_category == PG_U_UNASSIGNED)
|
||||
{
|
||||
@ -72,6 +170,9 @@ test_icu(void)
|
||||
}
|
||||
|
||||
icu_test_simple(code);
|
||||
unicode_to_utf8(code, (unsigned char *) code_str);
|
||||
icu_test_full(code_str);
|
||||
|
||||
successful++;
|
||||
}
|
||||
}
|
||||
@ -86,7 +187,7 @@ test_icu(void)
|
||||
#endif
|
||||
|
||||
static void
|
||||
test_strlower(const char *test_string, const char *expected)
|
||||
test_convert(TestFunc tfunc, const char *test_string, const char *expected)
|
||||
{
|
||||
size_t src1len = strlen(test_string);
|
||||
size_t src2len = -1; /* NUL-terminated */
|
||||
@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
|
||||
|
||||
/* neither source nor destination are NUL-terminated */
|
||||
memset(dst1, 0x7F, dst1len);
|
||||
needed = unicode_strlower(dst1, dst1len, src1, src1len);
|
||||
needed = tfunc(dst1, dst1len, src1, src1len);
|
||||
if (needed != strlen(expected))
|
||||
{
|
||||
printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
|
||||
printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
|
||||
test_string, needed, strlen(expected));
|
||||
exit(1);
|
||||
}
|
||||
if (memcmp(dst1, expected, dst1len) != 0)
|
||||
@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
|
||||
|
||||
/* destination is NUL-terminated and source is not */
|
||||
memset(dst2, 0x7F, dst2len);
|
||||
needed = unicode_strlower(dst2, dst2len, src1, src1len);
|
||||
needed = tfunc(dst2, dst2len, src1, src1len);
|
||||
if (needed != strlen(expected))
|
||||
{
|
||||
printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
|
||||
printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
|
||||
test_string, needed, strlen(expected));
|
||||
exit(1);
|
||||
}
|
||||
if (strcmp(dst2, expected) != 0)
|
||||
@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
|
||||
|
||||
/* source is NUL-terminated and destination is not */
|
||||
memset(dst1, 0x7F, dst1len);
|
||||
needed = unicode_strlower(dst1, dst1len, src2, src2len);
|
||||
needed = tfunc(dst1, dst1len, src2, src2len);
|
||||
if (needed != strlen(expected))
|
||||
{
|
||||
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
|
||||
test_string, needed, strlen(expected));
|
||||
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
|
||||
exit(1);
|
||||
}
|
||||
@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
|
||||
|
||||
/* both source and destination are NUL-terminated */
|
||||
memset(dst2, 0x7F, dst2len);
|
||||
needed = unicode_strlower(dst2, dst2len, src2, src2len);
|
||||
needed = tfunc(dst2, dst2len, src2, src2len);
|
||||
if (needed != strlen(expected))
|
||||
{
|
||||
printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
|
||||
printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
|
||||
test_string, needed, strlen(expected));
|
||||
exit(1);
|
||||
}
|
||||
if (strcmp(dst2, expected) != 0)
|
||||
@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
|
||||
free(dst2);
|
||||
}
|
||||
|
||||
static size_t
|
||||
tfunc_lower(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen)
|
||||
{
|
||||
return unicode_strlower(dst, dstsize, src, srclen, true);
|
||||
}
|
||||
|
||||
static size_t
|
||||
tfunc_title(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen)
|
||||
{
|
||||
struct WordBoundaryState wbstate = {
|
||||
.str = src,
|
||||
.len = srclen,
|
||||
.offset = 0,
|
||||
.init = false,
|
||||
.prev_alnum = false,
|
||||
};
|
||||
|
||||
return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
|
||||
&wbstate);
|
||||
}
|
||||
|
||||
static size_t
|
||||
tfunc_upper(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen)
|
||||
{
|
||||
return unicode_strupper(dst, dstsize, src, srclen, true);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_convert_case()
|
||||
{
|
||||
/* test string with no case changes */
|
||||
test_strlower("√∞", "√∞");
|
||||
test_convert(tfunc_lower, "√∞", "√∞");
|
||||
/* test adjust-to-cased behavior */
|
||||
test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
|
||||
/* test string with case changes */
|
||||
test_strlower("ABC", "abc");
|
||||
test_convert(tfunc_upper, "abc", "ABC");
|
||||
/* test string with case changes and byte length changes */
|
||||
test_strlower("ȺȺȺ", "ⱥⱥⱥ");
|
||||
test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
|
||||
/* test special case conversions */
|
||||
test_convert(tfunc_upper, "ß", "SS");
|
||||
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
|
||||
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
|
||||
/* test final sigma */
|
||||
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
|
||||
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
|
||||
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
|
||||
|
||||
#ifdef USE_ICU
|
||||
icu_test_full("");
|
||||
icu_test_full("ȺȺȺ");
|
||||
icu_test_full("ßßß");
|
||||
icu_test_full("√∞");
|
||||
icu_test_full("a b");
|
||||
icu_test_full("abc 123xyz");
|
||||
icu_test_full("σςΣ ΣΣΣ");
|
||||
icu_test_full("ıiIİ");
|
||||
/* test <alpha><iota_subscript><acute> */
|
||||
icu_test_full("\u0391\u0345\u0301");
|
||||
#endif
|
||||
|
||||
printf("case_test: convert_case: success\n");
|
||||
}
|
||||
@ -182,6 +342,22 @@ test_convert_case()
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
#ifdef USE_ICU
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
/*
|
||||
* Disable ICU's word break adjustment for titlecase to match the expected
|
||||
* behavior of unicode_strtitle().
|
||||
*/
|
||||
casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
printf("case_test: failure opening UCaseMap: %s\n",
|
||||
u_errorName(status));
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
|
||||
#ifdef USE_ICU
|
||||
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
|
||||
@ -191,5 +367,9 @@ main(int argc, char **argv)
|
||||
#endif
|
||||
|
||||
test_convert_case();
|
||||
|
||||
#ifdef USE_ICU
|
||||
ucasemap_close(casemap);
|
||||
#endif
|
||||
exit(0);
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
# Generate Unicode character case mappings. Does not include tailoring
|
||||
# or locale-specific mappings.
|
||||
#
|
||||
# Input: UnicodeData.txt
|
||||
# Input: SpecialCasing.txt UnicodeData.txt
|
||||
# Output: unicode_case_table.h
|
||||
#
|
||||
# Copyright (c) 2000-2025, PostgreSQL Global Development Group
|
||||
@ -21,6 +21,10 @@ GetOptions('outdir:s' => \$output_path);
|
||||
|
||||
my $output_table_file = "$output_path/unicode_case_table.h";
|
||||
|
||||
# The maximum number of codepoints that can result from case mapping
|
||||
# of a single character. See Unicode section 5.18 "Case Mappings".
|
||||
my $MAX_CASE_EXPANSION = 3;
|
||||
|
||||
my $FH;
|
||||
|
||||
my %simple = ();
|
||||
@ -51,6 +55,98 @@ while (my $line = <$FH>)
|
||||
}
|
||||
close $FH;
|
||||
|
||||
# Map for special casing rules that aren't represented in the simple
|
||||
# mapping. Language-sensitive mappings are not supported.
|
||||
#
|
||||
# See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the
|
||||
# SpecialCasing.txt file itself for details.
|
||||
|
||||
# for now, only Final_Sigma is supported
|
||||
my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA');
|
||||
|
||||
my %special = ();
|
||||
open($FH, '<', "$output_path/SpecialCasing.txt")
|
||||
or die "Could not open $output_path/SpecialCasing.txt: $!.";
|
||||
while (my $line = <$FH>)
|
||||
{
|
||||
# language-sensitive mappings not supported
|
||||
last if $line =~ /\# Language-Sensitive Mappings/;
|
||||
|
||||
# remove comments
|
||||
$line =~ s/^(.*?)#.*$/$1/s;
|
||||
|
||||
# ignore empty lines
|
||||
next unless $line =~ /;/;
|
||||
|
||||
my @elts = split /;/, $line;
|
||||
my $code = hex($elts[0]);
|
||||
|
||||
# Codepoint may map to multiple characters when converting
|
||||
# case. Split each mapping on whitespace and extract the
|
||||
# hexadecimal into an array of codepoints.
|
||||
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
|
||||
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
|
||||
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
|
||||
my @conditions = map {
|
||||
# supporting negated conditions may require storing a
|
||||
# mask of relevant conditions for a given rule to differentiate
|
||||
# between lack of a condition and a negated condition
|
||||
die "negated conditions not supported" if /^Not_/;
|
||||
$condition_map{$_} || die "unrecognized condition: $_"
|
||||
} (grep /\w+/, (split /\s+/, $elts[4]));
|
||||
|
||||
my $cond_str = (join '|', @conditions) || '0';
|
||||
|
||||
# if empty, create a self-mapping
|
||||
push @lower, $code if (scalar @lower == 0);
|
||||
push @title, $code if (scalar @title == 0);
|
||||
push @upper, $code if (scalar @upper == 0);
|
||||
|
||||
# none should map to more than 3 codepoints
|
||||
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
|
||||
if (scalar @lower) > $MAX_CASE_EXPANSION;
|
||||
die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'"
|
||||
if (scalar @title) > $MAX_CASE_EXPANSION;
|
||||
die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'"
|
||||
if (scalar @upper) > $MAX_CASE_EXPANSION;
|
||||
|
||||
# pad arrays to a fixed length of 3
|
||||
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
|
||||
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
|
||||
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
|
||||
|
||||
# Characters with special mappings may not have simple mappings;
|
||||
# ensure that an entry exists.
|
||||
$simple{$code} ||= {
|
||||
Simple_Lowercase => $code,
|
||||
Simple_Titlecase => $code,
|
||||
Simple_Uppercase => $code
|
||||
};
|
||||
|
||||
# Multiple special case rules for a single codepoint could be
|
||||
# supported by making several entries for each codepoint, and have
|
||||
# the simple mapping point to the first entry. The caller could
|
||||
# scan forward looking for an entry that matches the conditions,
|
||||
# or fall back to the normal behavior.
|
||||
die "multiple special case mappings not supported"
|
||||
if defined $special{$code};
|
||||
|
||||
$special{$code} = {
|
||||
Lowercase => \@lower,
|
||||
Titlecase => \@title,
|
||||
Uppercase => \@upper,
|
||||
Conditions => $cond_str
|
||||
};
|
||||
}
|
||||
close $FH;
|
||||
|
||||
# assign sequential array indexes to the special mappings
|
||||
my $special_idx = 0;
|
||||
foreach my $code (sort { $a <=> $b } (keys %special))
|
||||
{
|
||||
$special{$code}{Index} = $special_idx++;
|
||||
}
|
||||
|
||||
# Start writing out the output files
|
||||
open my $OT, '>', $output_table_file
|
||||
or die "Could not open output file $output_table_file: $!\n";
|
||||
@ -63,6 +159,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
|
||||
$num_simple++ unless $code < 0x80;
|
||||
}
|
||||
|
||||
my $num_special = scalar(keys %special) + 1;
|
||||
|
||||
print $OT <<"EOS";
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
@ -86,6 +184,19 @@ print $OT <<"EOS";
|
||||
#include "common/unicode_case.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
/*
|
||||
* The maximum number of codepoints that can result from case mapping
|
||||
* of a single character. See Unicode section 5.18 "Case Mappings".
|
||||
*/
|
||||
#define MAX_CASE_EXPANSION 3
|
||||
|
||||
/*
|
||||
* Case mapping condition flags. For now, only Final_Sigma is supported.
|
||||
*
|
||||
* See Unicode Context Specification for Casing.
|
||||
*/
|
||||
#define PG_U_FINAL_SIGMA (1 << 0)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CaseLower = 0,
|
||||
@ -94,12 +205,47 @@ typedef enum
|
||||
NCaseKind
|
||||
} CaseKind;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
pg_wchar codepoint; /* Unicode codepoint */
|
||||
int16 conditions;
|
||||
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
|
||||
} pg_special_case;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
pg_wchar codepoint; /* Unicode codepoint */
|
||||
pg_wchar simplemap[NCaseKind];
|
||||
const pg_special_case *special_case;
|
||||
} pg_case_map;
|
||||
|
||||
/*
|
||||
* Special case mappings that aren't representable in the simple map.
|
||||
* Entries are referenced from simple_case_map.
|
||||
*/
|
||||
static const pg_special_case special_case[$num_special] =
|
||||
{
|
||||
EOS
|
||||
|
||||
foreach my $code (sort { $a <=> $b } (keys %special))
|
||||
{
|
||||
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
|
||||
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
|
||||
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
|
||||
my $lower = join ", ",
|
||||
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
|
||||
my $title = join ", ",
|
||||
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
|
||||
my $upper = join ", ",
|
||||
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
|
||||
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
|
||||
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
|
||||
}
|
||||
|
||||
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
|
||||
print $OT <<"EOS";
|
||||
};
|
||||
|
||||
/*
|
||||
* Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
|
||||
* sparse for higher codepoints (requiring scan or binary search).
|
||||
@ -114,8 +260,10 @@ for (my $code = 0; $code < 0x80; $code++)
|
||||
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
|
||||
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
|
||||
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
|
||||
die "unexpected special case for code $code"
|
||||
if defined $special{$code};
|
||||
printf $OT
|
||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
|
||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
|
||||
$code, $lc, $tc, $uc;
|
||||
}
|
||||
printf $OT "\n";
|
||||
@ -126,9 +274,14 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
|
||||
next unless $code >= 0x80; # already output above
|
||||
|
||||
my $map = $simple{$code};
|
||||
my $special_case = "NULL";
|
||||
if (exists $special{$code})
|
||||
{
|
||||
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
|
||||
}
|
||||
printf $OT
|
||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
|
||||
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
|
||||
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
|
||||
$map->{Simple_Uppercase};
|
||||
$map->{Simple_Uppercase}, $special_case;
|
||||
}
|
||||
print $OT "};\n";
|
||||
|
@ -11,7 +11,7 @@ endif
|
||||
|
||||
# These files are part of the Unicode Character Database. Download them on
|
||||
# demand.
|
||||
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'UnicodeData.txt']
|
||||
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
|
||||
url = unicode_baseurl.format(UNICODE_VERSION, f)
|
||||
target = custom_target(f,
|
||||
output: f,
|
||||
@ -26,7 +26,7 @@ update_unicode_targets = []
|
||||
|
||||
update_unicode_targets += \
|
||||
custom_target('unicode_case_table.h',
|
||||
input: [unicode_data['UnicodeData.txt']],
|
||||
input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
|
||||
output: ['unicode_case_table.h'],
|
||||
command: [
|
||||
perl, files('generate-unicode_case_table.pl'),
|
||||
|
@ -17,12 +17,15 @@
|
||||
|
||||
#include "common/unicode_case.h"
|
||||
#include "common/unicode_case_table.h"
|
||||
#include "common/unicode_category.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
static const pg_case_map *find_case_map(pg_wchar ucs);
|
||||
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
CaseKind str_casekind, WordBoundaryNext wbnext,
|
||||
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
|
||||
void *wbstate);
|
||||
static bool check_special_conditions(int conditions, const char *str,
|
||||
size_t len, size_t offset);
|
||||
|
||||
pg_wchar
|
||||
unicode_lowercase_simple(pg_wchar code)
|
||||
@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
|
||||
*
|
||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||
* required buffer size before allocating.
|
||||
*
|
||||
* If full is true, use special case mappings if available and if the
|
||||
* conditions are satisfied.
|
||||
*/
|
||||
size_t
|
||||
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
bool full)
|
||||
{
|
||||
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
|
||||
return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||
* required buffer size before allocating.
|
||||
*
|
||||
* If full is true, use special case mappings if available and if the
|
||||
* conditions are satisfied. Otherwise, use only simple mappings and use
|
||||
* uppercase instead of titlecase.
|
||||
*
|
||||
* Titlecasing requires knowledge about word boundaries, which is provided by
|
||||
* the callback wbnext. A word boundary is the offset of the start of a word
|
||||
* or the offset of the character immediately following a word.
|
||||
@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
*/
|
||||
size_t
|
||||
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
WordBoundaryNext wbnext, void *wbstate)
|
||||
bool full, WordBoundaryNext wbnext, void *wbstate)
|
||||
{
|
||||
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
|
||||
return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
|
||||
wbstate);
|
||||
}
|
||||
|
||||
@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
*
|
||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||
* required buffer size before allocating.
|
||||
*
|
||||
* If full is true, use special case mappings if available and if the
|
||||
* conditions are satisfied.
|
||||
*/
|
||||
size_t
|
||||
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
bool full)
|
||||
{
|
||||
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
|
||||
return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Implement Unicode Default Case Conversion algorithm.
|
||||
*
|
||||
* If str_casekind is CaseLower or CaseUpper, map each character in the string
|
||||
* for which a mapping is available.
|
||||
*
|
||||
* If str_casekind is CaseTitle, maps characters found on a word boundary to
|
||||
* uppercase and other characters to lowercase.
|
||||
* titlecase (or uppercase if full is false) and other characters to
|
||||
* lowercase. NB: does not currently implement the Unicode behavior in which
|
||||
* the word boundary is adjusted to the next Cased character. That behavior
|
||||
* could be implemented as an option, but it doesn't match the default
|
||||
* behavior of ICU, nor does it match the documented behavior of INITCAP().
|
||||
*
|
||||
* If full is true, use special mappings for relevant characters, which can
|
||||
* map a single codepoint to multiple codepoints, or depend on conditions.
|
||||
*/
|
||||
static size_t
|
||||
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
|
||||
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
|
||||
void *wbstate)
|
||||
{
|
||||
/* character CaseKind varies while titlecasing */
|
||||
CaseKind chr_casekind = str_casekind;
|
||||
@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
|
||||
int u1len = unicode_utf8len(u1);
|
||||
const pg_case_map *casemap = find_case_map(u1);
|
||||
const pg_special_case *special = NULL;
|
||||
|
||||
if (str_casekind == CaseTitle)
|
||||
{
|
||||
if (srcoff == boundary)
|
||||
{
|
||||
chr_casekind = CaseUpper;
|
||||
chr_casekind = full ? CaseTitle : CaseUpper;
|
||||
boundary = wbnext(wbstate);
|
||||
}
|
||||
else
|
||||
chr_casekind = CaseLower;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find special case that matches the conditions, if any.
|
||||
*
|
||||
* Note: only a single special mapping per codepoint is currently
|
||||
* supported, though Unicode allows for multiple special mappings for
|
||||
* a single codepoint.
|
||||
*/
|
||||
if (full && casemap && casemap->special_case)
|
||||
{
|
||||
int16 conditions = casemap->special_case->conditions;
|
||||
|
||||
Assert(casemap->special_case->codepoint == u1);
|
||||
if (check_special_conditions(conditions, src, srclen, srcoff))
|
||||
special = casemap->special_case;
|
||||
}
|
||||
|
||||
/* perform mapping, update result_len, and write to dst */
|
||||
if (casemap)
|
||||
if (special)
|
||||
{
|
||||
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
|
||||
{
|
||||
pg_wchar u2 = special->map[chr_casekind][i];
|
||||
size_t u2len = unicode_utf8len(u2);
|
||||
|
||||
if (u2 == '\0')
|
||||
break;
|
||||
|
||||
if (result_len + u2len <= dstsize)
|
||||
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
|
||||
|
||||
result_len += u2len;
|
||||
}
|
||||
}
|
||||
else if (casemap)
|
||||
{
|
||||
pg_wchar u2 = casemap->simplemap[chr_casekind];
|
||||
pg_wchar u2len = unicode_utf8len(u2);
|
||||
@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
return result_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that the condition matches Final_Sigma, described in Unicode Table
|
||||
* 3-17. The character at the given offset must be directly preceded by a
|
||||
* Cased character, and must not be directly followed by a Cased character.
|
||||
*
|
||||
* Case_Ignorable characters are ignored. NB: some characters may be both
|
||||
* Cased and Case_Ignorable, in which case they are ignored.
|
||||
*/
|
||||
static bool
|
||||
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
|
||||
{
|
||||
/* the start of the string is not preceded by a Cased character */
|
||||
if (offset == 0)
|
||||
return false;
|
||||
|
||||
/* iterate backwards, looking for Cased character */
|
||||
for (int i = offset - 1; i >= 0; i--)
|
||||
{
|
||||
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
|
||||
{
|
||||
pg_wchar curr = utf8_to_unicode(str + i);
|
||||
|
||||
if (pg_u_prop_case_ignorable(curr))
|
||||
continue;
|
||||
else if (pg_u_prop_cased(curr))
|
||||
break;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
else if ((str[i] & 0xC0) == 0x80)
|
||||
continue;
|
||||
|
||||
Assert(false); /* invalid UTF-8 */
|
||||
}
|
||||
|
||||
/* end of string is not followed by a Cased character */
|
||||
if (offset == len)
|
||||
return true;
|
||||
|
||||
/* iterate forwards, looking for Cased character */
|
||||
for (int i = offset + 1; i < len && str[i] != '\0'; i++)
|
||||
{
|
||||
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
|
||||
{
|
||||
pg_wchar curr = utf8_to_unicode(str + i);
|
||||
|
||||
if (pg_u_prop_case_ignorable(curr))
|
||||
continue;
|
||||
else if (pg_u_prop_cased(curr))
|
||||
return false;
|
||||
else
|
||||
break;
|
||||
}
|
||||
else if ((str[i] & 0xC0) == 0x80)
|
||||
continue;
|
||||
|
||||
Assert(false); /* invalid UTF-8 */
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
check_special_conditions(int conditions, const char *str, size_t len,
|
||||
size_t offset)
|
||||
{
|
||||
if (conditions == 0)
|
||||
return true;
|
||||
else if (conditions == PG_U_FINAL_SIGMA)
|
||||
return check_final_sigma((unsigned char *) str, len, offset);
|
||||
|
||||
/* no other conditions supported */
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* find entry in simple case map, if any */
|
||||
static const pg_case_map *
|
||||
find_case_map(pg_wchar ucs)
|
||||
|
@ -22,11 +22,11 @@ pg_wchar unicode_lowercase_simple(pg_wchar code);
|
||||
pg_wchar unicode_titlecase_simple(pg_wchar code);
|
||||
pg_wchar unicode_uppercase_simple(pg_wchar code);
|
||||
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen);
|
||||
ssize_t srclen, bool full);
|
||||
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, WordBoundaryNext wbnext,
|
||||
void *wbstate);
|
||||
ssize_t srclen, bool full,
|
||||
WordBoundaryNext wbnext, void *wbstate);
|
||||
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen);
|
||||
ssize_t srclen, bool full);
|
||||
|
||||
#endif /* UNICODE_CASE_H */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -3754,6 +3754,7 @@ pg_sha256_ctx
|
||||
pg_sha384_ctx
|
||||
pg_sha512_ctx
|
||||
pg_snapshot
|
||||
pg_special_case
|
||||
pg_stack_base_t
|
||||
pg_time_t
|
||||
pg_time_usec_t
|
||||
|
Reference in New Issue
Block a user