diff --git a/mysql-test/r/ctype_ldml.result b/mysql-test/r/ctype_ldml.result index acb92a8363b..d333c03143b 100644 --- a/mysql-test/r/ctype_ldml.result +++ b/mysql-test/r/ctype_ldml.result @@ -460,6 +460,7 @@ utf8mb4_test_400_ci utf8mb4 328 8 latin1_test2 latin1 332 1 utf8_bengali_standard_ci utf8 336 8 utf8_bengali_traditional_ci utf8 337 8 +utf8_implicit_weights_ci utf8 338 8 utf8_phone_ci utf8 352 8 utf8_test_ci utf8 353 8 utf8_5624_1 utf8 354 8 @@ -1156,3 +1157,25 @@ Warning 1273 Expansion too long: 'a\u002Daaaaaa10' # # Search for occurrences of [ERROR] Syntax error at '[strength tertiary]' Occurances : 2 +# +# MDEV-8686 A user defined collation utf8_confusables doesn't work +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_implicit_weights_ci); +INSERT INTO t1 VALUES ('a'),('b'),('c'); +INSERT INTO t1 VALUES (_ucs2 0x1500),(_ucs2 0x1501); +INSERT INTO t1 VALUES (_ucs2 0x3400),(_ucs2 0x3560),(_ucs2 0x3561),(_ucs2 0x3600); +INSERT INTO t1 VALUES (_ucs2 0x3700),(_ucs2 0x3701); +SELECT HEX(CONVERT(a USING ucs2)) AS ch, HEX(WEIGHT_STRING(a)) AS w, HEX(WEIGHT_STRING(a COLLATE utf8_unicode_ci)) AS ducet FROM t1 ORDER BY a,ch; +ch w ducet +0061 0E33 0E33 +3561 0E33 FB80B561 +0063 0E60 0E60 +1500 0E60 1BAD +0062 FB80B400 0E4A +3400 FB80B400 FB80B400 +3560 FB80B560 FB80B560 +1501 FB80B600 1BAE +3600 FB80B600 FB80B600 +3700 FB80B700 FB80B700 +3701 FB80B700 FB80B701 +DROP TABLE t1; diff --git a/mysql-test/std_data/Index.xml b/mysql-test/std_data/Index.xml index 5139db0554a..b66fdfee55c 100644 --- a/mysql-test/std_data/Index.xml +++ b/mysql-test/std_data/Index.xml @@ -1117,6 +1117,16 @@ + + + \u3400b + a\u3561 + c\u1500 + \u3600\u1501 + \u3700\u3701 + + + diff --git a/mysql-test/t/ctype_ldml.test b/mysql-test/t/ctype_ldml.test index 1b9d7c9d4ad..1ea8002a2eb 100644 --- a/mysql-test/t/ctype_ldml.test +++ b/mysql-test/t/ctype_ldml.test @@ -397,3 +397,15 @@ perl; print "Occurances : $count_error\n"; close(FILE); EOF + + +--echo # +--echo # MDEV-8686 A user defined collation utf8_confusables doesn't work +--echo # +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_implicit_weights_ci); +INSERT INTO t1 VALUES ('a'),('b'),('c'); +INSERT INTO t1 VALUES (_ucs2 0x1500),(_ucs2 0x1501); +INSERT INTO t1 VALUES (_ucs2 0x3400),(_ucs2 0x3560),(_ucs2 0x3561),(_ucs2 0x3600); +INSERT INTO t1 VALUES (_ucs2 0x3700),(_ucs2 0x3701); +SELECT HEX(CONVERT(a USING ucs2)) AS ch, HEX(WEIGHT_STRING(a)) AS w, HEX(WEIGHT_STRING(a COLLATE utf8_unicode_ci)) AS ducet FROM t1 ORDER BY a,ch; +DROP TABLE t1; diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index cebb723cebb..eef5c950684 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -31574,6 +31574,26 @@ my_uca_implicit_weight_base(my_wc_t code) } +static inline void +my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level) +{ + switch (level) { + case 1: to[0]= 0x0020; to[1]= 0; break; /* Secondary level */ + case 2: to[0]= 0x0002; to[1]= 0; break; /* Tertiary level */ + case 3: to[0]= 0x0001; to[1]= 0; break; /* Quaternary level */ + default: + DBUG_ASSERT(0); + case 0: + break; + } + /* Primary level */ + to[0]= (code >> 15) + my_uca_implicit_weight_base(code); + to[1]= (code & 0x7FFF) | 0x8000; + to[2]= 0; +} + +/****************************************************************/ + /** Return an implicit UCA weight for the primary level. Used for characters that do not have assigned UCA weights. @@ -33583,6 +33603,7 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, { size_t chlen; const uint16 *from= NULL; + uint16 implicit_weights[3]; for (chlen= len; chlen > 1; chlen--) { @@ -33597,6 +33618,11 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, if (!from) { from= my_char_weight_addr(dst, *str); + if (!from) + { + from= implicit_weights; + my_uca_implicit_weight_put(implicit_weights, *str, dst->levelno); + } str++; len--; } @@ -33649,6 +33675,25 @@ my_uca_copy_page(MY_CHARSET_LOADER *loader, } +static my_bool +my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader, + MY_UCA_WEIGHT_LEVEL *dst, + uint page) +{ + uint chc, size= 256 * dst->lengths[page] * sizeof(uint16); + if (!(dst->weights[page]= (uint16 *) (loader->once_alloc)(size))) + return TRUE; + + memset(dst->weights[page], 0, size); + for (chc= 0 ; chc < 256; chc++) + { + uint16 *w= dst->weights[page] + chc * dst->lengths[page]; + my_uca_implicit_weight_put(w, (page << 8) + chc, dst->levelno); + } + return FALSE; +} + + static my_bool apply_shift(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, @@ -33766,7 +33811,7 @@ my_uca_init_one_contraction(MY_CONTRACTIONS *contractions, static my_bool apply_one_rule(MY_CHARSET_LOADER *loader, - MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, + MY_COLL_RULES *rules, MY_COLL_RULE *r, MY_UCA_WEIGHT_LEVEL *dst) { size_t nweights; @@ -33842,7 +33887,7 @@ apply_one_rule(MY_CHARSET_LOADER *loader, } /* Apply level difference. */ - return apply_shift(loader, rules, r, level, to, nweights); + return apply_shift(loader, rules, r, dst->levelno, to, nweights); } @@ -33875,8 +33920,92 @@ check_rules(MY_CHARSET_LOADER *loader, } +/** + Calculates how many weights are needed on the given page. + + In case of implicit weights, the functions returns 3: + two implicit weights plus trailing 0. + + Implicit weights can appear if we do something like this: + \u3400 + a + I.e. we reset to a character that does not have an explicit weight (U+3400), + and then reorder another character relatively to it. +*/ +static uint my_weight_size_on_page(const MY_UCA_WEIGHT_LEVEL *src, uint page) +{ + return src->lengths[page] ? src->lengths[page] : 3; +} + + +/** + Generate default weights for a page: + - copy default weights from "src", or + - generate implicit weights algorithmically. + Note, some of these default weights will change later, + during a apply_one_rule() call. +*/ static my_bool -init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, +my_uca_generate_page(MY_CHARSET_LOADER *loader, + MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src, + uint pageno) +{ + DBUG_ASSERT(dst->levelno == src->levelno); + return src->lengths[pageno] ? + /* + A page with explicit weights and some special rules. + Copy all weights from the page in "src". + */ + my_uca_copy_page(loader, src, dst, pageno) : + /* + A page with implicit weights and some special rules. + Generate default weights for all characters on this page + algorithmically now, at initialization time. + */ + my_uca_generate_implicit_page(loader, dst, pageno); +} + + +/** + Find all pages that we have special rules on and + populate default (explicit or implicit) weights for these pages. +*/ +static my_bool +my_uca_generate_pages(MY_CHARSET_LOADER *loader, + MY_UCA_WEIGHT_LEVEL *dst, + const MY_UCA_WEIGHT_LEVEL *src, + uint npages) +{ + uint page; + for (page= 0; page < npages; page++) + { + if (dst->weights[page]) + { + /* A page with explicit weights with no special rules */ + continue; + } + + if (!dst->lengths[page]) + { + /* + A page with implicit weights with no special rules. + Keep dst->weights[page]==NULL and dst->lengths[page]==0. + Weights for this page will be generated at run time algorithmically, + using my_uca_scanner_next_implicit(). + */ + continue; + } + + /* Found a page with some special rules. */ + if (my_uca_generate_page(loader, dst, src, page)) + return TRUE; + } + return FALSE; +} + + +static my_bool +init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src) { MY_COLL_RULE *r, *rlast; @@ -33916,9 +34045,15 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, } else { - uint pageb= (r->base[0] >> 8); - if (dst->lengths[pagec] < src->lengths[pageb]) - dst->lengths[pagec]= src->lengths[pageb]; + /* + Not an expansion and not a contraction. + The page correspoding to r->curr[0] in "dst" + will need at least the same amount of weights + that r->base[0] has in "src". + */ + uint wsize= my_weight_size_on_page(src, r->base[0] >> 8); + if (dst->lengths[pagec] < wsize) + dst->lengths[pagec]= wsize; } dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */ } @@ -33928,18 +34063,8 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, ncontractions += src->contractions.nitems; - /* Allocate pages that we'll overwrite and copy default weights */ - for (i= 0; i < npages; i++) - { - my_bool rc; - /* - Don't touch pages with lengths[i]==0, they have implicit weights - calculated algorithmically. - */ - if (!dst->weights[i] && dst->lengths[i] && - (rc= my_uca_copy_page(loader, src, dst, i))) - return rc; - } + if ((my_uca_generate_pages(loader, dst, src, npages))) + return TRUE; if (ncontractions) { @@ -33957,7 +34082,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, */ for (r= rules->rule; r < rlast; r++) { - if (apply_one_rule(loader, rules, r, level, dst)) + if (apply_one_rule(loader, rules, r, dst)) return TRUE; } @@ -34040,7 +34165,7 @@ create_tailoring(struct charset_info_st *cs, cs->caseinfo= &my_unicase_default; } - if ((rc= init_weight_level(loader, &rules, 0, + if ((rc= init_weight_level(loader, &rules, &new_uca.level[0], &src_uca->level[0]))) goto ex; @@ -34103,7 +34228,7 @@ create_tailoring_multilevel(struct charset_info_st *cs, for (i= 0; i != num_level; i++) { - if ((rc= init_weight_level(loader, &rules, i, + if ((rc= init_weight_level(loader, &rules, &new_uca.level[i], &src_uca->level[i]))) goto ex; }