From 6bc10f8026e691444f0d2e857a5d0de4b88fa11f Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Sun, 28 Nov 2021 16:48:13 +0400 Subject: [PATCH] MDEV-27009 Add UCA-14.0.0 collations - adding version aware implicit weight handling Implicit weights are now handled according to the Unicode version (14.0.0 vs earlier versions). - Adding a new member MY_UCA_INFO::version - Copy logical positions and the version from "src_uca" to "new_uca" in init_weight_level(). - Adding a "const MY_UCA_INFO *" parameter to a few functions to know Unicode version to generate implicit weights accordingly: - during the collation initialization time, to pages which are a mixture of explicit and implicit weights - during comparison time, for fully implicit pages --- include/m_ctype.h | 3 +++ strings/ctype-uca.c | 50 +++++++++++++++++++++++++++++++++------------ 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/include/m_ctype.h b/include/m_ctype.h index 83b12c3c4f6..1d0053815ea 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -168,6 +168,9 @@ typedef struct uca_info_st my_wc_t first_variable; my_wc_t last_variable; + /* Unicode version */ + uint version; + } MY_UCA_INFO; diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 06dfe73ab39..55a76be84bc 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6580,6 +6580,9 @@ MY_UCA_INFO my_uca_v400= 0x0009, /* first_variable */ 0x2183, /* last_variable */ + + /* Misc */ + 400 /* Version */ }; /******************************************************/ @@ -30136,6 +30139,9 @@ MY_UCA_INFO my_uca_v520_th= 0x0009, /* first_variable if alt=non-ignorable: p != ignore */ 0x1D371, /* last_variable if alt=shifter: p,s,t == ignore */ + + /* Misc */ + 520 /* Version */ }; MY_UCA_INFO my_uca_v520= @@ -30183,6 +30189,9 @@ MY_UCA_INFO my_uca_v520= 0x0009, /* first_variable if alt=non-ignorable: p != ignore */ 0x1D371, /* last_variable if alt=shifter: p,s,t == ignore */ + + /* Misc */ + 520 /* Version */ }; @@ -31691,10 +31700,11 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc, /****************************************************************/ static inline void -my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level) +my_uca_implicit_weight_put(uint16 *to, const MY_UCA_INFO *src_uca, + my_wc_t code, uint level) { MY_UCA_IMPLICIT_WEIGHT weight; - weight= my_uca_implicit_weight_on_level(520, code, level); + weight= my_uca_implicit_weight_on_level(src_uca->version, code, level); to[0]= weight.weight[0]; to[1]= weight.weight[1]; to[2]= 0; @@ -31718,7 +31728,8 @@ static inline int my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner) { my_wc_t wc= (scanner->page << 8) + scanner->code; - MY_UCA_IMPLICIT_WEIGHT weight= my_uca_520_implicit_weight_primary(wc); + uint version= scanner->cs->uca->version; + MY_UCA_IMPLICIT_WEIGHT weight= my_uca_implicit_weight_primary(version, wc); scanner->implicit[0]= weight.weight[1]; /* The second weight */ scanner->implicit[1]= 0; /* 0 terminator */ scanner->wbeg= scanner->implicit; @@ -32509,7 +32520,7 @@ typedef struct my_coll_rules_st { uint version; /* Unicode version, e.g. 400 or 520 */ uint strength; /* Number of levels */ - MY_UCA_INFO *uca; /* Unicode weight data */ + const MY_UCA_INFO *uca; /* Unicode weight data */ size_t nrules; /* Number of rules in the rule array */ size_t mrules; /* Number of allocated rules */ MY_COLL_RULE *rule; /* Rule array */ @@ -33199,8 +33210,8 @@ my_coll_rule_parse(MY_COLL_RULES *rules, Copies UCA weights for a given "uint" string to the given location. + @dst destination UCA weight level data @src_uca source UCA weight data - @dst_uca destination UCA weight data @to destination address @to_length size of destination @nweights OUT number of weights put to "to" @@ -33212,6 +33223,7 @@ my_coll_rule_parse(MY_COLL_RULES *rules, static my_bool my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, + const MY_UCA_INFO *src_uca, uint16 *to, size_t to_length, size_t *nweights, my_wc_t *str, size_t len) { @@ -33249,7 +33261,7 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, if (!from) { from= implicit_weights; - my_uca_implicit_weight_put(implicit_weights, *str, dst->levelno); + my_uca_implicit_weight_put(implicit_weights, src_uca, *str, dst->levelno); } str++; len--; @@ -33306,6 +33318,7 @@ my_uca_copy_page(MY_CHARSET_LOADER *loader, static my_bool my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader, MY_UCA_WEIGHT_LEVEL *dst, + const MY_UCA_INFO *src_uca, uint page) { uint chc, size= 256 * dst->lengths[page] * sizeof(uint16); @@ -33316,7 +33329,7 @@ my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader, for (chc= 0 ; chc < 256; chc++) { uint16 *w= dst->weights[page] + chc * dst->lengths[page]; - my_uca_implicit_weight_put(w, (page << 8) + chc, dst->levelno); + my_uca_implicit_weight_put(w, src_uca, (page << 8) + chc, dst->levelno); } return FALSE; } @@ -33495,7 +33508,7 @@ apply_one_rule(MY_CHARSET_LOADER *loader, r->curr, (uint)nshift, r->with_context); /* Store weights of the "reset to" character */ dst->contractions.nitems--; /* Temporarily hide - it's incomplete */ - rc= my_char_weight_put(dst, + rc= my_char_weight_put(dst, rules->uca, to, MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE, &nweights, r->base, nreset); dst->contractions.nitems++; /* Activate, now it's complete */ @@ -33506,7 +33519,8 @@ apply_one_rule(MY_CHARSET_LOADER *loader, DBUG_ASSERT(dst->weights[pagec]); to= my_char_weight_addr(dst, r->curr[0]); /* Store weights of the "reset to" character */ - rc= my_char_weight_put(dst, to, dst->lengths[pagec], &nweights, r->base, nreset); + rc= my_char_weight_put(dst, rules->uca, + to, dst->lengths[pagec], &nweights, r->base, nreset); } if (rc) { @@ -33575,7 +33589,9 @@ static uint my_weight_size_on_page(const MY_UCA_WEIGHT_LEVEL *src, uint page) */ static my_bool my_uca_generate_page(MY_CHARSET_LOADER *loader, - MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src, + MY_UCA_WEIGHT_LEVEL *dst, + const MY_UCA_INFO *src_uca, + const MY_UCA_WEIGHT_LEVEL *src, uint pageno) { DBUG_ASSERT(dst->levelno == src->levelno); @@ -33590,7 +33606,7 @@ my_uca_generate_page(MY_CHARSET_LOADER *loader, Generate default weights for all characters on this page algorithmically now, at initialization time. */ - my_uca_generate_implicit_page(loader, dst, pageno); + my_uca_generate_implicit_page(loader, dst, src_uca, pageno); } @@ -33601,6 +33617,7 @@ my_uca_generate_page(MY_CHARSET_LOADER *loader, static my_bool my_uca_generate_pages(MY_CHARSET_LOADER *loader, MY_UCA_WEIGHT_LEVEL *dst, + const MY_UCA_INFO *src_uca, const MY_UCA_WEIGHT_LEVEL *src, uint npages) { @@ -33625,7 +33642,7 @@ my_uca_generate_pages(MY_CHARSET_LOADER *loader, } /* Found a page with some special rules. */ - if (my_uca_generate_page(loader, dst, src, page)) + if (my_uca_generate_page(loader, dst, src_uca, src, page)) return TRUE; } return FALSE; @@ -33691,7 +33708,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, ncontractions += (int)src->contractions.nitems; - if ((my_uca_generate_pages(loader, dst, src, (uint)npages))) + if ((my_uca_generate_pages(loader, dst, rules->uca, src, (uint)npages))) return TRUE; if (ncontractions) @@ -33860,6 +33877,13 @@ create_tailoring(struct charset_info_st *cs, } cs->levels_for_order= rules.strength ? rules.strength : 1; + /* + Copy logical positions, version, but don't copy levels - + they will be initialized below. + */ + new_uca= *src_uca; + bzero(&new_uca.level, sizeof(new_uca.level)); + for (i= 0; i != cs->levels_for_order; i++) { if ((rc= (src_uca->level[i].maxchar == 0)))