mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
MDEV-8686 A user defined collation utf8_confusables doesn't work
The collation customization code for the UCA (Unicode Collation Alrorithm) based collations now allows to reset to and shift of characters with implicit weights. Previously reset/shift worked only for the characters with explicit DUCET weights. An attempt to use reset/shift with character with implicit weights made the server crash.
This commit is contained in:
@@ -460,6 +460,7 @@ utf8mb4_test_400_ci utf8mb4 328 8
|
|||||||
latin1_test2 latin1 332 1
|
latin1_test2 latin1 332 1
|
||||||
utf8_bengali_standard_ci utf8 336 8
|
utf8_bengali_standard_ci utf8 336 8
|
||||||
utf8_bengali_traditional_ci utf8 337 8
|
utf8_bengali_traditional_ci utf8 337 8
|
||||||
|
utf8_implicit_weights_ci utf8 338 8
|
||||||
utf8_phone_ci utf8 352 8
|
utf8_phone_ci utf8 352 8
|
||||||
utf8_test_ci utf8 353 8
|
utf8_test_ci utf8 353 8
|
||||||
utf8_5624_1 utf8 354 8
|
utf8_5624_1 utf8 354 8
|
||||||
@@ -1156,3 +1157,25 @@ Warning 1273 Expansion too long: 'a\u002Daaaaaa10'
|
|||||||
#
|
#
|
||||||
# Search for occurrences of [ERROR] Syntax error at '[strength tertiary]'
|
# Search for occurrences of [ERROR] Syntax error at '[strength tertiary]'
|
||||||
Occurances : 2
|
Occurances : 2
|
||||||
|
#
|
||||||
|
# MDEV-8686 A user defined collation utf8_confusables doesn't work
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_implicit_weights_ci);
|
||||||
|
INSERT INTO t1 VALUES ('a'),('b'),('c');
|
||||||
|
INSERT INTO t1 VALUES (_ucs2 0x1500),(_ucs2 0x1501);
|
||||||
|
INSERT INTO t1 VALUES (_ucs2 0x3400),(_ucs2 0x3560),(_ucs2 0x3561),(_ucs2 0x3600);
|
||||||
|
INSERT INTO t1 VALUES (_ucs2 0x3700),(_ucs2 0x3701);
|
||||||
|
SELECT HEX(CONVERT(a USING ucs2)) AS ch, HEX(WEIGHT_STRING(a)) AS w, HEX(WEIGHT_STRING(a COLLATE utf8_unicode_ci)) AS ducet FROM t1 ORDER BY a,ch;
|
||||||
|
ch w ducet
|
||||||
|
0061 0E33 0E33
|
||||||
|
3561 0E33 FB80B561
|
||||||
|
0063 0E60 0E60
|
||||||
|
1500 0E60 1BAD
|
||||||
|
0062 FB80B400 0E4A
|
||||||
|
3400 FB80B400 FB80B400
|
||||||
|
3560 FB80B560 FB80B560
|
||||||
|
1501 FB80B600 1BAE
|
||||||
|
3600 FB80B600 FB80B600
|
||||||
|
3700 FB80B700 FB80B700
|
||||||
|
3701 FB80B700 FB80B701
|
||||||
|
DROP TABLE t1;
|
||||||
|
@@ -1117,6 +1117,16 @@
|
|||||||
</rules>
|
</rules>
|
||||||
</collation>
|
</collation>
|
||||||
|
|
||||||
|
<collation name="utf8_implicit_weights_ci" id="338">
|
||||||
|
<rules>
|
||||||
|
<reset>\u3400</reset><i>b</i>
|
||||||
|
<reset>a</reset><i>\u3561</i>
|
||||||
|
<reset>c</reset><i>\u1500</i>
|
||||||
|
<reset>\u3600</reset><i>\u1501</i>
|
||||||
|
<reset>\u3700</reset><i>\u3701</i>
|
||||||
|
</rules>
|
||||||
|
</collation>
|
||||||
|
|
||||||
</charset>
|
</charset>
|
||||||
|
|
||||||
</charsets>
|
</charsets>
|
||||||
|
@@ -397,3 +397,15 @@ perl;
|
|||||||
print "Occurances : $count_error\n";
|
print "Occurances : $count_error\n";
|
||||||
close(FILE);
|
close(FILE);
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-8686 A user defined collation utf8_confusables doesn't work
|
||||||
|
--echo #
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_implicit_weights_ci);
|
||||||
|
INSERT INTO t1 VALUES ('a'),('b'),('c');
|
||||||
|
INSERT INTO t1 VALUES (_ucs2 0x1500),(_ucs2 0x1501);
|
||||||
|
INSERT INTO t1 VALUES (_ucs2 0x3400),(_ucs2 0x3560),(_ucs2 0x3561),(_ucs2 0x3600);
|
||||||
|
INSERT INTO t1 VALUES (_ucs2 0x3700),(_ucs2 0x3701);
|
||||||
|
SELECT HEX(CONVERT(a USING ucs2)) AS ch, HEX(WEIGHT_STRING(a)) AS w, HEX(WEIGHT_STRING(a COLLATE utf8_unicode_ci)) AS ducet FROM t1 ORDER BY a,ch;
|
||||||
|
DROP TABLE t1;
|
||||||
|
@@ -31574,6 +31574,26 @@ my_uca_implicit_weight_base(my_wc_t code)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
|
||||||
|
{
|
||||||
|
switch (level) {
|
||||||
|
case 1: to[0]= 0x0020; to[1]= 0; break; /* Secondary level */
|
||||||
|
case 2: to[0]= 0x0002; to[1]= 0; break; /* Tertiary level */
|
||||||
|
case 3: to[0]= 0x0001; to[1]= 0; break; /* Quaternary level */
|
||||||
|
default:
|
||||||
|
DBUG_ASSERT(0);
|
||||||
|
case 0:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* Primary level */
|
||||||
|
to[0]= (code >> 15) + my_uca_implicit_weight_base(code);
|
||||||
|
to[1]= (code & 0x7FFF) | 0x8000;
|
||||||
|
to[2]= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Return an implicit UCA weight for the primary level.
|
Return an implicit UCA weight for the primary level.
|
||||||
Used for characters that do not have assigned UCA weights.
|
Used for characters that do not have assigned UCA weights.
|
||||||
@@ -33583,6 +33603,7 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
|
|||||||
{
|
{
|
||||||
size_t chlen;
|
size_t chlen;
|
||||||
const uint16 *from= NULL;
|
const uint16 *from= NULL;
|
||||||
|
uint16 implicit_weights[3];
|
||||||
|
|
||||||
for (chlen= len; chlen > 1; chlen--)
|
for (chlen= len; chlen > 1; chlen--)
|
||||||
{
|
{
|
||||||
@@ -33597,6 +33618,11 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
|
|||||||
if (!from)
|
if (!from)
|
||||||
{
|
{
|
||||||
from= my_char_weight_addr(dst, *str);
|
from= my_char_weight_addr(dst, *str);
|
||||||
|
if (!from)
|
||||||
|
{
|
||||||
|
from= implicit_weights;
|
||||||
|
my_uca_implicit_weight_put(implicit_weights, *str, dst->levelno);
|
||||||
|
}
|
||||||
str++;
|
str++;
|
||||||
len--;
|
len--;
|
||||||
}
|
}
|
||||||
@@ -33649,6 +33675,25 @@ my_uca_copy_page(MY_CHARSET_LOADER *loader,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static my_bool
|
||||||
|
my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader,
|
||||||
|
MY_UCA_WEIGHT_LEVEL *dst,
|
||||||
|
uint page)
|
||||||
|
{
|
||||||
|
uint chc, size= 256 * dst->lengths[page] * sizeof(uint16);
|
||||||
|
if (!(dst->weights[page]= (uint16 *) (loader->once_alloc)(size)))
|
||||||
|
return TRUE;
|
||||||
|
|
||||||
|
memset(dst->weights[page], 0, size);
|
||||||
|
for (chc= 0 ; chc < 256; chc++)
|
||||||
|
{
|
||||||
|
uint16 *w= dst->weights[page] + chc * dst->lengths[page];
|
||||||
|
my_uca_implicit_weight_put(w, (page << 8) + chc, dst->levelno);
|
||||||
|
}
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static my_bool
|
static my_bool
|
||||||
apply_shift(MY_CHARSET_LOADER *loader,
|
apply_shift(MY_CHARSET_LOADER *loader,
|
||||||
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
|
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
|
||||||
@@ -33766,7 +33811,7 @@ my_uca_init_one_contraction(MY_CONTRACTIONS *contractions,
|
|||||||
|
|
||||||
static my_bool
|
static my_bool
|
||||||
apply_one_rule(MY_CHARSET_LOADER *loader,
|
apply_one_rule(MY_CHARSET_LOADER *loader,
|
||||||
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
|
MY_COLL_RULES *rules, MY_COLL_RULE *r,
|
||||||
MY_UCA_WEIGHT_LEVEL *dst)
|
MY_UCA_WEIGHT_LEVEL *dst)
|
||||||
{
|
{
|
||||||
size_t nweights;
|
size_t nweights;
|
||||||
@@ -33842,7 +33887,7 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Apply level difference. */
|
/* Apply level difference. */
|
||||||
return apply_shift(loader, rules, r, level, to, nweights);
|
return apply_shift(loader, rules, r, dst->levelno, to, nweights);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -33875,8 +33920,92 @@ check_rules(MY_CHARSET_LOADER *loader,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
Calculates how many weights are needed on the given page.
|
||||||
|
|
||||||
|
In case of implicit weights, the functions returns 3:
|
||||||
|
two implicit weights plus trailing 0.
|
||||||
|
|
||||||
|
Implicit weights can appear if we do something like this:
|
||||||
|
<reset>\u3400</>
|
||||||
|
<i>a</i>
|
||||||
|
I.e. we reset to a character that does not have an explicit weight (U+3400),
|
||||||
|
and then reorder another character relatively to it.
|
||||||
|
*/
|
||||||
|
static uint my_weight_size_on_page(const MY_UCA_WEIGHT_LEVEL *src, uint page)
|
||||||
|
{
|
||||||
|
return src->lengths[page] ? src->lengths[page] : 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
Generate default weights for a page:
|
||||||
|
- copy default weights from "src", or
|
||||||
|
- generate implicit weights algorithmically.
|
||||||
|
Note, some of these default weights will change later,
|
||||||
|
during a apply_one_rule() call.
|
||||||
|
*/
|
||||||
static my_bool
|
static my_bool
|
||||||
init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
|
my_uca_generate_page(MY_CHARSET_LOADER *loader,
|
||||||
|
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src,
|
||||||
|
uint pageno)
|
||||||
|
{
|
||||||
|
DBUG_ASSERT(dst->levelno == src->levelno);
|
||||||
|
return src->lengths[pageno] ?
|
||||||
|
/*
|
||||||
|
A page with explicit weights and some special rules.
|
||||||
|
Copy all weights from the page in "src".
|
||||||
|
*/
|
||||||
|
my_uca_copy_page(loader, src, dst, pageno) :
|
||||||
|
/*
|
||||||
|
A page with implicit weights and some special rules.
|
||||||
|
Generate default weights for all characters on this page
|
||||||
|
algorithmically now, at initialization time.
|
||||||
|
*/
|
||||||
|
my_uca_generate_implicit_page(loader, dst, pageno);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
Find all pages that we have special rules on and
|
||||||
|
populate default (explicit or implicit) weights for these pages.
|
||||||
|
*/
|
||||||
|
static my_bool
|
||||||
|
my_uca_generate_pages(MY_CHARSET_LOADER *loader,
|
||||||
|
MY_UCA_WEIGHT_LEVEL *dst,
|
||||||
|
const MY_UCA_WEIGHT_LEVEL *src,
|
||||||
|
uint npages)
|
||||||
|
{
|
||||||
|
uint page;
|
||||||
|
for (page= 0; page < npages; page++)
|
||||||
|
{
|
||||||
|
if (dst->weights[page])
|
||||||
|
{
|
||||||
|
/* A page with explicit weights with no special rules */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!dst->lengths[page])
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
A page with implicit weights with no special rules.
|
||||||
|
Keep dst->weights[page]==NULL and dst->lengths[page]==0.
|
||||||
|
Weights for this page will be generated at run time algorithmically,
|
||||||
|
using my_uca_scanner_next_implicit().
|
||||||
|
*/
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Found a page with some special rules. */
|
||||||
|
if (my_uca_generate_page(loader, dst, src, page))
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static my_bool
|
||||||
|
init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
|
||||||
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src)
|
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src)
|
||||||
{
|
{
|
||||||
MY_COLL_RULE *r, *rlast;
|
MY_COLL_RULE *r, *rlast;
|
||||||
@@ -33916,9 +34045,15 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
uint pageb= (r->base[0] >> 8);
|
/*
|
||||||
if (dst->lengths[pagec] < src->lengths[pageb])
|
Not an expansion and not a contraction.
|
||||||
dst->lengths[pagec]= src->lengths[pageb];
|
The page correspoding to r->curr[0] in "dst"
|
||||||
|
will need at least the same amount of weights
|
||||||
|
that r->base[0] has in "src".
|
||||||
|
*/
|
||||||
|
uint wsize= my_weight_size_on_page(src, r->base[0] >> 8);
|
||||||
|
if (dst->lengths[pagec] < wsize)
|
||||||
|
dst->lengths[pagec]= wsize;
|
||||||
}
|
}
|
||||||
dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */
|
dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */
|
||||||
}
|
}
|
||||||
@@ -33928,18 +34063,8 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
|
|||||||
|
|
||||||
ncontractions += src->contractions.nitems;
|
ncontractions += src->contractions.nitems;
|
||||||
|
|
||||||
/* Allocate pages that we'll overwrite and copy default weights */
|
if ((my_uca_generate_pages(loader, dst, src, npages)))
|
||||||
for (i= 0; i < npages; i++)
|
return TRUE;
|
||||||
{
|
|
||||||
my_bool rc;
|
|
||||||
/*
|
|
||||||
Don't touch pages with lengths[i]==0, they have implicit weights
|
|
||||||
calculated algorithmically.
|
|
||||||
*/
|
|
||||||
if (!dst->weights[i] && dst->lengths[i] &&
|
|
||||||
(rc= my_uca_copy_page(loader, src, dst, i)))
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ncontractions)
|
if (ncontractions)
|
||||||
{
|
{
|
||||||
@@ -33957,7 +34082,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
|
|||||||
*/
|
*/
|
||||||
for (r= rules->rule; r < rlast; r++)
|
for (r= rules->rule; r < rlast; r++)
|
||||||
{
|
{
|
||||||
if (apply_one_rule(loader, rules, r, level, dst))
|
if (apply_one_rule(loader, rules, r, dst))
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -34040,7 +34165,7 @@ create_tailoring(struct charset_info_st *cs,
|
|||||||
cs->caseinfo= &my_unicase_default;
|
cs->caseinfo= &my_unicase_default;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((rc= init_weight_level(loader, &rules, 0,
|
if ((rc= init_weight_level(loader, &rules,
|
||||||
&new_uca.level[0], &src_uca->level[0])))
|
&new_uca.level[0], &src_uca->level[0])))
|
||||||
goto ex;
|
goto ex;
|
||||||
|
|
||||||
@@ -34103,7 +34228,7 @@ create_tailoring_multilevel(struct charset_info_st *cs,
|
|||||||
|
|
||||||
for (i= 0; i != num_level; i++)
|
for (i= 0; i != num_level; i++)
|
||||||
{
|
{
|
||||||
if ((rc= init_weight_level(loader, &rules, i,
|
if ((rc= init_weight_level(loader, &rules,
|
||||||
&new_uca.level[i], &src_uca->level[i])))
|
&new_uca.level[i], &src_uca->level[i])))
|
||||||
goto ex;
|
goto ex;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user