1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-8686 A user defined collation utf8_confusables doesn't work

The collation customization code for the UCA (Unicode Collation Alrorithm)
based collations now allows to reset to and shift of characters with
implicit weights. Previously reset/shift worked only for the characters
with explicit DUCET weights. An attempt to use reset/shift with
character with implicit weights made the server crash.
This commit is contained in:
Alexander Barkov
2016-06-23 14:25:48 +04:00
parent 3e03b89b0a
commit 25e68c5e46
4 changed files with 191 additions and 21 deletions

View File

@@ -460,6 +460,7 @@ utf8mb4_test_400_ci utf8mb4 328 8
latin1_test2 latin1 332 1 latin1_test2 latin1 332 1
utf8_bengali_standard_ci utf8 336 8 utf8_bengali_standard_ci utf8 336 8
utf8_bengali_traditional_ci utf8 337 8 utf8_bengali_traditional_ci utf8 337 8
utf8_implicit_weights_ci utf8 338 8
utf8_phone_ci utf8 352 8 utf8_phone_ci utf8 352 8
utf8_test_ci utf8 353 8 utf8_test_ci utf8 353 8
utf8_5624_1 utf8 354 8 utf8_5624_1 utf8 354 8
@@ -1156,3 +1157,25 @@ Warning 1273 Expansion too long: 'a\u002Daaaaaa10'
# #
# Search for occurrences of [ERROR] Syntax error at '[strength tertiary]' # Search for occurrences of [ERROR] Syntax error at '[strength tertiary]'
Occurances : 2 Occurances : 2
#
# MDEV-8686 A user defined collation utf8_confusables doesn't work
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_implicit_weights_ci);
INSERT INTO t1 VALUES ('a'),('b'),('c');
INSERT INTO t1 VALUES (_ucs2 0x1500),(_ucs2 0x1501);
INSERT INTO t1 VALUES (_ucs2 0x3400),(_ucs2 0x3560),(_ucs2 0x3561),(_ucs2 0x3600);
INSERT INTO t1 VALUES (_ucs2 0x3700),(_ucs2 0x3701);
SELECT HEX(CONVERT(a USING ucs2)) AS ch, HEX(WEIGHT_STRING(a)) AS w, HEX(WEIGHT_STRING(a COLLATE utf8_unicode_ci)) AS ducet FROM t1 ORDER BY a,ch;
ch w ducet
0061 0E33 0E33
3561 0E33 FB80B561
0063 0E60 0E60
1500 0E60 1BAD
0062 FB80B400 0E4A
3400 FB80B400 FB80B400
3560 FB80B560 FB80B560
1501 FB80B600 1BAE
3600 FB80B600 FB80B600
3700 FB80B700 FB80B700
3701 FB80B700 FB80B701
DROP TABLE t1;

View File

@@ -1117,6 +1117,16 @@
</rules> </rules>
</collation> </collation>
<collation name="utf8_implicit_weights_ci" id="338">
<rules>
<reset>\u3400</reset><i>b</i>
<reset>a</reset><i>\u3561</i>
<reset>c</reset><i>\u1500</i>
<reset>\u3600</reset><i>\u1501</i>
<reset>\u3700</reset><i>\u3701</i>
</rules>
</collation>
</charset> </charset>
</charsets> </charsets>

View File

@@ -397,3 +397,15 @@ perl;
print "Occurances : $count_error\n"; print "Occurances : $count_error\n";
close(FILE); close(FILE);
EOF EOF
--echo #
--echo # MDEV-8686 A user defined collation utf8_confusables doesn't work
--echo #
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_implicit_weights_ci);
INSERT INTO t1 VALUES ('a'),('b'),('c');
INSERT INTO t1 VALUES (_ucs2 0x1500),(_ucs2 0x1501);
INSERT INTO t1 VALUES (_ucs2 0x3400),(_ucs2 0x3560),(_ucs2 0x3561),(_ucs2 0x3600);
INSERT INTO t1 VALUES (_ucs2 0x3700),(_ucs2 0x3701);
SELECT HEX(CONVERT(a USING ucs2)) AS ch, HEX(WEIGHT_STRING(a)) AS w, HEX(WEIGHT_STRING(a COLLATE utf8_unicode_ci)) AS ducet FROM t1 ORDER BY a,ch;
DROP TABLE t1;

View File

@@ -31574,6 +31574,26 @@ my_uca_implicit_weight_base(my_wc_t code)
} }
static inline void
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
{
switch (level) {
case 1: to[0]= 0x0020; to[1]= 0; break; /* Secondary level */
case 2: to[0]= 0x0002; to[1]= 0; break; /* Tertiary level */
case 3: to[0]= 0x0001; to[1]= 0; break; /* Quaternary level */
default:
DBUG_ASSERT(0);
case 0:
break;
}
/* Primary level */
to[0]= (code >> 15) + my_uca_implicit_weight_base(code);
to[1]= (code & 0x7FFF) | 0x8000;
to[2]= 0;
}
/****************************************************************/
/** /**
Return an implicit UCA weight for the primary level. Return an implicit UCA weight for the primary level.
Used for characters that do not have assigned UCA weights. Used for characters that do not have assigned UCA weights.
@@ -33583,6 +33603,7 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
{ {
size_t chlen; size_t chlen;
const uint16 *from= NULL; const uint16 *from= NULL;
uint16 implicit_weights[3];
for (chlen= len; chlen > 1; chlen--) for (chlen= len; chlen > 1; chlen--)
{ {
@@ -33597,6 +33618,11 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
if (!from) if (!from)
{ {
from= my_char_weight_addr(dst, *str); from= my_char_weight_addr(dst, *str);
if (!from)
{
from= implicit_weights;
my_uca_implicit_weight_put(implicit_weights, *str, dst->levelno);
}
str++; str++;
len--; len--;
} }
@@ -33649,6 +33675,25 @@ my_uca_copy_page(MY_CHARSET_LOADER *loader,
} }
static my_bool
my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader,
MY_UCA_WEIGHT_LEVEL *dst,
uint page)
{
uint chc, size= 256 * dst->lengths[page] * sizeof(uint16);
if (!(dst->weights[page]= (uint16 *) (loader->once_alloc)(size)))
return TRUE;
memset(dst->weights[page], 0, size);
for (chc= 0 ; chc < 256; chc++)
{
uint16 *w= dst->weights[page] + chc * dst->lengths[page];
my_uca_implicit_weight_put(w, (page << 8) + chc, dst->levelno);
}
return FALSE;
}
static my_bool static my_bool
apply_shift(MY_CHARSET_LOADER *loader, apply_shift(MY_CHARSET_LOADER *loader,
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
@@ -33766,7 +33811,7 @@ my_uca_init_one_contraction(MY_CONTRACTIONS *contractions,
static my_bool static my_bool
apply_one_rule(MY_CHARSET_LOADER *loader, apply_one_rule(MY_CHARSET_LOADER *loader,
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, MY_COLL_RULES *rules, MY_COLL_RULE *r,
MY_UCA_WEIGHT_LEVEL *dst) MY_UCA_WEIGHT_LEVEL *dst)
{ {
size_t nweights; size_t nweights;
@@ -33842,7 +33887,7 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
} }
/* Apply level difference. */ /* Apply level difference. */
return apply_shift(loader, rules, r, level, to, nweights); return apply_shift(loader, rules, r, dst->levelno, to, nweights);
} }
@@ -33875,8 +33920,92 @@ check_rules(MY_CHARSET_LOADER *loader,
} }
/**
Calculates how many weights are needed on the given page.
In case of implicit weights, the functions returns 3:
two implicit weights plus trailing 0.
Implicit weights can appear if we do something like this:
<reset>\u3400</>
<i>a</i>
I.e. we reset to a character that does not have an explicit weight (U+3400),
and then reorder another character relatively to it.
*/
static uint my_weight_size_on_page(const MY_UCA_WEIGHT_LEVEL *src, uint page)
{
return src->lengths[page] ? src->lengths[page] : 3;
}
/**
Generate default weights for a page:
- copy default weights from "src", or
- generate implicit weights algorithmically.
Note, some of these default weights will change later,
during a apply_one_rule() call.
*/
static my_bool static my_bool
init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, my_uca_generate_page(MY_CHARSET_LOADER *loader,
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src,
uint pageno)
{
DBUG_ASSERT(dst->levelno == src->levelno);
return src->lengths[pageno] ?
/*
A page with explicit weights and some special rules.
Copy all weights from the page in "src".
*/
my_uca_copy_page(loader, src, dst, pageno) :
/*
A page with implicit weights and some special rules.
Generate default weights for all characters on this page
algorithmically now, at initialization time.
*/
my_uca_generate_implicit_page(loader, dst, pageno);
}
/**
Find all pages that we have special rules on and
populate default (explicit or implicit) weights for these pages.
*/
static my_bool
my_uca_generate_pages(MY_CHARSET_LOADER *loader,
MY_UCA_WEIGHT_LEVEL *dst,
const MY_UCA_WEIGHT_LEVEL *src,
uint npages)
{
uint page;
for (page= 0; page < npages; page++)
{
if (dst->weights[page])
{
/* A page with explicit weights with no special rules */
continue;
}
if (!dst->lengths[page])
{
/*
A page with implicit weights with no special rules.
Keep dst->weights[page]==NULL and dst->lengths[page]==0.
Weights for this page will be generated at run time algorithmically,
using my_uca_scanner_next_implicit().
*/
continue;
}
/* Found a page with some special rules. */
if (my_uca_generate_page(loader, dst, src, page))
return TRUE;
}
return FALSE;
}
static my_bool
init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src) MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src)
{ {
MY_COLL_RULE *r, *rlast; MY_COLL_RULE *r, *rlast;
@@ -33916,9 +34045,15 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
} }
else else
{ {
uint pageb= (r->base[0] >> 8); /*
if (dst->lengths[pagec] < src->lengths[pageb]) Not an expansion and not a contraction.
dst->lengths[pagec]= src->lengths[pageb]; The page correspoding to r->curr[0] in "dst"
will need at least the same amount of weights
that r->base[0] has in "src".
*/
uint wsize= my_weight_size_on_page(src, r->base[0] >> 8);
if (dst->lengths[pagec] < wsize)
dst->lengths[pagec]= wsize;
} }
dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */ dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */
} }
@@ -33928,18 +34063,8 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
ncontractions += src->contractions.nitems; ncontractions += src->contractions.nitems;
/* Allocate pages that we'll overwrite and copy default weights */ if ((my_uca_generate_pages(loader, dst, src, npages)))
for (i= 0; i < npages; i++) return TRUE;
{
my_bool rc;
/*
Don't touch pages with lengths[i]==0, they have implicit weights
calculated algorithmically.
*/
if (!dst->weights[i] && dst->lengths[i] &&
(rc= my_uca_copy_page(loader, src, dst, i)))
return rc;
}
if (ncontractions) if (ncontractions)
{ {
@@ -33957,7 +34082,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
*/ */
for (r= rules->rule; r < rlast; r++) for (r= rules->rule; r < rlast; r++)
{ {
if (apply_one_rule(loader, rules, r, level, dst)) if (apply_one_rule(loader, rules, r, dst))
return TRUE; return TRUE;
} }
@@ -34040,7 +34165,7 @@ create_tailoring(struct charset_info_st *cs,
cs->caseinfo= &my_unicase_default; cs->caseinfo= &my_unicase_default;
} }
if ((rc= init_weight_level(loader, &rules, 0, if ((rc= init_weight_level(loader, &rules,
&new_uca.level[0], &src_uca->level[0]))) &new_uca.level[0], &src_uca->level[0])))
goto ex; goto ex;
@@ -34103,7 +34228,7 @@ create_tailoring_multilevel(struct charset_info_st *cs,
for (i= 0; i != num_level; i++) for (i= 0; i != num_level; i++)
{ {
if ((rc= init_weight_level(loader, &rules, i, if ((rc= init_weight_level(loader, &rules,
&new_uca.level[i], &src_uca->level[i]))) &new_uca.level[i], &src_uca->level[i])))
goto ex; goto ex;
} }