mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
MDEV-27042 UCA: Resetting contractions to ignorable does not work well
The weight scanner routine scanner_next() did not properly handle the cases when a contraction produces no weights (is ignorable). Adding a helper routine my_uca_scanner_set_weight() and using it in all cases: - A single ASCII character - A contraction starting with an ASCII character - A multi-byte character - A contraction starting with a multi-byte character Also adding two other helper routines: - my_uca_scanner_next_expansion_weight() - my_uca_scanner_set_weight_outside_maxchar() to avoid using scanner->wbeg directly inside scanner_next(). This reduces the probability of similar future bugs.
This commit is contained in:
@ -34,6 +34,9 @@ Bar +7-912-800-80-01
|
|||||||
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
|
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
|
||||||
name phone
|
name phone
|
||||||
Bar +7-912-800-80-01
|
Bar +7-912-800-80-01
|
||||||
|
SELECT * FROM t1 WHERE phone='tel.79128008001';
|
||||||
|
name phone
|
||||||
|
Bar +7-912-800-80-01
|
||||||
DROP TABLE t1;
|
DROP TABLE t1;
|
||||||
show collation like 'utf8mb3_test_ci';
|
show collation like 'utf8mb3_test_ci';
|
||||||
Collation Charset Id Default Compiled Sortlen
|
Collation Charset Id Default Compiled Sortlen
|
||||||
@ -3042,3 +3045,37 @@ SHOW COLLATION LIKE 'latin1_test_replace';
|
|||||||
Collation Charset Id Default Compiled Sortlen
|
Collation Charset Id Default Compiled Sortlen
|
||||||
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
|
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
|
||||||
ERROR HY000: Unknown collation: 'latin1_test_replace'
|
ERROR HY000: Unknown collation: 'latin1_test_replace'
|
||||||
|
#
|
||||||
|
# MDEV-27042 UCA: Resetting contractions to ignorable does not work well
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (
|
||||||
|
phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
|
||||||
|
);
|
||||||
|
INSERT INTO t1 VALUES ('123');
|
||||||
|
INSERT INTO t1 VALUES ('tel.123');
|
||||||
|
INSERT INTO t1 VALUES ('tél.123');
|
||||||
|
INSERT INTO t1 VALUES ('tèl.123');
|
||||||
|
INSERT INTO t1 VALUES ('ťel.123');
|
||||||
|
INSERT INTO t1 VALUES ('ťèl.123');
|
||||||
|
INSERT INTO t1 VALUES ('tex.123');
|
||||||
|
SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
|
||||||
|
phone
|
||||||
|
123
|
||||||
|
tel.123
|
||||||
|
tél.123
|
||||||
|
ťel.123
|
||||||
|
SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
|
||||||
|
phone
|
||||||
|
tex.123
|
||||||
|
tèl.123
|
||||||
|
ťèl.123
|
||||||
|
SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
|
||||||
|
phone HEX(WEIGHT_STRING(phone))
|
||||||
|
123 0E2A0E2B0E2C
|
||||||
|
tel.123 0E2A0E2B0E2C
|
||||||
|
tél.123 0E2A0E2B0E2C
|
||||||
|
ťel.123 0E2A0E2B0E2C
|
||||||
|
tèl.123 10020E8B0F2E025D0E2A0E2B0E2C
|
||||||
|
ťèl.123 10020E8B0F2E025D0E2A0E2B0E2C
|
||||||
|
tex.123 10020E8B105A025D0E2A0E2B0E2C
|
||||||
|
DROP TABLE t1;
|
||||||
|
@ -33,6 +33,7 @@ SELECT * FROM t1 ORDER BY phone;
|
|||||||
SELECT * FROM t1 WHERE phone='+7(912)800-80-01';
|
SELECT * FROM t1 WHERE phone='+7(912)800-80-01';
|
||||||
SELECT * FROM t1 WHERE phone='79128008001';
|
SELECT * FROM t1 WHERE phone='79128008001';
|
||||||
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
|
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
|
||||||
|
SELECT * FROM t1 WHERE phone='tel.79128008001';
|
||||||
DROP TABLE t1;
|
DROP TABLE t1;
|
||||||
|
|
||||||
show collation like 'utf8mb3_test_ci';
|
show collation like 'utf8mb3_test_ci';
|
||||||
@ -615,3 +616,23 @@ SELECT 'a' COLLATE utf8_czech_test_bad_w2;
|
|||||||
SHOW COLLATION LIKE 'latin1_test_replace';
|
SHOW COLLATION LIKE 'latin1_test_replace';
|
||||||
--error ER_UNKNOWN_COLLATION
|
--error ER_UNKNOWN_COLLATION
|
||||||
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
|
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
|
||||||
|
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-27042 UCA: Resetting contractions to ignorable does not work well
|
||||||
|
--echo #
|
||||||
|
|
||||||
|
CREATE TABLE t1 (
|
||||||
|
phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
|
||||||
|
);
|
||||||
|
INSERT INTO t1 VALUES ('123');
|
||||||
|
INSERT INTO t1 VALUES ('tel.123');
|
||||||
|
INSERT INTO t1 VALUES ('tél.123');
|
||||||
|
INSERT INTO t1 VALUES ('tèl.123');
|
||||||
|
INSERT INTO t1 VALUES ('ťel.123');
|
||||||
|
INSERT INTO t1 VALUES ('ťèl.123');
|
||||||
|
INSERT INTO t1 VALUES ('tex.123');
|
||||||
|
SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
|
||||||
|
SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
|
||||||
|
SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
|
||||||
|
DROP TABLE t1;
|
||||||
|
@ -9,6 +9,9 @@
|
|||||||
<i>\u0029</i> <!-- right parenthesis -->
|
<i>\u0029</i> <!-- right parenthesis -->
|
||||||
<i>\u002B</i> <!-- plus -->
|
<i>\u002B</i> <!-- plus -->
|
||||||
<i>\u002D</i> <!-- hyphen -->
|
<i>\u002D</i> <!-- hyphen -->
|
||||||
|
<i>tel.</i>
|
||||||
|
<i>tél.</i>
|
||||||
|
<i>ťel.</i>
|
||||||
</rules>
|
</rules>
|
||||||
</collation>
|
</collation>
|
||||||
<collation name="utf8mb3_test_ci" id="353">
|
<collation name="utf8mb3_test_ci" id="353">
|
||||||
|
@ -31175,6 +31175,33 @@ static const uint16 nochar[]= {0,0};
|
|||||||
#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
|
#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
|
||||||
#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
|
#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
|
||||||
|
|
||||||
|
|
||||||
|
static inline uint16
|
||||||
|
my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner)
|
||||||
|
{
|
||||||
|
if (scanner->wbeg[0])
|
||||||
|
return *scanner->wbeg++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline uint16
|
||||||
|
my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight)
|
||||||
|
{
|
||||||
|
scanner->wbeg= weight + 1;
|
||||||
|
return *weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline uint16
|
||||||
|
my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner)
|
||||||
|
{
|
||||||
|
/* Return 0xFFFD as weight for all characters outside BMP */
|
||||||
|
scanner->wbeg= nochar;
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/********** Helper functions to handle contraction ************/
|
/********** Helper functions to handle contraction ************/
|
||||||
|
|
||||||
|
|
||||||
|
@ -40,20 +40,16 @@
|
|||||||
static inline int
|
static inline int
|
||||||
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
|
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
|
||||||
{
|
{
|
||||||
/*
|
uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
|
||||||
Check if the weights for the previous character have been
|
if (weight)
|
||||||
already fully scanned. If yes, then get the next character and
|
return weight; /* Next expansion weight found */
|
||||||
initialize wbeg and wlength to its weight string.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (scanner->wbeg[0]) /* More weights left from the previous step: */
|
for ( ; ; )
|
||||||
return *scanner->wbeg++; /* return the next weight from expansion */
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
const uint16 *wpage;
|
const uint16 *wpage;
|
||||||
int mblen;
|
int mblen;
|
||||||
my_wc_t currwc;
|
my_wc_t currwc;
|
||||||
|
const uint16 *cweight;
|
||||||
|
|
||||||
/* Get next character */
|
/* Get next character */
|
||||||
#if MY_UCA_ASCII_OPTIMIZE
|
#if MY_UCA_ASCII_OPTIMIZE
|
||||||
@ -64,23 +60,21 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
|
|||||||
scanner->sbeg+= 1;
|
scanner->sbeg+= 1;
|
||||||
|
|
||||||
#if MY_UCA_COMPILE_CONTRACTIONS
|
#if MY_UCA_COMPILE_CONTRACTIONS
|
||||||
if (my_uca_needs_context_handling(scanner->level, currwc))
|
if (my_uca_needs_context_handling(scanner->level, currwc) &&
|
||||||
|
(cweight= my_uca_context_weight_find(scanner, currwc)))
|
||||||
{
|
{
|
||||||
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
|
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
|
||||||
if (cweight)
|
return weight;
|
||||||
{
|
continue; /* Ignorable contraction */
|
||||||
scanner->wbeg= cweight + 1;
|
|
||||||
return *cweight;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
scanner->page= 0;
|
scanner->page= 0;
|
||||||
scanner->code= (int) currwc;
|
scanner->code= (int) currwc;
|
||||||
scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
|
cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
|
||||||
if (scanner->wbeg[0])
|
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
|
||||||
return *scanner->wbeg++;
|
return weight;
|
||||||
continue;
|
continue; /* Ignorable character */
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
@ -109,21 +103,15 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
|
|||||||
|
|
||||||
scanner->sbeg+= mblen;
|
scanner->sbeg+= mblen;
|
||||||
if (currwc > scanner->level->maxchar)
|
if (currwc > scanner->level->maxchar)
|
||||||
{
|
return my_uca_scanner_set_weight_outside_maxchar(scanner);
|
||||||
/* Return 0xFFFD as weight for all characters outside BMP */
|
|
||||||
scanner->wbeg= nochar;
|
|
||||||
return 0xFFFD;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if MY_UCA_COMPILE_CONTRACTIONS
|
#if MY_UCA_COMPILE_CONTRACTIONS
|
||||||
if (my_uca_needs_context_handling(scanner->level, currwc))
|
if (my_uca_needs_context_handling(scanner->level, currwc) &&
|
||||||
|
(cweight= my_uca_context_weight_find(scanner, currwc)))
|
||||||
{
|
{
|
||||||
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
|
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
|
||||||
if (cweight)
|
return weight;
|
||||||
{
|
continue; /* Ignorable contraction */
|
||||||
scanner->wbeg= cweight + 1;
|
|
||||||
return *cweight;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -136,11 +124,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
|
|||||||
return my_uca_scanner_next_implicit(scanner);
|
return my_uca_scanner_next_implicit(scanner);
|
||||||
|
|
||||||
/* Calculate pointer to w[0]'s weight, using page and offset */
|
/* Calculate pointer to w[0]'s weight, using page and offset */
|
||||||
scanner->wbeg= wpage +
|
cweight= wpage + scanner->code * scanner->level->lengths[scanner->page];
|
||||||
scanner->code * scanner->level->lengths[scanner->page];
|
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
|
||||||
} while (!scanner->wbeg[0]); /* Skip ignorable characters */
|
return weight;
|
||||||
|
continue; /* Ignorable character */
|
||||||
|
}
|
||||||
|
|
||||||
return *scanner->wbeg++;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user