1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-29 05:21:33 +03:00

MDEV-27042 UCA: Resetting contractions to ignorable does not work well

The weight scanner routine scanner_next() did not properly handle the cases
when a contraction produces no weights (is ignorable).

Adding a helper routine my_uca_scanner_set_weight() and using
it in all cases:

- A single ASCII character
- A contraction starting with an ASCII character
- A multi-byte character
- A contraction starting with a multi-byte character

Also adding two other helper routines:

- my_uca_scanner_next_expansion_weight()
- my_uca_scanner_set_weight_outside_maxchar()

to avoid using scanner->wbeg directly inside scanner_next().
This reduces the probability of similar future bugs.
This commit is contained in:
Alexander Barkov
2021-11-14 07:09:08 +04:00
parent 0a3d1d106a
commit f9ad8072cd
5 changed files with 114 additions and 36 deletions

View File

@ -34,6 +34,9 @@ Bar +7-912-800-80-01
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1'; SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
name phone name phone
Bar +7-912-800-80-01 Bar +7-912-800-80-01
SELECT * FROM t1 WHERE phone='tel.79128008001';
name phone
Bar +7-912-800-80-01
DROP TABLE t1; DROP TABLE t1;
show collation like 'utf8mb3_test_ci'; show collation like 'utf8mb3_test_ci';
Collation Charset Id Default Compiled Sortlen Collation Charset Id Default Compiled Sortlen
@ -3042,3 +3045,37 @@ SHOW COLLATION LIKE 'latin1_test_replace';
Collation Charset Id Default Compiled Sortlen Collation Charset Id Default Compiled Sortlen
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace; SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
ERROR HY000: Unknown collation: 'latin1_test_replace' ERROR HY000: Unknown collation: 'latin1_test_replace'
#
# MDEV-27042 UCA: Resetting contractions to ignorable does not work well
#
CREATE TABLE t1 (
phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
);
INSERT INTO t1 VALUES ('123');
INSERT INTO t1 VALUES ('tel.123');
INSERT INTO t1 VALUES ('tél.123');
INSERT INTO t1 VALUES ('tèl.123');
INSERT INTO t1 VALUES ('ťel.123');
INSERT INTO t1 VALUES ('ťèl.123');
INSERT INTO t1 VALUES ('tex.123');
SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
phone
123
tel.123
tél.123
ťel.123
SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
phone
tex.123
tèl.123
ťèl.123
SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
phone HEX(WEIGHT_STRING(phone))
123 0E2A0E2B0E2C
tel.123 0E2A0E2B0E2C
tél.123 0E2A0E2B0E2C
ťel.123 0E2A0E2B0E2C
tèl.123 10020E8B0F2E025D0E2A0E2B0E2C
ťèl.123 10020E8B0F2E025D0E2A0E2B0E2C
tex.123 10020E8B105A025D0E2A0E2B0E2C
DROP TABLE t1;

View File

@ -33,6 +33,7 @@ SELECT * FROM t1 ORDER BY phone;
SELECT * FROM t1 WHERE phone='+7(912)800-80-01'; SELECT * FROM t1 WHERE phone='+7(912)800-80-01';
SELECT * FROM t1 WHERE phone='79128008001'; SELECT * FROM t1 WHERE phone='79128008001';
SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1'; SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
SELECT * FROM t1 WHERE phone='tel.79128008001';
DROP TABLE t1; DROP TABLE t1;
show collation like 'utf8mb3_test_ci'; show collation like 'utf8mb3_test_ci';
@ -615,3 +616,23 @@ SELECT 'a' COLLATE utf8_czech_test_bad_w2;
SHOW COLLATION LIKE 'latin1_test_replace'; SHOW COLLATION LIKE 'latin1_test_replace';
--error ER_UNKNOWN_COLLATION --error ER_UNKNOWN_COLLATION
SELECT 'foo' = 'foo ' COLLATE latin1_test_replace; SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
--echo #
--echo # MDEV-27042 UCA: Resetting contractions to ignorable does not work well
--echo #
CREATE TABLE t1 (
phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
);
INSERT INTO t1 VALUES ('123');
INSERT INTO t1 VALUES ('tel.123');
INSERT INTO t1 VALUES ('tél.123');
INSERT INTO t1 VALUES ('tèl.123');
INSERT INTO t1 VALUES ('ťel.123');
INSERT INTO t1 VALUES ('ťèl.123');
INSERT INTO t1 VALUES ('tex.123');
SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
DROP TABLE t1;

View File

@ -9,6 +9,9 @@
<i>\u0029</i> <!-- right parenthesis --> <i>\u0029</i> <!-- right parenthesis -->
<i>\u002B</i> <!-- plus --> <i>\u002B</i> <!-- plus -->
<i>\u002D</i> <!-- hyphen --> <i>\u002D</i> <!-- hyphen -->
<i>tel.</i>
<i>tél.</i>
<i>ťel.</i>
</rules> </rules>
</collation> </collation>
<collation name="utf8mb3_test_ci" id="353"> <collation name="utf8mb3_test_ci" id="353">

View File

@ -31175,6 +31175,33 @@ static const uint16 nochar[]= {0,0};
#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64 #define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128 #define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
static inline uint16
my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner)
{
if (scanner->wbeg[0])
return *scanner->wbeg++;
return 0;
}
static inline uint16
my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight)
{
scanner->wbeg= weight + 1;
return *weight;
}
static inline uint16
my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner)
{
/* Return 0xFFFD as weight for all characters outside BMP */
scanner->wbeg= nochar;
return 0xFFFD;
}
/********** Helper functions to handle contraction ************/ /********** Helper functions to handle contraction ************/

View File

@ -40,20 +40,16 @@
static inline int static inline int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
{ {
/* uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
Check if the weights for the previous character have been if (weight)
already fully scanned. If yes, then get the next character and return weight; /* Next expansion weight found */
initialize wbeg and wlength to its weight string.
*/
if (scanner->wbeg[0]) /* More weights left from the previous step: */ for ( ; ; )
return *scanner->wbeg++; /* return the next weight from expansion */
do
{ {
const uint16 *wpage; const uint16 *wpage;
int mblen; int mblen;
my_wc_t currwc; my_wc_t currwc;
const uint16 *cweight;
/* Get next character */ /* Get next character */
#if MY_UCA_ASCII_OPTIMIZE #if MY_UCA_ASCII_OPTIMIZE
@ -64,23 +60,21 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
scanner->sbeg+= 1; scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS #if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, currwc)) if (my_uca_needs_context_handling(scanner->level, currwc) &&
(cweight= my_uca_context_weight_find(scanner, currwc)))
{ {
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc); if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
if (cweight) return weight;
{ continue; /* Ignorable contraction */
scanner->wbeg= cweight + 1;
return *cweight;
}
} }
#endif #endif
scanner->page= 0; scanner->page= 0;
scanner->code= (int) currwc; scanner->code= (int) currwc;
scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
if (scanner->wbeg[0]) if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
return *scanner->wbeg++; return weight;
continue; continue; /* Ignorable character */
} }
else else
#endif #endif
@ -109,21 +103,15 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
scanner->sbeg+= mblen; scanner->sbeg+= mblen;
if (currwc > scanner->level->maxchar) if (currwc > scanner->level->maxchar)
{ return my_uca_scanner_set_weight_outside_maxchar(scanner);
/* Return 0xFFFD as weight for all characters outside BMP */
scanner->wbeg= nochar;
return 0xFFFD;
}
#if MY_UCA_COMPILE_CONTRACTIONS #if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, currwc)) if (my_uca_needs_context_handling(scanner->level, currwc) &&
(cweight= my_uca_context_weight_find(scanner, currwc)))
{ {
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc); if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
if (cweight) return weight;
{ continue; /* Ignorable contraction */
scanner->wbeg= cweight + 1;
return *cweight;
}
} }
#endif #endif
@ -136,11 +124,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
return my_uca_scanner_next_implicit(scanner); return my_uca_scanner_next_implicit(scanner);
/* Calculate pointer to w[0]'s weight, using page and offset */ /* Calculate pointer to w[0]'s weight, using page and offset */
scanner->wbeg= wpage + cweight= wpage + scanner->code * scanner->level->lengths[scanner->page];
scanner->code * scanner->level->lengths[scanner->page]; if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
} while (!scanner->wbeg[0]); /* Skip ignorable characters */ return weight;
continue; /* Ignorable character */
}
return *scanner->wbeg++; return 0;
} }