mirror of
https://github.com/MariaDB/server.git
synced 2025-08-01 03:47:19 +03:00
MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
This commit is contained in:
@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
|
|||||||
/* A helper macros for "need at least n bytes" */
|
/* A helper macros for "need at least n bytes" */
|
||||||
#define MY_CS_TOOSMALLN(n) (-100-(n))
|
#define MY_CS_TOOSMALLN(n) (-100-(n))
|
||||||
|
|
||||||
|
#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */
|
||||||
|
#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
|
||||||
|
|
||||||
|
|
||||||
#define MY_SEQ_INTTAIL 1
|
#define MY_SEQ_INTTAIL 1
|
||||||
#define MY_SEQ_SPACES 2
|
#define MY_SEQ_SPACES 2
|
||||||
|
|
||||||
|
@ -5926,3 +5926,24 @@ Warning 1300 Invalid gb2312 character string: '\xA3A'
|
|||||||
#
|
#
|
||||||
# End of 10.1 tests
|
# End of 10.1 tests
|
||||||
#
|
#
|
||||||
|
#
|
||||||
|
# Start of 10.2 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
|
||||||
|
SELECT HEX(a) FROM t1;
|
||||||
|
HEX(a)
|
||||||
|
B04061B041
|
||||||
|
B042
|
||||||
|
DELETE FROM t1;
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
|
||||||
|
SELECT HEX(a) FROM t1;
|
||||||
|
HEX(a)
|
||||||
|
B042
|
||||||
|
DROP TABLE t1;
|
||||||
|
#
|
||||||
|
# End of 10.2 tests
|
||||||
|
#
|
||||||
|
@ -10401,3 +10401,30 @@ SET @@SQL_MODE=default;
|
|||||||
#
|
#
|
||||||
# End of 10.1 tests
|
# End of 10.1 tests
|
||||||
#
|
#
|
||||||
|
#
|
||||||
|
# Start of 10.2 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё';
|
||||||
|
Warnings:
|
||||||
|
Warning 1638 Non-ASCII separator arguments are not fully supported
|
||||||
|
SELECT c1 FROM t1;
|
||||||
|
c1
|
||||||
|
a
|
||||||
|
b
|
||||||
|
c
|
||||||
|
DELETE FROM t1;
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё' IGNORE 1 LINES;
|
||||||
|
Warnings:
|
||||||
|
Warning 1638 Non-ASCII separator arguments are not fully supported
|
||||||
|
SELECT c1 FROM t1;
|
||||||
|
c1
|
||||||
|
b
|
||||||
|
c
|
||||||
|
DROP TABLE t1;
|
||||||
|
#
|
||||||
|
# End of 10.2 tests
|
||||||
|
#
|
||||||
|
1
mysql-test/std_data/loaddata/mdev8711.txt
Normal file
1
mysql-test/std_data/loaddata/mdev8711.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
<EFBFBD>@a<>A@<40>B@
|
1
mysql-test/std_data/loaddata/mdev9824.txt
Normal file
1
mysql-test/std_data/loaddata/mdev9824.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
aёёbёёcёё
|
@ -435,3 +435,22 @@ SELECT HEX(CONVERT(CAST(0xA341 AS CHAR CHARACTER SET gb2312) USING utf8));
|
|||||||
--echo #
|
--echo #
|
||||||
--echo # End of 10.1 tests
|
--echo # End of 10.1 tests
|
||||||
--echo #
|
--echo #
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # Start of 10.2 tests
|
||||||
|
--echo #
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
|
||||||
|
--echo #
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
|
||||||
|
SELECT HEX(a) FROM t1;
|
||||||
|
DELETE FROM t1;
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
|
||||||
|
SELECT HEX(a) FROM t1;
|
||||||
|
DROP TABLE t1;
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # End of 10.2 tests
|
||||||
|
--echo #
|
||||||
|
@ -1950,3 +1950,22 @@ SET @@SQL_MODE=default;
|
|||||||
--echo #
|
--echo #
|
||||||
--echo # End of 10.1 tests
|
--echo # End of 10.1 tests
|
||||||
--echo #
|
--echo #
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # Start of 10.2 tests
|
||||||
|
--echo #
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
|
||||||
|
--echo #
|
||||||
|
CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё';
|
||||||
|
SELECT c1 FROM t1;
|
||||||
|
DELETE FROM t1;
|
||||||
|
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё' IGNORE 1 LINES;
|
||||||
|
SELECT c1 FROM t1;
|
||||||
|
DROP TABLE t1;
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # End of 10.2 tests
|
||||||
|
--echo #
|
||||||
|
@ -545,6 +545,7 @@ static void init_available_charsets(void)
|
|||||||
{
|
{
|
||||||
if (*cs)
|
if (*cs)
|
||||||
{
|
{
|
||||||
|
DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN);
|
||||||
if (cs[0]->ctype)
|
if (cs[0]->ctype)
|
||||||
if (init_state_maps(*cs))
|
if (init_state_maps(*cs))
|
||||||
*cs= NULL;
|
*cs= NULL;
|
||||||
|
@ -79,6 +79,14 @@ class READ_INFO {
|
|||||||
NET *io_net;
|
NET *io_net;
|
||||||
int level; /* for load xml */
|
int level; /* for load xml */
|
||||||
|
|
||||||
|
bool getbyte(char *to)
|
||||||
|
{
|
||||||
|
int chr= GET;
|
||||||
|
if (chr == my_b_EOF)
|
||||||
|
return (eof= true);
|
||||||
|
*to= chr;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
bool error,line_cuted,found_null,enclosed;
|
bool error,line_cuted,found_null,enclosed;
|
||||||
uchar *row_start, /* Found row starts here */
|
uchar *row_start, /* Found row starts here */
|
||||||
@ -1706,33 +1714,76 @@ int READ_INFO::next_line()
|
|||||||
return 0; // No lines
|
return 0; // No lines
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
int chr = GET;
|
int chlen;
|
||||||
#ifdef USE_MB
|
char buf[MY_CS_MBMAXLEN];
|
||||||
if (my_mbcharlen(read_charset, chr) > 1)
|
|
||||||
{
|
if (getbyte(&buf[0]))
|
||||||
for (uint i=1;
|
return 1; // EOF
|
||||||
chr != my_b_EOF && i<my_mbcharlen(read_charset, chr);
|
|
||||||
i++)
|
if (use_mb(read_charset) &&
|
||||||
chr = GET;
|
(chlen= my_charlen(read_charset, buf, buf + 1)) != 1)
|
||||||
if (chr == escape_char)
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (chr == my_b_EOF)
|
|
||||||
{
|
|
||||||
eof=1;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (chr == escape_char)
|
|
||||||
{
|
{
|
||||||
line_cuted=1;
|
uint i;
|
||||||
if (GET == my_b_EOF)
|
for (i= 1; MY_CS_IS_TOOSMALL(chlen); )
|
||||||
return 1;
|
{
|
||||||
|
DBUG_ASSERT(i < sizeof(buf));
|
||||||
|
DBUG_ASSERT(chlen != 1);
|
||||||
|
if (getbyte(&buf[i++]))
|
||||||
|
return 1; // EOF
|
||||||
|
chlen= my_charlen(read_charset, buf, buf + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Either a complete multi-byte sequence,
|
||||||
|
or a broken byte sequence was found.
|
||||||
|
Check if the sequence is a prefix of the "LINES TERMINATED BY" string.
|
||||||
|
*/
|
||||||
|
if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
|
||||||
|
!memcmp(buf, line_term_ptr, i))
|
||||||
|
{
|
||||||
|
if (line_term_length == i)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
We found a "LINES TERMINATED BY" string that consists
|
||||||
|
of a single multi-byte character.
|
||||||
|
*/
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
buf[] is a prefix of "LINES TERMINATED BY".
|
||||||
|
Now check the suffix. Length of the suffix of line_term_ptr
|
||||||
|
that still needs to be checked is (line_term_length - i).
|
||||||
|
Note, READ_INFO::terminator() assumes that the leftmost byte of the
|
||||||
|
argument is already scanned from the file and is checked to
|
||||||
|
be a known prefix (e.g. against line_term_char).
|
||||||
|
So we need to pass one extra byte.
|
||||||
|
*/
|
||||||
|
if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
Here we have a good multi-byte sequence or a broken byte sequence,
|
||||||
|
and the sequence is not equal to "LINES TERMINATED BY".
|
||||||
|
No needs to check for escape_char, because:
|
||||||
|
- multi-byte escape characters in "FIELDS ESCAPED BY" are not
|
||||||
|
supported and are rejected at parse time.
|
||||||
|
- broken single-byte sequences are not recognized as escapes,
|
||||||
|
they are considered to be a part of the data and are converted to
|
||||||
|
question marks.
|
||||||
|
*/
|
||||||
|
line_cuted= true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (chr == line_term_char && terminator(line_term_ptr,line_term_length))
|
if (buf[0] == escape_char)
|
||||||
|
{
|
||||||
|
line_cuted= true;
|
||||||
|
if (GET == my_b_EOF)
|
||||||
|
return 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (buf[0] == line_term_char && terminator(line_term_ptr,line_term_length))
|
||||||
return 0;
|
return 0;
|
||||||
line_cuted=1;
|
line_cuted= true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user