From 3fc6a8b832fd152f1fbabff08273e0223c0ff0ab Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Thu, 31 Mar 2016 14:22:25 +0400 Subject: [PATCH] MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified --- include/m_ctype.h | 4 + mysql-test/r/ctype_gbk.result | 21 +++++ mysql-test/r/ctype_utf8.result | 27 +++++++ mysql-test/std_data/loaddata/mdev8711.txt | 1 + mysql-test/std_data/loaddata/mdev9824.txt | 1 + mysql-test/t/ctype_gbk.test | 19 +++++ mysql-test/t/ctype_utf8.test | 19 +++++ mysys/charset.c | 1 + sql/sql_load.cc | 97 +++++++++++++++++------ 9 files changed, 167 insertions(+), 23 deletions(-) create mode 100644 mysql-test/std_data/loaddata/mdev8711.txt create mode 100644 mysql-test/std_data/loaddata/mdev9824.txt diff --git a/include/m_ctype.h b/include/m_ctype.h index d22a0b1f987..c892d576102 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256]; /* A helper macros for "need at least n bytes" */ #define MY_CS_TOOSMALLN(n) (-100-(n)) +#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */ +#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL) + + #define MY_SEQ_INTTAIL 1 #define MY_SEQ_SPACES 2 diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result index b5774548d85..e454347592c 100644 --- a/mysql-test/r/ctype_gbk.result +++ b/mysql-test/r/ctype_gbk.result @@ -5926,3 +5926,24 @@ Warning 1300 Invalid gb2312 character string: '\xA3A' # # End of 10.1 tests # +# +# Start of 10.2 tests +# +# +# MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk); +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@'; +SELECT HEX(a) FROM t1; +HEX(a) +B04061B041 +B042 +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES; +SELECT HEX(a) FROM t1; +HEX(a) +B042 +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 816fe654028..f52e08a676f 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -10401,3 +10401,30 @@ SET @@SQL_MODE=default; # # End of 10.1 tests # +# +# Start of 10.2 tests +# +# +# MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified +# +CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8); +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё'; +Warnings: +Warning 1638 Non-ASCII separator arguments are not fully supported +SELECT c1 FROM t1; +c1 +a +b +c +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё' IGNORE 1 LINES; +Warnings: +Warning 1638 Non-ASCII separator arguments are not fully supported +SELECT c1 FROM t1; +c1 +b +c +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/std_data/loaddata/mdev8711.txt b/mysql-test/std_data/loaddata/mdev8711.txt new file mode 100644 index 00000000000..49296a7134d --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev8711.txt @@ -0,0 +1 @@ +°@a°A@°B@ \ No newline at end of file diff --git a/mysql-test/std_data/loaddata/mdev9824.txt b/mysql-test/std_data/loaddata/mdev9824.txt new file mode 100644 index 00000000000..7050e081844 --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev9824.txt @@ -0,0 +1 @@ +aёёbёёcёё \ No newline at end of file diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test index 07e73cdf745..ae66dbba4b5 100644 --- a/mysql-test/t/ctype_gbk.test +++ b/mysql-test/t/ctype_gbk.test @@ -435,3 +435,22 @@ SELECT HEX(CONVERT(CAST(0xA341 AS CHAR CHARACTER SET gb2312) USING utf8)); --echo # --echo # End of 10.1 tests --echo # + +--echo # +--echo # Start of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases +--echo # +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk); +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@'; +SELECT HEX(a) FROM t1; +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES; +SELECT HEX(a) FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 85ffed943cf..f3a9e63b57d 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1950,3 +1950,22 @@ SET @@SQL_MODE=default; --echo # --echo # End of 10.1 tests --echo # + +--echo # +--echo # Start of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified +--echo # +CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8); +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё'; +SELECT c1 FROM t1; +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё' IGNORE 1 LINES; +SELECT c1 FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/mysys/charset.c b/mysys/charset.c index ad3eb78ae0e..3c134dc388e 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -545,6 +545,7 @@ static void init_available_charsets(void) { if (*cs) { + DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN); if (cs[0]->ctype) if (init_state_maps(*cs)) *cs= NULL; diff --git a/sql/sql_load.cc b/sql/sql_load.cc index d43eb884abd..f1c29203f3e 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -79,6 +79,14 @@ class READ_INFO { NET *io_net; int level; /* for load xml */ + bool getbyte(char *to) + { + int chr= GET; + if (chr == my_b_EOF) + return (eof= true); + *to= chr; + return false; + } public: bool error,line_cuted,found_null,enclosed; uchar *row_start, /* Found row starts here */ @@ -1706,33 +1714,76 @@ int READ_INFO::next_line() return 0; // No lines for (;;) { - int chr = GET; -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1) - { - for (uint i=1; - chr != my_b_EOF && i