diff --git a/libmariadb/my_charset.c b/libmariadb/my_charset.c index 5d37b518..f6390bfe 100644 --- a/libmariadb/my_charset.c +++ b/libmariadb/my_charset.c @@ -51,6 +51,7 @@ #endif #include #include +#include #include @@ -1121,13 +1122,55 @@ int madb_get_windows_cp(const char *charset) #endif /* }}} */ + +/* {{{ map_charset_name + Changing charset name into something iconv understands, if necessary. + Another purpose it to avoid BOMs in result string, adding BE if necessary + e.g.UTF16 does not work form iconv, while UTF-16 does. + */ +static void map_charset_name(const char *cs_name, my_bool target_cs, char *buffer, size_t buff_len) +{ + char *ptr= buffer, digits[3], endianness[3]= "BE"; + + if (sscanf(cs_name, "UTF%2[0-9]%2[LBE]", digits, endianness)) + { + /* We should have at least digits. Endianness we write either default(BE), or what we found in the string */ + ptr= strnmov(ptr, "UTF-", buff_len); + ptr= strnmov(ptr, digits, buff_len - (ptr - buffer)); + ptr= strnmov(ptr, endianness, buff_len - (ptr - buffer)); + } + else + { + /* Not our client - copy as is*/ + ptr= strnmov(ptr, cs_name, buff_len); + } + + if (target_cs) + { + strnmov(ptr, "//TRANSLIT", buff_len - (ptr - buffer)); + } +} +/* }}} */ + +/* {{{ mariadb_convert_string + Converts string from one charset to another, and writes converted string to given buffer + @param[in] from + @param[in/out] from_len + @param[in] from_cs + @param[out] to + @param[in/out] to_len + @param[in] to_cs + @param[out] errorcode + + @return -1 in case of error, bytes used in the "to" buffer, otherwise + */ size_t STDCALL mariadb_convert_string(const char *from, size_t *from_len, CHARSET_INFO *from_cs, char *to, size_t *to_len, CHARSET_INFO *to_cs, int *errorcode) { iconv_t conv= 0; size_t rc= -1; size_t save_len= *to_len; - char to_encoding[128]; + char to_encoding[128], from_encoding[128]; *errorcode= 0; @@ -1138,14 +1181,11 @@ size_t STDCALL mariadb_convert_string(const char *from, size_t *from_len, CHARSE *errorcode= EINVAL; return rc; } - /* UTF16 does not work form iconv, while UTF-16 does. - Besides we don't want iconv to generate BOM, thus we used either UTF-16LE or BE by default - TODO: Need to do the same for UTF-32(at leased re BOM) */ - snprintf(to_encoding, 128, "%s//TRANSLIT", strncmp(to_cs->encoding, "UTF16", 5) == 0 - ? (strcmp(to_cs->encoding + 5, "LE") == 0 ? "UTF-16LE" : "UTF-16BE") - : to_cs->encoding); - if ((conv= iconv_open(to_encoding, from_cs->encoding)) == (iconv_t)-1) + map_charset_name(to_cs->encoding, 1, to_encoding, sizeof(to_encoding)); + map_charset_name(from_cs->encoding, 0, from_encoding, sizeof(from_encoding)); + + if ((conv= iconv_open(to_encoding, from_encoding)) == (iconv_t)-1) { *errorcode= errno; goto error; @@ -1161,4 +1201,5 @@ error: iconv_close(conv); return rc; } +/* }}} */ diff --git a/unittest/libmariadb/charset.c b/unittest/libmariadb/charset.c index 1c6dda8c..bec5539e 100644 --- a/unittest/libmariadb/charset.c +++ b/unittest/libmariadb/charset.c @@ -656,6 +656,75 @@ static int test_bug_54100(MYSQL *mysql) } +/* We need this internal function for the test */ +CHARSET_INFO * mysql_find_charset_name(const char *name); + +static int test_utf16_utf32_noboms(MYSQL *mysql) +{ + char *csname[]= {"utf16", "utf16le", "utf32", "utf8"}; + CHARSET_INFO *csinfo[sizeof(csname)/sizeof(char*)]; + + const int UTF8= sizeof(csname)/sizeof(char*) - 1; + + unsigned char in_string[][8]= {"\xd8\x02\xdc\x60\0", /* utf16(be) */ + "\x02\xd8\x60\xdc\0", /* utf16le */ + "\x00\x01\x08\x60\0\0\0", /* utf32(be) */ + "\xF0\x90\xA1\xA0" }; /* utf8 */ + size_t in_oct_len[]= {6, 6, 8, 5}; + + char buffer[8], as_hex[16]; + int i, error; + size_t rc, in_len, out_len; + + for (i= 0; i < sizeof(csname)/sizeof(char*); ++i) + { + csinfo[i]= mysql_find_charset_name(csname[i]); + + if (csinfo[i] == NULL) + { + diag("Could not get cs info for %s", csname[i]); + return FAIL; + } + } + + for (i= 0; i < UTF8; ++i) + { + in_len= in_oct_len[i]; + out_len= sizeof(buffer); + + diag("Converting %s->%s", csname[i], csname[UTF8]); + rc= mariadb_convert_string(in_string[i], &in_len, csinfo[i], buffer, &out_len, csinfo[UTF8], &error); + + FAIL_IF(rc == -1, "Conversion failed"); + FAIL_IF(rc != in_oct_len[UTF8], "Incorrect number of written bytes"); + + if (memcmp(buffer, in_string[UTF8], rc) != 0) + { + mysql_hex_string(as_hex, buffer, rc); + diag("Converted string(%s) does not match the expected one", as_hex); + return FAIL; + } + + in_len= in_oct_len[UTF8]; + out_len= sizeof(buffer); + + diag("Converting %s->%s", csname[UTF8], csname[i]); + rc= mariadb_convert_string(in_string[UTF8], &in_len, csinfo[UTF8], buffer, &out_len, csinfo[i], &error); + + FAIL_IF(rc==-1, "Conversion failed"); + FAIL_IF(rc != in_oct_len[i], "Incorrect number of written bytes"); + + if (memcmp(buffer, in_string[i], rc) != 0) + { + mysql_hex_string(as_hex, buffer, rc); + diag("Converted string(%s) does not match the expected one", as_hex); + return FAIL; + } + } + + return OK; +} + struct my_tests_st my_tests[] = { {"bug_8378: mysql_real_escape with gbk", bug_8378, TEST_CONNECTION_NEW, 0, opt_bug8378, NULL}, {"test_client_character_set", test_client_character_set, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, @@ -667,6 +736,7 @@ struct my_tests_st my_tests[] = { {"test_bug30472", test_bug30472, TEST_CONNECTION_NEW, 0, NULL, NULL}, {"test_ps_i18n", test_ps_i18n, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, {"test_bug_54100", test_bug_54100, TEST_CONNECTION_NEW, 0, NULL, NULL}, + {"test_utf16_utf32_noboms", test_utf16_utf32_noboms, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, {NULL, NULL, 0, 0, NULL, 0} };