1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

MCOL-444 Truncate UTF8 correctly

cpimport would truncate UTF8 data half way through a character which
would cause problems for functions using that data. This patch
calculates the correct truncation point when inserting the data.
This commit is contained in:
Andrew Hutchings
2017-11-29 10:43:57 +00:00
parent 9b65a86ce2
commit 3d5bd3809c
3 changed files with 30 additions and 2 deletions

View File

@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str)
return std::string(outbuf, strmblen);
}
inline
uint8_t utf8_truncate_point(const char* input, size_t length)
{
// Find the beginning of a multibyte char to truncate at and return the
// number of bytes to truncate
if (length < 3)
{
return 0;
}
const unsigned char *b = (const unsigned char*)(input) + length - 3;
if (b[2] & 0x80)
{
// First byte in a new multi-byte sequence
if (b[2] & 0x40) return 1;
// 3 byte sequence
else if ((b[1] & 0xe0) == 0xe0) return 2;
// 4 byte sequence
else if ((b[0] & 0xf0) == 0xf0) return 3;
}
return 0;
}
} //namespace utf8
} //namespace funcexp