1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-444 Truncate UTF8 correctly

cpimport would truncate UTF8 data half way through a character which
would cause problems for functions using that data. This patch
calculates the correct truncation point when inserting the data.
This commit is contained in:
Andrew Hutchings
2017-11-29 10:43:57 +00:00
parent 9b65a86ce2
commit 3d5bd3809c
3 changed files with 30 additions and 2 deletions

View File

@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str)
return std::string(outbuf, strmblen); return std::string(outbuf, strmblen);
} }
inline
uint8_t utf8_truncate_point(const char* input, size_t length)
{
// Find the beginning of a multibyte char to truncate at and return the
// number of bytes to truncate
if (length < 3)
{
return 0;
}
const unsigned char *b = (const unsigned char*)(input) + length - 3;
if (b[2] & 0x80)
{
// First byte in a new multi-byte sequence
if (b[2] & 0x40) return 1;
// 3 byte sequence
else if ((b[1] & 0xe0) == 0xe0) return 2;
// 4 byte sequence
else if ((b[0] & 0xf0) == 0xf0) return 3;
}
return 0;
}
} //namespace utf8 } //namespace utf8
} //namespace funcexp } //namespace funcexp

View File

@ -41,6 +41,8 @@
#include "joblisttypes.h" #include "joblisttypes.h"
#include "utils_utf8.h"
using namespace std; using namespace std;
using namespace boost; using namespace boost;
using namespace execplan; using namespace execplan;
@ -513,7 +515,8 @@ void BulkLoadBuffer::convert(char *field, int fieldLength,
// on disk (e.g. 5 for a varchar(5) instead of 8). // on disk (e.g. 5 for a varchar(5) instead of 8).
if (fieldLength > column.definedWidth) if (fieldLength > column.definedWidth)
{ {
memcpy( charTmpBuf, field, column.definedWidth ); uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth);
memcpy( charTmpBuf, field, column.definedWidth - truncate_point );
bufStats.satCount++; bufStats.satCount++;
} }
else else

View File

@ -47,6 +47,7 @@ using namespace BRM;
#include "IDBPolicy.h" #include "IDBPolicy.h"
#include "cacheutils.h" #include "cacheutils.h"
using namespace idbdatafile; using namespace idbdatafile;
#include "utils_utf8.h"
namespace namespace
{ {
@ -731,7 +732,8 @@ int Dctnry::insertDctnry(const char* buf,
// @Bug 2565: Truncate any strings longer than schema's column width // @Bug 2565: Truncate any strings longer than schema's column width
if (curSig.size > m_colWidth) if (curSig.size > m_colWidth)
{ {
curSig.size = m_colWidth; uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
curSig.size = m_colWidth - truncate_point;
++truncCount; ++truncCount;
} }