You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-444 Truncate UTF8 correctly
cpimport would truncate UTF8 data half way through a character which would cause problems for functions using that data. This patch calculates the correct truncation point when inserting the data.
This commit is contained in:
@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str)
|
||||
return std::string(outbuf, strmblen);
|
||||
}
|
||||
|
||||
inline
|
||||
uint8_t utf8_truncate_point(const char* input, size_t length)
|
||||
{
|
||||
// Find the beginning of a multibyte char to truncate at and return the
|
||||
// number of bytes to truncate
|
||||
if (length < 3)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
const unsigned char *b = (const unsigned char*)(input) + length - 3;
|
||||
if (b[2] & 0x80)
|
||||
{
|
||||
// First byte in a new multi-byte sequence
|
||||
if (b[2] & 0x40) return 1;
|
||||
// 3 byte sequence
|
||||
else if ((b[1] & 0xe0) == 0xe0) return 2;
|
||||
// 4 byte sequence
|
||||
else if ((b[0] & 0xf0) == 0xf0) return 3;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} //namespace utf8
|
||||
} //namespace funcexp
|
||||
|
||||
|
@ -41,6 +41,8 @@
|
||||
|
||||
#include "joblisttypes.h"
|
||||
|
||||
#include "utils_utf8.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
using namespace execplan;
|
||||
@ -513,7 +515,8 @@ void BulkLoadBuffer::convert(char *field, int fieldLength,
|
||||
// on disk (e.g. 5 for a varchar(5) instead of 8).
|
||||
if (fieldLength > column.definedWidth)
|
||||
{
|
||||
memcpy( charTmpBuf, field, column.definedWidth );
|
||||
uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth);
|
||||
memcpy( charTmpBuf, field, column.definedWidth - truncate_point );
|
||||
bufStats.satCount++;
|
||||
}
|
||||
else
|
||||
|
@ -47,6 +47,7 @@ using namespace BRM;
|
||||
#include "IDBPolicy.h"
|
||||
#include "cacheutils.h"
|
||||
using namespace idbdatafile;
|
||||
#include "utils_utf8.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -731,7 +732,8 @@ int Dctnry::insertDctnry(const char* buf,
|
||||
// @Bug 2565: Truncate any strings longer than schema's column width
|
||||
if (curSig.size > m_colWidth)
|
||||
{
|
||||
curSig.size = m_colWidth;
|
||||
uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
|
||||
curSig.size = m_colWidth - truncate_point;
|
||||
++truncCount;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user