You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-444 Truncate UTF8 correctly
cpimport would truncate UTF8 data half way through a character which would cause problems for functions using that data. This patch calculates the correct truncation point when inserting the data.
This commit is contained in:
@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str)
|
|||||||
return std::string(outbuf, strmblen);
|
return std::string(outbuf, strmblen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline
|
||||||
|
uint8_t utf8_truncate_point(const char* input, size_t length)
|
||||||
|
{
|
||||||
|
// Find the beginning of a multibyte char to truncate at and return the
|
||||||
|
// number of bytes to truncate
|
||||||
|
if (length < 3)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned char *b = (const unsigned char*)(input) + length - 3;
|
||||||
|
if (b[2] & 0x80)
|
||||||
|
{
|
||||||
|
// First byte in a new multi-byte sequence
|
||||||
|
if (b[2] & 0x40) return 1;
|
||||||
|
// 3 byte sequence
|
||||||
|
else if ((b[1] & 0xe0) == 0xe0) return 2;
|
||||||
|
// 4 byte sequence
|
||||||
|
else if ((b[0] & 0xf0) == 0xf0) return 3;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
} //namespace utf8
|
} //namespace utf8
|
||||||
} //namespace funcexp
|
} //namespace funcexp
|
||||||
|
|
||||||
|
@ -41,6 +41,8 @@
|
|||||||
|
|
||||||
#include "joblisttypes.h"
|
#include "joblisttypes.h"
|
||||||
|
|
||||||
|
#include "utils_utf8.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace boost;
|
using namespace boost;
|
||||||
using namespace execplan;
|
using namespace execplan;
|
||||||
@ -513,7 +515,8 @@ void BulkLoadBuffer::convert(char *field, int fieldLength,
|
|||||||
// on disk (e.g. 5 for a varchar(5) instead of 8).
|
// on disk (e.g. 5 for a varchar(5) instead of 8).
|
||||||
if (fieldLength > column.definedWidth)
|
if (fieldLength > column.definedWidth)
|
||||||
{
|
{
|
||||||
memcpy( charTmpBuf, field, column.definedWidth );
|
uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth);
|
||||||
|
memcpy( charTmpBuf, field, column.definedWidth - truncate_point );
|
||||||
bufStats.satCount++;
|
bufStats.satCount++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -47,6 +47,7 @@ using namespace BRM;
|
|||||||
#include "IDBPolicy.h"
|
#include "IDBPolicy.h"
|
||||||
#include "cacheutils.h"
|
#include "cacheutils.h"
|
||||||
using namespace idbdatafile;
|
using namespace idbdatafile;
|
||||||
|
#include "utils_utf8.h"
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
@ -731,7 +732,8 @@ int Dctnry::insertDctnry(const char* buf,
|
|||||||
// @Bug 2565: Truncate any strings longer than schema's column width
|
// @Bug 2565: Truncate any strings longer than schema's column width
|
||||||
if (curSig.size > m_colWidth)
|
if (curSig.size > m_colWidth)
|
||||||
{
|
{
|
||||||
curSig.size = m_colWidth;
|
uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
|
||||||
|
curSig.size = m_colWidth - truncate_point;
|
||||||
++truncCount;
|
++truncCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user