From 3d5bd3809ca472ba9b5e751b7109df405f94c73b Mon Sep 17 00:00:00 2001 From: Andrew Hutchings Date: Wed, 29 Nov 2017 10:43:57 +0000 Subject: [PATCH] MCOL-444 Truncate UTF8 correctly cpimport would truncate UTF8 data half way through a character which would cause problems for functions using that data. This patch calculates the correct truncation point when inserting the data. --- utils/funcexp/utils_utf8.h | 23 +++++++++++++++++++++++ writeengine/bulk/we_bulkloadbuffer.cpp | 5 ++++- writeengine/dictionary/we_dctnry.cpp | 4 +++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/utils/funcexp/utils_utf8.h b/utils/funcexp/utils_utf8.h index d7a86415d..8055a2580 100644 --- a/utils/funcexp/utils_utf8.h +++ b/utils/funcexp/utils_utf8.h @@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str) return std::string(outbuf, strmblen); } +inline +uint8_t utf8_truncate_point(const char* input, size_t length) +{ + // Find the beginning of a multibyte char to truncate at and return the + // number of bytes to truncate + if (length < 3) + { + return 0; + } + + const unsigned char *b = (const unsigned char*)(input) + length - 3; + if (b[2] & 0x80) + { + // First byte in a new multi-byte sequence + if (b[2] & 0x40) return 1; + // 3 byte sequence + else if ((b[1] & 0xe0) == 0xe0) return 2; + // 4 byte sequence + else if ((b[0] & 0xf0) == 0xf0) return 3; + } + return 0; +} + } //namespace utf8 } //namespace funcexp diff --git a/writeengine/bulk/we_bulkloadbuffer.cpp b/writeengine/bulk/we_bulkloadbuffer.cpp index 9f75c6873..b1902e98f 100644 --- a/writeengine/bulk/we_bulkloadbuffer.cpp +++ b/writeengine/bulk/we_bulkloadbuffer.cpp @@ -41,6 +41,8 @@ #include "joblisttypes.h" +#include "utils_utf8.h" + using namespace std; using namespace boost; using namespace execplan; @@ -513,7 +515,8 @@ void BulkLoadBuffer::convert(char *field, int fieldLength, // on disk (e.g. 5 for a varchar(5) instead of 8). if (fieldLength > column.definedWidth) { - memcpy( charTmpBuf, field, column.definedWidth ); + uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth); + memcpy( charTmpBuf, field, column.definedWidth - truncate_point ); bufStats.satCount++; } else diff --git a/writeengine/dictionary/we_dctnry.cpp b/writeengine/dictionary/we_dctnry.cpp index 626ec9ea4..04f4a7ba1 100644 --- a/writeengine/dictionary/we_dctnry.cpp +++ b/writeengine/dictionary/we_dctnry.cpp @@ -47,6 +47,7 @@ using namespace BRM; #include "IDBPolicy.h" #include "cacheutils.h" using namespace idbdatafile; +#include "utils_utf8.h" namespace { @@ -731,7 +732,8 @@ int Dctnry::insertDctnry(const char* buf, // @Bug 2565: Truncate any strings longer than schema's column width if (curSig.size > m_colWidth) { - curSig.size = m_colWidth; + uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth); + curSig.size = m_colWidth - truncate_point; ++truncCount; }