diff --git a/utils/common/utils_utf8.h b/utils/common/utils_utf8.h index 7be105c5f..352bbb4f7 100644 --- a/utils/common/utils_utf8.h +++ b/utils/common/utils_utf8.h @@ -91,6 +91,33 @@ inline std::string wstring_to_utf8(const std::wstring& str) return ret; } +inline uint8_t utf8_truncate_point(const char* input, size_t length) +{ + // Find the beginning of a multibyte char to truncate at and return the + // number of bytes to truncate1` + if (length < 3) + { + return 0; + } + + const unsigned char* b = (const unsigned char*)(input) + length - 3; + + if (b[2] & 0x80) + { + // First byte in a new multi-byte sequence + if (b[2] & 0x40) + return 1; + // 3 byte sequence + else if ((b[1] & 0xe0) == 0xe0) + return 2; + // 4 byte sequence + else if ((b[0] & 0xf0) == 0xf0) + return 3; + } + + return 0; +} + int mcs_strcoll(const char* str1, const char* str2, const uint32_t charsetNumber); int mcs_strcoll(const char* str1, const uint32_t l1, const char* str2, const uint32_t l2, const uint32_t charsetNumber); diff --git a/writeengine/bulk/we_columninfo.cpp b/writeengine/bulk/we_columninfo.cpp index fb763d1ea..7d59d6da9 100644 --- a/writeengine/bulk/we_columninfo.cpp +++ b/writeengine/bulk/we_columninfo.cpp @@ -1697,7 +1697,7 @@ int ColumnInfo::updateDctnryStore(char* buf, ColPosPair** pos, const int totalRo Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT); #endif - int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs); + int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs, column.weType); if (rc != NO_ERROR) { diff --git a/writeengine/dictionary/we_dctnry.cpp b/writeengine/dictionary/we_dctnry.cpp index ec3daf3f8..a128d21f7 100644 --- a/writeengine/dictionary/we_dctnry.cpp +++ b/writeengine/dictionary/we_dctnry.cpp @@ -49,6 +49,7 @@ using namespace BRM; #include "cacheutils.h" using namespace idbdatafile; #include "checks.h" +#include "utils_utf8.h" // for utf8_truncate_point() namespace { @@ -763,7 +764,8 @@ int Dctnry::insertDctnry2(Signature& sig) * failure - it did not write the header to block ******************************************************************************/ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, char* tokenBuf, - long long& truncCount, const CHARSET_INFO* cs) + long long& truncCount, const CHARSET_INFO* cs, + const WriteEngine::ColType& weType) { #ifdef PROFILE Stats::startParseEvent(WE_STATS_PARSE_DCT); @@ -838,17 +840,32 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, if (cs->mbmaxlen > 1) { - const char* start = (const char*) curSig.signature; - const char* end = (const char*)(curSig.signature + curSig.size); - size_t numChars = cs->numchars(start, end); - size_t maxCharLength = m_colWidth / cs->mbmaxlen; - - if (numChars > maxCharLength) + // For TEXT columns, we truncate based on the number of bytes, + // and not based on the number of characters, as for CHAR/VARCHAR + // columns in the else block. + if (weType == WriteEngine::WR_TEXT) { - MY_STRCOPY_STATUS status; - cs->well_formed_char_length(start, end, maxCharLength, &status); - curSig.size = status.m_source_end_pos - start; - truncCount++; + if (curSig.size > m_colWidth) + { + uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth); + curSig.size = m_colWidth - truncate_point; + truncCount++; + } + } + else + { + const char* start = (const char*) curSig.signature; + const char* end = (const char*)(curSig.signature + curSig.size); + size_t numChars = cs->numchars(start, end); + size_t maxCharLength = m_colWidth / cs->mbmaxlen; + + if (numChars > maxCharLength) + { + MY_STRCOPY_STATUS status; + cs->well_formed_char_length(start, end, maxCharLength, &status); + curSig.size = status.m_source_end_pos - start; + truncCount++; + } } } else // cs->mbmaxlen == 1 diff --git a/writeengine/dictionary/we_dctnry.h b/writeengine/dictionary/we_dctnry.h index 52aa14d37..6c5f69eaa 100644 --- a/writeengine/dictionary/we_dctnry.h +++ b/writeengine/dictionary/we_dctnry.h @@ -168,7 +168,8 @@ class Dctnry : public DbFileOp * @param tokenBuf - (output) list of tokens for the parsed strings */ EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, - char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs); + char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs, + const WriteEngine::ColType& weType); /** * @brief Update dictionary store with tokenized strings (for DDL/DML use)