MCOL-444 Truncate UTF8 correctly

cpimport would truncate UTF8 data half way through a character which would cause problems for functions using that data. This patch calculates the correct truncation point when inserting the data.
2025-11-02 06:13:16 +03:00 · 2017-11-29 10:43:57 +00:00
parent 9b65a86ce2
commit 3d5bd3809c
3 changed files with 30 additions and 2 deletions
--- a/utils/funcexp/utils_utf8.h
+++ b/utils/funcexp/utils_utf8.h
@@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str)
    return std::string(outbuf, strmblen);
 }
 inline
 uint8_t utf8_truncate_point(const char* input, size_t length)
 {
    // Find the beginning of a multibyte char to truncate at and return the
    // number of bytes to truncate
    if (length < 3)
    {
        return 0;
    }
    const unsigned char *b = (const unsigned char*)(input) + length - 3;
    if (b[2] & 0x80)
    {
        // First byte in a new multi-byte sequence
        if (b[2] & 0x40) return 1;
        // 3 byte sequence
        else if ((b[1] & 0xe0) == 0xe0) return 2;
        // 4 byte sequence
        else if ((b[0] & 0xf0) == 0xf0) return 3;
    }
    return 0;
 }
 } //namespace utf8
 } //namespace funcexp
--- a/writeengine/bulk/we_bulkloadbuffer.cpp
+++ b/writeengine/bulk/we_bulkloadbuffer.cpp
@@ -41,6 +41,8 @@
 #include "joblisttypes.h"
 #include "utils_utf8.h"
 using namespace std;
 using namespace boost;
 using namespace execplan;
@@ -513,7 +515,8 @@ void BulkLoadBuffer::convert(char *field, int fieldLength,
                // on disk (e.g. 5 for a varchar(5) instead of 8).
                if (fieldLength > column.definedWidth)
                {
-                    memcpy( charTmpBuf, field, column.definedWidth );
+                    uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth);
                    memcpy( charTmpBuf, field, column.definedWidth - truncate_point );
                    bufStats.satCount++;
                }
                else
--- a/writeengine/dictionary/we_dctnry.cpp
+++ b/writeengine/dictionary/we_dctnry.cpp
@@ -47,6 +47,7 @@ using namespace BRM;
 #include "IDBPolicy.h"
 #include "cacheutils.h"
 using namespace idbdatafile;
 #include "utils_utf8.h"
 namespace
 {
@@ -731,7 +732,8 @@ int Dctnry::insertDctnry(const char* buf,
        // @Bug 2565: Truncate any strings longer than schema's column width
        if (curSig.size > m_colWidth)
        {
-            curSig.size = m_colWidth;
+            uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
            curSig.size = m_colWidth - truncate_point;
            ++truncCount;
        }