From 3d5bd3809ca472ba9b5e751b7109df405f94c73b Mon Sep 17 00:00:00 2001
From: Andrew Hutchings <andrew@linuxjedi.co.uk>
Date: Wed, 29 Nov 2017 10:43:57 +0000
Subject: [PATCH] MCOL-444 Truncate UTF8 correctly

cpimport would truncate UTF8 data half way through a character which
would cause problems for functions using that data. This patch
calculates the correct truncation point when inserting the data.
---
 utils/funcexp/utils_utf8.h             | 23 +++++++++++++++++++++++
 writeengine/bulk/we_bulkloadbuffer.cpp |  5 ++++-
 writeengine/dictionary/we_dctnry.cpp   |  4 +++-
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/utils/funcexp/utils_utf8.h b/utils/funcexp/utils_utf8.h
index d7a86415d..8055a2580 100644
--- a/utils/funcexp/utils_utf8.h
+++ b/utils/funcexp/utils_utf8.h
@@ -195,6 +195,29 @@ std::string wstring_to_utf8 (const std::wstring& str)
     return std::string(outbuf, strmblen);
 }
 
+inline
+uint8_t utf8_truncate_point(const char* input, size_t length)
+{
+    // Find the beginning of a multibyte char to truncate at and return the
+    // number of bytes to truncate
+    if (length < 3)
+    {
+        return 0;
+    }
+
+    const unsigned char *b = (const unsigned char*)(input) + length - 3;
+    if (b[2] & 0x80)
+    {
+        // First byte in a new multi-byte sequence
+        if (b[2] & 0x40) return 1;
+        // 3 byte sequence
+        else if ((b[1] & 0xe0) == 0xe0) return 2;
+        // 4 byte sequence
+        else if ((b[0] & 0xf0) == 0xf0) return 3;
+    }
+    return 0;
+}
+
 } //namespace utf8
 } //namespace funcexp
 
diff --git a/writeengine/bulk/we_bulkloadbuffer.cpp b/writeengine/bulk/we_bulkloadbuffer.cpp
index 9f75c6873..b1902e98f 100644
--- a/writeengine/bulk/we_bulkloadbuffer.cpp
+++ b/writeengine/bulk/we_bulkloadbuffer.cpp
@@ -41,6 +41,8 @@
 
 #include "joblisttypes.h"
 
+#include "utils_utf8.h"
+
 using namespace std;
 using namespace boost;
 using namespace execplan;
@@ -513,7 +515,8 @@ void BulkLoadBuffer::convert(char *field, int fieldLength,
                 // on disk (e.g. 5 for a varchar(5) instead of 8).
                 if (fieldLength > column.definedWidth)
                 {
-                    memcpy( charTmpBuf, field, column.definedWidth );
+                    uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth);
+                    memcpy( charTmpBuf, field, column.definedWidth - truncate_point );
                     bufStats.satCount++;
                 }
                 else
diff --git a/writeengine/dictionary/we_dctnry.cpp b/writeengine/dictionary/we_dctnry.cpp
index 626ec9ea4..04f4a7ba1 100644
--- a/writeengine/dictionary/we_dctnry.cpp
+++ b/writeengine/dictionary/we_dctnry.cpp
@@ -47,6 +47,7 @@ using namespace BRM;
 #include "IDBPolicy.h"
 #include "cacheutils.h"
 using namespace idbdatafile;
+#include "utils_utf8.h"
 
 namespace
 {
@@ -731,7 +732,8 @@ int Dctnry::insertDctnry(const char* buf,
         // @Bug 2565: Truncate any strings longer than schema's column width
         if (curSig.size > m_colWidth)
         {
-            curSig.size = m_colWidth;
+            uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
+            curSig.size = m_colWidth - truncate_point;
             ++truncCount;
         }