From 7f9c624626767445271b5bf6bde2081c7533a852 Mon Sep 17 00:00:00 2001
From: Gagan Goel <gagan.nith@gmail.com>
Date: Mon, 18 Sep 2023 13:55:24 -0400
Subject: [PATCH] MCOL-5573 Fix cpimport truncation of TEXT columns.

1. Restore the utf8_truncate_point() function in utils/common/utils_utf8.h
that I removed as part of the patch for MCOL-4931.

2. As per the definition of TEXT columns, the default column width represents
the maximum number of bytes that can be stored in the TEXT column. So the
effective maximum length is less if the value contains multi-byte characters.
However, if the user explicitly specifies the length of the TEXT column in a
table DDL, such as TEXT(65535), then the DDL logic ensures that enough number
of bytes are allocated (upto a system maximum) to allow upto that many number
of characters (multi-byte characters if the charset for the column is multi-byte,
such as utf8mb3).
---
 utils/common/utils_utf8.h            | 27 +++++++++++++++++++
 writeengine/bulk/we_columninfo.cpp   |  2 +-
 writeengine/dictionary/we_dctnry.cpp | 39 ++++++++++++++++++++--------
 writeengine/dictionary/we_dctnry.h   |  3 ++-
 4 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/utils/common/utils_utf8.h b/utils/common/utils_utf8.h
index 7be105c5f..352bbb4f7 100644
--- a/utils/common/utils_utf8.h
+++ b/utils/common/utils_utf8.h
@@ -91,6 +91,33 @@ inline std::string wstring_to_utf8(const std::wstring& str)
   return ret;
 }
 
+inline uint8_t utf8_truncate_point(const char* input, size_t length)
+{
+  // Find the beginning of a multibyte char to truncate at and return the
+  // number of bytes to truncate1`
+  if (length < 3)
+  {
+    return 0;
+  }
+
+  const unsigned char* b = (const unsigned char*)(input) + length - 3;
+
+  if (b[2] & 0x80)
+  {
+    // First byte in a new multi-byte sequence
+    if (b[2] & 0x40)
+      return 1;
+    // 3 byte sequence
+    else if ((b[1] & 0xe0) == 0xe0)
+      return 2;
+    // 4 byte sequence
+    else if ((b[0] & 0xf0) == 0xf0)
+      return 3;
+  }
+
+  return 0;
+}
+
 int mcs_strcoll(const char* str1, const char* str2, const uint32_t charsetNumber);
 int mcs_strcoll(const char* str1, const uint32_t l1, const char* str2, const uint32_t l2,
                 const uint32_t charsetNumber);
diff --git a/writeengine/bulk/we_columninfo.cpp b/writeengine/bulk/we_columninfo.cpp
index fb763d1ea..7d59d6da9 100644
--- a/writeengine/bulk/we_columninfo.cpp
+++ b/writeengine/bulk/we_columninfo.cpp
@@ -1697,7 +1697,7 @@ int ColumnInfo::updateDctnryStore(char* buf, ColPosPair** pos, const int totalRo
   Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
 #endif
 
-  int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs);
+  int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs, column.weType);
 
   if (rc != NO_ERROR)
   {
diff --git a/writeengine/dictionary/we_dctnry.cpp b/writeengine/dictionary/we_dctnry.cpp
index ec3daf3f8..a128d21f7 100644
--- a/writeengine/dictionary/we_dctnry.cpp
+++ b/writeengine/dictionary/we_dctnry.cpp
@@ -49,6 +49,7 @@ using namespace BRM;
 #include "cacheutils.h"
 using namespace idbdatafile;
 #include "checks.h"
+#include "utils_utf8.h" // for utf8_truncate_point()
 
 namespace
 {
@@ -763,7 +764,8 @@ int Dctnry::insertDctnry2(Signature& sig)
  *    failure    - it did not  write the header to block
  ******************************************************************************/
 int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, char* tokenBuf,
-                         long long& truncCount, const CHARSET_INFO* cs)
+                         long long& truncCount, const CHARSET_INFO* cs,
+                         const WriteEngine::ColType& weType)
 {
 #ifdef PROFILE
   Stats::startParseEvent(WE_STATS_PARSE_DCT);
@@ -838,17 +840,32 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
 
     if (cs->mbmaxlen > 1)
     {
-      const char* start = (const char*) curSig.signature;
-      const char* end = (const char*)(curSig.signature + curSig.size);
-      size_t numChars = cs->numchars(start, end);
-      size_t maxCharLength = m_colWidth / cs->mbmaxlen;
-
-      if (numChars > maxCharLength)
+      // For TEXT columns, we truncate based on the number of bytes,
+      // and not based on the number of characters, as for CHAR/VARCHAR
+      // columns in the else block.
+      if (weType == WriteEngine::WR_TEXT)
       {
-        MY_STRCOPY_STATUS status;
-        cs->well_formed_char_length(start, end, maxCharLength, &status);
-        curSig.size = status.m_source_end_pos - start;
-        truncCount++;
+        if (curSig.size > m_colWidth)
+        {
+          uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
+          curSig.size = m_colWidth - truncate_point;
+          truncCount++;
+        }
+      }
+      else
+      {
+        const char* start = (const char*) curSig.signature;
+        const char* end = (const char*)(curSig.signature + curSig.size);
+        size_t numChars = cs->numchars(start, end);
+        size_t maxCharLength = m_colWidth / cs->mbmaxlen;
+
+        if (numChars > maxCharLength)
+        {
+          MY_STRCOPY_STATUS status;
+          cs->well_formed_char_length(start, end, maxCharLength, &status);
+          curSig.size = status.m_source_end_pos - start;
+          truncCount++;
+        }
       }
     }
     else // cs->mbmaxlen == 1
diff --git a/writeengine/dictionary/we_dctnry.h b/writeengine/dictionary/we_dctnry.h
index 52aa14d37..6c5f69eaa 100644
--- a/writeengine/dictionary/we_dctnry.h
+++ b/writeengine/dictionary/we_dctnry.h
@@ -168,7 +168,8 @@ class Dctnry : public DbFileOp
    * @param tokenBuf  - (output) list of tokens for the parsed strings
    */
   EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col,
-                          char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs);
+                          char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs,
+                          const WriteEngine::ColType& weType);
 
   /**
    * @brief Update dictionary store with tokenized strings (for DDL/DML use)