MCOL-4931 Make cpimport charset-aware. (#2938)

1. Extend the following CalpontSystemCatalog member functions to set CalpontSystemCatalog::ColType::charsetNumber, after the system catalog update to add charset number to calpontsys.syscolumn in MCOL-5005: CalpontSystemCatalog::lookupOID CalpontSystemCatalog::colType CalpontSystemCatalog::columnRIDs CalpontSystemCatalog::getSchemaInfo 2. Update cpimport to use the CHARSET_INFO object associated with the charset number retrieved from the system catalog, for a dictionary/non-dictionary CHAR/VARCHAR/TEXT column, to truncate long strings that exceed the target column character length. 3. Add MTR test cases.
2025-07-29 08:21:15 +03:00 · 2023-09-05 10:17:20 -04:00
parent 5b4f06bf0d
commit 931f2b36a1
12 changed files with 211 additions and 72 deletions
--- a/writeengine/bulk/cpimport.cpp
+++ b/writeengine/bulk/cpimport.cpp
@ -48,6 +48,7 @@
 #include "MonitorProcMem.h"
 #include "dataconvert.h"
 #include "mcsconfig.h"
+#include "mariadb_my_sys.h"

 using namespace std;
 using namespace WriteEngine;
@ -1002,6 +1003,9 @@ int main(int argc, char** argv)
 {
  setupSignalHandlers();

+  // Initialize the charset library
+  MY_INIT(argv[0]);
+
  // Set locale language
  const char* pLoc = setlocale(LC_ALL, "");
  if (pLoc)
@ -1316,6 +1320,9 @@ int main(int argc, char** argv)
    rc = ERR_UNKNOWN;
  }

+  // Free up resources allocated by MY_INIT() above.
+  my_end(0);
+
  //--------------------------------------------------------------------------
  // Log end of job to INFO log
  //--------------------------------------------------------------------------
--- a/writeengine/bulk/we_bulkloadbuffer.cpp
+++ b/writeengine/bulk/we_bulkloadbuffer.cpp
@ -43,8 +43,6 @@

 #include "joblisttypes.h"

-#include "utils_utf8.h"  // utf8_truncate_point()
-
 using namespace std;
 using namespace boost;
 using namespace execplan;
@ -515,14 +513,32 @@ void BulkLoadBuffer::convert(char* field, int fieldLength, bool nullFlag, unsign
        // from storing characters beyond the column's defined width.
        // It contains the column definition width rather than the bytes
        // on disk (e.g. 5 for a varchar(5) instead of 8).
-        if (fieldLength > column.definedWidth)
+        if (column.cs->mbmaxlen > 1)
        {
-          uint8_t truncate_point = utf8::utf8_truncate_point(field, column.definedWidth);
-          memcpy(charTmpBuf, field, column.definedWidth - truncate_point);
-          bufStats.satCount++;
+          const CHARSET_INFO* cs = column.cs;
+          const char* start = (const char*) field;
+          const char* end = (const char*)(field + fieldLength);
+          size_t numChars = cs->numchars(start, end);
+          size_t maxCharLength = column.definedWidth / cs->mbmaxlen;
+
+          if (numChars > maxCharLength)
+          {
+            MY_STRCOPY_STATUS status;
+            cs->well_formed_char_length(start, end, maxCharLength, &status);
+            fieldLength = status.m_source_end_pos - start;
+            bufStats.satCount++;
+          }
        }
-        else
-          memcpy(charTmpBuf, field, fieldLength);
+        else // cs->mbmaxlen == 1
+        {
+          if (fieldLength > column.definedWidth)
+          {
+            fieldLength = column.definedWidth;
+            bufStats.satCount++;
+          }
+        }
+
+        memcpy(charTmpBuf, field, fieldLength);
      }

      // Swap byte order before comparing character string
--- a/writeengine/bulk/we_columninfo.cpp
+++ b/writeengine/bulk/we_columninfo.cpp
@ -1697,7 +1697,7 @@ int ColumnInfo::updateDctnryStore(char* buf, ColPosPair** pos, const int totalRo
  Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
 #endif

-  int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount);
+  int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs);

  if (rc != NO_ERROR)
  {
--- a/writeengine/dictionary/we_dctnry.cpp
+++ b/writeengine/dictionary/we_dctnry.cpp
@ -48,7 +48,6 @@ using namespace BRM;
 #include "IDBPolicy.h"
 #include "cacheutils.h"
 using namespace idbdatafile;
-#include "utils_utf8.h"  // utf8_truncate_point()
 #include "checks.h"

 namespace
@ -764,7 +763,7 @@ int Dctnry::insertDctnry2(Signature& sig)
 *    failure    - it did not  write the header to block
 ******************************************************************************/
 int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, char* tokenBuf,
-                         long long& truncCount)
+                         long long& truncCount, const CHARSET_INFO* cs)
 {
 #ifdef PROFILE
  Stats::startParseEvent(WE_STATS_PARSE_DCT);
@ -837,12 +836,28 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
      curSig.signature = (unsigned char*)pIn;
    }

-    // @Bug 2565: Truncate any strings longer than schema's column width
-    if (curSig.size > m_colWidth)
+    if (cs->mbmaxlen > 1)
    {
-      uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
-      curSig.size = m_colWidth - truncate_point;
-      ++truncCount;
+      const char* start = (const char*) curSig.signature;
+      const char* end = (const char*)(curSig.signature + curSig.size);
+      size_t numChars = cs->numchars(start, end);
+      size_t maxCharLength = m_colWidth / cs->mbmaxlen;
+
+      if (numChars > maxCharLength)
+      {
+        MY_STRCOPY_STATUS status;
+        cs->well_formed_char_length(start, end, maxCharLength, &status);
+        curSig.size = status.m_source_end_pos - start;
+        truncCount++;
+      }
+    }
+    else // cs->mbmaxlen == 1
+    {
+      if (curSig.size > m_colWidth)
+      {
+        curSig.size = m_colWidth;
+        truncCount++;
+      }
    }

    //...Search for the string in our string cache
--- a/writeengine/dictionary/we_dctnry.h
+++ b/writeengine/dictionary/we_dctnry.h
@ -168,7 +168,7 @@ class Dctnry : public DbFileOp
   * @param tokenBuf  - (output) list of tokens for the parsed strings
   */
  EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col,
-                          char* tokenBuf, long long& truncCount);
+                          char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs);

  /**
   * @brief Update dictionary store with tokenized strings (for DDL/DML use)
--- a/writeengine/dictionary/we_dctnrystore.h
+++ b/writeengine/dictionary/we_dctnrystore.h
@ -135,22 +135,6 @@ class DctnryStore : public DbFileOp
   */
  EXPORT const int updateDctnryStore(unsigned char* sigValue, int& sigSize, Token& token);

-  /**
-   * @brief Update dictionary store with tokenized strings (for Bulk use)
-   *
-   * @param buf       - bulk buffer containing strings to be parsed
-   * @param pos       - list of offsets into buf
-   * @param totalRow  - total number of rows in buf
-   * @param col       - the column to be parsed from buf
-   * @param colWidth  - width of the dictionary column being parsed
-   * @param tokenBuf  - (output) list of tokens for the parsed strings
-   */
-  const int updateDctnryStore(const char* buf, ColPosPair** pos, const int totalRow, const int col,
-                              const int colWidth, char* tokenBuf)
-  {
-    return (m_dctnry.insertDctnry(buf, pos, totalRow, col, colWidth, tokenBuf));
-  }
-
  /**
   * @brief TransId related function
   *
--- a/writeengine/shared/we_type.h
+++ b/writeengine/shared/we_type.h
@ -40,6 +40,7 @@
 #include "IDBDataFile.h"
 #include "IDBPolicy.h"
 #include "nullstring.h"
+#include "collation.h" // For CHARSET_INFO struct

 #undef EXPORT
 #undef DELETE
@ -410,6 +411,7 @@ struct JobColumn /** @brief Job Column Structure */
  double fDefaultDbl;              /** @brief Dbl/Flt column default */
  int128_t fDefaultWideDecimal;    /** @brief Wide decimal column default */
  utils::NullString fDefaultChr;   /** @brief Char column default */
+  const CHARSET_INFO* cs;          /** @brief character set info for the column */
  JobColumn()
   : mapOid(0)
   , dataType(execplan::CalpontSystemCatalog::INT)
@ -435,6 +437,7 @@ struct JobColumn /** @brief Job Column Structure */
   , fDefaultUInt(0)
   , fDefaultDbl(0.0)
   , fDefaultWideDecimal(0)
+   , cs(nullptr)
  {
  }
  JobColumn(const std::string& colName_, OID mapOid_, const std::string& typeName_,
@ -466,6 +469,7 @@ struct JobColumn /** @brief Job Column Structure */
   , fDefaultUInt(defaultUInt_)
   , fDefaultDbl(0.0)
   , fDefaultWideDecimal(0)
+   , cs(nullptr)
  {
    dctnry.fCompressionType = dctnryCompressionType_;
  }
--- a/writeengine/xml/we_xmljob.cpp
+++ b/writeengine/xml/we_xmljob.cpp
@ -871,6 +871,15 @@ void XMLJob::fillInXMLDataAsLoaded(execplan::CalpontSystemCatalog::RIDList& colR
      col.compressionType = colType.compressionType;
      col.dctnry.fCompressionType = colType.compressionType;

+      if (colType.charsetNumber != 0)
+      {
+        col.cs = &datatypes::Charset(colType.charsetNumber).getCharset();
+      }
+      else
+      {
+        col.cs = &my_charset_latin1;
+      }
+
      if (colType.autoincrement)
        col.autoIncFlag = true;
      else