From 931f2b36a104b002161feaa263effe3c4bc2e619 Mon Sep 17 00:00:00 2001 From: Gagan Goel Date: Tue, 5 Sep 2023 10:17:20 -0400 Subject: [PATCH] MCOL-4931 Make cpimport charset-aware. (#2938) 1. Extend the following CalpontSystemCatalog member functions to set CalpontSystemCatalog::ColType::charsetNumber, after the system catalog update to add charset number to calpontsys.syscolumn in MCOL-5005: CalpontSystemCatalog::lookupOID CalpontSystemCatalog::colType CalpontSystemCatalog::columnRIDs CalpontSystemCatalog::getSchemaInfo 2. Update cpimport to use the CHARSET_INFO object associated with the charset number retrieved from the system catalog, for a dictionary/non-dictionary CHAR/VARCHAR/TEXT column, to truncate long strings that exceed the target column character length. 3. Add MTR test cases. --- dbcon/execplan/calpontsystemcatalog.cpp | 54 ++++++++++++++---- .../columnstore/bugfixes/mcol-4931.result | 44 ++++++++++++++ .../columnstore/bugfixes/mcol-4931.test | 57 +++++++++++++++++++ utils/common/utils_utf8.h | 27 --------- writeengine/bulk/cpimport.cpp | 7 +++ writeengine/bulk/we_bulkloadbuffer.cpp | 32 ++++++++--- writeengine/bulk/we_columninfo.cpp | 2 +- writeengine/dictionary/we_dctnry.cpp | 29 +++++++--- writeengine/dictionary/we_dctnry.h | 2 +- writeengine/dictionary/we_dctnrystore.h | 16 ------ writeengine/shared/we_type.h | 4 ++ writeengine/xml/we_xmljob.cpp | 9 +++ 12 files changed, 211 insertions(+), 72 deletions(-) create mode 100644 mysql-test/columnstore/bugfixes/mcol-4931.result create mode 100644 mysql-test/columnstore/bugfixes/mcol-4931.test diff --git a/dbcon/execplan/calpontsystemcatalog.cpp b/dbcon/execplan/calpontsystemcatalog.cpp index 1b9bad261..c577c2631 100644 --- a/dbcon/execplan/calpontsystemcatalog.cpp +++ b/dbcon/execplan/calpontsystemcatalog.cpp @@ -553,8 +553,9 @@ CalpontSystemCatalog::OID CalpontSystemCatalog::lookupOID(const TableColName& ta string autoincrement = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + AUTOINC_COL; string nextVal = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + NEXTVALUE_COL; string nullable = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + NULLABLE_COL; + string charsetnum = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + CHARSETNUM_COL; - SimpleColumn* col[17]; + SimpleColumn* col[18]; col[0] = new SimpleColumn(columnlength, fSessionID); col[1] = new SimpleColumn(objectid, fSessionID); col[2] = new SimpleColumn(datatype, fSessionID); @@ -572,6 +573,7 @@ CalpontSystemCatalog::OID CalpontSystemCatalog::lookupOID(const TableColName& ta col[14] = new SimpleColumn(autoincrement, fSessionID); col[15] = new SimpleColumn(nextVal, fSessionID); col[16] = new SimpleColumn(nullable, fSessionID); + col[17] = new SimpleColumn(charsetnum, fSessionID); SRCP srcp; srcp.reset(col[0]); @@ -609,13 +611,15 @@ CalpontSystemCatalog::OID CalpontSystemCatalog::lookupOID(const TableColName& ta colMap.insert(CMVT_(nextVal, srcp)); srcp.reset(col[16]); colMap.insert(CMVT_(nullable, srcp)); + srcp.reset(col[17]); + colMap.insert(CMVT_(charsetnum, srcp)); csep.columnMapNonStatic(colMap); // ignore returnedcolumn, because it's not read by Joblist for now csep.returnedCols(returnedColumnList); - OID oid[17]; + OID oid[18]; - for (int i = 0; i < 17; i++) + for (int i = 0; i < 18; i++) oid[i] = col[i]->oid(); // Filters @@ -709,6 +713,8 @@ CalpontSystemCatalog::OID CalpontSystemCatalog::lookupOID(const TableColName& ta ct.constraintType = NOTNULL_CONSTRAINT; } } + else if ((*it)->ColumnOID() == oid[17]) + ct.charsetNumber = ((*it)->GetData(0)); else if ((*it)->ColumnOID() == DICTOID_SYSCOLUMN_DEFAULTVAL) { ct.defaultValue = ((*it)->GetStringData(0)); @@ -1077,8 +1083,9 @@ const CalpontSystemCatalog::ColType CalpontSystemCatalog::colType(const OID& Oid string compressionType = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + COMPRESSIONTYPE_COL; string autoincrement = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + AUTOINC_COL; string nextvalue = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + NEXTVALUE_COL; + string charsetnum = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + CHARSETNUM_COL; - SimpleColumn* col[17]; + SimpleColumn* col[18]; col[0] = new SimpleColumn(columnlength, fSessionID); col[1] = new SimpleColumn(objectid, fSessionID); col[2] = new SimpleColumn(datatype, fSessionID); @@ -1096,6 +1103,7 @@ const CalpontSystemCatalog::ColType CalpontSystemCatalog::colType(const OID& Oid col[14] = new SimpleColumn(compressionType, fSessionID); col[15] = new SimpleColumn(autoincrement, fSessionID); col[16] = new SimpleColumn(nextvalue, fSessionID); + col[17] = new SimpleColumn(charsetnum, fSessionID); SRCP srcp; srcp.reset(col[0]); @@ -1134,14 +1142,16 @@ const CalpontSystemCatalog::ColType CalpontSystemCatalog::colType(const OID& Oid colMap.insert(CMVT_(autoincrement, srcp)); srcp.reset(col[16]); colMap.insert(CMVT_(nextvalue, srcp)); + srcp.reset(col[17]); + colMap.insert(CMVT_(charsetnum, srcp)); csep.columnMapNonStatic(colMap); // ignore returnedcolumn, because it's not read by Joblist for now csep.returnedCols(returnedColumnList); - OID oid[17]; + OID oid[18]; - for (int i = 0; i < 17; i++) + for (int i = 0; i < 18; i++) oid[i] = col[i]->oid(); // Filters @@ -1231,6 +1241,8 @@ const CalpontSystemCatalog::ColType CalpontSystemCatalog::colType(const OID& Oid } else if ((*it)->ColumnOID() == oid[16]) ct.nextvalue = ((*it)->GetData(0)); + else if ((*it)->ColumnOID() == oid[17]) + ct.charsetNumber = ((*it)->GetData(0)); ct.columnOID = Oid; } @@ -3055,8 +3067,9 @@ const CalpontSystemCatalog::RIDList CalpontSystemCatalog::columnRIDs(const Table string compressiontype = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + COMPRESSIONTYPE_COL; string autoIncrement = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + AUTOINC_COL; string nextVal = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + NEXTVALUE_COL; + string charsetnum = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + CHARSETNUM_COL; - SimpleColumn* col[17]; + SimpleColumn* col[18]; col[0] = new SimpleColumn(columnlength, fSessionID); col[1] = new SimpleColumn(objectid, fSessionID); col[2] = new SimpleColumn(datatype, fSessionID); @@ -3074,6 +3087,7 @@ const CalpontSystemCatalog::RIDList CalpontSystemCatalog::columnRIDs(const Table col[14] = new SimpleColumn(compressiontype, fSessionID); col[15] = new SimpleColumn(autoIncrement, fSessionID); col[16] = new SimpleColumn(nextVal, fSessionID); + col[17] = new SimpleColumn(charsetnum, fSessionID); SRCP srcp; srcp.reset(col[0]); @@ -3111,15 +3125,17 @@ const CalpontSystemCatalog::RIDList CalpontSystemCatalog::columnRIDs(const Table colMap.insert(CMVT_(autoIncrement, srcp)); srcp.reset(col[16]); colMap.insert(CMVT_(nextVal, srcp)); + srcp.reset(col[17]); + colMap.insert(CMVT_(charsetnum, srcp)); csep.columnMapNonStatic(colMap); srcp.reset(col[1]->clone()); returnedColumnList.push_back(srcp); csep.returnedCols(returnedColumnList); - OID oid[17]; + OID oid[18]; - for (int i = 0; i < 17; i++) + for (int i = 0; i < 18; i++) oid[i] = col[i]->oid(); oid[12] = DICTOID_SYSCOLUMN_COLNAME; @@ -3303,6 +3319,11 @@ const CalpontSystemCatalog::RIDList CalpontSystemCatalog::columnRIDs(const Table for (int i = 0; i < (*it)->dataCount(); i++) ctList[i].nextvalue = ((*it)->GetData(i)); } + else if ((*it)->ColumnOID() == oid[17]) + { + for (int i = 0; i < (*it)->dataCount(); i++) + ctList[i].charsetNumber = ((*it)->GetData(i)); + } } // MCOL-895 sort ctList, we can't specify an ORDER BY to do this yet @@ -5546,8 +5567,9 @@ void CalpontSystemCatalog::getSchemaInfo(const string& in_schema, int lower_case string compressiontype = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + COMPRESSIONTYPE_COL; string autoinc = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + AUTOINC_COL; string nextval = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + NEXTVALUE_COL; + string charsetnum = CALPONT_SCHEMA + "." + SYSCOLUMN_TABLE + "." + CHARSETNUM_COL; - SimpleColumn* col[17]; + SimpleColumn* col[18]; col[0] = new SimpleColumn(columnlength, fSessionID); col[1] = new SimpleColumn(objectid, fSessionID); col[2] = new SimpleColumn(datatype, fSessionID); @@ -5565,6 +5587,7 @@ void CalpontSystemCatalog::getSchemaInfo(const string& in_schema, int lower_case col[14] = new SimpleColumn(compressiontype, fSessionID); col[15] = new SimpleColumn(autoinc, fSessionID); col[16] = new SimpleColumn(nextval, fSessionID); + col[17] = new SimpleColumn(charsetnum, fSessionID); SRCP srcp; srcp.reset(col[0]); @@ -5603,15 +5626,17 @@ void CalpontSystemCatalog::getSchemaInfo(const string& in_schema, int lower_case colMap.insert(CMVT_(autoinc, srcp)); srcp.reset(col[16]); colMap.insert(CMVT_(nextval, srcp)); + srcp.reset(col[17]); + colMap.insert(CMVT_(charsetnum, srcp)); csep.columnMapNonStatic(colMap); srcp.reset(col[1]->clone()); returnedColumnList.push_back(srcp); csep.returnedCols(returnedColumnList); - OID oid[17]; + OID oid[18]; - for (int i = 0; i < 17; i++) + for (int i = 0; i < 18; i++) oid[i] = col[i]->oid(); oid[12] = DICTOID_SYSCOLUMN_COLNAME; @@ -5805,6 +5830,11 @@ void CalpontSystemCatalog::getSchemaInfo(const string& in_schema, int lower_case for (int i = 0; i < (*it)->dataCount(); i++) ctList[i].nextvalue = ((*it)->GetData(i)); } + else if ((*it)->ColumnOID() == oid[17]) + { + for (int i = 0; i < (*it)->dataCount(); i++) + ctList[i].charsetNumber = ((*it)->GetData(i)); + } } // populate colinfo cache diff --git a/mysql-test/columnstore/bugfixes/mcol-4931.result b/mysql-test/columnstore/bugfixes/mcol-4931.result new file mode 100644 index 000000000..dc8b47483 --- /dev/null +++ b/mysql-test/columnstore/bugfixes/mcol-4931.result @@ -0,0 +1,44 @@ +DROP DATABASE IF EXISTS mcol_4931; +CREATE DATABASE mcol_4931; +USE mcol_4931; +CREATE TABLE t1 ( +a VARCHAR(15) collate 'utf8mb4_croatian_ci', +b CHAR(15) collate 'utf8mb4_croatian_ci', +c VARCHAR(2), +d CHAR(2) +)ENGINE=columnstore default charset=utf8mb4; +CREATE TABLE t2 ( +a VARCHAR(15) collate 'latin2_hungarian_ci', +b CHAR(15) collate 'latin2_hungarian_ci', +c VARCHAR(2), +d CHAR(2) +)ENGINE=columnstore default charset=latin2; +LOAD DATA INFILE "DATADIR/mcol4931_1.txt" IGNORE INTO TABLE t1 charset utf8mb4 FIELDS TERMINATED BY "," ENCLOSED BY '"';; +Warnings: +Warning 1265 Data truncated for column 'a' at row 1 +Warning 1265 Data truncated for column 'b' at row 1 +Warning 1265 Data truncated for column 'c' at row 1 +Warning 1265 Data truncated for column 'd' at row 1 +LOAD DATA INFILE "DATADIR/mcol4931_2.txt" IGNORE INTO TABLE t2 charset latin2 FIELDS TERMINATED BY "," ENCLOSED BY '"';; +Warnings: +Warning 1265 Data truncated for column 'a' at row 1 +Warning 1265 Data truncated for column 'b' at row 1 +Warning 1265 Data truncated for column 'c' at row 1 +Warning 1265 Data truncated for column 'd' at row 1 +SELECT * FROM t1; +a b c d +König-abcd-Stra König-abcd-Stra Kö Kö +König-abcd-Stra König-abcd-Stra Kö Kö +SELECT CHAR_LENGTH(a), LENGTHB(a), CHAR_LENGTH(b), LENGTHB(b), CHAR_LENGTH(c), LENGTHB(c), CHAR_LENGTH(d), LENGTHB(d) FROM t1; +CHAR_LENGTH(a) LENGTHB(a) CHAR_LENGTH(b) LENGTHB(b) CHAR_LENGTH(c) LENGTHB(c) CHAR_LENGTH(d) LENGTHB(d) +15 16 15 16 2 3 2 3 +15 16 15 16 2 3 2 3 +SELECT * FROM t2; +a b c d +abcdefghijklmno abcdefghijklmno ab ab +abcdefghijklmno abcdefghijklmno ab ab +SELECT CHAR_LENGTH(a), LENGTHB(a), CHAR_LENGTH(b), LENGTHB(b), CHAR_LENGTH(c), LENGTHB(c), CHAR_LENGTH(d), LENGTHB(d) FROM t2; +CHAR_LENGTH(a) LENGTHB(a) CHAR_LENGTH(b) LENGTHB(b) CHAR_LENGTH(c) LENGTHB(c) CHAR_LENGTH(d) LENGTHB(d) +15 15 15 15 2 2 2 2 +15 15 15 15 2 2 2 2 +DROP DATABASE mcol_4931; diff --git a/mysql-test/columnstore/bugfixes/mcol-4931.test b/mysql-test/columnstore/bugfixes/mcol-4931.test new file mode 100644 index 000000000..eafcc3ceb --- /dev/null +++ b/mysql-test/columnstore/bugfixes/mcol-4931.test @@ -0,0 +1,57 @@ +# +# MCOL-4931 Make cpimport charset aware +# + +if (!$MYSQL_TEST_ROOT){ + skip Should be run by root to execute cpimport; +} + +--source ../include/have_columnstore.inc + +let $DATADIR=`SELECT @@datadir`; + +--disable_warnings +DROP DATABASE IF EXISTS mcol_4931; +--enable_warnings + +CREATE DATABASE mcol_4931; +USE mcol_4931; + +CREATE TABLE t1 ( + a VARCHAR(15) collate 'utf8mb4_croatian_ci', + b CHAR(15) collate 'utf8mb4_croatian_ci', + c VARCHAR(2), + d CHAR(2) +)ENGINE=columnstore default charset=utf8mb4; + +CREATE TABLE t2 ( + a VARCHAR(15) collate 'latin2_hungarian_ci', + b CHAR(15) collate 'latin2_hungarian_ci', + c VARCHAR(2), + d CHAR(2) +)ENGINE=columnstore default charset=latin2; + +--exec rm -f $DATADIR/mcol4931_1.txt +--exec rm -f $DATADIR/mcol4931_2.txt +--exec echo "\"König-abcd-Straße\",\"König-abcd-Straße\",\"König-abcd-Straße\",\"König-abcd-Straße\"" > $DATADIR/mcol4931_1.txt +--exec echo "\"abcdefghijklmnopq\",\"abcdefghijklmnopq\",\"abcdefghijklmnopq\",\"abcdefghijklmnopq\"" > $DATADIR/mcol4931_2.txt +--replace_result $DATADIR DATADIR +--eval LOAD DATA INFILE "$DATADIR/mcol4931_1.txt" IGNORE INTO TABLE t1 charset utf8mb4 FIELDS TERMINATED BY "," ENCLOSED BY '"'; +--replace_result $DATADIR DATADIR +--eval LOAD DATA INFILE "$DATADIR/mcol4931_2.txt" IGNORE INTO TABLE t2 charset latin2 FIELDS TERMINATED BY "," ENCLOSED BY '"'; + +--disable_result_log +--exec $MCS_CPIMPORT -s',' -E'"' mcol_4931 t1 '$DATADIR/mcol4931_1.txt'; +--exec $MCS_CPIMPORT -s',' -E'"' mcol_4931 t2 '$DATADIR/mcol4931_2.txt'; +--enable_result_log + +SELECT * FROM t1; +SELECT CHAR_LENGTH(a), LENGTHB(a), CHAR_LENGTH(b), LENGTHB(b), CHAR_LENGTH(c), LENGTHB(c), CHAR_LENGTH(d), LENGTHB(d) FROM t1; + +SELECT * FROM t2; +SELECT CHAR_LENGTH(a), LENGTHB(a), CHAR_LENGTH(b), LENGTHB(b), CHAR_LENGTH(c), LENGTHB(c), CHAR_LENGTH(d), LENGTHB(d) FROM t2; + +# Clean UP +--exec rm -f $DATADIR/mcol4931_1.txt +--exec rm -f $DATADIR/mcol4931_2.txt +DROP DATABASE mcol_4931; diff --git a/utils/common/utils_utf8.h b/utils/common/utils_utf8.h index 352bbb4f7..7be105c5f 100644 --- a/utils/common/utils_utf8.h +++ b/utils/common/utils_utf8.h @@ -91,33 +91,6 @@ inline std::string wstring_to_utf8(const std::wstring& str) return ret; } -inline uint8_t utf8_truncate_point(const char* input, size_t length) -{ - // Find the beginning of a multibyte char to truncate at and return the - // number of bytes to truncate1` - if (length < 3) - { - return 0; - } - - const unsigned char* b = (const unsigned char*)(input) + length - 3; - - if (b[2] & 0x80) - { - // First byte in a new multi-byte sequence - if (b[2] & 0x40) - return 1; - // 3 byte sequence - else if ((b[1] & 0xe0) == 0xe0) - return 2; - // 4 byte sequence - else if ((b[0] & 0xf0) == 0xf0) - return 3; - } - - return 0; -} - int mcs_strcoll(const char* str1, const char* str2, const uint32_t charsetNumber); int mcs_strcoll(const char* str1, const uint32_t l1, const char* str2, const uint32_t l2, const uint32_t charsetNumber); diff --git a/writeengine/bulk/cpimport.cpp b/writeengine/bulk/cpimport.cpp index 0e0b6dbd6..a1a0aebcb 100644 --- a/writeengine/bulk/cpimport.cpp +++ b/writeengine/bulk/cpimport.cpp @@ -48,6 +48,7 @@ #include "MonitorProcMem.h" #include "dataconvert.h" #include "mcsconfig.h" +#include "mariadb_my_sys.h" using namespace std; using namespace WriteEngine; @@ -1002,6 +1003,9 @@ int main(int argc, char** argv) { setupSignalHandlers(); + // Initialize the charset library + MY_INIT(argv[0]); + // Set locale language const char* pLoc = setlocale(LC_ALL, ""); if (pLoc) @@ -1316,6 +1320,9 @@ int main(int argc, char** argv) rc = ERR_UNKNOWN; } + // Free up resources allocated by MY_INIT() above. + my_end(0); + //-------------------------------------------------------------------------- // Log end of job to INFO log //-------------------------------------------------------------------------- diff --git a/writeengine/bulk/we_bulkloadbuffer.cpp b/writeengine/bulk/we_bulkloadbuffer.cpp index 2b32deab3..8b74b8c15 100644 --- a/writeengine/bulk/we_bulkloadbuffer.cpp +++ b/writeengine/bulk/we_bulkloadbuffer.cpp @@ -43,8 +43,6 @@ #include "joblisttypes.h" -#include "utils_utf8.h" // utf8_truncate_point() - using namespace std; using namespace boost; using namespace execplan; @@ -515,14 +513,32 @@ void BulkLoadBuffer::convert(char* field, int fieldLength, bool nullFlag, unsign // from storing characters beyond the column's defined width. // It contains the column definition width rather than the bytes // on disk (e.g. 5 for a varchar(5) instead of 8). - if (fieldLength > column.definedWidth) + if (column.cs->mbmaxlen > 1) { - uint8_t truncate_point = utf8::utf8_truncate_point(field, column.definedWidth); - memcpy(charTmpBuf, field, column.definedWidth - truncate_point); - bufStats.satCount++; + const CHARSET_INFO* cs = column.cs; + const char* start = (const char*) field; + const char* end = (const char*)(field + fieldLength); + size_t numChars = cs->numchars(start, end); + size_t maxCharLength = column.definedWidth / cs->mbmaxlen; + + if (numChars > maxCharLength) + { + MY_STRCOPY_STATUS status; + cs->well_formed_char_length(start, end, maxCharLength, &status); + fieldLength = status.m_source_end_pos - start; + bufStats.satCount++; + } } - else - memcpy(charTmpBuf, field, fieldLength); + else // cs->mbmaxlen == 1 + { + if (fieldLength > column.definedWidth) + { + fieldLength = column.definedWidth; + bufStats.satCount++; + } + } + + memcpy(charTmpBuf, field, fieldLength); } // Swap byte order before comparing character string diff --git a/writeengine/bulk/we_columninfo.cpp b/writeengine/bulk/we_columninfo.cpp index 3f89c28a0..fb763d1ea 100644 --- a/writeengine/bulk/we_columninfo.cpp +++ b/writeengine/bulk/we_columninfo.cpp @@ -1697,7 +1697,7 @@ int ColumnInfo::updateDctnryStore(char* buf, ColPosPair** pos, const int totalRo Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT); #endif - int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount); + int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs); if (rc != NO_ERROR) { diff --git a/writeengine/dictionary/we_dctnry.cpp b/writeengine/dictionary/we_dctnry.cpp index 34605e421..ec3daf3f8 100644 --- a/writeengine/dictionary/we_dctnry.cpp +++ b/writeengine/dictionary/we_dctnry.cpp @@ -48,7 +48,6 @@ using namespace BRM; #include "IDBPolicy.h" #include "cacheutils.h" using namespace idbdatafile; -#include "utils_utf8.h" // utf8_truncate_point() #include "checks.h" namespace @@ -764,7 +763,7 @@ int Dctnry::insertDctnry2(Signature& sig) * failure - it did not write the header to block ******************************************************************************/ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, char* tokenBuf, - long long& truncCount) + long long& truncCount, const CHARSET_INFO* cs) { #ifdef PROFILE Stats::startParseEvent(WE_STATS_PARSE_DCT); @@ -837,12 +836,28 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, curSig.signature = (unsigned char*)pIn; } - // @Bug 2565: Truncate any strings longer than schema's column width - if (curSig.size > m_colWidth) + if (cs->mbmaxlen > 1) { - uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth); - curSig.size = m_colWidth - truncate_point; - ++truncCount; + const char* start = (const char*) curSig.signature; + const char* end = (const char*)(curSig.signature + curSig.size); + size_t numChars = cs->numchars(start, end); + size_t maxCharLength = m_colWidth / cs->mbmaxlen; + + if (numChars > maxCharLength) + { + MY_STRCOPY_STATUS status; + cs->well_formed_char_length(start, end, maxCharLength, &status); + curSig.size = status.m_source_end_pos - start; + truncCount++; + } + } + else // cs->mbmaxlen == 1 + { + if (curSig.size > m_colWidth) + { + curSig.size = m_colWidth; + truncCount++; + } } //...Search for the string in our string cache diff --git a/writeengine/dictionary/we_dctnry.h b/writeengine/dictionary/we_dctnry.h index d20791316..52aa14d37 100644 --- a/writeengine/dictionary/we_dctnry.h +++ b/writeengine/dictionary/we_dctnry.h @@ -168,7 +168,7 @@ class Dctnry : public DbFileOp * @param tokenBuf - (output) list of tokens for the parsed strings */ EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, - char* tokenBuf, long long& truncCount); + char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs); /** * @brief Update dictionary store with tokenized strings (for DDL/DML use) diff --git a/writeengine/dictionary/we_dctnrystore.h b/writeengine/dictionary/we_dctnrystore.h index 693be2177..c8db79e3b 100644 --- a/writeengine/dictionary/we_dctnrystore.h +++ b/writeengine/dictionary/we_dctnrystore.h @@ -135,22 +135,6 @@ class DctnryStore : public DbFileOp */ EXPORT const int updateDctnryStore(unsigned char* sigValue, int& sigSize, Token& token); - /** - * @brief Update dictionary store with tokenized strings (for Bulk use) - * - * @param buf - bulk buffer containing strings to be parsed - * @param pos - list of offsets into buf - * @param totalRow - total number of rows in buf - * @param col - the column to be parsed from buf - * @param colWidth - width of the dictionary column being parsed - * @param tokenBuf - (output) list of tokens for the parsed strings - */ - const int updateDctnryStore(const char* buf, ColPosPair** pos, const int totalRow, const int col, - const int colWidth, char* tokenBuf) - { - return (m_dctnry.insertDctnry(buf, pos, totalRow, col, colWidth, tokenBuf)); - } - /** * @brief TransId related function * diff --git a/writeengine/shared/we_type.h b/writeengine/shared/we_type.h index 0997e746a..4b74c0efd 100644 --- a/writeengine/shared/we_type.h +++ b/writeengine/shared/we_type.h @@ -40,6 +40,7 @@ #include "IDBDataFile.h" #include "IDBPolicy.h" #include "nullstring.h" +#include "collation.h" // For CHARSET_INFO struct #undef EXPORT #undef DELETE @@ -410,6 +411,7 @@ struct JobColumn /** @brief Job Column Structure */ double fDefaultDbl; /** @brief Dbl/Flt column default */ int128_t fDefaultWideDecimal; /** @brief Wide decimal column default */ utils::NullString fDefaultChr; /** @brief Char column default */ + const CHARSET_INFO* cs; /** @brief character set info for the column */ JobColumn() : mapOid(0) , dataType(execplan::CalpontSystemCatalog::INT) @@ -435,6 +437,7 @@ struct JobColumn /** @brief Job Column Structure */ , fDefaultUInt(0) , fDefaultDbl(0.0) , fDefaultWideDecimal(0) + , cs(nullptr) { } JobColumn(const std::string& colName_, OID mapOid_, const std::string& typeName_, @@ -466,6 +469,7 @@ struct JobColumn /** @brief Job Column Structure */ , fDefaultUInt(defaultUInt_) , fDefaultDbl(0.0) , fDefaultWideDecimal(0) + , cs(nullptr) { dctnry.fCompressionType = dctnryCompressionType_; } diff --git a/writeengine/xml/we_xmljob.cpp b/writeengine/xml/we_xmljob.cpp index 25471e609..96cd61d01 100644 --- a/writeengine/xml/we_xmljob.cpp +++ b/writeengine/xml/we_xmljob.cpp @@ -871,6 +871,15 @@ void XMLJob::fillInXMLDataAsLoaded(execplan::CalpontSystemCatalog::RIDList& colR col.compressionType = colType.compressionType; col.dctnry.fCompressionType = colType.compressionType; + if (colType.charsetNumber != 0) + { + col.cs = &datatypes::Charset(colType.charsetNumber).getCharset(); + } + else + { + col.cs = &my_charset_latin1; + } + if (colType.autoincrement) col.autoIncFlag = true; else