1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

MCOL-4931 Make cpimport charset-aware. (#2938)

1. Extend the following CalpontSystemCatalog member functions to
   set CalpontSystemCatalog::ColType::charsetNumber, after the
   system catalog update to add charset number to calpontsys.syscolumn
   in MCOL-5005:
     CalpontSystemCatalog::lookupOID
     CalpontSystemCatalog::colType
     CalpontSystemCatalog::columnRIDs
     CalpontSystemCatalog::getSchemaInfo

2. Update cpimport to use the CHARSET_INFO object associated with the
   charset number retrieved from the system catalog, for a
   dictionary/non-dictionary CHAR/VARCHAR/TEXT column, to truncate
   long strings that exceed the target column character length.

3. Add MTR test cases.
This commit is contained in:
Gagan Goel
2023-09-05 10:17:20 -04:00
committed by GitHub
parent 5b4f06bf0d
commit 931f2b36a1
12 changed files with 211 additions and 72 deletions

View File

@ -48,6 +48,7 @@
#include "MonitorProcMem.h"
#include "dataconvert.h"
#include "mcsconfig.h"
#include "mariadb_my_sys.h"
using namespace std;
using namespace WriteEngine;
@ -1002,6 +1003,9 @@ int main(int argc, char** argv)
{
setupSignalHandlers();
// Initialize the charset library
MY_INIT(argv[0]);
// Set locale language
const char* pLoc = setlocale(LC_ALL, "");
if (pLoc)
@ -1316,6 +1320,9 @@ int main(int argc, char** argv)
rc = ERR_UNKNOWN;
}
// Free up resources allocated by MY_INIT() above.
my_end(0);
//--------------------------------------------------------------------------
// Log end of job to INFO log
//--------------------------------------------------------------------------

View File

@ -43,8 +43,6 @@
#include "joblisttypes.h"
#include "utils_utf8.h" // utf8_truncate_point()
using namespace std;
using namespace boost;
using namespace execplan;
@ -515,14 +513,32 @@ void BulkLoadBuffer::convert(char* field, int fieldLength, bool nullFlag, unsign
// from storing characters beyond the column's defined width.
// It contains the column definition width rather than the bytes
// on disk (e.g. 5 for a varchar(5) instead of 8).
if (fieldLength > column.definedWidth)
if (column.cs->mbmaxlen > 1)
{
uint8_t truncate_point = utf8::utf8_truncate_point(field, column.definedWidth);
memcpy(charTmpBuf, field, column.definedWidth - truncate_point);
bufStats.satCount++;
const CHARSET_INFO* cs = column.cs;
const char* start = (const char*) field;
const char* end = (const char*)(field + fieldLength);
size_t numChars = cs->numchars(start, end);
size_t maxCharLength = column.definedWidth / cs->mbmaxlen;
if (numChars > maxCharLength)
{
MY_STRCOPY_STATUS status;
cs->well_formed_char_length(start, end, maxCharLength, &status);
fieldLength = status.m_source_end_pos - start;
bufStats.satCount++;
}
}
else
memcpy(charTmpBuf, field, fieldLength);
else // cs->mbmaxlen == 1
{
if (fieldLength > column.definedWidth)
{
fieldLength = column.definedWidth;
bufStats.satCount++;
}
}
memcpy(charTmpBuf, field, fieldLength);
}
// Swap byte order before comparing character string

View File

@ -1697,7 +1697,7 @@ int ColumnInfo::updateDctnryStore(char* buf, ColPosPair** pos, const int totalRo
Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
#endif
int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount);
int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs);
if (rc != NO_ERROR)
{

View File

@ -48,7 +48,6 @@ using namespace BRM;
#include "IDBPolicy.h"
#include "cacheutils.h"
using namespace idbdatafile;
#include "utils_utf8.h" // utf8_truncate_point()
#include "checks.h"
namespace
@ -764,7 +763,7 @@ int Dctnry::insertDctnry2(Signature& sig)
* failure - it did not write the header to block
******************************************************************************/
int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, char* tokenBuf,
long long& truncCount)
long long& truncCount, const CHARSET_INFO* cs)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT);
@ -837,12 +836,28 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
curSig.signature = (unsigned char*)pIn;
}
// @Bug 2565: Truncate any strings longer than schema's column width
if (curSig.size > m_colWidth)
if (cs->mbmaxlen > 1)
{
uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
curSig.size = m_colWidth - truncate_point;
++truncCount;
const char* start = (const char*) curSig.signature;
const char* end = (const char*)(curSig.signature + curSig.size);
size_t numChars = cs->numchars(start, end);
size_t maxCharLength = m_colWidth / cs->mbmaxlen;
if (numChars > maxCharLength)
{
MY_STRCOPY_STATUS status;
cs->well_formed_char_length(start, end, maxCharLength, &status);
curSig.size = status.m_source_end_pos - start;
truncCount++;
}
}
else // cs->mbmaxlen == 1
{
if (curSig.size > m_colWidth)
{
curSig.size = m_colWidth;
truncCount++;
}
}
//...Search for the string in our string cache

View File

@ -168,7 +168,7 @@ class Dctnry : public DbFileOp
* @param tokenBuf - (output) list of tokens for the parsed strings
*/
EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col,
char* tokenBuf, long long& truncCount);
char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs);
/**
* @brief Update dictionary store with tokenized strings (for DDL/DML use)

View File

@ -135,22 +135,6 @@ class DctnryStore : public DbFileOp
*/
EXPORT const int updateDctnryStore(unsigned char* sigValue, int& sigSize, Token& token);
/**
* @brief Update dictionary store with tokenized strings (for Bulk use)
*
* @param buf - bulk buffer containing strings to be parsed
* @param pos - list of offsets into buf
* @param totalRow - total number of rows in buf
* @param col - the column to be parsed from buf
* @param colWidth - width of the dictionary column being parsed
* @param tokenBuf - (output) list of tokens for the parsed strings
*/
const int updateDctnryStore(const char* buf, ColPosPair** pos, const int totalRow, const int col,
const int colWidth, char* tokenBuf)
{
return (m_dctnry.insertDctnry(buf, pos, totalRow, col, colWidth, tokenBuf));
}
/**
* @brief TransId related function
*

View File

@ -40,6 +40,7 @@
#include "IDBDataFile.h"
#include "IDBPolicy.h"
#include "nullstring.h"
#include "collation.h" // For CHARSET_INFO struct
#undef EXPORT
#undef DELETE
@ -410,6 +411,7 @@ struct JobColumn /** @brief Job Column Structure */
double fDefaultDbl; /** @brief Dbl/Flt column default */
int128_t fDefaultWideDecimal; /** @brief Wide decimal column default */
utils::NullString fDefaultChr; /** @brief Char column default */
const CHARSET_INFO* cs; /** @brief character set info for the column */
JobColumn()
: mapOid(0)
, dataType(execplan::CalpontSystemCatalog::INT)
@ -435,6 +437,7 @@ struct JobColumn /** @brief Job Column Structure */
, fDefaultUInt(0)
, fDefaultDbl(0.0)
, fDefaultWideDecimal(0)
, cs(nullptr)
{
}
JobColumn(const std::string& colName_, OID mapOid_, const std::string& typeName_,
@ -466,6 +469,7 @@ struct JobColumn /** @brief Job Column Structure */
, fDefaultUInt(defaultUInt_)
, fDefaultDbl(0.0)
, fDefaultWideDecimal(0)
, cs(nullptr)
{
dctnry.fCompressionType = dctnryCompressionType_;
}

View File

@ -871,6 +871,15 @@ void XMLJob::fillInXMLDataAsLoaded(execplan::CalpontSystemCatalog::RIDList& colR
col.compressionType = colType.compressionType;
col.dctnry.fCompressionType = colType.compressionType;
if (colType.charsetNumber != 0)
{
col.cs = &datatypes::Charset(colType.charsetNumber).getCharset();
}
else
{
col.cs = &my_charset_latin1;
}
if (colType.autoincrement)
col.autoIncFlag = true;
else