You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-4931 Make cpimport charset-aware. (#2938)
1. Extend the following CalpontSystemCatalog member functions to set CalpontSystemCatalog::ColType::charsetNumber, after the system catalog update to add charset number to calpontsys.syscolumn in MCOL-5005: CalpontSystemCatalog::lookupOID CalpontSystemCatalog::colType CalpontSystemCatalog::columnRIDs CalpontSystemCatalog::getSchemaInfo 2. Update cpimport to use the CHARSET_INFO object associated with the charset number retrieved from the system catalog, for a dictionary/non-dictionary CHAR/VARCHAR/TEXT column, to truncate long strings that exceed the target column character length. 3. Add MTR test cases.
This commit is contained in:
@ -48,6 +48,7 @@
|
||||
#include "MonitorProcMem.h"
|
||||
#include "dataconvert.h"
|
||||
#include "mcsconfig.h"
|
||||
#include "mariadb_my_sys.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace WriteEngine;
|
||||
@ -1002,6 +1003,9 @@ int main(int argc, char** argv)
|
||||
{
|
||||
setupSignalHandlers();
|
||||
|
||||
// Initialize the charset library
|
||||
MY_INIT(argv[0]);
|
||||
|
||||
// Set locale language
|
||||
const char* pLoc = setlocale(LC_ALL, "");
|
||||
if (pLoc)
|
||||
@ -1316,6 +1320,9 @@ int main(int argc, char** argv)
|
||||
rc = ERR_UNKNOWN;
|
||||
}
|
||||
|
||||
// Free up resources allocated by MY_INIT() above.
|
||||
my_end(0);
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Log end of job to INFO log
|
||||
//--------------------------------------------------------------------------
|
||||
|
@ -43,8 +43,6 @@
|
||||
|
||||
#include "joblisttypes.h"
|
||||
|
||||
#include "utils_utf8.h" // utf8_truncate_point()
|
||||
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
using namespace execplan;
|
||||
@ -515,14 +513,32 @@ void BulkLoadBuffer::convert(char* field, int fieldLength, bool nullFlag, unsign
|
||||
// from storing characters beyond the column's defined width.
|
||||
// It contains the column definition width rather than the bytes
|
||||
// on disk (e.g. 5 for a varchar(5) instead of 8).
|
||||
if (fieldLength > column.definedWidth)
|
||||
if (column.cs->mbmaxlen > 1)
|
||||
{
|
||||
uint8_t truncate_point = utf8::utf8_truncate_point(field, column.definedWidth);
|
||||
memcpy(charTmpBuf, field, column.definedWidth - truncate_point);
|
||||
bufStats.satCount++;
|
||||
const CHARSET_INFO* cs = column.cs;
|
||||
const char* start = (const char*) field;
|
||||
const char* end = (const char*)(field + fieldLength);
|
||||
size_t numChars = cs->numchars(start, end);
|
||||
size_t maxCharLength = column.definedWidth / cs->mbmaxlen;
|
||||
|
||||
if (numChars > maxCharLength)
|
||||
{
|
||||
MY_STRCOPY_STATUS status;
|
||||
cs->well_formed_char_length(start, end, maxCharLength, &status);
|
||||
fieldLength = status.m_source_end_pos - start;
|
||||
bufStats.satCount++;
|
||||
}
|
||||
}
|
||||
else
|
||||
memcpy(charTmpBuf, field, fieldLength);
|
||||
else // cs->mbmaxlen == 1
|
||||
{
|
||||
if (fieldLength > column.definedWidth)
|
||||
{
|
||||
fieldLength = column.definedWidth;
|
||||
bufStats.satCount++;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(charTmpBuf, field, fieldLength);
|
||||
}
|
||||
|
||||
// Swap byte order before comparing character string
|
||||
|
@ -1697,7 +1697,7 @@ int ColumnInfo::updateDctnryStore(char* buf, ColPosPair** pos, const int totalRo
|
||||
Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
|
||||
#endif
|
||||
|
||||
int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount);
|
||||
int rc = fStore->insertDctnry(buf, pos, totalRow, id, tokenBuf, truncCount, column.cs);
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
|
@ -48,7 +48,6 @@ using namespace BRM;
|
||||
#include "IDBPolicy.h"
|
||||
#include "cacheutils.h"
|
||||
using namespace idbdatafile;
|
||||
#include "utils_utf8.h" // utf8_truncate_point()
|
||||
#include "checks.h"
|
||||
|
||||
namespace
|
||||
@ -764,7 +763,7 @@ int Dctnry::insertDctnry2(Signature& sig)
|
||||
* failure - it did not write the header to block
|
||||
******************************************************************************/
|
||||
int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col, char* tokenBuf,
|
||||
long long& truncCount)
|
||||
long long& truncCount, const CHARSET_INFO* cs)
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_PARSE_DCT);
|
||||
@ -837,12 +836,28 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
|
||||
curSig.signature = (unsigned char*)pIn;
|
||||
}
|
||||
|
||||
// @Bug 2565: Truncate any strings longer than schema's column width
|
||||
if (curSig.size > m_colWidth)
|
||||
if (cs->mbmaxlen > 1)
|
||||
{
|
||||
uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
|
||||
curSig.size = m_colWidth - truncate_point;
|
||||
++truncCount;
|
||||
const char* start = (const char*) curSig.signature;
|
||||
const char* end = (const char*)(curSig.signature + curSig.size);
|
||||
size_t numChars = cs->numchars(start, end);
|
||||
size_t maxCharLength = m_colWidth / cs->mbmaxlen;
|
||||
|
||||
if (numChars > maxCharLength)
|
||||
{
|
||||
MY_STRCOPY_STATUS status;
|
||||
cs->well_formed_char_length(start, end, maxCharLength, &status);
|
||||
curSig.size = status.m_source_end_pos - start;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
else // cs->mbmaxlen == 1
|
||||
{
|
||||
if (curSig.size > m_colWidth)
|
||||
{
|
||||
curSig.size = m_colWidth;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
|
||||
//...Search for the string in our string cache
|
||||
|
@ -168,7 +168,7 @@ class Dctnry : public DbFileOp
|
||||
* @param tokenBuf - (output) list of tokens for the parsed strings
|
||||
*/
|
||||
EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col,
|
||||
char* tokenBuf, long long& truncCount);
|
||||
char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs);
|
||||
|
||||
/**
|
||||
* @brief Update dictionary store with tokenized strings (for DDL/DML use)
|
||||
|
@ -135,22 +135,6 @@ class DctnryStore : public DbFileOp
|
||||
*/
|
||||
EXPORT const int updateDctnryStore(unsigned char* sigValue, int& sigSize, Token& token);
|
||||
|
||||
/**
|
||||
* @brief Update dictionary store with tokenized strings (for Bulk use)
|
||||
*
|
||||
* @param buf - bulk buffer containing strings to be parsed
|
||||
* @param pos - list of offsets into buf
|
||||
* @param totalRow - total number of rows in buf
|
||||
* @param col - the column to be parsed from buf
|
||||
* @param colWidth - width of the dictionary column being parsed
|
||||
* @param tokenBuf - (output) list of tokens for the parsed strings
|
||||
*/
|
||||
const int updateDctnryStore(const char* buf, ColPosPair** pos, const int totalRow, const int col,
|
||||
const int colWidth, char* tokenBuf)
|
||||
{
|
||||
return (m_dctnry.insertDctnry(buf, pos, totalRow, col, colWidth, tokenBuf));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief TransId related function
|
||||
*
|
||||
|
@ -40,6 +40,7 @@
|
||||
#include "IDBDataFile.h"
|
||||
#include "IDBPolicy.h"
|
||||
#include "nullstring.h"
|
||||
#include "collation.h" // For CHARSET_INFO struct
|
||||
|
||||
#undef EXPORT
|
||||
#undef DELETE
|
||||
@ -410,6 +411,7 @@ struct JobColumn /** @brief Job Column Structure */
|
||||
double fDefaultDbl; /** @brief Dbl/Flt column default */
|
||||
int128_t fDefaultWideDecimal; /** @brief Wide decimal column default */
|
||||
utils::NullString fDefaultChr; /** @brief Char column default */
|
||||
const CHARSET_INFO* cs; /** @brief character set info for the column */
|
||||
JobColumn()
|
||||
: mapOid(0)
|
||||
, dataType(execplan::CalpontSystemCatalog::INT)
|
||||
@ -435,6 +437,7 @@ struct JobColumn /** @brief Job Column Structure */
|
||||
, fDefaultUInt(0)
|
||||
, fDefaultDbl(0.0)
|
||||
, fDefaultWideDecimal(0)
|
||||
, cs(nullptr)
|
||||
{
|
||||
}
|
||||
JobColumn(const std::string& colName_, OID mapOid_, const std::string& typeName_,
|
||||
@ -466,6 +469,7 @@ struct JobColumn /** @brief Job Column Structure */
|
||||
, fDefaultUInt(defaultUInt_)
|
||||
, fDefaultDbl(0.0)
|
||||
, fDefaultWideDecimal(0)
|
||||
, cs(nullptr)
|
||||
{
|
||||
dctnry.fCompressionType = dctnryCompressionType_;
|
||||
}
|
||||
|
@ -871,6 +871,15 @@ void XMLJob::fillInXMLDataAsLoaded(execplan::CalpontSystemCatalog::RIDList& colR
|
||||
col.compressionType = colType.compressionType;
|
||||
col.dctnry.fCompressionType = colType.compressionType;
|
||||
|
||||
if (colType.charsetNumber != 0)
|
||||
{
|
||||
col.cs = &datatypes::Charset(colType.charsetNumber).getCharset();
|
||||
}
|
||||
else
|
||||
{
|
||||
col.cs = &my_charset_latin1;
|
||||
}
|
||||
|
||||
if (colType.autoincrement)
|
||||
col.autoIncFlag = true;
|
||||
else
|
||||
|
Reference in New Issue
Block a user