1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

MCOL-3536 collation

This commit is contained in:
David Hall
2020-05-19 16:22:44 -05:00
parent 8479a87e46
commit 11ba12f6ea
10 changed files with 53 additions and 126 deletions

View File

@ -59,6 +59,7 @@ DictStepJL::DictStepJL(const pDictionaryStep& dict)
filterString = dict.fFilterString; filterString = dict.fFilterString;
filterCount = dict.fFilterCount; filterCount = dict.fFilterCount;
charsetNumber = dict.fColType.charsetNumber;
} }
DictStepJL::~DictStepJL() DictStepJL::~DictStepJL()
@ -88,7 +89,7 @@ void DictStepJL::createCommand(ByteStream& bs) const
} }
else else
bs << filterString; bs << filterString;
bs << charsetNumber;
CommandJL::createCommand(bs); CommandJL::createCommand(bs);
} }

View File

@ -76,6 +76,7 @@ private:
std::vector<std::string> eqFilter; std::vector<std::string> eqFilter;
bool hasEqFilter; bool hasEqFilter;
uint8_t eqOp; // COMPARE_EQ or COMPARE_NE uint8_t eqOp; // COMPARE_EQ or COMPARE_NE
uint32_t charsetNumber;
}; };
}; // namespace }; // namespace

View File

@ -103,7 +103,7 @@ Notes:
*/ */
void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h,
TokenByScanResultHeader* ret, unsigned outSize, bool utf8, TokenByScanResultHeader* ret, unsigned outSize,
boost::shared_ptr<DictEqualityFilter> eqFilter) boost::shared_ptr<DictEqualityFilter> eqFilter)
{ {
const DataValue* args; const DataValue* args;
@ -113,7 +113,6 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h,
int offsetIndex, argIndex, argsOffset; int offsetIndex, argIndex, argsOffset;
bool cmpResult = false; bool cmpResult = false;
int tmp, i, err; int tmp, i, err;
const char* sig; const char* sig;
uint16_t siglen; uint16_t siglen;
@ -191,6 +190,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h,
if (eqFilter) if (eqFilter)
{ {
// MCOL-1246 Trim whitespace before match // MCOL-1246 Trim whitespace before match
// TODO MCOL-3536 use CHARSET_INFO* cs for collation
// cs->hash_sort(hash_sort(const uchar *key, size_t len, ulong *nr1, ulong *nr2))
string strData(sig, siglen); string strData(sig, siglen);
boost::trim_right_if(strData, boost::is_any_of(" ")); boost::trim_right_if(strData, boost::is_any_of(" "));
bool gotIt = eqFilter->find(strData) != eqFilter->end(); bool gotIt = eqFilter->find(strData) != eqFilter->end();
@ -214,15 +215,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h,
cmpResult = !cmpResult; cmpResult = !cmpResult;
} }
else else
{
if (utf8)
{ {
tmp = cs->strnncoll(sig, siglen, args->data, args->len); tmp = cs->strnncoll(sig, siglen, args->data, args->len);
}
else
{
tmp = strncmp(sig, args->data, std::min(siglen, args->len));
}
cmpResult = compare(tmp, h->COP1, siglen, args->len); cmpResult = compare(tmp, h->COP1, siglen, args->len);
} }
@ -262,15 +256,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h,
} }
else else
{
if (utf8)
{ {
tmp = cs->strnncoll(sig, siglen, args->data, args->len); tmp = cs->strnncoll(sig, siglen, args->data, args->len);
}
else
{
tmp = strncmp(sig, args->data, std::min(siglen, args->len));
}
cmpResult = compare(tmp, h->COP2, siglen, args->len); cmpResult = compare(tmp, h->COP2, siglen, args->len);
} }
@ -297,15 +284,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h,
} }
else else
{
if (utf8)
{ {
tmp = cs->strnncoll(sig, siglen, args->data, args->len); tmp = cs->strnncoll(sig, siglen, args->data, args->len);
}
else
{
tmp = strncmp(sig, args->data, std::min(siglen, args->len));
}
cmpResult = compare(tmp, h->COP2, siglen, args->len); cmpResult = compare(tmp, h->COP2, siglen, args->len);
} }
@ -667,8 +647,12 @@ PrimitiveProcessor::makeLikeFilter (const DictFilterElement* filterString, uint3
return ret; return ret;
} }
void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector<uint8_t>* out, bool utf8, void PrimitiveProcessor::p_Dictionary(const DictInput* in,
bool skipNulls, boost::shared_ptr<DictEqualityFilter> eqFilter, uint8_t eqOp) vector<uint8_t>* out,
bool skipNulls,
uint32_t charsetNumber,
boost::shared_ptr<DictEqualityFilter> eqFilter,
uint8_t eqOp)
{ {
PrimToken* outToken; PrimToken* outToken;
const DictFilterElement* filter = 0; const DictFilterElement* filter = 0;
@ -679,6 +663,7 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector<uint8_t>* out,
uint16_t aggCount; uint16_t aggCount;
bool cmpResult; bool cmpResult;
DictOutput header; DictOutput header;
const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME));
// default size of the ouput to something sufficiently large to prevent // default size of the ouput to something sufficiently large to prevent
// excessive reallocation and copy when resizing // excessive reallocation and copy when resizing
@ -714,30 +699,13 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector<uint8_t>* out,
nextSig(in->NVALS, in->tokens, &sigptr, in->OutputType, nextSig(in->NVALS, in->tokens, &sigptr, in->OutputType,
(in->InputFlags ? true : false), skipNulls)) (in->InputFlags ? true : false), skipNulls))
{ {
string sig_utf8;
if (utf8)
{
string tmpString((char*)sigptr.data, sigptr.len);
sig_utf8 = tmpString;
}
// do aggregate processing // do aggregate processing
if (in->OutputType & OT_AGGREGATE) if (in->OutputType & OT_AGGREGATE)
{ {
// len == 0 indicates this is the first pass // len == 0 indicates this is the first pass
if (max.len != 0) if (max.len != 0)
{ {
if (utf8 ) tmp = cs->strnncoll(sigptr.data, sigptr.len, max.data, max.len);
{
string max_utf8((char*)max.data, max.len);
tmp = utf8::idb_strcoll(sig_utf8.c_str(), max_utf8.c_str());
}
else
{
tmp = strncmp((char*)sigptr.data, (char*)max.data, std::min(sigptr.len, max.len));
}
if (tmp > 0) if (tmp > 0)
max = sigptr; max = sigptr;
@ -747,15 +715,7 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector<uint8_t>* out,
if (min.len != 0) if (min.len != 0)
{ {
if (utf8) tmp = cs->strnncoll(sigptr.data, sigptr.len, min.data, min.len);
{
string min_utf8((char*)min.data, min.len);
tmp = utf8::idb_strcoll(sig_utf8.c_str(), min_utf8.c_str());
}
else
{
tmp = strncmp((char*)sigptr.data, (char*)min.data, std::min(sigptr.len, min.len));
}
if (tmp < 0) if (tmp < 0)
min = sigptr; min = sigptr;
@ -788,15 +748,6 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector<uint8_t>* out,
for (filterIndex = 0; filterIndex < in->NOPS; filterIndex++) for (filterIndex = 0; filterIndex < in->NOPS; filterIndex++)
{ {
filter = reinterpret_cast<const DictFilterElement*>(&in8[filterOffset]); filter = reinterpret_cast<const DictFilterElement*>(&in8[filterOffset]);
string filt_utf8;
size_t filt_utf8_len = 0;
if (utf8)
{
string tmpString((const char*)filter->data, filter->len);
filt_utf8 = tmpString;
filt_utf8_len = filt_utf8.length();
}
if (filter->COP & COMPARE_LIKE) if (filter->COP & COMPARE_LIKE)
{ {
@ -807,18 +758,7 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector<uint8_t>* out,
} }
else else
{ {
if (utf8) tmp = cs->strnncoll(sigptr.data, sigptr.len, filter->data, filter->len);
{
size_t sig_utf8_len = sig_utf8.length();
tmp = utf8::idb_strcoll(sig_utf8.c_str(), filt_utf8.c_str());
cmpResult = compare(tmp, filter->COP, sig_utf8_len, filt_utf8_len);
}
else
{
tmp = strncmp((const char*) sigptr.data, (const char*)filter->data,
std::min(sigptr.len, static_cast<int>(filter->len)));
}
cmpResult = compare(tmp, filter->COP, sigptr.len, filter->len); cmpResult = compare(tmp, filter->COP, sigptr.len, filter->len);
} }

View File

@ -185,7 +185,7 @@ public:
* @note Throws logic_error if the output buffer is too small for the result. * @note Throws logic_error if the output buffer is too small for the result.
*/ */
void p_TokenByScan(const TokenByScanRequestHeader* t, void p_TokenByScan(const TokenByScanRequestHeader* t,
TokenByScanResultHeader* out, unsigned outSize, bool utf8, TokenByScanResultHeader* out, unsigned outSize,
boost::shared_ptr<DictEqualityFilter> eqFilter); boost::shared_ptr<DictEqualityFilter> eqFilter);
/** @brief The p_IdxWalk primitive processor /** @brief The p_IdxWalk primitive processor
@ -264,8 +264,9 @@ public:
*/ */
// void p_ColAggregate(const NewColAggRequestHeader *in, NewColAggResultHeader *out); // void p_ColAggregate(const NewColAggRequestHeader *in, NewColAggResultHeader *out);
void p_Dictionary(const DictInput* in, std::vector<uint8_t>* out, bool utf8, void p_Dictionary(const DictInput* in, std::vector<uint8_t>* out,
bool skipNulls, boost::shared_ptr<DictEqualityFilter> eqFilter, bool skipNulls, uint32_t charsetNumber,
boost::shared_ptr<DictEqualityFilter> eqFilter,
uint8_t eqOp); uint8_t eqOp);
inline void setLogicalBlockMode(bool b) inline void setLogicalBlockMode(bool b)

View File

@ -99,6 +99,7 @@ void DictStep::createCommand(ByteStream& bs)
else else
bs >> filterString; bs >> filterString;
bs >> charsetNumber;
#if 0 #if 0
cout << "see " << filterCount << " filters\n"; cout << "see " << filterCount << " filters\n";
DictFilterElement* filters = (DictFilterElement*) filterString.buf(); DictFilterElement* filters = (DictFilterElement*) filterString.buf();
@ -173,8 +174,7 @@ void DictStep::issuePrimitive(bool isFilter)
} }
bpp->pp.setLikeFilter(likeFilter); bpp->pp.setLikeFilter(likeFilter);
// MCOL-3536 We shouldn't need to pass in utf8 -- maybe?? bpp->pp.p_Dictionary(primMsg, &result, isFilter, charsetNumber, eqFilter, eqOp);
bpp->pp.p_Dictionary(primMsg, &result, true, isFilter, eqFilter, eqOp);
} }
void DictStep::copyResultToTmpSpace(OrderedToken* ot) void DictStep::copyResultToTmpSpace(OrderedToken* ot)

View File

@ -141,6 +141,7 @@ private:
messageqcpp::ByteStream filterString; messageqcpp::ByteStream filterString;
uint32_t filterCount; uint32_t filterCount;
uint32_t bufferSize; uint32_t bufferSize;
uint32_t charsetNumber;
uint16_t inputRidCount; uint16_t inputRidCount;
bool hasEqFilter; bool hasEqFilter;

View File

@ -1154,7 +1154,6 @@ int DictScanJob::operator()()
PrimitiveProcessor pproc(gDebugLevel); PrimitiveProcessor pproc(gDebugLevel);
TokenByScanResultHeader* output; TokenByScanResultHeader* output;
QueryContext verInfo; QueryContext verInfo;
bool bUtf8;
try try
{ {
@ -1166,25 +1165,6 @@ int DictScanJob::operator()()
*fByteStream >> verInfo; *fByteStream >> verInfo;
cmd = (TokenByScanRequestHeader*) fByteStream->buf(); cmd = (TokenByScanRequestHeader*) fByteStream->buf();
// If charset is one of those that can be representedby standard ascii,
// we can get a performance improvement by using strcmp rather than
// the full charset compare system.
switch (cmd->charsetNumber)
{
case 8: // latin1_swedish_ci
case 9: // latin2_general_ci
case 11: // ascii_general_ci
case 47: // latin1_bin
case 48: // latin1_general_ci
case 49: // latin1_general_cs
case 65: // ascii_bin
case 77: // latin2_bin
bUtf8 = false;
break;
default:
bUtf8 = true;
}
session = cmd->Hdr.SessionID; session = cmd->Hdr.SessionID;
uniqueId = cmd->Hdr.UniqueID; uniqueId = cmd->Hdr.UniqueID;
runCount = cmd->Count; runCount = cmd->Count;
@ -1229,8 +1209,7 @@ int DictScanJob::operator()()
fLBIDTraceOn, fLBIDTraceOn,
session); session);
pproc.setBlockPtr((int*) data); pproc.setBlockPtr((int*) data);
// MCOL-3536 We shouldn't need to pass in utf8 -- maybe?? pproc.p_TokenByScan(cmd, output, output_buf_size, eqFilter);
pproc.p_TokenByScan(cmd, output, output_buf_size, bUtf8, eqFilter);
if (wasBlockInCache) if (wasBlockInCache)
output->CacheIO++; output->CacheIO++;

View File

@ -22,6 +22,10 @@
* *
****************************************************************************/ ****************************************************************************/
#include <mariadb.h>
#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost
#include <my_sys.h>
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
using namespace std; using namespace std;
@ -53,16 +57,16 @@ inline bool numericLE(result_t op1, result_t op2)
return op1 <= op2; return op1 <= op2;
} }
inline bool strGE(const string& op1, const string& op2) inline bool strGE(uint32_t charsetNumber, const string& op1, const string& op2)
{ {
//return strcoll(op1.c_str(), op2.c_str()) >= 0; const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME));
return utf8::idb_strcoll(op1.c_str(), op2.c_str()) >= 0; return cs->strnncoll(op1.c_str(), op1.length(), op2.c_str(), op2.length()) >= 0;
} }
inline bool strLE(const string& op1, const string& op2) inline bool strLE(uint32_t charsetNumber, const string& op1, const string& op2)
{ {
//return strcoll(op1.c_str(), op2.c_str()) <= 0; const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME));
return utf8::idb_strcoll(op1.c_str(), op2.c_str()) <= 0; return cs->strnncoll(op1.c_str(), op1.length(), op2.c_str(), op2.length()) <= 0;
} }
inline bool getBool(rowgroup::Row& row, inline bool getBool(rowgroup::Row& row,
@ -256,16 +260,16 @@ inline bool getBool(rowgroup::Row& row,
if (notBetween) if (notBetween)
{ {
if (!strGE(val, pm[1]->data()->getStrVal(row, isNull)) && !isNull) if (!strGE(ct.charsetNumber, val, pm[1]->data()->getStrVal(row, isNull)) && !isNull)
return true; return true;
isNull = false; isNull = false;
return (!strLE(val, pm[2]->data()->getStrVal(row, isNull)) && !isNull); return (!strLE(ct.charsetNumber, val, pm[2]->data()->getStrVal(row, isNull)) && !isNull);
} }
return !isNull && return !isNull &&
strGE(val, pm[1]->data()->getStrVal(row, isNull)) && strGE(ct.charsetNumber, val, pm[1]->data()->getStrVal(row, isNull)) &&
strLE(val, pm[2]->data()->getStrVal(row, isNull)); strLE(ct.charsetNumber, val, pm[2]->data()->getStrVal(row, isNull));
} }
default: default:

View File

@ -21,6 +21,10 @@
* *
****************************************************************************/ ****************************************************************************/
#include <mariadb.h>
#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost
#include <my_sys.h>
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
#include <sstream> #include <sstream>
@ -78,15 +82,12 @@ int64_t Func_char_length::getIntVal(rowgroup::Row& row,
case execplan::CalpontSystemCatalog::UDECIMAL: case execplan::CalpontSystemCatalog::UDECIMAL:
{ {
const string& tstr = parm[0]->data()->getStrVal(row, isNull); const string& tstr = parm[0]->data()->getStrVal(row, isNull);
if (isNull) if (isNull)
return 0; return 0;
const char* b = tstr.c_str();
size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; const char* e = tstr.c_str() + tstr.length();
wchar_t* wcbuf = new wchar_t[strwclen]; const CHARSET_INFO* cs = get_charset(parm[0]->data()->resultType().charsetNumber, MYF(MY_WME));
strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); return (int64_t)cs->numchars(b, e);
delete [] wcbuf;
return (int64_t)strwclen;
} }
case execplan::CalpontSystemCatalog::DATE: case execplan::CalpontSystemCatalog::DATE:

View File

@ -54,7 +54,6 @@ string Func_concat_ws::getStrVal(Row& row,
if (isNull) if (isNull)
return ""; return "";
#ifdef STRCOLL_ENH__
wstring wstr; wstring wstr;
size_t strwclen = utf8::idb_mbstowcs(0, delim.c_str(), 0) + 1; size_t strwclen = utf8::idb_mbstowcs(0, delim.c_str(), 0) + 1;
wchar_t* wcbuf = new wchar_t[strwclen]; wchar_t* wcbuf = new wchar_t[strwclen];
@ -96,7 +95,7 @@ string Func_concat_ws::getStrVal(Row& row,
delete [] wcbuf; delete [] wcbuf;
return ret; return ret;
#else #if 0
string str; string str;
string tmp; string tmp;
for ( uint32_t i = 1 ; i < parm.size() ; i++) for ( uint32_t i = 1 ; i < parm.size() ; i++)