You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-3536 Collation
This commit is contained in:
@ -20,6 +20,9 @@
|
||||
*
|
||||
*
|
||||
****************************************************************************/
|
||||
#include <mariadb.h>
|
||||
#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost
|
||||
#include <my_sys.h>
|
||||
|
||||
#include <string>
|
||||
using namespace std;
|
||||
@ -47,106 +50,106 @@ CalpontSystemCatalog::ColType Func_trim::operationType(FunctionParm& fp, Calpont
|
||||
std::string Func_trim::getStrVal(rowgroup::Row& row,
|
||||
FunctionParm& fp,
|
||||
bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType&)
|
||||
execplan::CalpontSystemCatalog::ColType& type)
|
||||
{
|
||||
// The number of characters (not bytes) in our input tstr.
|
||||
// Not all of these are necessarily significant. We need to search for the
|
||||
// NULL terminator to be sure.
|
||||
size_t strwclen;
|
||||
// this holds the number of characters (not bytes) in ourtrim tstr.
|
||||
size_t trimwclen;
|
||||
|
||||
CHARSET_INFO* cs = type.getCharset();
|
||||
// The original string
|
||||
const string& tstr = fp[0]->data()->getStrVal(row, isNull);
|
||||
const string& src = fp[0]->data()->getStrVal(row, isNull);
|
||||
if (isNull)
|
||||
return "";
|
||||
if (src.empty() || src.length() == 0)
|
||||
return src;
|
||||
// binLen represents the number of bytes in src
|
||||
size_t binLen = src.length();
|
||||
const char* pos = src.c_str();
|
||||
const char* end = pos + binLen;
|
||||
// strLen = the number of characters in src
|
||||
size_t strLen = cs->numchars(pos, end);
|
||||
|
||||
// The trim characters.
|
||||
const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " ");
|
||||
// binTLen represents the number of bytes in trim
|
||||
size_t binTLen = trim.length();
|
||||
const char* posT = trim.c_str();
|
||||
// strTLen = the number of characters in trim
|
||||
size_t strTLen = cs->numchars(posT, posT+binTLen);
|
||||
if (strTLen == 0 || strTLen > strLen)
|
||||
return src;
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
if (tstr.empty() || tstr.length() == 0)
|
||||
return tstr;
|
||||
|
||||
// Rather than calling the wideconvert functions with a null buffer to
|
||||
// determine the size of buffer to allocate, we can be sure the wide
|
||||
// char string won't be longer than:
|
||||
strwclen = tstr.length(); // a guess to start with. This will be >= to the real count.
|
||||
int bufsize = strwclen + 1;
|
||||
|
||||
// Convert the string to wide characters. Do all further work in wide characters
|
||||
wchar_t* wcbuf = new wchar_t[bufsize];
|
||||
strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1);
|
||||
|
||||
// Bad char in mbc can return -1
|
||||
if (strwclen == static_cast<size_t>(-1))
|
||||
strwclen = 0;
|
||||
|
||||
// Convert the trim string to wide
|
||||
trimwclen = trim.length(); // A guess to start.
|
||||
int trimbufsize = trimwclen + 1;
|
||||
wchar_t* wctrim = new wchar_t[trimbufsize];
|
||||
size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1);
|
||||
|
||||
// Bad char in mbc can return -1
|
||||
if (trimlen == static_cast<size_t>(-1))
|
||||
trimlen = 0;
|
||||
|
||||
size_t trimCmpLen = trimlen * sizeof(wchar_t);
|
||||
|
||||
const wchar_t* oPtr = wcbuf; // To remember the start of the string
|
||||
const wchar_t* aPtr = oPtr;
|
||||
const wchar_t* aEnd = wcbuf + strwclen - 1;
|
||||
size_t trimCnt = 0;
|
||||
|
||||
if (trimlen > 0)
|
||||
if (binTLen == 1)
|
||||
{
|
||||
if (trimlen == 1)
|
||||
// If the trim string is 1 byte, don't waste cpu for memcmp
|
||||
// Trim leading
|
||||
while (pos < end && *pos == *posT)
|
||||
{
|
||||
// If trim is a single char, then don't spend the overhead for memcmp.
|
||||
wchar_t chr = wctrim[0];
|
||||
|
||||
// remove leading
|
||||
while (aPtr != aEnd && *aPtr == chr)
|
||||
{
|
||||
aPtr++;
|
||||
++trimCnt;
|
||||
}
|
||||
|
||||
// remove trailing
|
||||
while (aEnd != aPtr && *aEnd == chr)
|
||||
{
|
||||
aEnd--;
|
||||
++trimCnt;
|
||||
}
|
||||
++pos;
|
||||
--binLen;
|
||||
}
|
||||
else
|
||||
// Trim trailing
|
||||
while (end > pos && *end == *posT)
|
||||
{
|
||||
aEnd -= (trimlen - 1); // So we don't compare past the end of the string.
|
||||
|
||||
// remove leading
|
||||
while (aPtr <= aEnd && !memcmp(aPtr, wctrim, trimCmpLen))
|
||||
--end;
|
||||
--binLen;
|
||||
}
|
||||
}
|
||||
else if (!cs->use_mb())
|
||||
{
|
||||
// This is a one byte per char charset with multiple char trim.
|
||||
// Trim leading
|
||||
while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0)
|
||||
{
|
||||
pos += binTLen;
|
||||
binLen -= binTLen;
|
||||
}
|
||||
// Trim trailing
|
||||
while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0)
|
||||
{
|
||||
end -= binTLen;
|
||||
binLen -= binTLen;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// We're using a multi-byte charset
|
||||
// Trim leading is easy
|
||||
while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0)
|
||||
{
|
||||
pos += binTLen;
|
||||
binLen -= binTLen;
|
||||
}
|
||||
|
||||
// Trim trailing
|
||||
// The problem is that the byte pattern at the end could
|
||||
// match memcmp, but not be correct since the first byte compared
|
||||
// may actually be a second or later byte from a previous char.
|
||||
|
||||
// We start at the beginning of the string and move forward
|
||||
// one character at a time until we reach the end. Then we can
|
||||
// safely compare.
|
||||
while (end - binTLen >= pos)
|
||||
{
|
||||
const char* p = pos;
|
||||
uint32 l;
|
||||
while (p + binTLen < end)
|
||||
{
|
||||
aPtr += trimlen;
|
||||
trimCnt += trimlen;
|
||||
if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte
|
||||
p += l;
|
||||
else
|
||||
++p;
|
||||
}
|
||||
|
||||
// remove trailing
|
||||
while (aPtr <= aEnd && !memcmp(aEnd, wctrim, trimCmpLen))
|
||||
if (p + binTLen == end && memcmp(p,posT,binTLen) == 0)
|
||||
{
|
||||
aEnd -= trimlen; //BUG 5241
|
||||
trimCnt += trimlen;
|
||||
end -= binTLen;
|
||||
binLen -= binTLen;
|
||||
}
|
||||
else
|
||||
{
|
||||
break; // We've run out of places to look
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Bug 5110 - error in allocating enough memory for utf8 chars
|
||||
size_t aLen = strwclen - trimCnt;
|
||||
wstring trimmed = wstring(aPtr, aLen);
|
||||
// Turn back to a string
|
||||
std::string ret(utf8::wstring_to_utf8(trimmed.c_str()));
|
||||
delete [] wctrim;
|
||||
delete [] wcbuf;
|
||||
std::string ret(pos, binLen);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user