You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-3536 Collation
This commit is contained in:
@ -49,36 +49,128 @@ CalpontSystemCatalog::ColType Func_replace::operationType(FunctionParm& fp, Calp
|
||||
std::string Func_replace::getStrVal(rowgroup::Row& row,
|
||||
FunctionParm& fp,
|
||||
bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType&)
|
||||
execplan::CalpontSystemCatalog::ColType& ct)
|
||||
{
|
||||
CHARSET_INFO* cs = ct.getCharset();
|
||||
|
||||
const string& str = fp[0]->data()->getStrVal(row, isNull);
|
||||
if (isNull)
|
||||
return "";
|
||||
size_t strLen = str.length();
|
||||
|
||||
const string& fromstr = fp[1]->data()->getStrVal(row, isNull);
|
||||
if (isNull)
|
||||
return "";
|
||||
if (fromstr.length() == 0)
|
||||
return str;
|
||||
size_t fromLen = fromstr.length();
|
||||
|
||||
const string& tostr = fp[2]->data()->getStrVal(row, isNull);
|
||||
if (isNull)
|
||||
return "";
|
||||
size_t toLen = tostr.length();
|
||||
|
||||
bool binaryCmp = (cs->state & MY_CS_BINSORT) || !cs->use_mb();
|
||||
string newstr;
|
||||
unsigned int i = 0;
|
||||
|
||||
for (;;)
|
||||
size_t pos = 0;
|
||||
if (binaryCmp)
|
||||
{
|
||||
size_t pos = str.find(fromstr, i);
|
||||
uint32_t i = 0;
|
||||
pos = str.find(fromstr);
|
||||
if (pos == string::npos)
|
||||
return str;
|
||||
|
||||
if ( pos != string::npos )
|
||||
// Count the number of fromstr in strend
|
||||
int count = 0;
|
||||
do
|
||||
{
|
||||
++count;
|
||||
pos = str.find(fromstr, pos + fromLen);
|
||||
}
|
||||
while (pos != string::npos);
|
||||
|
||||
newstr.reserve(strLen + (count * ((int)toLen - (int)fromLen)) + 1);
|
||||
|
||||
// Now move the stuff into newstr
|
||||
do
|
||||
{
|
||||
//match
|
||||
if (pos > i)
|
||||
newstr = newstr + str.substr(i, pos - i);
|
||||
|
||||
newstr = newstr + tostr;
|
||||
i = pos + fromstr.size();
|
||||
i = pos + fromLen;
|
||||
pos = str.find(fromstr, i);
|
||||
}
|
||||
while (pos != string::npos);
|
||||
|
||||
newstr = newstr + str.substr(i, string::npos);
|
||||
}
|
||||
else
|
||||
{
|
||||
newstr = newstr + str.substr(i, 1000);
|
||||
// UTF
|
||||
const char* src = str.c_str();
|
||||
const char* srcEnd = src + strLen;
|
||||
const char* srchEnd = srcEnd - fromLen + 1;
|
||||
const char* from = fromstr.c_str();
|
||||
const char* fromEnd = from + fromLen;
|
||||
const char* to = tostr.c_str();
|
||||
char* ptr = const_cast<char*>(src);
|
||||
char *i,*j;
|
||||
size_t count = 10; // Some arbitray number to reserve some space to start.
|
||||
size_t growlen = count * ((int)toLen - (int)fromLen);
|
||||
newstr.reserve(strLen + (count * growlen) + 1);
|
||||
size_t maxsize = newstr.max_size();
|
||||
uint32_t l;
|
||||
|
||||
// We don't know where byte patterns might match so
|
||||
// we start at the beginning of the string and move forward
|
||||
// one character at a time until we find a match. Then we can
|
||||
// move the src bytes and add in the to bytes,then try again.
|
||||
while (ptr < srchEnd)
|
||||
{
|
||||
bool found = false;
|
||||
if (*ptr == *from) // If the first byte matches, maybe we have a match
|
||||
{
|
||||
// Do a byte by byte compare of src at that spot against from
|
||||
i = ptr + 1;
|
||||
j = const_cast<char*>(from) + 1;
|
||||
found = true;
|
||||
while (j != fromEnd)
|
||||
{
|
||||
if (*i++ != *j++)
|
||||
{
|
||||
found = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (found)
|
||||
{
|
||||
if (ptr < i)
|
||||
{
|
||||
int mvsize = i - ptr;
|
||||
if (newstr.length() + mvsize + toLen < maxsize)
|
||||
{
|
||||
// We need a re-alloc
|
||||
newstr.reserve(maxsize + growlen);
|
||||
growlen *= 2;
|
||||
}
|
||||
newstr.append(ptr, mvsize);
|
||||
ptr += mvsize;
|
||||
}
|
||||
newstr.append(to, toLen);
|
||||
ptr += toLen;
|
||||
}
|
||||
else
|
||||
{
|
||||
// move to the next character
|
||||
if ((l = my_ismbchar(cs, ptr, srcEnd))) // returns the number of bytes in the leading char or zero if one byte
|
||||
ptr += l;
|
||||
else
|
||||
++ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return newstr;
|
||||
}
|
||||
|
@ -37,8 +37,6 @@ using namespace joblist;
|
||||
|
||||
#include "collation.h"
|
||||
|
||||
#define STRCOLL_ENH__
|
||||
|
||||
namespace funcexp
|
||||
{
|
||||
|
||||
@ -52,101 +50,57 @@ CalpontSystemCatalog::ColType Func_substr::operationType(FunctionParm& fp, Calpo
|
||||
std::string Func_substr::getStrVal(rowgroup::Row& row,
|
||||
FunctionParm& fp,
|
||||
bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType&)
|
||||
execplan::CalpontSystemCatalog::ColType& ct)
|
||||
{
|
||||
#ifdef STRCOLL_ENH__
|
||||
const string& tstr = fp[0]->data()->getStrVal(row, isNull);
|
||||
CHARSET_INFO* cs = ct.getCharset();
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1;
|
||||
wchar_t* wcbuf = new wchar_t[strwclen];
|
||||
strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen);
|
||||
wstring str(wcbuf, strwclen);
|
||||
|
||||
int64_t start = fp[1]->data()->getIntVal(row, isNull) - 1;
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
if (start == -1) // pos == 0
|
||||
return "";
|
||||
|
||||
wstring::size_type n = wstring::npos;
|
||||
|
||||
if (fp.size() == 3)
|
||||
{
|
||||
int64_t len = fp[2]->data()->getIntVal(row, isNull);
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
if (len < 1)
|
||||
return "";
|
||||
|
||||
n = len;
|
||||
}
|
||||
|
||||
int64_t strLen = static_cast<int64_t>(str.length());
|
||||
|
||||
if (start < -1) // negative pos, beginning from end
|
||||
start += strLen + 1;
|
||||
|
||||
if (start < 0 || strLen <= start)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
wstring out = str.substr(start, n);
|
||||
size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1;
|
||||
char* outbuf = new char[strmblen];
|
||||
strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen);
|
||||
std::string ret(outbuf, strmblen);
|
||||
delete [] outbuf;
|
||||
delete [] wcbuf;
|
||||
return ret;
|
||||
#else
|
||||
const string& str = fp[0]->data()->getStrVal(row, isNull);
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
int64_t strLen = str.length();
|
||||
const char* strptr = str.c_str();
|
||||
const char* strend = strptr + strLen;
|
||||
uint32_t strChars = cs->numchars(strptr, strend);
|
||||
|
||||
int64_t start = fp[1]->data()->getIntVal(row, isNull) - 1;
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
if (start == -1) // pos == 0
|
||||
if (start < -1) // negative pos, beginning from end
|
||||
start += strChars + 1;
|
||||
if (start < 0 || strChars <= start)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
size_t n = string::npos;
|
||||
|
||||
int64_t length;
|
||||
if (fp.size() == 3)
|
||||
{
|
||||
int64_t len = fp[2]->data()->getIntVal(row, isNull);
|
||||
|
||||
int64_t length = fp[2]->data()->getIntVal(row, isNull);
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
if (len < 1)
|
||||
if (length < 1)
|
||||
return "";
|
||||
|
||||
n = len;
|
||||
}
|
||||
|
||||
size_t strLen = strlen(str.c_str());
|
||||
|
||||
if (start < -1) // negative pos, beginning from end
|
||||
start += strLen + 1;
|
||||
|
||||
if (start < 0 || (int64_t)strLen <= start)
|
||||
else
|
||||
{
|
||||
return "";
|
||||
length = strChars - start;
|
||||
}
|
||||
|
||||
return str.substr(start, n);
|
||||
#endif
|
||||
// start is now number of chars into str to start the substring
|
||||
// We convert it to number of bytes:
|
||||
start = cs->charpos(strptr, strend, start);
|
||||
// Convert length to bytes as well
|
||||
length= cs->charpos(strptr + start, strend, length);
|
||||
if ((start < 0) || (start + 1 > strLen))
|
||||
return "";
|
||||
|
||||
if (start == 0 && strLen == length)
|
||||
return str;
|
||||
|
||||
length= MY_MIN(length, strLen - start);
|
||||
|
||||
std::string ret(strptr + start, length);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
@ -45,48 +45,130 @@ CalpontSystemCatalog::ColType Func_substring_index::operationType(FunctionParm&
|
||||
return fp[0]->data()->resultType();
|
||||
}
|
||||
|
||||
|
||||
std::string Func_substring_index::getStrVal(rowgroup::Row& row,
|
||||
FunctionParm& fp,
|
||||
bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType&)
|
||||
execplan::CalpontSystemCatalog::ColType& ct)
|
||||
{
|
||||
CHARSET_INFO* cs = ct.getCharset();
|
||||
|
||||
const string& str = fp[0]->data()->getStrVal(row, isNull);
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
int64_t strLen = str.length();
|
||||
|
||||
const string& delim = fp[1]->data()->getStrVal(row, isNull);
|
||||
|
||||
const string& delimstr = fp[1]->data()->getStrVal(row, isNull);
|
||||
if (isNull)
|
||||
return "";
|
||||
int64_t delimLen = delimstr.length();
|
||||
|
||||
int64_t count = fp[2]->data()->getIntVal(row, isNull);
|
||||
|
||||
if (isNull)
|
||||
return "";
|
||||
|
||||
if ( count == 0 )
|
||||
if (strLen == 0 || delimLen == 0 || !count == 0)
|
||||
return "";
|
||||
|
||||
// To avoid comparison b/w int64_t and size_t
|
||||
int64_t end = strlen(str.c_str()) & 0x7fffffffffffffff;
|
||||
|
||||
if ( count > end )
|
||||
if (count > strLen)
|
||||
return str;
|
||||
|
||||
if (( count < 0 ) && ((count * -1) > (int64_t) end))
|
||||
if ((count < 0) && ((count * -1) > strLen))
|
||||
return str;
|
||||
|
||||
string value = str;
|
||||
std::string value; // Only used if !use_mb()
|
||||
|
||||
if (cs->use_mb()) // Charset supports multibyte characters
|
||||
{
|
||||
const char* src = str.c_str();
|
||||
const char* srcEnd = src + strLen;
|
||||
const char* srchEnd = srcEnd - delimLen + 1;
|
||||
const char* delim = delimstr.c_str();
|
||||
const char* delimEnd = delim + delimLen;
|
||||
char* ptr = const_cast<char*>(src);
|
||||
char *i,*j;
|
||||
uint32_t l;
|
||||
int32 n = 0, c = count, pass;
|
||||
// For count > 0, this loop goes once.
|
||||
// For count < 0, it goes twice
|
||||
for (pass = (count > 0 ? 1 : 0); pass<2; ++pass)
|
||||
{
|
||||
while (ptr < srchEnd)
|
||||
{
|
||||
bool found = false;
|
||||
if (*ptr == *delim) // If the first byte matches, maybe we have a match
|
||||
{
|
||||
// Do a byte by byte compare of src at that spot against delim
|
||||
i = ptr + 1;
|
||||
j = const_cast<char*>(delim) + 1;
|
||||
found = true;
|
||||
while (j != delimEnd)
|
||||
{
|
||||
if (*i++ != *j++)
|
||||
{
|
||||
found = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (found)
|
||||
{
|
||||
if (pass==0)
|
||||
++n;
|
||||
else if (!--c)
|
||||
break;
|
||||
|
||||
ptr += delimLen;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
// move to the next character
|
||||
if ((l = my_ismbchar(cs, ptr, srcEnd))) // returns the number of bytes in the leading char or zero if one byte
|
||||
ptr += l;
|
||||
else
|
||||
++ptr;
|
||||
}
|
||||
}
|
||||
if (pass == 0) /* count<0 */
|
||||
{
|
||||
c += n + 1;
|
||||
if (c <= 0)
|
||||
{
|
||||
return str; // not found, return the original string
|
||||
}
|
||||
// Go back and do a second pass
|
||||
ptr = const_cast<char*>(src);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (c)
|
||||
{
|
||||
return str; // not found, return the original string
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( count > 0) /* return left part */
|
||||
{
|
||||
std::string ret(src, ptr - src);
|
||||
return ret;
|
||||
}
|
||||
else /* return right part */
|
||||
{
|
||||
ptr+= delimLen;
|
||||
std::string ret(ptr, srcEnd - ptr);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (count > 0)
|
||||
{
|
||||
int pointer = 0;
|
||||
|
||||
int64_t end = strLen;
|
||||
for ( int64_t i = 0 ; i < count ; i ++ )
|
||||
{
|
||||
string::size_type pos = str.find(delim, pointer);
|
||||
string::size_type pos = str.find(delimstr, pointer);
|
||||
|
||||
if (pos != string::npos)
|
||||
pointer = pos + 1;
|
||||
@ -99,16 +181,16 @@ std::string Func_substring_index::getStrVal(rowgroup::Row& row,
|
||||
else
|
||||
{
|
||||
count = -count;
|
||||
int pointer = end;
|
||||
int pointer = strLen;
|
||||
int start = 0;
|
||||
|
||||
for ( int64_t i = 0 ; i < count ; i ++ )
|
||||
{
|
||||
string::size_type pos = str.rfind(delim, pointer);
|
||||
string::size_type pos = str.rfind(delimstr, pointer);
|
||||
|
||||
if (pos != string::npos)
|
||||
{
|
||||
if ( count > end )
|
||||
if ( count > strLen )
|
||||
return "";
|
||||
|
||||
pointer = pos - 1;
|
||||
@ -118,9 +200,9 @@ std::string Func_substring_index::getStrVal(rowgroup::Row& row,
|
||||
start = 0;
|
||||
}
|
||||
|
||||
value = str.substr(start, end);
|
||||
value = str.substr(start, strLen);
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
@ -132,12 +132,12 @@ std::string Func_trim::getStrVal(rowgroup::Row& row,
|
||||
|
||||
// We start at the beginning of the string and move forward
|
||||
// one character at a time until we reach the end. Then we can
|
||||
// safely compare and remove on character. Then back to the beginning
|
||||
// safely compare and remove one character. Then back to the beginning
|
||||
// and try again.
|
||||
while (end - binTLen >= pos)
|
||||
{
|
||||
const char* p = pos;
|
||||
uint32 l;
|
||||
uint32_t l;
|
||||
while (p + binTLen < end)
|
||||
{
|
||||
if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte
|
||||
|
Reference in New Issue
Block a user