diff --git a/utils/funcexp/func_replace.cpp b/utils/funcexp/func_replace.cpp index 0c16c9174..f147f3a03 100644 --- a/utils/funcexp/func_replace.cpp +++ b/utils/funcexp/func_replace.cpp @@ -49,34 +49,126 @@ CalpontSystemCatalog::ColType Func_replace::operationType(FunctionParm& fp, Calp std::string Func_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& ct) { + CHARSET_INFO* cs = ct.getCharset(); + const string& str = fp[0]->data()->getStrVal(row, isNull); - + if (isNull) + return ""; + size_t strLen = str.length(); + const string& fromstr = fp[1]->data()->getStrVal(row, isNull); - + if (isNull) + return ""; + if (fromstr.length() == 0) + return str; + size_t fromLen = fromstr.length(); + const string& tostr = fp[2]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + size_t toLen = tostr.length(); + bool binaryCmp = (cs->state & MY_CS_BINSORT) || !cs->use_mb(); string newstr; - unsigned int i = 0; - - for (;;) + size_t pos = 0; + if (binaryCmp) { - size_t pos = str.find(fromstr, i); - - if ( pos != string::npos ) + uint32_t i = 0; + pos = str.find(fromstr); + if (pos == string::npos) + return str; + + // Count the number of fromstr in strend + int count = 0; + do { - //match - if ( pos > i ) + ++count; + pos = str.find(fromstr, pos + fromLen); + } + while (pos != string::npos); + + newstr.reserve(strLen + (count * ((int)toLen - (int)fromLen)) + 1); + + // Now move the stuff into newstr + do + { + if (pos > i) newstr = newstr + str.substr(i, pos - i); newstr = newstr + tostr; - i = pos + fromstr.size(); + i = pos + fromLen; + pos = str.find(fromstr, i); } - else + while (pos != string::npos); + + newstr = newstr + str.substr(i, string::npos); + } + else + { + // UTF + const char* src = str.c_str(); + const char* srcEnd = src + strLen; + const char* srchEnd = srcEnd - fromLen + 1; + const char* from = fromstr.c_str(); + const char* fromEnd = from + fromLen; + const char* to = tostr.c_str(); + char* ptr = const_cast(src); + char *i,*j; + size_t count = 10; // Some arbitray number to reserve some space to start. + size_t growlen = count * ((int)toLen - (int)fromLen); + newstr.reserve(strLen + (count * growlen) + 1); + size_t maxsize = newstr.max_size(); + uint32_t l; + + // We don't know where byte patterns might match so + // we start at the beginning of the string and move forward + // one character at a time until we find a match. Then we can + // move the src bytes and add in the to bytes,then try again. + while (ptr < srchEnd) { - newstr = newstr + str.substr(i, 1000); - break; + bool found = false; + if (*ptr == *from) // If the first byte matches, maybe we have a match + { + // Do a byte by byte compare of src at that spot against from + i = ptr + 1; + j = const_cast(from) + 1; + found = true; + while (j != fromEnd) + { + if (*i++ != *j++) + { + found = false; + break; + } + } + } + if (found) + { + if (ptr < i) + { + int mvsize = i - ptr; + if (newstr.length() + mvsize + toLen < maxsize) + { + // We need a re-alloc + newstr.reserve(maxsize + growlen); + growlen *= 2; + } + newstr.append(ptr, mvsize); + ptr += mvsize; + } + newstr.append(to, toLen); + ptr += toLen; + } + else + { + // move to the next character + if ((l = my_ismbchar(cs, ptr, srcEnd))) // returns the number of bytes in the leading char or zero if one byte + ptr += l; + else + ++ptr; + } } } diff --git a/utils/funcexp/func_substr.cpp b/utils/funcexp/func_substr.cpp index 3f8be7373..6fdb5c16a 100644 --- a/utils/funcexp/func_substr.cpp +++ b/utils/funcexp/func_substr.cpp @@ -37,8 +37,6 @@ using namespace joblist; #include "collation.h" -#define STRCOLL_ENH__ - namespace funcexp { @@ -52,101 +50,57 @@ CalpontSystemCatalog::ColType Func_substr::operationType(FunctionParm& fp, Calpo std::string Func_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& ct) { -#ifdef STRCOLL_ENH__ - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + CHARSET_INFO* cs = ct.getCharset(); - if (isNull) - return ""; - - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring str(wcbuf, strwclen); - - int64_t start = fp[1]->data()->getIntVal(row, isNull) - 1; - - if (isNull) - return ""; - - if (start == -1) // pos == 0 - return ""; - - wstring::size_type n = wstring::npos; - - if (fp.size() == 3) - { - int64_t len = fp[2]->data()->getIntVal(row, isNull); - - if (isNull) - return ""; - - if (len < 1) - return ""; - - n = len; - } - - int64_t strLen = static_cast(str.length()); - - if (start < -1) // negative pos, beginning from end - start += strLen + 1; - - if (start < 0 || strLen <= start) - { - return ""; - } - - wstring out = str.substr(start, n); - size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; - return ret; -#else const string& str = fp[0]->data()->getStrVal(row, isNull); - if (isNull) return ""; - + int64_t strLen = str.length(); + const char* strptr = str.c_str(); + const char* strend = strptr + strLen; + uint32_t strChars = cs->numchars(strptr, strend); + int64_t start = fp[1]->data()->getIntVal(row, isNull) - 1; - if (isNull) return ""; - - if (start == -1) // pos == 0 + if (start < -1) // negative pos, beginning from end + start += strChars + 1; + if (start < 0 || strChars <= start) + { return ""; + } - size_t n = string::npos; - + int64_t length; if (fp.size() == 3) { - int64_t len = fp[2]->data()->getIntVal(row, isNull); - + int64_t length = fp[2]->data()->getIntVal(row, isNull); if (isNull) return ""; - - if (len < 1) + if (length < 1) return ""; - - n = len; } - - size_t strLen = strlen(str.c_str()); - - if (start < -1) // negative pos, beginning from end - start += strLen + 1; - - if (start < 0 || (int64_t)strLen <= start) + else { - return ""; + length = strChars - start; } - return str.substr(start, n); -#endif + // start is now number of chars into str to start the substring + // We convert it to number of bytes: + start = cs->charpos(strptr, strend, start); + // Convert length to bytes as well + length= cs->charpos(strptr + start, strend, length); + if ((start < 0) || (start + 1 > strLen)) + return ""; + + if (start == 0 && strLen == length) + return str; + + length= MY_MIN(length, strLen - start); + + std::string ret(strptr + start, length); + return ret; } diff --git a/utils/funcexp/func_substring_index.cpp b/utils/funcexp/func_substring_index.cpp index 970b3d28c..0fb9efd38 100644 --- a/utils/funcexp/func_substring_index.cpp +++ b/utils/funcexp/func_substring_index.cpp @@ -45,82 +45,164 @@ CalpontSystemCatalog::ColType Func_substring_index::operationType(FunctionParm& return fp[0]->data()->resultType(); } - std::string Func_substring_index::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& ct) { + CHARSET_INFO* cs = ct.getCharset(); + const string& str = fp[0]->data()->getStrVal(row, isNull); - if (isNull) return ""; - - const string& delim = fp[1]->data()->getStrVal(row, isNull); - + int64_t strLen = str.length(); + + const string& delimstr = fp[1]->data()->getStrVal(row, isNull); if (isNull) return ""; - + int64_t delimLen = delimstr.length(); + int64_t count = fp[2]->data()->getIntVal(row, isNull); - if (isNull) return ""; - if ( count == 0 ) + if (strLen == 0 || delimLen == 0 || !count == 0) return ""; - // To avoid comparison b/w int64_t and size_t - int64_t end = strlen(str.c_str()) & 0x7fffffffffffffff; - - if ( count > end ) + if (count > strLen) return str; - if (( count < 0 ) && ((count * -1) > (int64_t) end)) + if ((count < 0) && ((count * -1) > strLen)) return str; - string value = str; - - if ( count > 0 ) + std::string value; // Only used if !use_mb() + + if (cs->use_mb()) // Charset supports multibyte characters { - int pointer = 0; - - for ( int64_t i = 0 ; i < count ; i ++ ) + const char* src = str.c_str(); + const char* srcEnd = src + strLen; + const char* srchEnd = srcEnd - delimLen + 1; + const char* delim = delimstr.c_str(); + const char* delimEnd = delim + delimLen; + char* ptr = const_cast(src); + char *i,*j; + uint32_t l; + int32 n = 0, c = count, pass; + // For count > 0, this loop goes once. + // For count < 0, it goes twice + for (pass = (count > 0 ? 1 : 0); pass<2; ++pass) { - string::size_type pos = str.find(delim, pointer); - - if (pos != string::npos) - pointer = pos + 1; - - end = pos; + while (ptr < srchEnd) + { + bool found = false; + if (*ptr == *delim) // If the first byte matches, maybe we have a match + { + // Do a byte by byte compare of src at that spot against delim + i = ptr + 1; + j = const_cast(delim) + 1; + found = true; + while (j != delimEnd) + { + if (*i++ != *j++) + { + found = false; + break; + } + } + } + if (found) + { + if (pass==0) + ++n; + else if (!--c) + break; + + ptr += delimLen; + continue; + } + else + { + // move to the next character + if ((l = my_ismbchar(cs, ptr, srcEnd))) // returns the number of bytes in the leading char or zero if one byte + ptr += l; + else + ++ptr; + } + } + if (pass == 0) /* count<0 */ + { + c += n + 1; + if (c <= 0) + { + return str; // not found, return the original string + } + // Go back and do a second pass + ptr = const_cast(src); + } + else + { + if (c) + { + return str; // not found, return the original string + } + } + } + + if ( count > 0) /* return left part */ + { + std::string ret(src, ptr - src); + return ret; + } + else /* return right part */ + { + ptr+= delimLen; + std::string ret(ptr, srcEnd - ptr); + return ret; } - - value = str.substr(0, end); } else { - count = -count; - int pointer = end; - int start = 0; - - for ( int64_t i = 0 ; i < count ; i ++ ) + if (count > 0) { - string::size_type pos = str.rfind(delim, pointer); - - if (pos != string::npos) + int pointer = 0; + int64_t end = strLen; + for ( int64_t i = 0 ; i < count ; i ++ ) { - if ( count > end ) - return ""; + string::size_type pos = str.find(delimstr, pointer); - pointer = pos - 1; - start = pos + 1; + if (pos != string::npos) + pointer = pos + 1; + + end = pos; } - else - start = 0; + + value = str.substr(0, end); } + else + { + count = -count; + int pointer = strLen; + int start = 0; - value = str.substr(start, end); + for ( int64_t i = 0 ; i < count ; i ++ ) + { + string::size_type pos = str.rfind(delimstr, pointer); + + if (pos != string::npos) + { + if ( count > strLen ) + return ""; + + pointer = pos - 1; + start = pos + 1; + } + else + start = 0; + } + + value = str.substr(start, strLen); + } } - return value; } diff --git a/utils/funcexp/func_trim.cpp b/utils/funcexp/func_trim.cpp index 2a522d3fb..005183fcd 100644 --- a/utils/funcexp/func_trim.cpp +++ b/utils/funcexp/func_trim.cpp @@ -132,12 +132,12 @@ std::string Func_trim::getStrVal(rowgroup::Row& row, // We start at the beginning of the string and move forward // one character at a time until we reach the end. Then we can - // safely compare and remove on character. Then back to the beginning + // safely compare and remove one character. Then back to the beginning // and try again. while (end - binTLen >= pos) { const char* p = pos; - uint32 l; + uint32_t l; while (p + binTLen < end) { if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte