MCOL-3536 Collation

2025-07-29 08:21:15 +03:00 · 2020-06-12 10:19:17 -05:00
parent de125bac2b
commit 165ae4a6f3
4 changed files with 267 additions and 139 deletions
--- a/utils/funcexp/func_replace.cpp
+++ b/utils/funcexp/func_replace.cpp
@ -49,36 +49,128 @@ CalpontSystemCatalog::ColType Func_replace::operationType(FunctionParm& fp, Calp
 std::string Func_replace::getStrVal(rowgroup::Row& row,
                                    FunctionParm& fp,
                                    bool& isNull,
-                                    execplan::CalpontSystemCatalog::ColType&)
+                                    execplan::CalpontSystemCatalog::ColType& ct)
 {
+    CHARSET_INFO* cs = ct.getCharset();
+
    const string& str = fp[0]->data()->getStrVal(row, isNull);
+    if (isNull)
+        return "";
+    size_t strLen = str.length();
    
    const string& fromstr = fp[1]->data()->getStrVal(row, isNull);
+    if (isNull)
+        return "";
+    if (fromstr.length() == 0)
+        return str;
+    size_t fromLen = fromstr.length();
    
    const string& tostr = fp[2]->data()->getStrVal(row, isNull);
+    if (isNull)
+        return "";
+    size_t toLen = tostr.length();

+    bool binaryCmp = (cs->state & MY_CS_BINSORT) || !cs->use_mb();
    string newstr;
-    unsigned int i = 0;
-
-    for (;;)
+    size_t pos = 0;
+    if (binaryCmp)
    {
-        size_t pos = str.find(fromstr, i);
+        uint32_t i = 0;
+        pos = str.find(fromstr);
+        if (pos == string::npos)
+            return str;
        
-        if ( pos != string::npos )
+        // Count the number of fromstr in strend
+        int count = 0;
+        do
+        {
+            ++count;
+            pos = str.find(fromstr, pos + fromLen);
+        }
+        while (pos != string::npos);
+        
+        newstr.reserve(strLen + (count * ((int)toLen - (int)fromLen)) + 1);
+        
+        // Now move the stuff into newstr
+        do
        {
-            //match
            if (pos > i)
                newstr = newstr + str.substr(i, pos - i);

            newstr = newstr + tostr;
-            i = pos + fromstr.size();
+            i = pos + fromLen;
+            pos = str.find(fromstr, i);
+        }
+        while (pos != string::npos);
+        
+        newstr = newstr + str.substr(i, string::npos);
    }
    else
    {
-            newstr = newstr + str.substr(i, 1000);
+        // UTF
+        const char* src = str.c_str();
+        const char* srcEnd = src + strLen;
+        const char* srchEnd = srcEnd - fromLen + 1;
+        const char* from = fromstr.c_str();
+        const char* fromEnd = from + fromLen;
+        const char* to = tostr.c_str();
+        char* ptr = const_cast<char*>(src);
+        char *i,*j;
+        size_t count = 10; // Some arbitray number to reserve some space to start.
+        size_t growlen = count * ((int)toLen - (int)fromLen);
+        newstr.reserve(strLen + (count * growlen) + 1); 
+        size_t maxsize = newstr.max_size();
+        uint32_t l;
+
+        // We don't know where byte patterns might match so
+        // we start at the beginning of the string and move forward
+        // one character at a time until we find a match. Then we can
+        // move the src bytes and add in the to bytes,then try again.
+        while (ptr < srchEnd)
+        {
+            bool found = false;
+            if (*ptr == *from)  // If the first byte matches, maybe we have a match
+            {
+                // Do a byte by byte compare of src at that spot against from
+                i = ptr + 1; 
+                j = const_cast<char*>(from) + 1;
+                found = true;
+                while (j != fromEnd)
+                {
+                  if (*i++ != *j++)
+                  {
+                      found = false;
                      break;
                  }
                }
+            }
+            if (found)
+            {
+                if (ptr < i)
+                {
+                    int mvsize = i - ptr;
+                    if (newstr.length() + mvsize + toLen < maxsize)
+                    {
+                        // We need a re-alloc
+                        newstr.reserve(maxsize + growlen);
+                        growlen *= 2;
+                    }
+                    newstr.append(ptr, mvsize);
+                    ptr += mvsize;
+                }
+                newstr.append(to, toLen);
+                ptr += toLen;
+            }
+            else
+            {
+                // move to the next character
+                if ((l = my_ismbchar(cs, ptr, srcEnd))) // returns the number of bytes in the leading char or zero if one byte
+                    ptr += l;
+                else
+                    ++ptr;
+            }
+        }
+    }

    return newstr;
 }
--- a/utils/funcexp/func_substr.cpp
+++ b/utils/funcexp/func_substr.cpp
@ -37,8 +37,6 @@ using namespace joblist;

 #include "collation.h"

-#define STRCOLL_ENH__
-
 namespace funcexp
 {

@ -52,101 +50,57 @@ CalpontSystemCatalog::ColType Func_substr::operationType(FunctionParm& fp, Calpo
 std::string Func_substr::getStrVal(rowgroup::Row& row,
                                   FunctionParm& fp,
                                   bool& isNull,
-                                   execplan::CalpontSystemCatalog::ColType&)
+                                   execplan::CalpontSystemCatalog::ColType& ct)
 {
-#ifdef STRCOLL_ENH__
-    const string& tstr = fp[0]->data()->getStrVal(row, isNull);
+    CHARSET_INFO* cs = ct.getCharset();

-    if (isNull)
-        return "";
-
-    size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1;
-    wchar_t* wcbuf = new wchar_t[strwclen];
-    strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen);
-    wstring str(wcbuf, strwclen);
-
-    int64_t start = fp[1]->data()->getIntVal(row, isNull) - 1;
-
-    if (isNull)
-        return "";
-
-    if (start == -1)  // pos == 0
-        return "";
-
-    wstring::size_type n = wstring::npos;
-
-    if (fp.size() == 3)
-    {
-        int64_t len = fp[2]->data()->getIntVal(row, isNull);
-
-        if (isNull)
-            return "";
-
-        if (len < 1)
-            return "";
-
-        n = len;
-    }
-
-    int64_t strLen = static_cast<int64_t>(str.length());
-
-    if (start < -1)  // negative pos, beginning from end
-        start += strLen + 1;
-
-    if (start < 0 || strLen <= start)
-    {
-        return "";
-    }
-
-    wstring out = str.substr(start, n);
-    size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1;
-    char* outbuf = new char[strmblen];
-    strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen);
-    std::string ret(outbuf, strmblen);
-    delete [] outbuf;
-    delete [] wcbuf;
-    return ret;
-#else
    const string& str = fp[0]->data()->getStrVal(row, isNull);
-
    if (isNull)
        return "";
+    int64_t strLen = str.length();
+    const char* strptr = str.c_str();
+    const char* strend = strptr + strLen;
+    uint32_t strChars = cs->numchars(strptr, strend);
    
    int64_t start = fp[1]->data()->getIntVal(row, isNull) - 1;
-
    if (isNull)
        return "";
-
-    if (start == -1)  // pos == 0
+    if (start < -1)  // negative pos, beginning from end
+        start += strChars + 1;
+    if (start < 0 || strChars <= start)
+    {
        return "";
+    }

-    size_t n = string::npos;
-
+    int64_t length;
    if (fp.size() == 3)
    {
-        int64_t len = fp[2]->data()->getIntVal(row, isNull);
-
+        int64_t length = fp[2]->data()->getIntVal(row, isNull);
        if (isNull)
            return "";
-
-        if (len < 1)
+        if (length < 1)
            return "";
-
-        n = len;
    }
-
-    size_t strLen = strlen(str.c_str());
-
-    if (start < -1)  // negative pos, beginning from end
-        start += strLen + 1;
-
-    if (start < 0 || (int64_t)strLen <= start)
+    else
    {
-        return "";
+        length = strChars - start;
    }

-    return str.substr(start, n);
-#endif
+    // start is now number of chars into str to start the substring
+    // We convert it to number of bytes:
+    start = cs->charpos(strptr, strend, start);
+    // Convert length to bytes as well
+    length= cs->charpos(strptr + start, strend, length);
+    if ((start < 0) || (start + 1 > strLen))
+        return "";
+
+    if (start == 0 && strLen == length)
+        return str;
+
+    length= MY_MIN(length, strLen - start);
+    
+    std::string ret(strptr + start, length);
+    return ret;
 }


--- a/utils/funcexp/func_substring_index.cpp
+++ b/utils/funcexp/func_substring_index.cpp
@ -45,48 +45,130 @@ CalpontSystemCatalog::ColType Func_substring_index::operationType(FunctionParm&
    return fp[0]->data()->resultType();
 }

-
 std::string Func_substring_index::getStrVal(rowgroup::Row& row,
        FunctionParm& fp,
        bool& isNull,
-        execplan::CalpontSystemCatalog::ColType&)
+        execplan::CalpontSystemCatalog::ColType& ct)
 {
+    CHARSET_INFO* cs = ct.getCharset();
+
    const string& str = fp[0]->data()->getStrVal(row, isNull);
-
    if (isNull)
        return "";
+    int64_t strLen = str.length();
    
-    const string& delim = fp[1]->data()->getStrVal(row, isNull);
-
+    const string& delimstr = fp[1]->data()->getStrVal(row, isNull);
    if (isNull)
        return "";
+    int64_t delimLen = delimstr.length();
    
    int64_t count = fp[2]->data()->getIntVal(row, isNull);
-
    if (isNull)
        return "";

-    if ( count == 0 )
+    if (strLen == 0 || delimLen == 0 || !count == 0)
        return "";

-    // To avoid comparison b/w int64_t and size_t
-    int64_t end = strlen(str.c_str()) & 0x7fffffffffffffff;
-
-    if ( count >  end )
+    if (count > strLen)
        return str;

-    if (( count < 0 ) && ((count * -1) > (int64_t) end))
+    if ((count < 0) && ((count * -1) > strLen))
        return str;

-    string value = str;
+    std::string value; // Only used if !use_mb()
     
+    if (cs->use_mb()) // Charset supports multibyte characters
+    {
+        const char* src = str.c_str();
+        const char* srcEnd = src + strLen;
+        const char* srchEnd = srcEnd - delimLen + 1;
+        const char* delim = delimstr.c_str();
+        const char* delimEnd = delim + delimLen;
+        char* ptr = const_cast<char*>(src);
+        char *i,*j;
+        uint32_t l;
+        int32 n = 0, c = count, pass;
+        // For count > 0, this loop goes once.
+        // For count < 0, it goes twice
+        for (pass = (count > 0 ? 1 : 0); pass<2; ++pass)
+        {
+            while (ptr < srchEnd)
+            {
+                bool found = false;
+                if (*ptr == *delim)  // If the first byte matches, maybe we have a match
+                {
+                    // Do a byte by byte compare of src at that spot against delim
+                    i = ptr + 1; 
+                    j = const_cast<char*>(delim) + 1;
+                    found = true;
+                    while (j != delimEnd)
+                    {
+                        if (*i++ != *j++)
+                        {
+                            found = false;
+                            break;
+                        }
+                    }
+                }
+                if (found)
+                {
+                    if (pass==0) 
+                        ++n;
+                    else if (!--c) 
+                        break;
+                    
+                    ptr += delimLen;
+                    continue;
+                }
+                else
+                {
+                    // move to the next character
+                    if ((l = my_ismbchar(cs, ptr, srcEnd))) // returns the number of bytes in the leading char or zero if one byte
+                        ptr += l;
+                    else
+                        ++ptr;
+                }
+            }
+            if (pass == 0) /* count<0 */
+            {
+                c += n + 1;
+                if (c <= 0)
+                {
+                    return str; // not found, return the original string
+                }
+                // Go back and do a second pass
+                ptr = const_cast<char*>(src);
+            }
+            else
+            {
+                if (c)
+                {
+                    return str; // not found, return the original string
+                }
+            }
+        }
+        
+        if ( count > 0) /* return left part */
+        {
+            std::string ret(src, ptr - src);
+            return ret;
+        }
+        else /* return right part */
+        {
+            ptr+= delimLen;
+            std::string ret(ptr, srcEnd - ptr);
+            return ret;
+        }
+    }
+    else
+    {
        if (count > 0)
        {
            int pointer = 0;
-
+            int64_t end = strLen;
            for ( int64_t i = 0 ; i < count ; i ++ )
            {
-            string::size_type pos = str.find(delim, pointer);
+                string::size_type pos = str.find(delimstr, pointer);

                if (pos != string::npos)
                    pointer = pos + 1;
@ -99,16 +181,16 @@ std::string Func_substring_index::getStrVal(rowgroup::Row& row,
        else
        {
            count = -count;
-        int pointer = end;
+            int pointer = strLen;
            int start = 0;

            for ( int64_t i = 0 ; i < count ; i ++ )
            {
-            string::size_type pos = str.rfind(delim, pointer);
+                string::size_type pos = str.rfind(delimstr, pointer);

                if (pos != string::npos)
                {
-                if ( count > end )
+                    if ( count > strLen )
                        return "";

                    pointer = pos - 1;
@ -118,9 +200,9 @@ std::string Func_substring_index::getStrVal(rowgroup::Row& row,
                    start = 0;
            }

-        value = str.substr(start, end);
+            value = str.substr(start, strLen);
+        }
    }
-
    return value;
 }

--- a/utils/funcexp/func_trim.cpp
+++ b/utils/funcexp/func_trim.cpp
@ -132,12 +132,12 @@ std::string Func_trim::getStrVal(rowgroup::Row& row,
            
            // We start at the beginning of the string and move forward
            // one character at a time until we reach the end. Then we can
-            // safely compare and remove on character. Then back to the beginning 
+            // safely compare and remove one character. Then back to the beginning 
            // and try again.
            while (end - binTLen >= pos)
            {
                const char* p = pos;
-                uint32 l;
+                uint32_t l;
                while (p + binTLen < end)
                {
                    if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte