MCOL-3536 Collation

2025-11-19 22:02:09 +03:00 · 2020-06-03 19:43:53 -05:00
parent 39a93ef753
commit 889094a23d
16 changed files with 211 additions and 151 deletions
--- a/dbcon/mysql/ha_mcs_execplan.cpp
+++ b/dbcon/mysql/ha_mcs_execplan.cpp
@@ -6672,7 +6672,8 @@ int processLimitAndOffset(
    }
    // We don't currently support limit with correlated subquery
-    if (gwi.subQuery && !gwi.correlatedTbNameVec.empty() && csep->hasOrderBy())
+    if (csep->limitNum() != (uint64_t) - 1 &&
            gwi.subQuery && !gwi.correlatedTbNameVec.empty())
    {
        gwi.fatalParseError = true;
        gwi.parseErrorText = IDBErrorInfo::instance()->errorMsg(ERR_NON_SUPPORT_LIMIT_SUB);
--- a/utils/funcexp/func_find_in_set.cpp
+++ b/utils/funcexp/func_find_in_set.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <cstdlib>
 #include <string>
@@ -56,37 +59,58 @@ int64_t Func_find_in_set::getIntVal(rowgroup::Row& row,
                                    CalpontSystemCatalog::ColType& op_ct)
 {
    const string& searchStr = parm[0]->data()->getStrVal(row, isNull);
    if (isNull)
        return 0;
    const string& setString = parm[1]->data()->getStrVal(row, isNull);
    if (isNull)
        return 0;
    if (searchStr.find(",") != string::npos)
        return 0;
-    string newSearchStr(searchStr.substr(0, strlen(searchStr.c_str())));
+    if (setString.length() > searchStr.length())
-    string newSetString(setString.substr(0, strlen(setString.c_str())));
+        return 0;
-    //tokenize the setStr with comma as seprator.
+        
-    typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
+    CHARSET_INFO *cs= op_ct.getCharset();
    boost::char_separator<char> sep( ",");
    tokenizer tokens(newSetString, sep);
-    unsigned i = 0;
+    my_wc_t wc= 0;
-    size_t pos = 0;
+    const char *str_begin= setString.c_str();
-
+    const char *str_end= setString.c_str();
-    for (tokenizer::iterator tok_iter = tokens.begin(); tok_iter != tokens.end(); ++tok_iter)
+    const char *real_end= str_end + setString.length();
    const char *find_str= searchStr.c_str();
    uint find_str_len= searchStr.length();
    int position= 0;
    static const char separator=',';
    while (1)
    {
-        pos = (*tok_iter).find(newSearchStr);
+        int symbol_len;
-        i++;
+        if ((symbol_len= cs->mb_wc(&wc, (uchar*) str_end,
-
+                                 (uchar*) real_end)) > 0)
-        if (( pos != string::npos) && (newSearchStr.length() == (*tok_iter).length()))
+        {
-            return i;
+            const char *substr_end= str_end + symbol_len;
            bool is_last_item= (substr_end == real_end);
            bool is_separator= (wc == (my_wc_t) separator);
            if (is_separator || is_last_item)
            {
                position++;
                if (is_last_item && !is_separator)
                    str_end= substr_end;
                if (!cs->strnncoll(str_begin, (uint) (str_end - str_begin),
                                 find_str, find_str_len))
                    return (longlong) position;
                else
                    str_begin= substr_end;
            }
            str_end= substr_end;
        }
        else if (str_end - str_begin == 0 &&
               find_str_len == 0 &&
               wc == (my_wc_t) separator)
            return (longlong) ++position;
        else
            return 0;
    }
    return 0;
 }
--- a/utils/funcexp/func_if.cpp
+++ b/utils/funcexp/func_if.cpp
@@ -53,22 +53,22 @@ bool boolVal(SPTP& parm, Row& row, const string& timeZone)
            case CalpontSystemCatalog::TEXT:
            case CalpontSystemCatalog::VARCHAR:
                ret = (atoi((char*)(parm->data()->getStrVal(timeZone).c_str())) != 0);
-
+                break;
            case CalpontSystemCatalog::FLOAT:
            case CalpontSystemCatalog::UFLOAT:
                ret = (parm->data()->getFloatVal(row, isNull) != 0);
-
+                break;
            case CalpontSystemCatalog::DOUBLE:
            case CalpontSystemCatalog::UDOUBLE:
                ret = (parm->data()->getDoubleVal(row, isNull) != 0);
-
+                break;
            case CalpontSystemCatalog::LONGDOUBLE:
                ret = (parm->data()->getLongDoubleVal(row, isNull) != 0);
-
+                break;
            case CalpontSystemCatalog::DECIMAL:
            case CalpontSystemCatalog::UDECIMAL:
                ret = (parm->data()->getDecimalVal(row, isNull).value != 0);
-
+                break;
            case CalpontSystemCatalog::BIGINT:
            case CalpontSystemCatalog::SMALLINT:
            case CalpontSystemCatalog::MEDINT:
@@ -83,6 +83,7 @@ bool boolVal(SPTP& parm, Row& row, const string& timeZone)
            case CalpontSystemCatalog::TIME:
            default:
                ret = (parm->data()->getIntVal(row, isNull) != 0);
                break;
        }
    }
--- a/utils/funcexp/func_insert.cpp
+++ b/utils/funcexp/func_insert.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
@@ -48,69 +51,61 @@ CalpontSystemCatalog::ColType Func_insert::operationType(FunctionParm& fp, Calpo
    return fp[0]->data()->resultType();
 }
 string insertStr(const string& src, int pos, int len, const string& targ)
 {
    int64_t strLen = static_cast<int64_t>(src.length());
    if ((pos <= 0) || ((pos - 1) >= strLen))
        return src;
    if ((len < 0) || (len > strLen))
        len = strLen;
    const char* srcptr = src.c_str();
    advance(srcptr, pos - 1, srcptr + strLen);
    // srcptr now pointing to where we need to insert targ string
    uint32_t srcPos = srcptr - src.c_str();
    uint32_t finPos = strLen;
    const char* finptr = src.c_str();
    if ((strLen - (pos - 1 + len)) >= 0)
    {
        advance(finptr, (pos - 1 + len), finptr + strLen);
        // finptr now pointing to the end of the string to replace
        finPos = finptr - src.c_str();
    }
    string out;
    out.reserve(srcPos + targ.length() + strLen - finPos + 1);
    out.append( src.c_str(), srcPos );
    out.append( targ.c_str(), targ.length() );
    out.append( src.c_str() + finPos, strLen - finPos );
    return out;
 }
 std::string Func_insert::getStrVal(rowgroup::Row& row,
                                   FunctionParm& fp,
                                   bool& isNull,
                                   execplan::CalpontSystemCatalog::ColType&)
 {
-	string tstr;
+	string src;
 	string tnewstr;
-    stringValue(fp[0], row, isNull, tstr);
+    int64_t start, length;
    stringValue(fp[0], row, isNull, src);
    if (isNull)
    {
        return "";
    }
    stringValue(fp[3], row, isNull, tnewstr);
    if (isNull)
        return "";
-    int64_t pos = fp[1]->data()->getIntVal(row, isNull);
+    start = fp[1]->data()->getIntVal(row, isNull);
    if (isNull)
        return "";
-    int64_t len = fp[2]->data()->getIntVal(row, isNull);
+    length = fp[2]->data()->getIntVal(row, isNull);
    if (isNull)
        return "";
-    return insertStr( tstr, pos, len, tnewstr );
+    start--; // Because SQL syntax is 1 based and we want 0 based.
    CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset();
    // binLen represents the number of bytes
    int64_t binLen = static_cast<int64_t>(src.length());
    const char* pos = src.c_str();
    const char* end = pos + binLen;
    // strLen is number of characters
    int64_t strLen = cs->numchars(pos, end);
    // Return the original string if start isn't within the string.
    if ((start <= 1) || start >= strLen)
        return src;
    if ((length < 0) || (length > strLen))
        length = strLen;
    // Convert start and length from characters to bytes.
    start = cs->charpos(pos, end, start);
    length = cs->charpos(pos+start, end, length);
    string out;
    out.reserve(binLen - length + tnewstr.length() + 1);
    out.append(src.c_str(), start);
    out.append(tnewstr.c_str(), tnewstr.length());
    out.append(src.c_str() + start + length, binLen - start - length);
    return out;
 }
--- a/utils/funcexp/func_left.cpp
+++ b/utils/funcexp/func_left.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_length.cpp
+++ b/utils/funcexp/func_length.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <cstdlib>
 #include <string>
--- a/utils/funcexp/func_lpad.cpp
+++ b/utils/funcexp/func_lpad.cpp
@@ -20,6 +20,10 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include "errorids.h"
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_ltrim.cpp
+++ b/utils/funcexp/func_ltrim.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_replace.cpp
+++ b/utils/funcexp/func_replace.cpp
@@ -21,6 +21,10 @@
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_right.cpp
+++ b/utils/funcexp/func_right.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_rpad.cpp
+++ b/utils/funcexp/func_rpad.cpp
@@ -20,6 +20,10 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include "errorids.h"
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_rtrim.cpp
+++ b/utils/funcexp/func_rtrim.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_strcmp.cpp
+++ b/utils/funcexp/func_strcmp.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
--- a/utils/funcexp/func_substr.cpp
+++ b/utils/funcexp/func_substr.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_substring_index.cpp
+++ b/utils/funcexp/func_substring_index.cpp
@@ -21,6 +21,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
--- a/utils/funcexp/func_trim.cpp
+++ b/utils/funcexp/func_trim.cpp
@@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
 #include <mariadb.h>
 #undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
 #include <my_sys.h>
 #include <string>
 using namespace std;
@@ -47,106 +50,106 @@ CalpontSystemCatalog::ColType Func_trim::operationType(FunctionParm& fp, Calpont
 std::string Func_trim::getStrVal(rowgroup::Row& row,
                                 FunctionParm& fp,
                                 bool& isNull,
-                                 execplan::CalpontSystemCatalog::ColType&)
+                                 execplan::CalpontSystemCatalog::ColType& type)
 {
-    // The number of characters (not bytes) in our input tstr.
+    CHARSET_INFO* cs = type.getCharset();
    // Not all of these are necessarily significant. We need to search for the
    // NULL terminator to be sure.
    size_t strwclen;
    // this holds the number of characters (not bytes) in ourtrim tstr.
    size_t trimwclen;
    // The original string
-    const string& tstr = fp[0]->data()->getStrVal(row, isNull);
+    const string& src = fp[0]->data()->getStrVal(row, isNull);
    if (isNull)
        return "";
    if (src.empty() || src.length() == 0)
        return src;
    // binLen represents the number of bytes in src
    size_t binLen = src.length();
    const char* pos = src.c_str();
    const char* end = pos + binLen;
    // strLen = the number of characters in src
    size_t strLen = cs->numchars(pos, end);
    // The trim characters.
    const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " ");
    // binTLen represents the number of bytes in trim
    size_t binTLen = trim.length();
    const char* posT = trim.c_str();
    // strTLen = the number of characters in trim
    size_t strTLen = cs->numchars(posT, posT+binTLen);
    if (strTLen == 0 || strTLen > strLen)
        return src;
-    if (isNull)
+    if (binTLen == 1)
        return "";
    if (tstr.empty() || tstr.length() == 0)
        return tstr;
    // Rather than calling the wideconvert functions with a null buffer to
    // determine the size of buffer to allocate, we can be sure the wide
    // char string won't be longer than:
    strwclen = tstr.length(); // a guess to start with. This will be >= to the real count.
    int bufsize = strwclen + 1;
    // Convert the string to wide characters. Do all further work in wide characters
    wchar_t* wcbuf = new wchar_t[bufsize];
    strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1);
    // Bad char in mbc can return -1
    if (strwclen == static_cast<size_t>(-1))
        strwclen = 0;
    // Convert the trim string to wide
    trimwclen = trim.length();  // A guess to start.
    int trimbufsize = trimwclen + 1;
    wchar_t* wctrim = new wchar_t[trimbufsize];
    size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1);
    // Bad char in mbc can return -1
    if (trimlen == static_cast<size_t>(-1))
        trimlen = 0;
    size_t trimCmpLen = trimlen * sizeof(wchar_t);
    const wchar_t* oPtr = wcbuf;      // To remember the start of the string
    const wchar_t* aPtr = oPtr;
    const wchar_t* aEnd = wcbuf + strwclen - 1;
    size_t trimCnt = 0;
    if (trimlen > 0)
    {
-        if (trimlen == 1)
+        // If the trim string is 1 byte, don't waste cpu for memcmp
        // Trim leading
        while (pos < end && *pos == *posT)
        {
-            // If trim is a single char, then don't spend the overhead for memcmp.
+            ++pos;
-            wchar_t chr = wctrim[0];
+            --binLen;
            // remove leading
            while (aPtr != aEnd && *aPtr == chr)
            {
                aPtr++;
                ++trimCnt;
            }
            // remove trailing
            while (aEnd != aPtr && *aEnd == chr)
            {
                aEnd--;
                ++trimCnt;
            }
        }
-        else
+        // Trim trailing
        while (end > pos && *end == *posT)
        {
-            aEnd -= (trimlen - 1);	// So we don't compare past the end of the string.
+            --end;
-
+            --binLen;
-            // remove leading
+        }
-            while (aPtr <= aEnd && !memcmp(aPtr, wctrim, trimCmpLen))
+    }
    else if (!cs->use_mb())
    {
        // This is a one byte per char charset with multiple char trim.
        // Trim leading
        while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0)
        {
            pos += binTLen;
            binLen -= binTLen;
        }
        // Trim trailing
        while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0)
        {
            end -= binTLen;
            binLen -= binTLen;
        }
    }    
    else
    {
        // We're using a multi-byte charset
        // Trim leading is easy
        while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0)
        {
            pos += binTLen;
            binLen -= binTLen;
        }
        // Trim trailing
        // The problem is that the byte pattern at the end could
        // match memcmp, but not be correct since the first byte compared
        // may actually be a second or later byte from a previous char.
        // We start at the beginning of the string and move forward
        // one character at a time until we reach the end. Then we can
        // safely compare.
        while (end - binTLen >= pos)
        {
            const char* p = pos;
            uint32 l;
            while (p + binTLen < end)
            {
-                aPtr += trimlen;
+                if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte
-                trimCnt += trimlen;
+                    p += l;
                else
                    ++p;
            }
-
+            if (p + binTLen == end && memcmp(p,posT,binTLen) == 0)
            // remove trailing
            while (aPtr <= aEnd && !memcmp(aEnd, wctrim, trimCmpLen))
            {
-                aEnd -= trimlen;	//BUG 5241
+                end -= binTLen;
-                trimCnt += trimlen;
+                binLen -= binTLen;
            }
            else
            {
                break;  // We've run out of places to look
            }
        }
    }
    // Bug 5110 - error in allocating enough memory for utf8 chars
    size_t aLen = strwclen - trimCnt;
    wstring trimmed = wstring(aPtr, aLen);
    // Turn back to a string
-    std::string ret(utf8::wstring_to_utf8(trimmed.c_str()));
+    std::string ret(pos, binLen);
    delete [] wctrim;
    delete [] wcbuf;
    return ret;
 }