MCOL-3536 Collation

2025-07-29 08:21:15 +03:00 · 2020-06-03 19:43:53 -05:00
parent 39a93ef753
commit 889094a23d
16 changed files with 211 additions and 151 deletions
--- a/utils/funcexp/func_trim.cpp
+++ b/utils/funcexp/func_trim.cpp
@ -20,6 +20,9 @@
 *
 *
 ****************************************************************************/
+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>

 #include <string>
 using namespace std;
@ -47,106 +50,106 @@ CalpontSystemCatalog::ColType Func_trim::operationType(FunctionParm& fp, Calpont
 std::string Func_trim::getStrVal(rowgroup::Row& row,
                                 FunctionParm& fp,
                                 bool& isNull,
-                                 execplan::CalpontSystemCatalog::ColType&)
+                                 execplan::CalpontSystemCatalog::ColType& type)
 {
-    // The number of characters (not bytes) in our input tstr.
-    // Not all of these are necessarily significant. We need to search for the
-    // NULL terminator to be sure.
-    size_t strwclen;
-    // this holds the number of characters (not bytes) in ourtrim tstr.
-    size_t trimwclen;
-
+    CHARSET_INFO* cs = type.getCharset();
    // The original string
-    const string& tstr = fp[0]->data()->getStrVal(row, isNull);
+    const string& src = fp[0]->data()->getStrVal(row, isNull);
+    if (isNull)
+        return "";
+    if (src.empty() || src.length() == 0)
+        return src;
+    // binLen represents the number of bytes in src
+    size_t binLen = src.length();
+    const char* pos = src.c_str();
+    const char* end = pos + binLen;
+    // strLen = the number of characters in src
+    size_t strLen = cs->numchars(pos, end);

    // The trim characters.
    const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " ");
+    // binTLen represents the number of bytes in trim
+    size_t binTLen = trim.length();
+    const char* posT = trim.c_str();
+    // strTLen = the number of characters in trim
+    size_t strTLen = cs->numchars(posT, posT+binTLen);
+    if (strTLen == 0 || strTLen > strLen)
+        return src;

-    if (isNull)
-        return "";
-
-    if (tstr.empty() || tstr.length() == 0)
-        return tstr;
-
-    // Rather than calling the wideconvert functions with a null buffer to
-    // determine the size of buffer to allocate, we can be sure the wide
-    // char string won't be longer than:
-    strwclen = tstr.length(); // a guess to start with. This will be >= to the real count.
-    int bufsize = strwclen + 1;
-
-    // Convert the string to wide characters. Do all further work in wide characters
-    wchar_t* wcbuf = new wchar_t[bufsize];
-    strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1);
-
-    // Bad char in mbc can return -1
-    if (strwclen == static_cast<size_t>(-1))
-        strwclen = 0;
-
-    // Convert the trim string to wide
-    trimwclen = trim.length();  // A guess to start.
-    int trimbufsize = trimwclen + 1;
-    wchar_t* wctrim = new wchar_t[trimbufsize];
-    size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1);
-
-    // Bad char in mbc can return -1
-    if (trimlen == static_cast<size_t>(-1))
-        trimlen = 0;
-
-    size_t trimCmpLen = trimlen * sizeof(wchar_t);
-
-    const wchar_t* oPtr = wcbuf;      // To remember the start of the string
-    const wchar_t* aPtr = oPtr;
-    const wchar_t* aEnd = wcbuf + strwclen - 1;
-    size_t trimCnt = 0;
-
-    if (trimlen > 0)
+    if (binTLen == 1)
    {
-        if (trimlen == 1)
+        // If the trim string is 1 byte, don't waste cpu for memcmp
+        // Trim leading
+        while (pos < end && *pos == *posT)
        {
-            // If trim is a single char, then don't spend the overhead for memcmp.
-            wchar_t chr = wctrim[0];
-
-            // remove leading
-            while (aPtr != aEnd && *aPtr == chr)
-            {
-                aPtr++;
-                ++trimCnt;
-            }
-
-            // remove trailing
-            while (aEnd != aPtr && *aEnd == chr)
-            {
-                aEnd--;
-                ++trimCnt;
-            }
+            ++pos;
+            --binLen;
        }
-        else
+        // Trim trailing
+        while (end > pos && *end == *posT)
        {
-            aEnd -= (trimlen - 1);	// So we don't compare past the end of the string.
-
-            // remove leading
-            while (aPtr <= aEnd && !memcmp(aPtr, wctrim, trimCmpLen))
+            --end;
+            --binLen;
+        }
+    }
+    else if (!cs->use_mb())
+    {
+        // This is a one byte per char charset with multiple char trim.
+        // Trim leading
+        while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0)
+        {
+            pos += binTLen;
+            binLen -= binTLen;
+        }
+        // Trim trailing
+        while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0)
+        {
+            end -= binTLen;
+            binLen -= binTLen;
+        }
+    }    
+    else
+    {
+        // We're using a multi-byte charset
+        // Trim leading is easy
+        while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0)
+        {
+            pos += binTLen;
+            binLen -= binTLen;
+        }
+        
+        // Trim trailing
+        // The problem is that the byte pattern at the end could
+        // match memcmp, but not be correct since the first byte compared
+        // may actually be a second or later byte from a previous char.
+        
+        // We start at the beginning of the string and move forward
+        // one character at a time until we reach the end. Then we can
+        // safely compare.
+        while (end - binTLen >= pos)
+        {
+            const char* p = pos;
+            uint32 l;
+            while (p + binTLen < end)
            {
-                aPtr += trimlen;
-                trimCnt += trimlen;
+                if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte
+                    p += l;
+                else
+                    ++p;
            }
-
-            // remove trailing
-            while (aPtr <= aEnd && !memcmp(aEnd, wctrim, trimCmpLen))
+            if (p + binTLen == end && memcmp(p,posT,binTLen) == 0)
            {
-                aEnd -= trimlen;	//BUG 5241
-                trimCnt += trimlen;
+                end -= binTLen;
+                binLen -= binTLen;
+            }
+            else
+            {
+                break;  // We've run out of places to look
            }
        }
    }
-
-    // Bug 5110 - error in allocating enough memory for utf8 chars
-    size_t aLen = strwclen - trimCnt;
-    wstring trimmed = wstring(aPtr, aLen);
    // Turn back to a string
-    std::string ret(utf8::wstring_to_utf8(trimmed.c_str()));
-    delete [] wctrim;
-    delete [] wcbuf;
+    std::string ret(pos, binLen);
    return ret;
 }