MCOl-3536 Collation phase 2

2025-07-30 19:23:07 +03:00 · 2020-06-15 11:08:59 -05:00
parent 165ae4a6f3
commit d0818f2b4e
6 changed files with 40 additions and 25 deletions
--- a/utils/funcexp/func_replace.cpp
+++ b/utils/funcexp/func_replace.cpp
@ -75,12 +75,7 @@ std::string Func_replace::getStrVal(rowgroup::Row& row,
    size_t pos = 0;
    if (binaryCmp)
    {
-        uint32_t i = 0;
-        pos = str.find(fromstr);
-        if (pos == string::npos)
-            return str;
-        
-        // Count the number of fromstr in strend
+        // Count the number of fromstr in strend so we can reserve buffer space.
        int count = 0;
        do
        {
@ -91,7 +86,11 @@ std::string Func_replace::getStrVal(rowgroup::Row& row,
        
        newstr.reserve(strLen + (count * ((int)toLen - (int)fromLen)) + 1);
        
-        // Now move the stuff into newstr
+        uint32_t i = 0;
+        pos = str.find(fromstr);
+        if (pos == string::npos)
+            return str;
+        // Move the stuff into newstr
        do
        {
            if (pos > i)
@ -114,12 +113,14 @@ std::string Func_replace::getStrVal(rowgroup::Row& row,
        const char* from = fromstr.c_str();
        const char* fromEnd = from + fromLen;
        const char* to = tostr.c_str();
-        char* ptr = const_cast<char*>(src);
+        const char* ptr = src;
        char *i,*j;
        size_t count = 10; // Some arbitray number to reserve some space to start.
-        size_t growlen = count * ((int)toLen - (int)fromLen);
+        int growlen = (int)toLen - (int)fromLen;
+        growlen = growlen < 1 ? 1 : growlen;
+        growlen *= count;
        newstr.reserve(strLen + (count * growlen) + 1); 
-        size_t maxsize = newstr.max_size();
+        size_t maxsize = newstr.capacity();
        uint32_t l;

        // We don't know where byte patterns might match so
@ -132,7 +133,7 @@ std::string Func_replace::getStrVal(rowgroup::Row& row,
            if (*ptr == *from)  // If the first byte matches, maybe we have a match
            {
                // Do a byte by byte compare of src at that spot against from
-                i = ptr + 1; 
+                i = const_cast<char*>(ptr) + 1; 
                j = const_cast<char*>(from) + 1;
                found = true;
                while (j != fromEnd)
@ -148,18 +149,19 @@ std::string Func_replace::getStrVal(rowgroup::Row& row,
            {
                if (ptr < i)
                {
-                    int mvsize = i - ptr;
-                    if (newstr.length() + mvsize + toLen < maxsize)
+                    int mvsize = ptr - src;
+                    if (newstr.length() + mvsize + toLen > maxsize)
                    {
                        // We need a re-alloc
                        newstr.reserve(maxsize + growlen);
+                        maxsize = newstr.capacity();
                        growlen *= 2;
                    }
-                    newstr.append(ptr, mvsize);
-                    ptr += mvsize;
+                    newstr.append(src, ptr - src);
+                    src += mvsize + fromLen;
+                    ptr = src;
                }
                newstr.append(to, toLen);
-                ptr += toLen;
            }
            else
            {
@ -170,8 +172,9 @@ std::string Func_replace::getStrVal(rowgroup::Row& row,
                    ++ptr;
            }
        }
+        // Copy in the trailing src chars.
+        newstr.append(src, ptr - src);
    }
-
    return newstr;
 }

--- a/utils/funcexp/func_rtrim.cpp
+++ b/utils/funcexp/func_rtrim.cpp
@ -76,10 +76,12 @@ std::string Func_rtrim::getStrVal(rowgroup::Row& row,
    if (strTLen == 0 || strTLen > strLen)
        return src;

+    bool binaryCmp = (cs->state & MY_CS_BINSORT) || !cs->use_mb();
+
    if (binTLen == 1)
    {
        const char* ptr = pos;
-        if (cs->use_mb())   // This is a multi-byte charset
+        if (!binaryCmp)   // This is a multi-byte charset
        {
            const char* p = pos;
            uint32 l;
@ -110,7 +112,7 @@ std::string Func_rtrim::getStrVal(rowgroup::Row& row,
    else
    {
        // An uncommon case where the space character is > 1 byte
-        if (cs->use_mb())   // This is a multi-byte charset
+        if (binaryCmp)   // This is a multi-byte charset
        {
            // The problem is that the byte pattern at the end could
            // match memcmp, but not be correct since the first byte compared
--- a/utils/funcexp/func_substr.cpp
+++ b/utils/funcexp/func_substr.cpp
@ -75,7 +75,7 @@ std::string Func_substr::getStrVal(rowgroup::Row& row,
    int64_t length;
    if (fp.size() == 3)
    {
-        int64_t length = fp[2]->data()->getIntVal(row, isNull);
+        length = fp[2]->data()->getIntVal(row, isNull);
        if (isNull)
            return "";
        if (length < 1)
--- a/utils/funcexp/func_substring_index.cpp
+++ b/utils/funcexp/func_substring_index.cpp
@ -66,7 +66,7 @@ std::string Func_substring_index::getStrVal(rowgroup::Row& row,
    if (isNull)
        return "";

-    if (strLen == 0 || delimLen == 0 || !count == 0)
+    if (strLen == 0 || delimLen == 0 || count == 0)
        return "";

    if (count > strLen)
@ -75,9 +75,10 @@ std::string Func_substring_index::getStrVal(rowgroup::Row& row,
    if ((count < 0) && ((count * -1) > strLen))
        return str;

+    bool binaryCmp = (cs->state & MY_CS_BINSORT) || !cs->use_mb();
    std::string value; // Only used if !use_mb()
     
-    if (cs->use_mb()) // Charset supports multibyte characters
+    if (!binaryCmp) // Charset supports multibyte characters
    {
        const char* src = str.c_str();
        const char* srcEnd = src + strLen;
--- a/utils/funcexp/func_trim.cpp
+++ b/utils/funcexp/func_trim.cpp
@ -75,6 +75,8 @@ std::string Func_trim::getStrVal(rowgroup::Row& row,
    if (strTLen == 0 || strTLen > strLen)
        return src;

+    bool binaryCmp = (cs->state & MY_CS_BINSORT) || !cs->use_mb();
+
    if (binTLen == 1)
    {
        // If the trim string is 1 byte, don't waste cpu for memcmp
@ -86,7 +88,7 @@ std::string Func_trim::getStrVal(rowgroup::Row& row,
        }
        // Trim trailing
        const char* ptr = pos;
-        if (cs->use_mb())   // This is a multi-byte charset
+        if (!binaryCmp)   // This is a multi-byte charset
        {
            const char* p = pos;
            uint32 l;
@ -124,7 +126,7 @@ std::string Func_trim::getStrVal(rowgroup::Row& row,
        }
        
        // Trim trailing
-        if (cs->use_mb())   // This is a multi-byte charset
+        if (!binaryCmp)   // This is a multi-byte charset
        {
            // The problem is that the byte pattern at the end could
            // match memcmp, but not be correct since the first byte compared
--- a/utils/windowfunction/idborderby.cpp
+++ b/utils/windowfunction/idborderby.cpp
@ -304,7 +304,14 @@ int StringCompare::operator()(IdbCompare* l, Row::Pointer r1, Row::Pointer r2)
        if (!cs)
            cs = l->rowGroup()->getCharset(fSpec.fIndex);
        
-        ret = fSpec.fAsc * cs->strnncollsp(s1, len1, s2, len2);
+        if (cs->state & MY_CS_BINSORT)
+        {
+            ret = fSpec.fAsc * strncmp(s1, s2, max(len1,len2));
+        }
+        else
+        {
+            ret = fSpec.fAsc * cs->strnncoll(s1, len1, s2, len2);
+        }
    }

    return ret;