MCOL-3536 collation

2025-08-07 03:22:57 +03:00 · 2020-05-26 12:42:11 -05:00
parent 11ba12f6ea
commit 06e50e0926
47 changed files with 516 additions and 535 deletions
--- a/utils/funcexp/func_case.cpp
+++ b/utils/funcexp/func_case.cpp
@@ -22,6 +22,10 @@
 *
 ****************************************************************************/

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+
 #include <string>
 //#define NDEBUG
 #include <cassert>
@@ -180,20 +184,21 @@ inline uint64_t simple_case_cmp(Row& row,
        case execplan::CalpontSystemCatalog::VARCHAR:
        {
            const string& ev = parm[n]->data()->getStrVal(row, isNull);
-
            if (isNull)
                break;
+            CHARSET_INFO* cs = parm[n]->data()->resultType().getCharset();

            for (i = 1; i <= whereCount; i++)
            {
                //BUG 5362
-                if (utf8::idb_strcoll(ev.c_str(), parm[i]->data()->getStrVal(row, isNull).c_str()) == 0 && !isNull)
+                const string& p1 = parm[i]->data()->getStrVal(row, isNull);
+                if (isNull)
+                    break;
+                if (cs->strnncoll(ev.c_str(), ev.length(), p1.c_str(), p1.length()) == 0)
                {
                    foundIt = true;
                    break;
                }
-                else
-                    isNull = false;
            }

            break;
--- a/utils/funcexp/func_char_length.cpp
+++ b/utils/funcexp/func_char_length.cpp
@@ -49,7 +49,7 @@ namespace funcexp

 CalpontSystemCatalog::ColType Func_char_length::operationType( FunctionParm& fp, CalpontSystemCatalog::ColType& resultType )
 {
-    return resultType;
+    return fp[0]->data()->resultType();
 }

 int64_t Func_char_length::getIntVal(rowgroup::Row& row,
@@ -86,8 +86,7 @@ int64_t Func_char_length::getIntVal(rowgroup::Row& row,
                return 0;
            const char* b = tstr.c_str();
            const char* e = tstr.c_str() + tstr.length();
-            const CHARSET_INFO* cs = get_charset(parm[0]->data()->resultType().charsetNumber, MYF(MY_WME));
-            return (int64_t)cs->numchars(b, e);
+            return (int64_t)parm[0]->data()->resultType().getCharset()->numchars(b, e);
        }

        case execplan::CalpontSystemCatalog::DATE:
--- a/utils/funcexp/func_greatest.cpp
+++ b/utils/funcexp/func_greatest.cpp
@@ -22,6 +22,10 @@
 *
 ****************************************************************************/

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+
 #include <cstdlib>
 #include <string>
 #include <sstream>
@@ -148,6 +152,7 @@ std::string Func_greatest::getStrVal(rowgroup::Row& row,
                                     execplan::CalpontSystemCatalog::ColType& op_ct)
 {
    const string& str = fp[0]->data()->getStrVal(row, isNull);
+    CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset();

    string greatestStr = str;

@@ -155,12 +160,10 @@ std::string Func_greatest::getStrVal(rowgroup::Row& row,
    {
        const string& str1 = fp[i]->data()->getStrVal(row, isNull);

-        int tmp = utf8::idb_strcoll(greatestStr.c_str(), str1.c_str());
-
-        if ( tmp < 0 )
-
-//		if ( greatestStr < str1 )
+        if (cs->strnncoll(greatestStr.c_str(), greatestStr.length(), str1.c_str(), str1.length()) < 0)
+        {
            greatestStr = str1;
+        }
    }

    return greatestStr;
--- a/utils/funcexp/func_in.cpp
+++ b/utils/funcexp/func_in.cpp
@@ -22,6 +22,10 @@
 *
 ****************************************************************************/

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+
 #include <cstdlib>
 #include <string>
 using namespace std;
@@ -52,11 +56,6 @@ inline bool numericEQ(result_t op1, result_t op2)
    return op1 == op2;
 }

-inline bool strEQ(string op1, string op2)
-{
-    return utf8::idb_strcoll(op1.c_str(), op2.c_str()) == 0;
-}
-
 inline bool getBoolForIn(rowgroup::Row& row,
                         funcexp::FunctionParm& pm,
                         bool& isNull,
@@ -273,15 +272,16 @@ inline bool getBoolForIn(rowgroup::Row& row,
        case execplan::CalpontSystemCatalog::TEXT:
        {
            const string& val = pm[0]->data()->getStrVal(row, isNull);
-
            if (isNull)
                return false;

+            CHARSET_INFO* cs = pm[0]->data()->resultType().getCharset();
+
            for (uint32_t i = 1; i < pm.size(); i++)
            {
                isNull = false;
-
-                if ( utf8::idb_strcoll(val.c_str(), pm[i]->data()->getStrVal(row, isNull).c_str()) == 0 && !isNull)
+                const string& str1 = pm[i]->data()->getStrVal(row, isNull);
+                if (cs->strnncoll(val.c_str(), val.length(), str1.c_str(), str1.length()) == 0 && !isNull)
                    return true;

                if (isNull && isNotIn)
--- a/utils/funcexp/func_instr.cpp
+++ b/utils/funcexp/func_instr.cpp
@@ -20,6 +20,10 @@
 *
 *
 ****************************************************************************/
+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+#include <m_ctype.h>

 #include <cstdlib>
 #include <string>
@@ -42,37 +46,50 @@ CalpontSystemCatalog::ColType Func_instr::operationType( FunctionParm& fp, Calpo
    return ct;
 }

-size_t Func_instr::in_str(const string& str, const string& substr, size_t start)
-{
-    // convert both inputs to wide character strings
-    std::wstring wcstr = utf8::utf8_to_wstring(str);
-    std::wstring wcsubstr = utf8::utf8_to_wstring(substr);
-
-    if ((str.length() && !wcstr.length()) ||
-            (substr.length() && !wcsubstr.length()))
-        // this means one or both of the strings had conversion errors to wide character
-        return 0;
-
-    size_t pos = wcstr.find(wcsubstr, start - 1);
-    return (pos != string::npos ? pos + 1 : 0);
-}
-
 int64_t Func_instr::getIntVal(rowgroup::Row& row,
                              FunctionParm& parm,
                              bool& isNull,
-                              CalpontSystemCatalog::ColType&)
+                              CalpontSystemCatalog::ColType& colType)
 {
-    uint64_t start = 1;
-
-    if (parm.size() == 3)
-        start = parm[2]->data()->getIntVal(row, isNull);
-
-    if (isNull || start == 0)
+    int64_t start = 0;
+    int64_t start0= 0;
+    my_match_t match;
+    
+    const std::string& str = parm[0]->data()->getStrVal(row, isNull);
+    if (isNull)
+        return 0;
+    const char* s1 = str.c_str();
+    uint32_t l1 = (uint32_t)str.length();
+    
+    const std::string& substr =parm[1]->data()->getStrVal(row, isNull);
+    if (isNull)
        return 0;

-    //Bug 5110 : to support utf8 char type, we have to convert and search
-    return in_str(parm[0]->data()->getStrVal(row, isNull), parm[1]->data()->getStrVal(row, isNull), start);
+    const char* s2 = substr.c_str();
+    uint32_t l2 = (uint32_t)substr.length();
+    if (l2 < 1)
+        return start + 1;

+    CHARSET_INFO* cs = colType.getCharset();
+    
+    if (parm.size() == 3)
+    {
+        start0 = start = parm[2]->data()->getIntVal(row, isNull) - 1;
+        
+        if ((start < 0) || (start > l1))
+          return 0;
+        
+        start = (int64_t)cs->charpos(s1, s1+l1, start); // adjust start for multi-byte
+
+        if (start + l2 > l1) // Substring is longer than str at pos.
+            return 0;
+    }
+    
+    if (!cs->instr(s1+start, l1-start,
+                   s2, l2,
+                   &match, 1))
+        return 0;
+    return (int64_t)match.mb_len + start0 + 1;
 }


--- a/utils/funcexp/func_lcase.cpp
+++ b/utils/funcexp/func_lcase.cpp
@@ -20,6 +20,10 @@
 *
 *
 ****************************************************************************/
+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+#include <m_ctype.h>

 #include <string>
 using namespace std;
@@ -56,31 +60,22 @@ CalpontSystemCatalog::ColType Func_lcase::operationType(FunctionParm& fp, Calpon
 std::string Func_lcase::getStrVal(rowgroup::Row& row,
                                  FunctionParm& fp,
                                  bool& isNull,
-                                  execplan::CalpontSystemCatalog::ColType&)
+                                  execplan::CalpontSystemCatalog::ColType& colType)
 {
-//	string str = fp[0]->data()->getStrVal(row, isNull);
-
-//	transform (str.begin(), str.end(), str.begin(), to_lower());
-
    const string& tstr = fp[0]->data()->getStrVal(row, isNull);

    if (isNull)
        return "";

-    size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1;
-    wchar_t* wcbuf = new wchar_t[strwclen];
-    strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen);
-    wstring wstr(wcbuf, strwclen);
+    CHARSET_INFO* cs = colType.getCharset();
+    uint64_t inLen = tstr.length();
+    uint64_t bufLen= inLen * cs->casedn_multiply;
+    char* outBuf = new char[bufLen];
+    
+    uint64_t outLen = cs->casedn(tstr.c_str(), inLen, outBuf, bufLen);

-    for (uint32_t i = 0; i < strwclen; i++)
-        wstr[i] = std::towlower(wstr[i]);
-
-    size_t strmblen = utf8::idb_wcstombs(0, wstr.c_str(), 0) + 1;
-    char* outbuf = new char[strmblen];
-    strmblen = utf8::idb_wcstombs(outbuf, wstr.c_str(), strmblen);
-    std::string ret(outbuf, strmblen);
-    delete [] outbuf;
-    delete [] wcbuf;
+    string ret = string(outBuf, outLen);
+    delete [] outBuf;
    return ret;
 }

--- a/utils/funcexp/func_least.cpp
+++ b/utils/funcexp/func_least.cpp
@@ -22,6 +22,10 @@
 *
 ****************************************************************************/

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+
 #include <cstdlib>
 #include <string>
 #include <sstream>
@@ -127,17 +131,16 @@ std::string Func_least::getStrVal(rowgroup::Row& row,
                                  execplan::CalpontSystemCatalog::ColType& op_ct)
 {
    string leastStr = fp[0]->data()->getStrVal(row, isNull);
+    CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset();

    for (uint32_t i = 1; i < fp.size(); i++)
    {
        const string& str1 = fp[i]->data()->getStrVal(row, isNull);

-        int tmp = utf8::idb_strcoll(leastStr.c_str(), str1.c_str());
-
-        if ( tmp > 0 )
-
-//		if ( leastStr > str1 )
+        if (cs->strnncoll(leastStr.c_str(), leastStr.length(), str1.c_str(), str1.length()) > 0)
+        {
            leastStr = str1;
+        }
    }

    return leastStr;
--- a/utils/funcexp/func_nullif.cpp
+++ b/utils/funcexp/func_nullif.cpp
@@ -22,6 +22,11 @@
 *
 ****************************************************************************/

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#undef LONGLONG_MIN
+#include <my_sys.h>
+
 #include <cstdlib>
 #include <string>
 #include <sstream>
@@ -363,6 +368,7 @@ string Func_nullif::getStrVal(rowgroup::Row& row,
                              CalpontSystemCatalog::ColType& op_ct)
 {
    string exp1 = parm[0]->data()->getStrVal(row, isNull);
+    CHARSET_INFO* cs = parm[0]->data()->resultType().getCharset();

    if (isNull)
    {
@@ -395,7 +401,7 @@ string Func_nullif::getStrVal(rowgroup::Row& row,
        exp2 = exp2 + " 00:00:00";
    }

-    if ( utf8::idb_strcoll(exp1.c_str(), exp2.c_str()) == 0 )
+    if (cs->strnncoll(exp1.c_str(), exp1.length(), exp2.c_str(), exp2.length()) == 0)
    {
        isNull = true;
        return "";
--- a/utils/funcexp/func_strcmp.cpp
+++ b/utils/funcexp/func_strcmp.cpp
@@ -21,6 +21,10 @@
 *
 ****************************************************************************/

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+
 #include <cstdlib>
 #include <string>
 #include <sstream>
@@ -39,6 +43,10 @@ using namespace joblist;
 #include "utils_utf8.h"
 using namespace funcexp;

+// Because including my_sys.h in a Columnstore header causes too many conflicts
+struct charset_info_st;
+typedef const struct charset_info_st CHARSET_INFO;
+
 class to_lower
 {
 public:
@@ -64,10 +72,11 @@ int64_t Func_strcmp::getIntVal(rowgroup::Row& row,
                               bool& isNull,
                               execplan::CalpontSystemCatalog::ColType& op_ct)
 {
+    CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset();
    const string& str = fp[0]->data()->getStrVal(row, isNull);
-
    const string& str1 = fp[1]->data()->getStrVal(row, isNull);
-    int ret = utf8::idb_strcoll(str.c_str(), str1.c_str());
+
+    int ret = cs->strnncoll(str.c_str(), str.length(), str1.c_str(), str1.length());
    // mysql's strcmp returns only -1, 0, and 1
    return (ret < 0 ? -1 : (ret > 0 ? 1 : 0));
 }
--- a/utils/funcexp/func_ucase.cpp
+++ b/utils/funcexp/func_ucase.cpp
@@ -20,6 +20,10 @@
 *
 *
 ****************************************************************************/
+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+#include <m_ctype.h>

 #include <string>
 using namespace std;
@@ -55,31 +59,22 @@ CalpontSystemCatalog::ColType Func_ucase::operationType(FunctionParm& fp, Calpon
 std::string Func_ucase::getStrVal(rowgroup::Row& row,
                                  FunctionParm& fp,
                                  bool& isNull,
-                                  execplan::CalpontSystemCatalog::ColType&)
+                                  execplan::CalpontSystemCatalog::ColType& colType)
 {
-//	string str = fp[0]->data()->getStrVal(row, isNull);
-
-//	transform (str.begin(), str.end(), str.begin(), to_lower());
-
    const string& tstr = fp[0]->data()->getStrVal(row, isNull);

    if (isNull)
        return "";

-    size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1;
-    wchar_t* wcbuf = new wchar_t[strwclen];
-    strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen);
-    wstring wstr(wcbuf, strwclen);
+    CHARSET_INFO* cs = colType.getCharset();
+    uint64_t inLen = tstr.length();
+    uint64_t bufLen= inLen * cs->caseup_multiply;
+    char* outBuf = new char[bufLen];
+    
+    uint64_t outLen = cs->caseup(tstr.c_str(), inLen, outBuf, bufLen);

-    for (uint32_t i = 0; i < strwclen; i++)
-        wstr[i] = std::towupper(wstr[i]);
-
-    size_t strmblen = utf8::idb_wcstombs(0, wstr.c_str(), 0) + 1;
-    char* outbuf = new char[strmblen];
-    strmblen = utf8::idb_wcstombs(outbuf, wstr.c_str(), strmblen);
-    std::string ret(outbuf, strmblen);
-    delete [] outbuf;
-    delete [] wcbuf;
+    string ret = string(outBuf, outLen);
+    delete [] outBuf;
    return ret;
 }

--- a/utils/funcexp/functor_int.h
+++ b/utils/funcexp/functor_int.h
@@ -84,8 +84,6 @@ public:

    execplan::CalpontSystemCatalog::ColType operationType(FunctionParm& fp, execplan::CalpontSystemCatalog::ColType& resultType);

-    size_t in_str(const std::string& str, const std::string& substr, size_t start);
-
    int64_t getIntVal(rowgroup::Row& row,
                      FunctionParm& fp,
                      bool& isNull,
--- a/utils/funcexp/utils_utf8.h
+++ b/utils/funcexp/utils_utf8.h
@@ -1,303 +0,0 @@
-/* Copyright (C) 2014 InfiniDB, Inc.
- * Copyright (C) 2016 MariaDB Corporation.
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; version 2 of
-   the License.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
-   MA 02110-1301, USA. */
-
-//  $Id$
-
-
-#ifndef _UTILS_UTF8_H_
-#define _UTILS_UTF8_H_
-
-
-
-#include <string>
-#if defined(_MSC_VER)
-#include <malloc.h>
-#include <windows.h>
-#elif defined(__FreeBSD__)
-//#include <cstdlib>
-#else
-#include <alloca.h>
-#endif
-#include <cstdlib>
-
-#include <clocale>
-#include "liboamcpp.h"
-
-/** @file */
-
-namespace funcexp
-{
-namespace utf8
-{
-extern bool JPcodePoint;		// code point ordering (Japanese UTF) flag, used in idb_strcoll
-
-const int MAX_UTF8_BYTES_PER_CHAR = 4;
-
-// A global loc object so we don't construct one at every compare
-extern std::locale loc;
-// Is there a way to construct a global reference to a facet?
-// const std::collate<char>& coll = std::use_facet<std::collate<char> >(loc);
-
-//Infinidb version of strlocale  BUG 5362
-//set System Locale "C" by default
-//return the system Locale currently set in from Columnstore.xml
-inline
-std::string idb_setlocale()
-{
-    // get and set locale language
-    std::string systemLang("C");
-    oam::Oam oam;
-    static bool loggedMsg = false;
-
-    try
-    {
-        oam.getSystemConfig("SystemLang", systemLang);
-    }
-    catch (...)
-    {
-        systemLang = "C";
-    }
-
-    char* pLoc = setlocale(LC_ALL, systemLang.c_str());
-
-    if (pLoc == NULL)
-    {
-        try
-        {
-            if (!loggedMsg)
-            {
-                //send alarm
-                alarmmanager::ALARMManager alarmMgr;
-                std::string alarmItem = "system";
-                alarmMgr.sendAlarmReport(alarmItem.c_str(), oam::INVALID_LOCALE, alarmmanager::SET);
-                
-                // Log one line
-                logging::LoggingID lid(17);  // ProcessManager -- probably the only one to find this for now
-                logging::MessageLog ml(lid);
-                logging::Message msg(1);
-                logging::Message::Args args;
-                args.add("Failed to set locale ");
-                args.add(systemLang.c_str());
-                args.add(": Setting to 'C'. Critical alarm generated");
-                msg.format( args );
-                ml.logErrorMessage(msg);
-                
-                loggedMsg = true;
-            }
-            systemLang = "C";
-        }
-        catch (...)
-        {
-            // Ignoring for time being.
-        }
-    }
-    else
-    {
-        try
-        {
-            //send alarm
-            alarmmanager::ALARMManager alarmMgr;
-            std::string alarmItem = "system";
-            alarmMgr.sendAlarmReport(alarmItem.c_str(), oam::INVALID_LOCALE, alarmmanager::CLEAR);
-        }
-        catch (...)
-        {
-            // Ignoring for time being.
-        }
-
-    }
-
-    printf ("Locale is : %s\n", systemLang.c_str() );
-
-    //BUG 2991
-    setlocale(LC_NUMERIC, "C");
-
-    if (systemLang.find("ja_JP") != std::string::npos)
-        JPcodePoint = true;
-
-    // MCOL-1559 Save off the locale to save runtime cpus
-    std::locale localloc(systemLang.c_str());
-    loc = localloc;
-
-    return systemLang;
-}
-
-// Infinidb version of strcoll.  BUG 5362
-// strcoll() comparison while ja_JP.utf8 does not give correct results.
-// For correct results strcmp() can be used.
-inline
-int idb_strcoll(const char* str1, const char* str2)
-{
-    if (JPcodePoint)
-        return strcmp(str1, str2);
-    else
-        return strcoll(str1, str2);
-}
-
-// MCOL-1559 Add a trimmed version of strcoll
-// The intent here is to make no copy of the original strings and
-// not modify them, so we can't use trim to deal with the spaces.
-inline
-int idb_strtrimcoll(const std::string& str1, const std::string& str2)
-{
-    static const std::string whitespaces (" ");
-    const char* s1 = str1.c_str();
-    const char* s2 = str2.c_str();
-
-    // Set found1 to the last non-whitespace char in str1
-    std::size_t found1 = str1.find_last_not_of(whitespaces);
-    // Set found2 to the first whitespace char in str2
-    std::size_t found2 = str2.find_last_not_of(whitespaces);
-
-     // Are both strings empty or all whitespace?
-    if (found1 == std::string::npos && found2 == std::string::npos)
-    {
-        return 0; // they match
-    }
-    // If str1 is empty or all spaces
-    if (found1 == std::string::npos)
-    {
-        return -1;
-    }
-    // If str2 is empty or all spaces
-    if (found2 == std::string::npos)
-    {
-        return 1;
-    }
-
-    // found1 and found2 point to the character that is not a space. 
-    // compare wants it to point to one past.
-    found1 += 1;
-    found2 += 1;
-    // If no trimming needs doing, then strcoll is faster
-    if (found1 == str1.size() && found2 == str2.size())
-    {
-        return idb_strcoll(s1, s2);
-    }
-    // Compare the (trimmed) strings
-    const std::collate<char>& coll = std::use_facet<std::collate<char> >(loc);
-    int rtn = coll.compare(s1, s1+found1, s2, s2+found2);
-    return rtn;
-}
-
-// BUG 5241
-// Infinidb specific mbstowcs(). This will handle both windows and unix platforms
-// Params dest and max should have enough length to accomodate NULL
-inline
-size_t idb_mbstowcs(wchar_t* dest, const char* src, size_t max)
-{
-#ifdef _MSC_VER
-    // 4th param (-1) denotes to convert till hit NULL char
-    // if 6th param max = 0, will return the required buffer size
-    size_t strwclen = MultiByteToWideChar(CP_UTF8, 0, src, -1, dest, (int)max);
-    // decrement the count of NULL; will become -1 on failure
-    return --strwclen;
-
-#else
-    return mbstowcs(dest, src, max);
-#endif
-}
-
-// BUG 5241
-// Infinidb specific wcstombs(). This will handle both windows and unix platforms
-// Params dest and max should have enough length to accomodate NULL
-inline
-size_t idb_wcstombs(char* dest, const wchar_t* src, size_t max)
-{
-#ifdef _MSC_VER
-    // 4th param (-1) denotes to convert till hit NULL char
-    //if 6th param max = 0, will return the required buffer size
-    size_t strmblen = WideCharToMultiByte( CP_UTF8, 0, src, -1, dest, (int)max, NULL, NULL);
-    // decrement the count of NULL; will become -1 on failure
-    return --strmblen;
-#else
-    return wcstombs(dest, src, max);
-#endif
-}
-
-// convert UTF-8 string to wstring
-inline
-std::wstring utf8_to_wstring (const std::string& str)
-{
-    size_t bufsize = str.length() + 1;
-
-    // Convert to wide characters. Do all further work in wide characters
-    wchar_t* wcbuf = new wchar_t[bufsize];
-    // Passing +1 so that windows is happy to see extra position to place NULL
-    size_t strwclen = idb_mbstowcs(wcbuf, str.c_str(), str.length() + 1);
-
-    // if result is -1 it means bad characters which may happen if locale is wrong.
-    // return an empty string
-    if ( strwclen == static_cast<size_t>(-1) )
-        strwclen = 0;
-
-    std::wstring ret(wcbuf, strwclen);
-
-    delete [] wcbuf;
-    return ret;
-}
-
-
-// convert wstring to UTF-8 string
-inline
-std::string wstring_to_utf8 (const std::wstring& str)
-{
-    char* outbuf = new char[(str.length() * MAX_UTF8_BYTES_PER_CHAR) + 1];
-    // Passing +1 so that windows is happy to see extra position to place NULL
-    size_t strmblen = idb_wcstombs(outbuf, str.c_str(), str.length() * MAX_UTF8_BYTES_PER_CHAR + 1);
-
-    // if result is -1 it means bad characters which may happen if locale is wrong.
-    // return an empty string
-    if ( strmblen == static_cast<size_t>(-1) )
-        strmblen = 0;
-
-    std::string ret(outbuf, strmblen);
-
-    delete [] outbuf;
-    return ret;
-}
-
-inline
-uint8_t utf8_truncate_point(const char* input, size_t length)
-{
-    // Find the beginning of a multibyte char to truncate at and return the
-    // number of bytes to truncate
-    if (length < 3)
-    {
-        return 0;
-    }
-
-    const unsigned char* b = (const unsigned char*)(input) + length - 3;
-
-    if (b[2] & 0x80)
-    {
-        // First byte in a new multi-byte sequence
-        if (b[2] & 0x40) return 1;
-        // 3 byte sequence
-        else if ((b[1] & 0xe0) == 0xe0) return 2;
-        // 4 byte sequence
-        else if ((b[0] & 0xf0) == 0xf0) return 3;
-    }
-
-    return 0;
-}
-
-} //namespace utf8
-} //namespace funcexp
-
-#endif
--- a/utils/rowgroup/rowaggregation.cpp
+++ b/utils/rowgroup/rowaggregation.cpp
@@ -24,6 +24,10 @@
 * is the primary class.
 */

+#include <mariadb.h>
+#undef set_bits  // mariadb.h defines set_bits, which is incompatible with boost
+#include <my_sys.h>
+
 #include <unistd.h>
 #include <sstream>
 #include <stdexcept>
@@ -384,36 +388,16 @@ inline void RowAggregation::updateFloatMinMax(float val1, float val2, int64_t co
        fRow.setFloatField(val1, col);
 }

-
-
-#define STRCOLL_ENH__
-
 void RowAggregation::updateStringMinMax(string val1, string val2, int64_t col, int func)
 {
-    if (isNull(fRowGroupOut, fRow, col))
+    CHARSET_INFO* cs = fRowGroupIn.getCharset(col);
+    int tmp = cs->strnncoll(val1.c_str(), val1.length(), val2.c_str(), val2.length());
+
+    if ((tmp < 0 && func == rowgroup::ROWAGG_MIN) ||
+            (tmp > 0 && func == rowgroup::ROWAGG_MAX))
    {
        fRow.setStringField(val1, col);
    }
-
-#ifdef STRCOLL_ENH__
-    else
-    {
-        int tmp = utf8::idb_strcoll(val1.c_str(), val2.c_str());
-
-        if ((tmp < 0 && func == rowgroup::ROWAGG_MIN) ||
-                (tmp > 0 && func == rowgroup::ROWAGG_MAX))
-        {
-            fRow.setStringField(val1, col);
-        }
-    }
-
-#else
-    else if (minMax(val1, val2, func))
-    {
-        fRow.setStringField(val1, col);
-    }
-
-#endif
 }

 //------------------------------------------------------------------------------
--- a/utils/rowgroup/rowaggregation.h
+++ b/utils/rowgroup/rowaggregation.h
@@ -53,6 +53,9 @@
 #include "mcsv1_udaf.h"
 #include "constantcolumn.h"

+// Because including my_sys.h in a Columnstore header causes too many conflicts
+struct charset_info_st;
+typedef const struct charset_info_st CHARSET_INFO;
 // To do: move code that depends on joblist to a proper subsystem.
 namespace joblist
 {
@@ -706,7 +709,7 @@ protected:

    // We need a separate copy for each thread.
    mcsv1sdk::mcsv1Context fRGContext;
-
+    
    // These are handy for testing the actual type of static_any for UDAF
    static const static_any::any& charTypeId;
    static const static_any::any& scharTypeId;
--- a/utils/rowgroup/rowgroup.cpp
+++ b/utils/rowgroup/rowgroup.cpp
@@ -505,8 +505,8 @@ Row::Row() : data(NULL), strings(NULL), userDataStore(NULL) { }

 Row::Row(const Row& r) : columnCount(r.columnCount), baseRid(r.baseRid),
    oldOffsets(r.oldOffsets), stOffsets(r.stOffsets),
-    offsets(r.offsets), colWidths(r.colWidths), types(r.types), data(r.data),
-    scale(r.scale), precision(r.precision), strings(r.strings),
+    offsets(r.offsets), colWidths(r.colWidths), types(r.types), charsetNumbers(r.charsetNumbers),
+    data(r.data), scale(r.scale), precision(r.precision), strings(r.strings),
    useStringTable(r.useStringTable), hasLongStringField(r.hasLongStringField),
    sTableThreshold(r.sTableThreshold), forceInline(r.forceInline), userDataStore(NULL)
 { }
@@ -522,6 +522,7 @@ Row& Row::operator=(const Row& r)
    offsets = r.offsets;
    colWidths = r.colWidths;
    types = r.types;
+    charsetNumbers = r.charsetNumbers;
    data = r.data;
    scale = r.scale;
    precision = r.precision;
@@ -1006,6 +1007,7 @@ RowGroup::RowGroup(uint32_t colCount,
                   const vector<uint32_t>& roids,
                   const vector<uint32_t>& tkeys,
                   const vector<CalpontSystemCatalog::ColDataType>& colTypes,
+                   const vector<uint32_t>& csNumbers,
                   const vector<uint32_t>& cscale,
                   const vector<uint32_t>& cprecision,
                   uint32_t stringTableThreshold,
@@ -1013,7 +1015,7 @@ RowGroup::RowGroup(uint32_t colCount,
                   const vector<bool>& forceInlineData
                  ) :
    columnCount(colCount), data(NULL), oldOffsets(positions), oids(roids), keys(tkeys),
-    types(colTypes), scale(cscale), precision(cprecision), rgData(NULL), strings(NULL),
+    types(colTypes), charsetNumbers(csNumbers), scale(cscale), precision(cprecision), rgData(NULL), strings(NULL),
    sTableThreshold(stringTableThreshold)
 {
    uint32_t i;
@@ -1047,12 +1049,16 @@ RowGroup::RowGroup(uint32_t colCount,

    useStringTable = (stringTable && hasLongStringField);
    offsets = (useStringTable ? &stOffsets[0] : &oldOffsets[0]);
+    
+    // Set all the charsets to NULL for jit initialization.
+    charsets.insert(charsets.begin(), charsetNumbers.size(), NULL);
 }

 RowGroup::RowGroup(const RowGroup& r) :
    columnCount(r.columnCount), data(r.data), oldOffsets(r.oldOffsets),
    stOffsets(r.stOffsets), colWidths(r.colWidths),
-    oids(r.oids), keys(r.keys), types(r.types), scale(r.scale), precision(r.precision),
+    oids(r.oids), keys(r.keys), types(r.types), charsetNumbers(r.charsetNumbers), 
+    charsets(r.charsets), scale(r.scale), precision(r.precision),
    rgData(r.rgData), strings(r.strings), useStringTable(r.useStringTable),
    hasLongStringField(r.hasLongStringField), sTableThreshold(r.sTableThreshold),
    forceInline(r.forceInline)
@@ -1076,6 +1082,8 @@ RowGroup& RowGroup::operator=(const RowGroup& r)
    oids = r.oids;
    keys = r.keys;
    types = r.types;
+    charsetNumbers = r.charsetNumbers;
+    charsets = r.charsets;
    data = r.data;
    scale = r.scale;
    precision = r.precision;
@@ -1120,6 +1128,7 @@ void RowGroup::serialize(ByteStream& bs) const
    serializeInlineVector<uint32_t>(bs, oids);
    serializeInlineVector<uint32_t>(bs, keys);
    serializeInlineVector<CalpontSystemCatalog::ColDataType>(bs, types);
+    serializeInlineVector<uint32_t>(bs, charsetNumbers);
    serializeInlineVector<uint32_t>(bs, scale);
    serializeInlineVector<uint32_t>(bs, precision);
    bs << (uint8_t) useStringTable;
@@ -1139,6 +1148,7 @@ void RowGroup::deserialize(ByteStream& bs)
    deserializeInlineVector<uint32_t>(bs, oids);
    deserializeInlineVector<uint32_t>(bs, keys);
    deserializeInlineVector<CalpontSystemCatalog::ColDataType>(bs, types);
+    deserializeInlineVector<uint32_t>(bs, charsetNumbers);
    deserializeInlineVector<uint32_t>(bs, scale);
    deserializeInlineVector<uint32_t>(bs, precision);
    bs >> tmp8;
@@ -1156,6 +1166,10 @@ void RowGroup::deserialize(ByteStream& bs)
        offsets = &stOffsets[0];
    else if (!useStringTable && !oldOffsets.empty())
        offsets = &oldOffsets[0];
+
+    // Set all the charsets to NULL for jit initialization.
+    charsets.insert(charsets.begin(), charsetNumbers.size(), NULL);
+    
 }

 void RowGroup::serializeRGData(ByteStream& bs) const
@@ -1467,6 +1481,15 @@ void RowGroup::addToSysDataList(execplan::CalpontSystemCatalog::NJLSysDataList&
    }
 }

+CHARSET_INFO* RowGroup::getCharset(uint32_t col)
+{
+    if (charsets[col] == NULL)
+    {
+        charsets[col] = get_charset(charsetNumbers[col], MYF(MY_WME));
+    }
+    return charsets[col];
+}
+
 void RowGroup::setDBRoot(uint32_t dbroot)
 {
    *((uint32_t*) &data[dbRootOffset]) = dbroot;
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@@ -58,6 +58,11 @@

 #include "../winport/winport.h"

+// Because including my_sys.h in a Columnstore header causes too many conflicts
+struct charset_info_st;
+typedef const struct charset_info_st CHARSET_INFO;
+
+
 // Workaround for my_global.h #define of isnan(X) causing a std::std namespace

 namespace rowgroup
@@ -319,6 +324,7 @@ public:
    inline execplan::CalpontSystemCatalog::ColDataType getColType(uint32_t colIndex) const;
    inline execplan::CalpontSystemCatalog::ColDataType* getColTypes();
    inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const;
+    inline uint32_t getCharsetNumber(uint32_t colIndex) const;

    // this returns true if the type is not CHAR or VARCHAR
    inline bool isCharType(uint32_t colIndex) const;
@@ -461,6 +467,7 @@ private:
    uint32_t* offsets;
    uint32_t* colWidths;
    execplan::CalpontSystemCatalog::ColDataType* types;
+    uint32_t* charsetNumbers;
    uint8_t* data;
    uint32_t* scale;
    uint32_t* precision;
@@ -569,6 +576,11 @@ inline const execplan::CalpontSystemCatalog::ColDataType* Row::getColTypes() con
    return types;
 }

+inline uint32_t Row::getCharsetNumber(uint32_t col) const
+{
+    return charsetNumbers[col];
+}
+
 inline bool Row::isCharType(uint32_t colIndex) const
 {
    return execplan::isCharType(types[colIndex]);
@@ -1268,6 +1280,7 @@ public:
    @param coids An array of oids for each column.
    @param tkeys An array of unique id for each column.
    @param colTypes An array of COLTYPEs for each column.
+    @param charsetNumbers an Array of the lookup numbers for the charset/collation object.
    @param scale An array specifying the scale of DECIMAL types (0 for non-decimal)
    @param precision An array specifying the precision of DECIMAL types (0 for non-decimal)
    */
@@ -1277,6 +1290,7 @@ public:
             const std::vector<uint32_t>& cOids,
             const std::vector<uint32_t>& tkeys,
             const std::vector<execplan::CalpontSystemCatalog::ColDataType>& colTypes,
+             const std::vector<uint32_t>& charsetNumbers,
             const std::vector<uint32_t>& scale,
             const std::vector<uint32_t>& precision,
             uint32_t stringTableThreshold,
@@ -1284,7 +1298,7 @@ public:
             const std::vector<bool>& forceInlineData = std::vector<bool>()
            );

-    /** @brief The copiers.  It copies metadata, not the row data */
+    /** @brief The copiers.  It copies metadata, not thetypes row data */
    RowGroup(const RowGroup&);

    /** @brief Assignment operator.  It copies metadata, not the row data */
@@ -1338,6 +1352,8 @@ public:
    inline execplan::CalpontSystemCatalog::ColDataType getColType(uint32_t colIndex) const;
    inline const std::vector<execplan::CalpontSystemCatalog::ColDataType>& getColTypes() const;
    inline std::vector<execplan::CalpontSystemCatalog::ColDataType>& getColTypes();
+    inline const std::vector<uint32_t>& getCharsetNumbers() const;
+    inline uint32_t getCharsetNumber(uint32_t colIndex) const;
    inline boost::shared_array<bool>& getForceInline();
    static inline uint32_t getHeaderSize()
    {
@@ -1397,6 +1413,8 @@ public:
                            uint16_t* blockNum);

    inline void setStringStore(boost::shared_ptr<StringStore>);
+    
+    CHARSET_INFO* getCharset(uint32_t col);

 private:
    uint32_t columnCount;
@@ -1413,8 +1431,11 @@ private:
    // Used to map the projected column and rowgroup index
    std::vector<uint32_t> keys;
    std::vector<execplan::CalpontSystemCatalog::ColDataType> types;
-
-    // DECIMAL support.  For non-decimal fields, the values are 0.
+    // For string collation
+    std::vector<uint32_t> charsetNumbers;
+    std::vector<CHARSET_INFO*> charsets;
+    
+    // DECIMAL support.  For non-decimal fields, the valutypeses are 0.
    std::vector<uint32_t> scale;
    std::vector<uint32_t> precision;

@@ -1547,6 +1568,7 @@ void RowGroup::initRow(Row* r, bool forceInlineData) const
    {
        r->colWidths = (uint32_t*) &colWidths[0];
        r->types = (execplan::CalpontSystemCatalog::ColDataType*) & (types[0]);
+        r->charsetNumbers = (uint32_t*) & (charsetNumbers[0]);
        r->scale = (uint32_t*) & (scale[0]);
        r->precision = (uint32_t*) & (precision[0]);
    }
@@ -1649,6 +1671,16 @@ inline std::vector<execplan::CalpontSystemCatalog::ColDataType>& RowGroup::getCo
    return types;
 }

+inline const std::vector<uint32_t>& RowGroup::getCharsetNumbers() const
+{
+    return charsetNumbers;
+}
+
+inline uint32_t RowGroup::getCharsetNumber(uint32_t colIndex) const
+{
+    return charsetNumbers[colIndex];
+}
+
 inline const std::vector<uint32_t>& RowGroup::getScale() const
 {
    return scale;
--- a/utils/udfsdk/mcsv1_udaf.h
+++ b/utils/udfsdk/mcsv1_udaf.h
@@ -369,6 +369,11 @@ public:

    EXPORT mcsv1Context& operator=(const mcsv1Context& rhs);
    EXPORT mcsv1Context& copy(const mcsv1Context& rhs);
+    
+    // Character collation support
+    EXPORT void setCharsetNumber(uint32_t csNum);
+    EXPORT uint32_t getCharsetNumber(); // Returns the unique ID for the language/collation
+    EXPORT CHARSET_INFO* getCharset();

 private:

@@ -392,6 +397,7 @@ private:
    int32_t  fParamCount;
    std::vector<uint32_t> paramKeys;
    enum_mariadb_return_type mariadbReturnType;
+    uint32_t fCharsetNumber;

 public:
    // For use by the framework
@@ -416,6 +422,7 @@ public:
    EXPORT void setParamCount(int32_t paramCount);
    std::vector<uint32_t>* getParamKeys();
    EXPORT void setMariaDBReturnType(enum_mariadb_return_type rt);
+
 };

 // Since aggregate functions can operate on any data type, we use the following structure
@@ -438,7 +445,9 @@ struct ColumnDatum
    uint32_t    scale;     // If dataType is a DECIMAL type
    uint32_t    precision; // If dataType is a DECIMAL type
    std::string alias;     // Only filled in for init()
-    ColumnDatum() : dataType(execplan::CalpontSystemCatalog::UNDEFINED), scale(0), precision(-1) {};
+    uint32_t    charsetNumber; // For string collations
+    ColumnDatum() : dataType(execplan::CalpontSystemCatalog::UNDEFINED), 
+                    scale(0), precision(-1), charsetNumber(8) {};
 };

 // Override mcsv1_UDAF to build your User Defined Aggregate (UDAF) and/or
@@ -658,7 +667,8 @@ inline mcsv1Context::mcsv1Context() :
    fStartConstant(0),
    fEndConstant(0),
    func(NULL),
-    fParamCount(0)
+    fParamCount(0),
+    fCharsetNumber(8)  // Latin1
 {
 }

@@ -683,6 +693,7 @@ inline mcsv1Context& mcsv1Context::copy(const mcsv1Context& rhs)
    bInterrupted     = rhs.bInterrupted;  // Multiple threads will use the same reference
    func             = rhs.func;
    fParamCount      = rhs.fParamCount;
+    fCharsetNumber   = rhs.fCharsetNumber;
    return *this;
 }

@@ -979,6 +990,16 @@ inline void mcsv1Context::setMariaDBReturnType(enum_mariadb_return_type rt)
    mariadbReturnType = rt;
 }

+inline void mcsv1Context::setCharsetNumber(uint32_t csNum)
+{
+    fCharsetNumber=csNum;
+}
+
+inline uint32_t mcsv1Context::getCharsetNumber()
+{
+    return fCharsetNumber;
+}
+
 inline mcsv1_UDAF::ReturnCode mcsv1_UDAF::dropValue(mcsv1Context* context, ColumnDatum* valsDropped)
 {
    return NOT_IMPLEMENTED;