diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b2173ad8..0f31249bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -247,6 +247,12 @@ ELSE() SET (MARIADB_CLIENT_LIBS -L${SERVER_BUILD_INCLUDE_DIR}/../libmariadb/libmariadb/ libmariadb.so) ENDIF() +IF (INSTALL_LAYOUT) + SET (MARIADB_STRING_LIBS dbug strings mysys) +ELSE() + SET (MARIADB_STRING_LIBS -L${SERVER_BUILD_INCLUDE_DIR}/../strings/ libstrings.a -L${SERVER_BUILD_INCLUDE_DIR}/../mysys/ libmysys.a -L${SERVER_BUILD_INCLUDE_DIR}/../dbug/ libdbug.a) +ENDIF() + #SET (ENGINE_UTILS_BOOSTIDB_INCLUDE "{CMAKE_CURRENT_SOURCE_DIR}/utils/boost_idb") SET (ENGINE_UTILS_MESSAGEQCPP_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/utils/messageqcpp") SET (ENGINE_WE_SHARED_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/writeengine/shared") diff --git a/dbcon/execplan/CMakeLists.txt b/dbcon/execplan/CMakeLists.txt index a2d85c340..fa8f52b12 100755 --- a/dbcon/execplan/CMakeLists.txt +++ b/dbcon/execplan/CMakeLists.txt @@ -46,7 +46,7 @@ set(execplan_LIB_SRCS add_library(execplan SHARED ${execplan_LIB_SRCS}) -target_link_libraries(execplan ${NETSNMP_LIBRARIES}) +target_link_libraries(execplan ${NETSNMP_LIBRARIES} ${MARIADB_STRING_LIBS}) install(TARGETS execplan DESTINATION ${ENGINE_LIBDIR} COMPONENT columnstore-engine) diff --git a/dbcon/execplan/calpontsystemcatalog.cpp b/dbcon/execplan/calpontsystemcatalog.cpp index aa369e005..610b0c221 100644 --- a/dbcon/execplan/calpontsystemcatalog.cpp +++ b/dbcon/execplan/calpontsystemcatalog.cpp @@ -21,6 +21,7 @@ * * ***********************************************************************/ + #include #include #include @@ -76,6 +77,8 @@ using namespace rowgroup; #include "idbregistry.h" #endif +#include "collation.h" + #undef BAIL_IF_0 #if 1 //We are unlikely to ever get anything more out of this connection, so bail out @@ -6084,6 +6087,48 @@ void CalpontSystemCatalog::checkSysCatVer() } } +CalpontSystemCatalog::ColType::ColType() : + colWidth(0), + constraintType(NO_CONSTRAINT), + colDataType(MEDINT), + defaultValue(""), + colPosition(-1), + scale(0), + precision(-1), + compressionType(NO_COMPRESSION), + columnOID(0), + autoincrement(0), + nextvalue(0), + cs(NULL) +{ + charsetNumber = default_charset_info->number; +} + +CalpontSystemCatalog::ColType::ColType(const ColType& rhs) +{ + colWidth = rhs.colWidth; + constraintType = rhs.constraintType; + colDataType = rhs.colDataType; + ddn = rhs.ddn; + defaultValue = rhs.defaultValue; + colPosition = rhs.colPosition; + scale = rhs.scale; + precision = rhs.precision; + compressionType = rhs.compressionType; + columnOID = rhs.columnOID; + autoincrement = rhs.autoincrement; + nextvalue = rhs.nextvalue; + charsetNumber = rhs.charsetNumber; + cs = rhs.cs; +} + +CHARSET_INFO* CalpontSystemCatalog::ColType::getCharset() +{ + if (!cs) + cs= get_charset(charsetNumber, MYF(MY_WME)); + return cs; +} + const string CalpontSystemCatalog::ColType::toString() const { ostringstream output; diff --git a/dbcon/execplan/calpontsystemcatalog.h b/dbcon/execplan/calpontsystemcatalog.h index 81b6c2218..9f6892a5f 100644 --- a/dbcon/execplan/calpontsystemcatalog.h +++ b/dbcon/execplan/calpontsystemcatalog.h @@ -48,6 +48,11 @@ #undef min #undef max +// Because including my_sys.h in a Columnstore header causes too many conflicts +struct charset_info_st; +typedef const struct charset_info_st CHARSET_INFO; + + #ifdef _MSC_VER #define __attribute__(x) #endif @@ -279,8 +284,7 @@ public: */ struct ColType { - ColType() : colWidth(0), constraintType(NO_CONSTRAINT), colDataType(MEDINT), defaultValue(""), colPosition(-1), scale(0), precision(-1), compressionType(NO_COMPRESSION), columnOID(0), - autoincrement(0), nextvalue(0) { } + ColType(); int32_t colWidth; ConstraintType constraintType; ColDataType colDataType; @@ -293,23 +297,12 @@ public: OID columnOID; bool autoincrement; //set to true if SYSCOLUMN autoincrement is �y� uint64_t nextvalue; //next autoincrement value + uint32_t charsetNumber; + const CHARSET_INFO* cs; - ColType(const ColType& rhs) - { - colWidth = rhs.colWidth; - constraintType = rhs.constraintType; - colDataType = rhs.colDataType; - ddn = rhs.ddn; - defaultValue = rhs.defaultValue; - colPosition = rhs.colPosition; - scale = rhs.scale; - precision = rhs.precision; - compressionType = rhs.compressionType; - columnOID = rhs.columnOID; - autoincrement = rhs.autoincrement; - nextvalue = rhs.nextvalue; + ColType(const ColType& rhs); - } + CHARSET_INFO* getCharset(); // for F&E use. only serialize necessary info for now void serialize (messageqcpp::ByteStream& b) const { @@ -318,6 +311,7 @@ public: b << (uint32_t)scale; b << (uint32_t)precision; b << (uint32_t)compressionType; + b << charsetNumber; } void unserialize (messageqcpp::ByteStream& b) @@ -329,6 +323,7 @@ public: b >> (uint32_t&)scale; b >> (uint32_t&)precision; b >> (uint32_t&)compressionType; + b >> charsetNumber; } const std::string toString() const; diff --git a/dbcon/execplan/predicateoperator.cpp b/dbcon/execplan/predicateoperator.cpp index efd748c99..4772a8b8c 100644 --- a/dbcon/execplan/predicateoperator.cpp +++ b/dbcon/execplan/predicateoperator.cpp @@ -21,6 +21,7 @@ * * ***********************************************************************/ + #include #include "bytestream.h" @@ -28,12 +29,13 @@ #include "objectreader.h" #include "liboamcpp.h" + +#include "collation.h" + using namespace oam; using namespace std; -bool futf8 = true; - namespace { @@ -54,62 +56,21 @@ namespace execplan /** * Constructors/Destructors */ -PredicateOperator::PredicateOperator() +PredicateOperator::PredicateOperator() : + cs(NULL) { - Oam oam; - // get and set locale language - string systemLang = "C"; - - try - { - oam.getSystemConfig("SystemLang", systemLang); - } - catch (...) - {} - - if ( systemLang != "en_US.UTF-8" && - systemLang.find("UTF") != string::npos ) - futf8 = true; } -PredicateOperator::PredicateOperator(const string& operatorName) +PredicateOperator::PredicateOperator(const string& operatorName) : + cs(NULL) { - Oam oam; - // get and set locale language - string systemLang = "C"; - - try - { - oam.getSystemConfig("SystemLang", systemLang); - } - catch (...) - {} - - if ( systemLang != "en_US.UTF-8" && - systemLang.find("UTF") != string::npos ) - futf8 = true; - data(operatorName); } PredicateOperator::PredicateOperator(const PredicateOperator& rhs) : Operator(rhs) { - Oam oam; - // get and set locale language - string systemLang = "C"; - - try - { - oam.getSystemConfig("SystemLang", systemLang); - } - catch (...) - {} - - if ( systemLang != "en_US.UTF-8" && - systemLang.find("UTF") != string::npos ) - futf8 = true; - data(rhs.data()); + cs = rhs.getCharset(); } PredicateOperator:: ~PredicateOperator() @@ -145,6 +106,7 @@ void PredicateOperator::unserialize(messageqcpp::ByteStream& b) ObjectReader::checkType(b, ObjectReader::PREDICATEOPERATOR); //b >> fData; Operator::unserialize(b); + cs = get_charset(fOperationType.charsetNumber, MYF(MY_WME)); } bool PredicateOperator::operator==(const PredicateOperator& t) const @@ -180,6 +142,7 @@ bool PredicateOperator::operator!=(const TreeNode* t) const //FIXME: VARBINARY??? void PredicateOperator::setOpType(Type& l, Type& r) { + fOperationType = l; // Default to left side. Modify as needed. if ( l.colDataType == execplan::CalpontSystemCatalog::DATETIME || l.colDataType == execplan::CalpontSystemCatalog::TIME || l.colDataType == execplan::CalpontSystemCatalog::TIMESTAMP || @@ -189,7 +152,7 @@ void PredicateOperator::setOpType(Type& l, Type& r) { case execplan::CalpontSystemCatalog::CHAR: case execplan::CalpontSystemCatalog::VARCHAR: - fOperationType = l; + fOperationType.charsetNumber = r.charsetNumber; break; case execplan::CalpontSystemCatalog::DATETIME: @@ -344,28 +307,37 @@ void PredicateOperator::setOpType(Type& l, Type& r) r.colDataType == execplan::CalpontSystemCatalog::VARCHAR || r.colDataType == execplan::CalpontSystemCatalog::TEXT)) { +#if 0 + // Currently, STRINT isn't properly implemented everywhere + // For short strings, we can get a faster execution for charset that fit in one byte. if ( ( (l.colDataType == execplan::CalpontSystemCatalog::CHAR && l.colWidth <= 8) || (l.colDataType == execplan::CalpontSystemCatalog::VARCHAR && l.colWidth < 8) ) && ( (r.colDataType == execplan::CalpontSystemCatalog::CHAR && r.colWidth <= 8) || (r.colDataType == execplan::CalpontSystemCatalog::VARCHAR && r.colWidth < 8) ) ) { - if ( futf8 ) + switch (fOperationType.charsetNumber) { - fOperationType.colDataType = execplan::CalpontSystemCatalog::VARCHAR; - fOperationType.colWidth = 255; - } - else - { - fOperationType.colDataType = execplan::CalpontSystemCatalog::BIGINT; - fOperationType.scale = 0; - fOperationType.colWidth = 8; - - // @bug3532, char[] as network order int for fast comparison. - l.colDataType = execplan::CalpontSystemCatalog::STRINT; - r.colDataType = execplan::CalpontSystemCatalog::STRINT; + case 8: // latin1_swedish_ci + case 9: // latin2_general_ci + case 11: // ascii_general_ci + case 47: // latin1_bin + case 48: // latin1_general_ci + case 49: // latin1_general_cs + case 65: // ascii_bin + case 77: // latin2_bin + // char[] as network order int for fast comparison. + fOperationType.colDataType = execplan::CalpontSystemCatalog::BIGINT; + fOperationType.scale = 0; + fOperationType.colWidth = 8; + l.colDataType = execplan::CalpontSystemCatalog::STRINT; + r.colDataType = execplan::CalpontSystemCatalog::STRINT; + default: + fOperationType.colDataType = execplan::CalpontSystemCatalog::VARCHAR; + fOperationType.colWidth = 255; } } else +#endif { fOperationType.colDataType = execplan::CalpontSystemCatalog::VARCHAR; fOperationType.colWidth = 255; @@ -382,6 +354,437 @@ void PredicateOperator::setOpType(Type& l, Type& r) fOperationType.colDataType = execplan::CalpontSystemCatalog::DOUBLE; fOperationType.colWidth = 8; } + + cs = get_charset(fOperationType.charsetNumber, MYF(MY_WME)); +} + +inline bool PredicateOperator::strTrimCompare(const std::string& op1, const std::string& op2) +{ + int r1 = cs->strnncollsp(op1.c_str(), op1.length(), op2.c_str(), op2.length()); + switch (fOp) + { + case OP_EQ: + return r1 == 0; + + case OP_NE: + return r1 != 0; + + case OP_GT: + return r1 > 0; + + case OP_GE: + return r1 >= 0; + + case OP_LT: + return r1 < 0; + + case OP_LE: + return r1 <= 0; + + default: + { + std::ostringstream oss; + oss << "Unsupported predicate operation: " << fOp; + throw logging::InvalidOperationExcept(oss.str()); + } + } +} + +bool PredicateOperator::getBoolVal(rowgroup::Row& row, bool& isNull, ReturnedColumn* lop, ReturnedColumn* rop) +{ + // like operator. both sides are string. + if (fOp == OP_LIKE || fOp == OP_NOTLIKE) + { + SP_CNX_Regex regex = rop->regex(); + + // Ugh. The strings returned by getStrVal have null padding out to the col width. boost::regex + // considers these nulls significant, but they're not in the pattern, so we need to strip + // them off... + const std::string& v = lop->getStrVal(row, isNull); +// char* c = (char*)alloca(v.length() + 1); +// memcpy(c, v.c_str(), v.length()); +// c[v.length()] = 0; +// std::string vv(c); + + if (regex) + { +#ifdef POSIX_REGEX + bool ret = regexec(regex.get(), v.c_str(), 0, NULL, 0) == 0; +#else + bool ret = boost::regex_match(v.c_str(), *regex); +#endif + return (((fOp == OP_LIKE) ? ret : !ret) && !isNull); + } + else + { +#ifdef POSIX_REGEX + regex_t regex; + std::string str = dataconvert::DataConvert::constructRegexp(rop->getStrVal(row, isNull)); + regcomp(®ex, str.c_str(), REG_NOSUB | REG_EXTENDED); + bool ret = regexec(®ex, v.c_str(), 0, NULL, 0) == 0; + regfree(®ex); +#else + boost::regex regex(dataconvert::DataConvert::constructRegexp(rop->getStrVal(row, isNull))); + bool ret = boost::regex_match(v.c_str(), regex); +#endif + return (((fOp == OP_LIKE) ? ret : !ret) && !isNull); + } + } + + // fOpType should have already been set on the connector during parsing + switch (fOperationType.colDataType) + { + case execplan::CalpontSystemCatalog::BIGINT: + case execplan::CalpontSystemCatalog::INT: + case execplan::CalpontSystemCatalog::MEDINT: + case execplan::CalpontSystemCatalog::TINYINT: + case execplan::CalpontSystemCatalog::SMALLINT: + { + if (fOp == OP_ISNULL) + { + lop->getIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + int64_t val1 = lop->getIntVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getIntVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::UBIGINT: + case execplan::CalpontSystemCatalog::UINT: + case execplan::CalpontSystemCatalog::UMEDINT: + case execplan::CalpontSystemCatalog::UTINYINT: + case execplan::CalpontSystemCatalog::USMALLINT: + { + if (fOp == OP_ISNULL) + { + lop->getUintVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getUintVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + uint64_t val1 = lop->getUintVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getUintVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::FLOAT: + case execplan::CalpontSystemCatalog::UFLOAT: + case execplan::CalpontSystemCatalog::DOUBLE: + case execplan::CalpontSystemCatalog::UDOUBLE: + { + if (fOp == OP_ISNULL) + { + lop->getDoubleVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getDoubleVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + double val1 = lop->getDoubleVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getDoubleVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::LONGDOUBLE: + { + if (fOp == OP_ISNULL) + { + lop->getLongDoubleVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getLongDoubleVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + long double val1 = lop->getLongDoubleVal(row, isNull); + if (isNull) + return false; + + long double val2 = rop->getLongDoubleVal(row, isNull); + if (isNull) + return false; + + // In many case, rounding error will prevent an eq compare to work + // In these cases, use the largest scale of the two items. + if (fOp == execplan::OP_EQ) + { + // In case a val is a representation of a very large integer, + // we won't want to just multiply by scale, as it may move + // significant digits out of scope. So we break them apart + // and compare each separately + int64_t scale = std::max(lop->resultType().scale, rop->resultType().scale); + if (scale) + { + long double intpart1; + long double fract1 = modfl(val1, &intpart1); + long double intpart2; + long double fract2 = modfl(val2, &intpart2); + if (numericCompare(intpart1, intpart2)) + { + double factor = pow(10.0, (double)scale); + fract1 = roundl(fract1 * factor); + fract2 = roundl(fract2 * factor); + return numericCompare(fract1, fract2); + } + else + { + return false; + } + } + } + return numericCompare(val1, val2); + } + + case execplan::CalpontSystemCatalog::DECIMAL: + case execplan::CalpontSystemCatalog::UDECIMAL: + { + if (fOp == OP_ISNULL) + { + lop->getDecimalVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getDecimalVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + IDB_Decimal val1 = lop->getDecimalVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getDecimalVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::DATE: + { + if (fOp == OP_ISNULL) + { + lop->getDateIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getDateIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + int64_t val1 = lop->getDateIntVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, (int64_t)rop->getDateIntVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::DATETIME: + { + if (fOp == OP_ISNULL) + { + lop->getDatetimeIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getDatetimeIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + int64_t val1 = lop->getDatetimeIntVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getDatetimeIntVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::TIMESTAMP: + { + if (fOp == OP_ISNULL) + { + lop->getTimestampIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getTimestampIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + int64_t val1 = lop->getTimestampIntVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getTimestampIntVal(row, isNull)) && !isNull; + } + + case execplan::CalpontSystemCatalog::TIME: + { + if (fOp == OP_ISNULL) + { + lop->getTimeIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getTimeIntVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + int64_t val1 = lop->getTimeIntVal(row, isNull); + + if (isNull) + return false; + + return numericCompare(val1, rop->getTimeIntVal(row, isNull)) && !isNull; + } + + + + case execplan::CalpontSystemCatalog::VARCHAR: + case execplan::CalpontSystemCatalog::CHAR: + case execplan::CalpontSystemCatalog::TEXT: + { + if (fOp == OP_ISNULL) + { + lop->getStrVal(row, isNull); + bool ret = isNull; + isNull = false; + return ret; + } + + if (fOp == OP_ISNOTNULL) + { + lop->getStrVal(row, isNull); + bool ret = isNull; + isNull = false; + return !ret; + } + + if (isNull) + return false; + + const std::string& val1 = lop->getStrVal(row, isNull); + if (isNull) + return false; + + return strTrimCompare(val1, rop->getStrVal(row, isNull)) && !isNull; + } + + //FIXME: ??? + case execplan::CalpontSystemCatalog::VARBINARY: + case execplan::CalpontSystemCatalog::BLOB: + return false; + break; + + default: + { + std::ostringstream oss; + oss << "invalid predicate operation type: " << fOperationType.colDataType; + throw logging::InvalidOperationExcept(oss.str()); + } + } + + return false; } } // namespace diff --git a/dbcon/execplan/predicateoperator.h b/dbcon/execplan/predicateoperator.h index bf51cbeed..cda9ce9e2 100644 --- a/dbcon/execplan/predicateoperator.h +++ b/dbcon/execplan/predicateoperator.h @@ -25,6 +25,7 @@ #ifndef PREDICATEOPERATOR_H #define PREDICATEOPERATOR_H + #include #include #if defined(_MSC_VER) @@ -38,10 +39,13 @@ #include #include +#include "utils_utf8.h" #include "expressionparser.h" #include "returnedcolumn.h" #include "dataconvert.h" -#include "utils_utf8.h" + +struct charset_info_st; +typedef const struct charset_info_st CHARSET_INFO; namespace messageqcpp { @@ -104,419 +108,24 @@ public: */ bool operator!=(const PredicateOperator& t) const; + const CHARSET_INFO* getCharset() const + { + return cs; + } /*********************************************************** * F&E framework * ***********************************************************/ - inline virtual bool getBoolVal(rowgroup::Row& row, bool& isNull, ReturnedColumn* lop, ReturnedColumn* rop); + virtual bool getBoolVal(rowgroup::Row& row, bool& isNull, ReturnedColumn* lop, ReturnedColumn* rop); void setOpType(Type& l, Type& r); private: template inline bool numericCompare(result_t op1, result_t op2); - inline bool strCompare(const std::string& op1, const std::string& op2); - // MCOL-1559 inline bool strTrimCompare(const std::string& op1, const std::string& op2); + + const CHARSET_INFO* cs; }; -inline bool PredicateOperator::getBoolVal(rowgroup::Row& row, bool& isNull, ReturnedColumn* lop, ReturnedColumn* rop) -{ - // like operator. both sides are string. - if (fOp == OP_LIKE || fOp == OP_NOTLIKE) - { - SP_CNX_Regex regex = rop->regex(); - - // Ugh. The strings returned by getStrVal have null padding out to the col width. boost::regex - // considers these nulls significant, but they're not in the pattern, so we need to strip - // them off... - const std::string& v = lop->getStrVal(row, isNull); -// char* c = (char*)alloca(v.length() + 1); -// memcpy(c, v.c_str(), v.length()); -// c[v.length()] = 0; -// std::string vv(c); - - if (regex) - { -#ifdef POSIX_REGEX - bool ret = regexec(regex.get(), v.c_str(), 0, NULL, 0) == 0; -#else - bool ret = boost::regex_match(v.c_str(), *regex); -#endif - return (((fOp == OP_LIKE) ? ret : !ret) && !isNull); - } - else - { -#ifdef POSIX_REGEX - regex_t regex; - std::string str = dataconvert::DataConvert::constructRegexp(rop->getStrVal(row, isNull)); - regcomp(®ex, str.c_str(), REG_NOSUB | REG_EXTENDED); - bool ret = regexec(®ex, v.c_str(), 0, NULL, 0) == 0; - regfree(®ex); -#else - boost::regex regex(dataconvert::DataConvert::constructRegexp(rop->getStrVal(row, isNull))); - bool ret = boost::regex_match(v.c_str(), regex); -#endif - return (((fOp == OP_LIKE) ? ret : !ret) && !isNull); - } - } - - // fOpType should have already been set on the connector during parsing - switch (fOperationType.colDataType) - { - case execplan::CalpontSystemCatalog::BIGINT: - case execplan::CalpontSystemCatalog::INT: - case execplan::CalpontSystemCatalog::MEDINT: - case execplan::CalpontSystemCatalog::TINYINT: - case execplan::CalpontSystemCatalog::SMALLINT: - { - if (fOp == OP_ISNULL) - { - lop->getIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - int64_t val1 = lop->getIntVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getIntVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::UBIGINT: - case execplan::CalpontSystemCatalog::UINT: - case execplan::CalpontSystemCatalog::UMEDINT: - case execplan::CalpontSystemCatalog::UTINYINT: - case execplan::CalpontSystemCatalog::USMALLINT: - { - if (fOp == OP_ISNULL) - { - lop->getUintVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getUintVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - uint64_t val1 = lop->getUintVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getUintVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::FLOAT: - case execplan::CalpontSystemCatalog::UFLOAT: - case execplan::CalpontSystemCatalog::DOUBLE: - case execplan::CalpontSystemCatalog::UDOUBLE: - { - if (fOp == OP_ISNULL) - { - lop->getDoubleVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getDoubleVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - double val1 = lop->getDoubleVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getDoubleVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::LONGDOUBLE: - { - if (fOp == OP_ISNULL) - { - lop->getLongDoubleVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getLongDoubleVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - long double val1 = lop->getLongDoubleVal(row, isNull); - if (isNull) - return false; - - long double val2 = rop->getLongDoubleVal(row, isNull); - if (isNull) - return false; - - // In many case, rounding error will prevent an eq compare to work - // In these cases, use the largest scale of the two items. - if (fOp == execplan::OP_EQ) - { - // In case a val is a representation of a very large integer, - // we won't want to just multiply by scale, as it may move - // significant digits out of scope. So we break them apart - // and compare each separately - int64_t scale = std::max(lop->resultType().scale, rop->resultType().scale); - if (scale) - { - long double intpart1; - long double fract1 = modfl(val1, &intpart1); - long double intpart2; - long double fract2 = modfl(val2, &intpart2); - if (numericCompare(intpart1, intpart2)) - { - double factor = pow(10.0, (double)scale); - fract1 = roundl(fract1 * factor); - fract2 = roundl(fract2 * factor); - return numericCompare(fract1, fract2); - } - else - { - return false; - } - } - } - return numericCompare(val1, val2); - } - - case execplan::CalpontSystemCatalog::DECIMAL: - case execplan::CalpontSystemCatalog::UDECIMAL: - { - if (fOp == OP_ISNULL) - { - lop->getDecimalVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getDecimalVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - IDB_Decimal val1 = lop->getDecimalVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getDecimalVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::DATE: - { - if (fOp == OP_ISNULL) - { - lop->getDateIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getDateIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - int64_t val1 = lop->getDateIntVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, (int64_t)rop->getDateIntVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::DATETIME: - { - if (fOp == OP_ISNULL) - { - lop->getDatetimeIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getDatetimeIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - int64_t val1 = lop->getDatetimeIntVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getDatetimeIntVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::TIMESTAMP: - { - if (fOp == OP_ISNULL) - { - lop->getTimestampIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getTimestampIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - int64_t val1 = lop->getTimestampIntVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getTimestampIntVal(row, isNull)) && !isNull; - } - - case execplan::CalpontSystemCatalog::TIME: - { - if (fOp == OP_ISNULL) - { - lop->getTimeIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getTimeIntVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - int64_t val1 = lop->getTimeIntVal(row, isNull); - - if (isNull) - return false; - - return numericCompare(val1, rop->getTimeIntVal(row, isNull)) && !isNull; - } - - - - case execplan::CalpontSystemCatalog::VARCHAR: - case execplan::CalpontSystemCatalog::CHAR: - case execplan::CalpontSystemCatalog::TEXT: - { - if (fOp == OP_ISNULL) - { - lop->getStrVal(row, isNull); - bool ret = isNull; - isNull = false; - return ret; - } - - if (fOp == OP_ISNOTNULL) - { - lop->getStrVal(row, isNull); - bool ret = isNull; - isNull = false; - return !ret; - } - - if (isNull) - return false; - - const std::string& val1 = lop->getStrVal(row, isNull); - if (isNull) - return false; - - return strTrimCompare(val1, rop->getStrVal(row, isNull)) && !isNull; -// return strCompare(val1, rop->getStrVal(row, isNull)) && !isNull; - - } - - //FIXME: ??? - case execplan::CalpontSystemCatalog::VARBINARY: - case execplan::CalpontSystemCatalog::BLOB: - return false; - break; - - default: - { - std::ostringstream oss; - oss << "invalid predicate operation type: " << fOperationType.colDataType; - throw logging::InvalidOperationExcept(oss.str()); - } - } - - return false; -} - template inline bool PredicateOperator::numericCompare(result_t op1, result_t op2) @@ -550,68 +159,6 @@ inline bool PredicateOperator::numericCompare(result_t op1, result_t op2) } } -inline bool PredicateOperator::strCompare(const std::string& op1, const std::string& op2) -{ - switch (fOp) - { - case OP_EQ: - return funcexp::utf8::idb_strcoll(op1.c_str(), op2.c_str()) == 0; - - case OP_NE: - return funcexp::utf8::idb_strcoll(op1.c_str(), op2.c_str()) != 0; - - case OP_GT: - return funcexp::utf8::idb_strcoll(op1.c_str(), op2.c_str()) > 0; - - case OP_GE: - return funcexp::utf8::idb_strcoll(op1.c_str(), op2.c_str()) >= 0; - - case OP_LT: - return funcexp::utf8::idb_strcoll(op1.c_str(), op2.c_str()) < 0; - - case OP_LE: - return funcexp::utf8::idb_strcoll(op1.c_str(), op2.c_str()) <= 0; - - default: - { - std::ostringstream oss; - oss << "Non support predicate operation: " << fOp; - throw logging::InvalidOperationExcept(oss.str()); - } - } -} - -inline bool PredicateOperator::strTrimCompare(const std::string& op1, const std::string& op2) -{ - switch (fOp) - { - case OP_EQ: - return funcexp::utf8::idb_strtrimcoll(op1, op2) == 0; - - case OP_NE: - return funcexp::utf8::idb_strtrimcoll(op1, op2) != 0; - - case OP_GT: - return funcexp::utf8::idb_strtrimcoll(op1, op2) > 0; - - case OP_GE: - return funcexp::utf8::idb_strtrimcoll(op1, op2) >= 0; - - case OP_LT: - return funcexp::utf8::idb_strtrimcoll(op1, op2) < 0; - - case OP_LE: - return funcexp::utf8::idb_strtrimcoll(op1, op2) <= 0; - - default: - { - std::ostringstream oss; - oss << "Non support predicate operation: " << fOp; - throw logging::InvalidOperationExcept(oss.str()); - } - } -} - std::ostream& operator<<(std::ostream& os, const PredicateOperator& rhs); } diff --git a/dbcon/execplan/treenode.h b/dbcon/execplan/treenode.h index ca1196b27..5fab31a02 100644 --- a/dbcon/execplan/treenode.h +++ b/dbcon/execplan/treenode.h @@ -336,7 +336,7 @@ public: fRefCount = refCount; } - // the inc and dec functions are used by connector single thread. + // the inc and dec functions areparm[n]->data() used by connector single thread. virtual void decRefCount() { fRefCount--; @@ -451,6 +451,16 @@ public: return fRegex; } + uint32_t charsetNumber() const + { + return fResultType.charsetNumber; + } + void charsetNumber(uint32_t cnum) + { + fResultType.charsetNumber = cnum; + fOperationType.charsetNumber = cnum; + } + protected: Result fResult; execplan::CalpontSystemCatalog::ColType fResultType; // mapped from mysql data type diff --git a/dbcon/joblist/batchprimitiveprocessor-jl.cpp b/dbcon/joblist/batchprimitiveprocessor-jl.cpp index ec8306646..0f04c8a8e 100644 --- a/dbcon/joblist/batchprimitiveprocessor-jl.cpp +++ b/dbcon/joblist/batchprimitiveprocessor-jl.cpp @@ -413,8 +413,8 @@ void BatchPrimitiveProcessorJL::addElementType(const StringElementType& et, uint void BatchPrimitiveProcessorJL::getElementTypes(ByteStream& in, vector* out, bool* validCPData, uint64_t* lbid, int64_t* min, - int64_t* max, uint32_t* cachedIO, uint32_t* physIO, uint32_t* touchedBlocks, - uint16_t* preJoinRidCount) const + int64_t* max, uint32_t* cachedIO, uint32_t* physIO, + uint32_t* touchedBlocks) const { uint32_t i; uint16_t l_count; @@ -425,11 +425,6 @@ void BatchPrimitiveProcessorJL::getElementTypes(ByteStream& in, uint64_t tmp64; uint8_t tmp8; - /* PM join support */ - uint32_t jCount; - ElementType* jet; - -// cout << "get Element Types uniqueID=" << uniqueID << endl; /* skip the header */ idbassert(in.length() > sizeof(ISMPacketHeader) + sizeof(PrimitiveHeader)); in.advance(sizeof(ISMPacketHeader) + sizeof(PrimitiveHeader)); @@ -472,21 +467,6 @@ void BatchPrimitiveProcessorJL::getElementTypes(ByteStream& in, (*out)[i].second = vals[i]; } - if (joiner.get() != NULL) - { - in >> *preJoinRidCount; - in >> jCount; - idbassert(in.length() > (jCount << 4)); - jet = (ElementType*) in.buf(); - - for (i = 0; i < jCount; ++i) - out->push_back(jet[i]); - - in.advance(jCount << 4); - } - else - *preJoinRidCount = l_count; - in >> *cachedIO; in >> *physIO; in >> *touchedBlocks; @@ -987,7 +967,7 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const if (needRidsAtDelivery) flags |= SEND_RIDS_AT_DELIVERY; - if (joiner.get() != NULL || tJoiners.size() > 0) + if (tJoiners.size() > 0) flags |= HAS_JOINER; if (sendRowGroups) @@ -1090,11 +1070,6 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const // cout << "joined RG: " << joinedRG.toString() << endl; } } - else - { - bs << (uint8_t) joiner->includeAll(); - bs << (uint32_t) joiner->size(); - } } bs << filterCount; @@ -1581,51 +1556,6 @@ bool BatchPrimitiveProcessorJL::nextTupleJoinerMsg(ByteStream& bs) return true; } -void BatchPrimitiveProcessorJL::useJoiner(boost::shared_ptr j) -{ - pos = 0; - joiner = j; -} - -bool BatchPrimitiveProcessorJL::nextJoinerMsg(ByteStream& bs) -{ - uint32_t size, toSend; - ISMPacketHeader ism; - - memset((void*)&ism, 0, sizeof(ism)); - - if (smallSide.get() == NULL) - smallSide = joiner->getSmallSide(); - - size = smallSide->size(); - - if (pos == size) - { - /* last message */ - ism.Command = BATCH_PRIMITIVE_END_JOINER; - bs.load((uint8_t*) &ism, sizeof(ism)); - bs << (messageqcpp::ByteStream::quadbyte)sessionID; - bs << (messageqcpp::ByteStream::quadbyte)stepID; - bs << uniqueID; - pos = 0; - return false; - } - - ism.Command = BATCH_PRIMITIVE_ADD_JOINER; - bs.load((uint8_t*) &ism, sizeof(ism)); - bs << (messageqcpp::ByteStream::quadbyte)sessionID; - bs << (messageqcpp::ByteStream::quadbyte)stepID; - bs << uniqueID; - - toSend = (size - pos > 1000000 ? 1000000 : size - pos); - bs << toSend; - bs << pos; - bs.append((uint8_t*) (&(*smallSide)[pos]), sizeof(ElementType) * toSend); - pos += toSend; - - return true; -} - void BatchPrimitiveProcessorJL::setProjectionRowGroup(const rowgroup::RowGroup& rg) { ot = ROW_GROUP; @@ -1761,8 +1691,6 @@ void BatchPrimitiveProcessorJL::deliverStringTableRowGroup(bool b) aggregateRGPM.setUseStringTable(b); else if (fe2) fe2Output.setUseStringTable(b); -// else if ((joiner.get() != NULL || tJoiners.size() > 0) && sendTupleJoinRowGroupData) -// joinedRG.setUseStringTable(b); else projectionRG.setUseStringTable(b); } diff --git a/dbcon/joblist/batchprimitiveprocessor-jl.h b/dbcon/joblist/batchprimitiveprocessor-jl.h index 8deba48d3..a5f29a02b 100644 --- a/dbcon/joblist/batchprimitiveprocessor-jl.h +++ b/dbcon/joblist/batchprimitiveprocessor-jl.h @@ -126,9 +126,6 @@ public: void createBPP(messageqcpp::ByteStream&) const; void destroyBPP(messageqcpp::ByteStream&) const; - void useJoiner(boost::shared_ptr); - bool nextJoinerMsg(messageqcpp::ByteStream&); - /* Call this one last */ // void addDeliveryStep(const DeliveryStep &); @@ -154,8 +151,7 @@ public: /* Turn a ByteStream into ElementTypes or StringElementTypes */ void getElementTypes(messageqcpp::ByteStream& in, std::vector* out, bool* validCPData, uint64_t* lbid, int64_t* min, int64_t* max, uint32_t* cachedIO, uint32_t* physIO, - uint32_t* touchedBlocks, - uint16_t* preJoinRidCount) const; + uint32_t* touchedBlocks) const; void getStringElementTypes(messageqcpp::ByteStream& in, std::vector* out, bool* validCPData, uint64_t* lbid, int64_t* min, int64_t* max, uint32_t* cachedIO, uint32_t* physIO, @@ -314,7 +310,6 @@ private: /* for Joiner serialization */ bool pickNextJoinerNum(); uint32_t pos, joinerNum; - boost::shared_ptr joiner; boost::shared_ptr > smallSide; boost::scoped_array posByJoinerNum; diff --git a/dbcon/joblist/crossenginestep.h b/dbcon/joblist/crossenginestep.h index 2e9cc79f0..15b2f6df2 100644 --- a/dbcon/joblist/crossenginestep.h +++ b/dbcon/joblist/crossenginestep.h @@ -134,7 +134,6 @@ public: { return fAlias; } - void useJoiner(boost::shared_ptr) {} void setJobInfo(const JobInfo* jobInfo) {} void setOutputRowGroup(const rowgroup::RowGroup&); const rowgroup::RowGroup& getOutputRowGroup() const; diff --git a/dbcon/joblist/dictstep-jl.cpp b/dbcon/joblist/dictstep-jl.cpp index b6d7fdb87..cc34f45ac 100644 --- a/dbcon/joblist/dictstep-jl.cpp +++ b/dbcon/joblist/dictstep-jl.cpp @@ -59,6 +59,7 @@ DictStepJL::DictStepJL(const pDictionaryStep& dict) filterString = dict.fFilterString; filterCount = dict.fFilterCount; + charsetNumber = dict.fColType.charsetNumber; } DictStepJL::~DictStepJL() @@ -88,7 +89,7 @@ void DictStepJL::createCommand(ByteStream& bs) const } else bs << filterString; - + bs << charsetNumber; CommandJL::createCommand(bs); } diff --git a/dbcon/joblist/dictstep-jl.h b/dbcon/joblist/dictstep-jl.h index ff5fd8eaa..a9782acf4 100644 --- a/dbcon/joblist/dictstep-jl.h +++ b/dbcon/joblist/dictstep-jl.h @@ -76,6 +76,7 @@ private: std::vector eqFilter; bool hasEqFilter; uint8_t eqOp; // COMPARE_EQ or COMPARE_NE + uint32_t charsetNumber; }; }; // namespace diff --git a/dbcon/joblist/expressionstep.cpp b/dbcon/joblist/expressionstep.cpp index e5d656936..9e949196d 100644 --- a/dbcon/joblist/expressionstep.cpp +++ b/dbcon/joblist/expressionstep.cpp @@ -398,8 +398,10 @@ void ExpressionStep::populateColumnInfo(SimpleColumn* sc, JobInfo& jobInfo) //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (dynamic_cast(sc) == NULL) + { ct = jobInfo.csc->colType(sc->oid()); - + ct.charsetNumber =sc->colType().charsetNumber; + } //X if (ct.scale == 0) // keep passed original ct for decimal type sc->resultType(ct); // update from mysql type to calpont type @@ -526,7 +528,10 @@ void ExpressionStep::updateInputIndex(map& indexMap, const J //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (dynamic_cast(sc) == NULL) + { ct = jobInfo.csc->colType(oid); + ct.charsetNumber =sc->colType().charsetNumber; + } //X dictOid = joblist::isDictCol(ct); diff --git a/dbcon/joblist/groupconcat.cpp b/dbcon/joblist/groupconcat.cpp index d48c5e289..0c8c355a9 100644 --- a/dbcon/joblist/groupconcat.cpp +++ b/dbcon/joblist/groupconcat.cpp @@ -211,6 +211,7 @@ void GroupConcatInfo::mapColumns(const RowGroup& projRG) vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); vector >::iterator i1 = (*k)->fGroupCols.begin(); @@ -229,6 +230,7 @@ void GroupConcatInfo::mapColumns(const RowGroup& projRG) oids.push_back(projRG.getOIDs()[j->second]); keys.push_back(projRG.getKeys()[j->second]); types.push_back(projRG.getColTypes()[j->second]); + csNums.push_back(projRG.getCharsetNumber(j->second)); scale.push_back(projRG.getScale()[j->second]); precision.push_back(projRG.getPrecision()[j->second]); @@ -258,6 +260,7 @@ void GroupConcatInfo::mapColumns(const RowGroup& projRG) oids.push_back(projRG.getOIDs()[j->second]); keys.push_back(projRG.getKeys()[j->second]); types.push_back(projRG.getColTypes()[j->second]); + csNums.push_back(projRG.getCharsetNumber(j->second)); scale.push_back(projRG.getScale()[j->second]); precision.push_back(projRG.getPrecision()[j->second]); } @@ -271,7 +274,7 @@ void GroupConcatInfo::mapColumns(const RowGroup& projRG) i2++; } - (*k)->fRowGroup = RowGroup(oids.size(), pos, oids, keys, types, scale, precision, projRG.getStringTableThreshold(), false); + (*k)->fRowGroup = RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, projRG.getStringTableThreshold(), false); (*k)->fMapping = makeMapping(projRG, (*k)->fRowGroup); } } diff --git a/dbcon/joblist/jlf_common.cpp b/dbcon/joblist/jlf_common.cpp index 576358ea2..fe3195653 100644 --- a/dbcon/joblist/jlf_common.cpp +++ b/dbcon/joblist/jlf_common.cpp @@ -196,7 +196,7 @@ TupleInfo setTupleInfo_(const CalpontSystemCatalog::ColType& ct, { //Haven't even seen the table yet, much less this col ti = TupleInfo(fudgeWidth(ct, col_oid), col_oid, col_key, tbl_key, - ct.scale, ct.precision, ct.colDataType); + ct.scale, ct.precision, ct.colDataType, ct.charsetNumber); jobInfo.keyInfo->tupleInfoMap[col_key] = ti; jobInfo.keyInfo->colKeyToTblKey[col_key] = tbl_key; jobInfo.keyInfo->colKeyToTblKey[tbl_key] = tbl_key; diff --git a/dbcon/joblist/jlf_common.h b/dbcon/joblist/jlf_common.h index 5c5a14b8e..fe6c477d9 100644 --- a/dbcon/joblist/jlf_common.h +++ b/dbcon/joblist/jlf_common.h @@ -74,8 +74,8 @@ const int32_t CNX_EXP_TABLE_ID = 999; struct TupleInfo { TupleInfo(uint32_t w = 0, uint32_t o = 0, uint32_t k = -1, uint32_t t = -1, uint32_t s = 0, uint32_t p = 0, - execplan::CalpontSystemCatalog::ColDataType dt = execplan::CalpontSystemCatalog::BIT) : - width(w), oid(o), key(k), tkey(t), scale(s), precision(p), dtype(dt) { } + execplan::CalpontSystemCatalog::ColDataType dt = execplan::CalpontSystemCatalog::BIT, uint32_t csn = 8) : + width(w), oid(o), key(k), tkey(t), scale(s), precision(p), dtype(dt), csNum(csn) { } ~TupleInfo() { } uint32_t width; @@ -85,6 +85,7 @@ struct TupleInfo uint32_t scale; uint32_t precision; execplan::CalpontSystemCatalog::ColDataType dtype; + uint32_t csNum; // For collations }; // for compound join diff --git a/dbcon/joblist/jlf_execplantojoblist.cpp b/dbcon/joblist/jlf_execplantojoblist.cpp index 8ce36a478..35f4110fa 100644 --- a/dbcon/joblist/jlf_execplantojoblist.cpp +++ b/dbcon/joblist/jlf_execplantojoblist.cpp @@ -650,11 +650,15 @@ const JobStepVector doColFilter(const SimpleColumn* sc1, const SimpleColumn* sc2 //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (!sc1->schemaName().empty() && sc1->isColumnStore() && !pc1) + { ct1 = jobInfo.csc->colType(sc1->oid()); - + ct1.charsetNumber =sc1->colType().charsetNumber; + } if (!sc2->schemaName().empty() && sc2->isColumnStore() && !pc2) + { ct2 = jobInfo.csc->colType(sc2->oid()); - + ct2.charsetNumber =sc2->colType().charsetNumber; + } //X int8_t op = op2num(sop); @@ -1075,11 +1079,15 @@ const JobStepVector doJoin( //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (!sc1->schemaName().empty() && sc1->isColumnStore() && !pc1) + { ct1 = jobInfo.csc->colType(sc1->oid()); - + ct1.charsetNumber =sc1->colType().charsetNumber; + } if (!sc2->schemaName().empty() && sc2->isColumnStore() && !pc2) + { ct2 = jobInfo.csc->colType(sc2->oid()); - + ct2.charsetNumber =sc2->colType().charsetNumber; + } //X uint64_t joinInfo = sc1->joinInfo() | sc2->joinInfo(); @@ -1342,8 +1350,10 @@ const JobStepVector doSemiJoin(const SimpleColumn* sc, const ReturnedColumn* rc, //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (!sc->schemaName().empty() && sc->isColumnStore() && !pc1) + { ct1 = jobInfo.csc->colType(sc->oid()); - + ct1.charsetNumber =sc->colType().charsetNumber; + } //X JobStepVector jsv; SJSTEP step; @@ -1651,7 +1661,10 @@ const JobStepVector doSimpleFilter(SimpleFilter* sf, JobInfo& jobInfo) //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (!sc->schemaName().empty() && sc->isColumnStore() && !pc) + { ct = jobInfo.csc->colType(sc->oid()); + ct.charsetNumber =sc->colType().charsetNumber; + } //X // Because, on a filter, we want to compare ignoring trailing spaces in many cases @@ -2730,7 +2743,10 @@ const JobStepVector doConstantFilter(const ConstantFilter* cf, JobInfo& jobInfo) //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (!sc->schemaName().empty() && sc->isColumnStore() && !pc) + { ct = jobInfo.csc->colType(sc->oid()); + ct.charsetNumber =sc->colType().charsetNumber; + } //X CalpontSystemCatalog::OID tbOID = tableOid(sc.get(), jobInfo.csc); @@ -3008,8 +3024,10 @@ const JobStepVector doConstantFilter(const ConstantFilter* cf, JobInfo& jobInfo) CalpontSystemCatalog::ColType ct = sc->colType(); if (!sc->schemaName().empty() && sc->isColumnStore() && !pc) + { ct = jobInfo.csc->colType(sc->oid()); - + ct.charsetNumber =sc->colType().charsetNumber; + } TupleInfo ti(setTupleInfo(ct, sc->oid(), jobInfo, tblOid, sc.get(), alias)); //X TupleInfo ti(setTupleInfo(sc->colType(), sc->oid(), jobInfo, tblOid, sc.get(), alias)); pcs->tupleId(ti.key); diff --git a/dbcon/joblist/jlf_subquery.cpp b/dbcon/joblist/jlf_subquery.cpp index 11179eee2..008bdc758 100644 --- a/dbcon/joblist/jlf_subquery.cpp +++ b/dbcon/joblist/jlf_subquery.cpp @@ -800,8 +800,10 @@ void addOrderByAndLimit(CalpontSelectExecutionPlan* csep, JobInfo& jobInfo) //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (sc->isColumnStore() && !(dynamic_cast(sc))) + { ct = jobInfo.csc->colType(sc->oid()); - + ct.charsetNumber =sc->colType().charsetNumber; + } //X dictOid = isDictCol(ct); } diff --git a/dbcon/joblist/jlf_tuplejoblist.cpp b/dbcon/joblist/jlf_tuplejoblist.cpp index dbcba47ea..8f24193ce 100644 --- a/dbcon/joblist/jlf_tuplejoblist.cpp +++ b/dbcon/joblist/jlf_tuplejoblist.cpp @@ -159,13 +159,14 @@ void tupleKeyToProjectStep(uint32_t key, JobStepVector& jsv, JobInfo& jobInfo) inline void addColumnToRG(uint32_t cid, vector& pos, vector& oids, vector& keys, vector& scale, vector& precision, - vector& types, JobInfo& jobInfo) + vector& types, vector& csNums, JobInfo& jobInfo) { TupleInfo ti(getTupleInfo(cid, jobInfo)); pos.push_back(pos.back() + ti.width); oids.push_back(ti.oid); keys.push_back(ti.key); types.push_back(ti.dtype); + csNums.push_back(ti.csNum); scale.push_back(ti.scale); precision.push_back(ti.precision); } @@ -173,19 +174,20 @@ inline void addColumnToRG(uint32_t cid, vector& pos, vector& inline void addColumnInExpToRG(uint32_t cid, vector& pos, vector& oids, vector& keys, vector& scale, vector& precision, - vector& types, JobInfo& jobInfo) + vector& types, vector& csNums, JobInfo& jobInfo) { if (jobInfo.keyInfo->dictKeyMap.find(cid) != jobInfo.keyInfo->dictKeyMap.end()) cid = jobInfo.keyInfo->dictKeyMap[cid]; if (find(keys.begin(), keys.end(), cid) == keys.end()) - addColumnToRG(cid, pos, oids, keys, scale, precision, types, jobInfo); + addColumnToRG(cid, pos, oids, keys, scale, precision, types, csNums, jobInfo); } inline void addColumnsToRG(uint32_t tid, vector& pos, vector& oids, vector& keys, vector& scale, vector& precision, vector& types, + vector& csNums, TableInfoMap& tableInfoMap, JobInfo& jobInfo) { // -- the selected columns @@ -193,7 +195,7 @@ inline void addColumnsToRG(uint32_t tid, vector& pos, vector for (unsigned i = 0; i < pjCol.size(); i++) { - addColumnToRG(pjCol[i], pos, oids, keys, scale, precision, types, jobInfo); + addColumnToRG(pjCol[i], pos, oids, keys, scale, precision, types, csNums, jobInfo); } // -- any columns will be used in cross-table exps @@ -201,7 +203,7 @@ inline void addColumnsToRG(uint32_t tid, vector& pos, vector for (unsigned i = 0; i < exp2.size(); i++) { - addColumnInExpToRG(exp2[i], pos, oids, keys, scale, precision, types, jobInfo); + addColumnInExpToRG(exp2[i], pos, oids, keys, scale, precision, types, csNums, jobInfo); } // -- any columns will be used in returned exps @@ -209,7 +211,7 @@ inline void addColumnsToRG(uint32_t tid, vector& pos, vector for (unsigned i = 0; i < expr.size(); i++) { - addColumnInExpToRG(expr[i], pos, oids, keys, scale, precision, types, jobInfo); + addColumnInExpToRG(expr[i], pos, oids, keys, scale, precision, types, csNums, jobInfo); } // -- any columns will be used in final outer join expression @@ -217,7 +219,7 @@ inline void addColumnsToRG(uint32_t tid, vector& pos, vector for (unsigned i = 0; i < expo.size(); i++) { - addColumnInExpToRG(expo[i], pos, oids, keys, scale, precision, types, jobInfo); + addColumnInExpToRG(expo[i], pos, oids, keys, scale, precision, types, csNums, jobInfo); } } @@ -232,6 +234,7 @@ void constructJoinedRowGroup(RowGroup& rg, uint32_t large, uint32_t prev, bool r vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); // -- start with the join keys @@ -242,14 +245,14 @@ void constructJoinedRowGroup(RowGroup& rg, uint32_t large, uint32_t prev, bool r vector& joinKeys = jobInfo.tableJoinMap[make_pair(large, prev)].fLeftKeys; for (vector::iterator i = joinKeys.begin(); i != joinKeys.end(); i++) - addColumnToRG(*i, pos, oids, keys, scale, precision, types, jobInfo); + addColumnToRG(*i, pos, oids, keys, scale, precision, types, csNums, jobInfo); } // -- followed by the columns in select or expression for (set::iterator i = tableSet.begin(); i != tableSet.end(); i++) - addColumnsToRG(*i, pos, oids, keys, scale, precision, types, tableInfoMap, jobInfo); + addColumnsToRG(*i, pos, oids, keys, scale, precision, types, csNums, tableInfoMap, jobInfo); - RowGroup tmpRg(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup tmpRg(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); rg = tmpRg; } @@ -264,12 +267,13 @@ void constructJoinedRowGroup(RowGroup& rg, set& tableSet, TableInfoMap vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); for (set::iterator i = tableSet.begin(); i != tableSet.end(); i++) { // columns in select or expression - addColumnsToRG(*i, pos, oids, keys, scale, precision, types, tableInfoMap, jobInfo); + addColumnsToRG(*i, pos, oids, keys, scale, precision, types, csNums, tableInfoMap, jobInfo); // keys to be joined if not already in the rowgroup vector& adjList = tableInfoMap[*i].fAdjacentList; @@ -284,13 +288,13 @@ void constructJoinedRowGroup(RowGroup& rg, set& tableSet, TableInfoMap for (vector::iterator k = joinKeys.begin(); k != joinKeys.end(); k++) { if (find(keys.begin(), keys.end(), *k) == keys.end()) - addColumnToRG(*k, pos, oids, keys, scale, precision, types, jobInfo); + addColumnToRG(*k, pos, oids, keys, scale, precision, types, csNums, jobInfo); } } } } - RowGroup tmpRg(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup tmpRg(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); rg = tmpRg; } @@ -339,6 +343,7 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps, vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); for (unsigned i = 0; i < v.size(); i++) @@ -347,11 +352,12 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps, oids.push_back(v[i].oid); keys.push_back(v[i].key); types.push_back(v[i].dtype); + csNums.push_back(v[i].csNum); scale.push_back(v[i].scale); precision.push_back(v[i].precision); } - RowGroup rg1(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup rg1(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); // evaluate the returned/groupby expressions if any JobStepVector& expSteps = jobInfo.returnedExpressions; @@ -365,6 +371,7 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps, scale.clear(); precision.clear(); types.clear(); + csNums.clear(); pos.push_back(2); const vector& keys0 = rg0->getKeys(); @@ -377,6 +384,7 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps, oids.push_back(v[i].oid); keys.push_back(v[i].key); types.push_back(v[i].dtype); + csNums.push_back(v[i].csNum); scale.push_back(v[i].scale); precision.push_back(v[i].precision); } @@ -384,7 +392,7 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps, // for v0.9.3.0, the output and input to the expression are in the same row // add the returned column into the rg0 as rg01 - RowGroup rg01 = *rg0 + RowGroup(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup rg01 = *rg0 + RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); if (jobInfo.trace) cout << "Output RowGroup 01: " << rg01.toString() << endl; @@ -640,6 +648,7 @@ void addProjectStepsToBps(TableInfoMap::iterator& mit, BatchPrimitive* bps, JobI vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); // this psv is a copy of the project steps, the original vector in mit is not changed @@ -730,6 +739,7 @@ void addProjectStepsToBps(TableInfoMap::iterator& mit, BatchPrimitive* bps, JobI oids.push_back(ti.oid); keys.push_back(ti.key); types.push_back(ti.dtype); + csNums.push_back(ti.csNum); scale.push_back(ti.scale); precision.push_back(ti.precision); } @@ -742,12 +752,13 @@ void addProjectStepsToBps(TableInfoMap::iterator& mit, BatchPrimitive* bps, JobI oids.push_back(ti.oid); keys.push_back(ti.key); types.push_back(ti.dtype); + csNums.push_back(ti.csNum); scale.push_back(ti.scale); precision.push_back(ti.precision); } // construct RowGroup - RowGroup rg(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup rg(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); // fix the output association AnyDataListSPtr spdl(new AnyDataList()); @@ -818,6 +829,7 @@ void addExpresssionStepsToBps(TableInfoMap::iterator& mit, SJSTEP& sjsp, JobInfo vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); vector cols; @@ -854,12 +866,13 @@ void addExpresssionStepsToBps(TableInfoMap::iterator& mit, SJSTEP& sjsp, JobInfo oids.push_back(ti.oid); keys.push_back(ti.key); types.push_back(ti.dtype); + csNums.push_back(ti.csNum); scale.push_back(ti.scale); precision.push_back(ti.precision); } // construct RowGroup and add to TBPS - RowGroup rg(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup rg(oids.size(), pos, oids, keys, types, scale, csNums, precision, jobInfo.stringTableThreshold); bps->setFE1Input(rg); if (jobInfo.trace) cout << "FE1 input RowGroup: " << rg.toString() << endl << endl; @@ -1025,6 +1038,7 @@ bool combineJobStepsByTable(TableInfoMap::iterator& mit, JobInfo& jobInfo) vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); pos.push_back(2 + 8); @@ -1033,10 +1047,11 @@ bool combineJobStepsByTable(TableInfoMap::iterator& mit, JobInfo& jobInfo) uint32_t keyId = pds->tupleId(); keys.push_back(keyId); types.push_back(CalpontSystemCatalog::BIGINT); + csNums.push_back(pds->colType().charsetNumber); scale.push_back(0); precision.push_back(0); - RowGroup rg(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + RowGroup rg(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); if (jobInfo.trace) cout << "RowGroup pds(and): " << rg.toString() << endl; @@ -1341,6 +1356,7 @@ bool combineJobStepsByTable(TableInfoMap::iterator& mit, JobInfo& jobInfo) vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); for (unsigned i = 0; i < tis.size(); i++) @@ -1349,11 +1365,12 @@ bool combineJobStepsByTable(TableInfoMap::iterator& mit, JobInfo& jobInfo) oids.push_back(tis[i].oid); keys.push_back(tis[i].key); types.push_back(tis[i].dtype); + csNums.push_back(tis[i].csNum); scale.push_back(tis[i].scale); precision.push_back(tis[i].precision); } - RowGroup addRg(oids.size(), pos, oids, keys, types, scale, precision, + RowGroup addRg(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); RowGroup feRg1 = feRg; @@ -3985,6 +4002,7 @@ SJSTEP unionQueries(JobStepVector& queries, uint64_t distinctUnionNum, JobInfo& vector precision; vector width; vector types; + vector csNums; JobStepAssociation jsaToUnion; // bug4388, share code with connector for column type coversion @@ -4009,10 +4027,12 @@ SJSTEP unionQueries(JobStepVector& queries, uint64_t distinctUnionNum, JobInfo& const vector& scaleIn = rg.getScale(); const vector& precisionIn = rg.getPrecision(); const vector& typesIn = rg.getColTypes(); - + const vector& csNumsIn = rg.getCharsetNumbers(); + for (uint64_t j = 0; j < colCount; ++j) { queryColTypes[j][i].colDataType = typesIn[j]; + queryColTypes[j][i].charsetNumber = csNumsIn[j]; queryColTypes[j][i].scale = scaleIn[j]; queryColTypes[j][i].precision = precisionIn[j]; queryColTypes[j][i].colWidth = rg.getColumnWidth(j); @@ -4054,6 +4074,7 @@ SJSTEP unionQueries(JobStepVector& queries, uint64_t distinctUnionNum, JobInfo& { CalpontSystemCatalog::ColType colType = DataConvert::convertUnionColType(queryColTypes[j]); types.push_back(colType.colDataType); + csNums.push_back(colType.charsetNumber); scale.push_back(colType.scale); precision.push_back(colType.precision); width.push_back(colType.colWidth); @@ -4067,7 +4088,7 @@ SJSTEP unionQueries(JobStepVector& queries, uint64_t distinctUnionNum, JobInfo& unionStep->setInputRowGroups(inputRGs); unionStep->setDistinctFlags(distinct); - unionStep->setOutputRowGroup(RowGroup(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold)); + unionStep->setOutputRowGroup(RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold)); // Fix for bug 4388 adjusts the result type at connector side, this workaround is obsolete. // bug 3067, update the returned column types. diff --git a/dbcon/joblist/joblistfactory.cpp b/dbcon/joblist/joblistfactory.cpp index 392387d4b..d762d912c 100644 --- a/dbcon/joblist/joblistfactory.cpp +++ b/dbcon/joblist/joblistfactory.cpp @@ -128,8 +128,10 @@ void projectSimpleColumn(const SimpleColumn* sc, JobStepVector& jsv, JobInfo& jo //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (sc->isColumnStore() && !pc) + { ct = jobInfo.csc->colType(sc->oid()); - + ct.charsetNumber =sc->colType().charsetNumber; + } //X if (pc == NULL) pcs = new pColStep(oid, tbl_oid, ct, jobInfo); @@ -717,8 +719,10 @@ const JobStepVector doAggProject(const CalpontSelectExecutionPlan* csep, JobInfo //XXX use this before connector sets colType in sc correctly. if (sc->isColumnStore() && dynamic_cast(sc) == NULL) + { ct = jobInfo.csc->colType(sc->oid()); - + ct.charsetNumber =sc->colType().charsetNumber; + } //X dictOid = isDictCol(ct); } @@ -1007,7 +1011,10 @@ const JobStepVector doAggProject(const CalpontSelectExecutionPlan* csep, JobInfo //XXX use this before connector sets colType in sc correctly. if (sc->isColumnStore() && dynamic_cast(sc) == NULL) + { ct = jobInfo.csc->colType(sc->oid()); + ct.charsetNumber =sc->colType().charsetNumber; + } //X dictOid = isDictCol(ct); @@ -1160,7 +1167,10 @@ const JobStepVector doAggProject(const CalpontSelectExecutionPlan* csep, JobInfo //XXX use this before connector sets colType in sc correctly. if (sc->isColumnStore() && dynamic_cast(sc) == NULL) + { ct = jobInfo.csc->colType(sc->oid()); + ct.charsetNumber =sc->colType().charsetNumber; + } //X dictOid = isDictCol(ct); @@ -1646,7 +1656,10 @@ void parseExecutionPlan(CalpontSelectExecutionPlan* csep, JobInfo& jobInfo, //XXX use this before connector sets colType in sc correctly. if (sc->isColumnStore() && dynamic_cast(sc) == NULL) + { ct = jobInfo.csc->colType(sc->oid()); + ct.charsetNumber =sc->colType().charsetNumber; + } //X diff --git a/dbcon/joblist/pdictionaryscan.cpp b/dbcon/joblist/pdictionaryscan.cpp index 39bb561d0..cf1c9af50 100644 --- a/dbcon/joblist/pdictionaryscan.cpp +++ b/dbcon/joblist/pdictionaryscan.cpp @@ -138,7 +138,7 @@ pDictionaryScan::pDictionaryScan( sendWaiting(false), ridCount(0), ridList(0), - colType(ct), + fColType(ct), pThread(0), cThread(0), fScanLbidReqLimit(jobInfo.rm->getJlScanLbidReqLimit()), @@ -505,7 +505,8 @@ void pDictionaryScan::sendAPrimitiveMessage( hdr.COP2 = fCOP2; hdr.NVALS = fFilterCount; hdr.Count = msgLbidCount; - hdr.CompType = colType.ddn.compressionType; + hdr.CompType = fColType.ddn.compressionType; + hdr.charsetNumber = fColType.charsetNumber; idbassert(hdr.Count > 0); if (isEquality) @@ -628,7 +629,8 @@ void pDictionaryScan::receivePrimitiveMessages() if (fOid >= 3000 && traceOn() && dlTimes.FirstReadTime().tv_sec == 0) dlTimes.setFirstReadTime(); - if (fOid >= 3000 && traceOn()) dlTimes.setLastReadTime(); + if (fOid >= 3000 && traceOn()) + dlTimes.setLastReadTime(); if (bs->length() == 0) { diff --git a/dbcon/joblist/primitivemsg.h b/dbcon/joblist/primitivemsg.h index 6b959d014..f2cad3e76 100644 --- a/dbcon/joblist/primitivemsg.h +++ b/dbcon/joblist/primitivemsg.h @@ -560,6 +560,7 @@ struct TokenByScanRequestHeader uint16_t flags; uint32_t Pad2; uint16_t Count; + uint32_t charsetNumber; }; // what follows is NVALS DataValues. // compatibility with Ron's stuff. diff --git a/dbcon/joblist/primitivestep.h b/dbcon/joblist/primitivestep.h index c6e9134ff..72f9c7dbf 100644 --- a/dbcon/joblist/primitivestep.h +++ b/dbcon/joblist/primitivestep.h @@ -925,7 +925,7 @@ public: { return fOutType; } - void getOutputType(BPSOutputType ot) + void setOutputType(BPSOutputType ot) { fOutType = ot; } @@ -960,6 +960,11 @@ public: void appendFilter(const messageqcpp::ByteStream& filter, unsigned count); virtual void abort(); + + const execplan::CalpontSystemCatalog::ColType& colType() const + { + return fColType; + } protected: void sendError(uint16_t error); @@ -992,7 +997,7 @@ private: uint32_t fLogicalBlocksPerScan; DataList* ridList; messageqcpp::ByteStream fFilterString; - execplan::CalpontSystemCatalog::ColType colType; + execplan::CalpontSystemCatalog::ColType fColType; uint64_t pThread; //producer thread. thread pool handle uint64_t cThread; //consumer thread. thread pool handle DataList_t* requestList; @@ -1057,7 +1062,6 @@ public: virtual bool wasStepRun() const = 0; virtual BPSOutputType getOutputType() const = 0; virtual uint64_t getRows() const = 0; - virtual void useJoiner(boost::shared_ptr) = 0; virtual void setJobInfo(const JobInfo* jobInfo) = 0; virtual void setOutputRowGroup(const rowgroup::RowGroup& rg) = 0; virtual const rowgroup::RowGroup& getOutputRowGroup() const = 0; @@ -1239,7 +1243,6 @@ public: { return uniqueID; } - void useJoiner(boost::shared_ptr); void useJoiner(boost::shared_ptr); void useJoiners(const std::vector >&); bool wasStepRun() const diff --git a/dbcon/joblist/subquerytransformer.cpp b/dbcon/joblist/subquerytransformer.cpp index b66bcb310..cd5583c26 100644 --- a/dbcon/joblist/subquerytransformer.cpp +++ b/dbcon/joblist/subquerytransformer.cpp @@ -198,6 +198,7 @@ SJSTEP& SubQueryTransformer::makeSubQueryStep(execplan::CalpontSelectExecutionPl vector scale; vector precision; vector types; + vector csNums; pos.push_back(2); CalpontSystemCatalog::OID tblOid = fVtable.tableOid(); @@ -229,6 +230,7 @@ SJSTEP& SubQueryTransformer::makeSubQueryStep(execplan::CalpontSelectExecutionPl { ct.colWidth = row.getColumnWidth(i); ct.colDataType = row.getColTypes()[i]; + ct.charsetNumber = row.getCharsetNumber(i); ct.scale = row.getScale(i); if (colDataTypeInRg != CalpontSystemCatalog::FLOAT && @@ -268,6 +270,7 @@ SJSTEP& SubQueryTransformer::makeSubQueryStep(execplan::CalpontSelectExecutionPl oids.push_back(ti.oid); keys.push_back(ti.key); types.push_back(ti.dtype); + csNums.push_back(ti.csNum); scale.push_back(ti.scale); precision.push_back(ti.precision); } @@ -276,7 +279,7 @@ SJSTEP& SubQueryTransformer::makeSubQueryStep(execplan::CalpontSelectExecutionPl fVtable.columnType(i); } - RowGroup rg1(oids.size(), pos, oids, keys, types, scale, precision, csep->stringTableThreshold()); + RowGroup rg1(oids.size(), pos, oids, keys, types, csNums, scale, precision, csep->stringTableThreshold()); rg1.setUseStringTable(rg.usesStringTable()); dynamic_cast(fSubQueryStep.get())->setOutputRowGroup(rg1); diff --git a/dbcon/joblist/tuple-bps.cpp b/dbcon/joblist/tuple-bps.cpp index f70bb7b8e..f8da404ff 100644 --- a/dbcon/joblist/tuple-bps.cpp +++ b/dbcon/joblist/tuple-bps.cpp @@ -2774,10 +2774,6 @@ void TupleBPS::useJoiners(const vector >& fBPP->useJoiners(tjoiners); } -void TupleBPS::useJoiner(boost::shared_ptr j) -{ -} - void TupleBPS::newPMOnline(uint32_t connectionNumber) { ByteStream bs; diff --git a/dbcon/joblist/tupleaggregatestep.cpp b/dbcon/joblist/tupleaggregatestep.cpp index eb4cc52d9..8ba3473c5 100644 --- a/dbcon/joblist/tupleaggregatestep.cpp +++ b/dbcon/joblist/tupleaggregatestep.cpp @@ -758,13 +758,14 @@ void TupleAggregateStep::configDeliveredRowGroup(const JobInfo& jobInfo) vector::const_iterator offsets0 = fRowGroupOut.getOffsets().begin(); vector::const_iterator types0 = fRowGroupOut.getColTypes().begin(); - + vector csNums = fRowGroupOut.getCharsetNumbers(); vector::const_iterator precision0 = fRowGroupOut.getPrecision().begin(); fRowGroupDelivered = RowGroup(retColCount, vector(offsets0, offsets0 + retColCount + 1), vector(oids.begin(), oids.begin() + retColCount), vector(keys.begin(), keys.begin() + retColCount), vector(types0, types0 + retColCount), + vector(csNums.begin(), csNums.begin() + retColCount), vector(scale.begin(), scale.begin() + retColCount), vector(precision0, precision0 + retColCount), jobInfo.stringTableThreshold); @@ -1037,6 +1038,7 @@ void TupleAggregateStep::prep1PhaseAggregate( const vector& scaleProj = projRG.getScale(); const vector& precisionProj = projRG.getPrecision(); const vector& typeProj = projRG.getColTypes(); + const vector& csNumProj = projRG.getCharsetNumbers(); vector posAgg; vector oidsAgg; @@ -1044,6 +1046,7 @@ void TupleAggregateStep::prep1PhaseAggregate( vector scaleAgg; vector precisionAgg; vector typeAgg; + vector csNumAgg; vector widthAgg; vector groupBy; vector functionVec; @@ -1108,6 +1111,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(ti.scale); precisionAgg.push_back(ti.precision); typeAgg.push_back(ti.dtype); + csNumAgg.push_back(ti.csNum); widthAgg.push_back(ti.width); SP_ROWAGG_FUNC_t funct(new RowAggFunctionCol( aggOp, stats, 0, outIdx, jobInfo.cntStarPos)); @@ -1126,6 +1130,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(ti.scale); precisionAgg.push_back(ti.precision); typeAgg.push_back(ti.dtype); + csNumAgg.push_back(ti.csNum); widthAgg.push_back(width); SP_ROWAGG_FUNC_t funct(new RowAggFunctionCol( aggOp, stats, 0, outIdx, -1)); @@ -1174,6 +1179,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(width[colProj]); if (groupBy[it->second]->fOutputColumnIndex == (uint32_t) - 1) @@ -1199,6 +1205,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(ti.scale); precisionAgg.push_back(ti.precision); typeAgg.push_back(ti.dtype); + csNumAgg.push_back(ti.csNum); widthAgg.push_back(ti.width); ++outIdx; continue; @@ -1212,6 +1219,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(width[colProj]); ++outIdx; continue; @@ -1224,6 +1232,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(width[colProj]); ++outIdx; continue; @@ -1291,7 +1300,8 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); - widthAgg.push_back(width[colProj]); + csNumAgg.push_back(csNumProj[colProj]); + widthAgg.push_back(width[colProj]); } break; @@ -1321,6 +1331,7 @@ void TupleAggregateStep::prep1PhaseAggregate( oidsAgg.push_back(oidsProj[colProj]); keysAgg.push_back(key); typeAgg.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAgg.push_back(csNumProj[colProj]); precisionAgg.push_back(-1); widthAgg.push_back(sizeof(long double)); scaleAgg.push_back(0); @@ -1336,6 +1347,7 @@ void TupleAggregateStep::prep1PhaseAggregate( // work around count() in select subquery precisionAgg.push_back(9999); typeAgg.push_back(CalpontSystemCatalog::UBIGINT); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(bigIntWidth); } break; @@ -1365,6 +1377,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(0); typeAgg.push_back(CalpontSystemCatalog::DOUBLE); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(sizeof(double)); } break; @@ -1387,6 +1400,7 @@ void TupleAggregateStep::prep1PhaseAggregate( typeAgg.push_back(CalpontSystemCatalog::BIGINT); } + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(bigIntWidth); } break; @@ -1406,6 +1420,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(udafFuncCol->fUDAFContext.getScale()); precisionAgg.push_back(udafFuncCol->fUDAFContext.getPrecision()); typeAgg.push_back(udafFuncCol->fUDAFContext.getResultType()); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(udafFuncCol->fUDAFContext.getColWidth()); break; } @@ -1507,6 +1522,7 @@ void TupleAggregateStep::prep1PhaseAggregate( precisionAgg.push_back(0); precisionAgg.push_back(0); typeAgg.push_back(CalpontSystemCatalog::UBIGINT); + csNumAgg.push_back(8); widthAgg.push_back(bigUintWidth); continue; } @@ -1522,6 +1538,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(0); precisionAgg.push_back(-1); typeAgg.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAgg.push_back(8); widthAgg.push_back(sizeof(long double)); ++lastCol; @@ -1531,6 +1548,7 @@ void TupleAggregateStep::prep1PhaseAggregate( scaleAgg.push_back(0); precisionAgg.push_back(-1); typeAgg.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAgg.push_back(8); widthAgg.push_back(sizeof(long double)); ++lastCol; } @@ -1541,7 +1559,7 @@ void TupleAggregateStep::prep1PhaseAggregate( for (uint64_t i = 0; i < oidsAgg.size(); i++) posAgg.push_back(posAgg[i] + widthAgg[i]); - RowGroup aggRG(oidsAgg.size(), posAgg, oidsAgg, keysAgg, typeAgg, scaleAgg, precisionAgg, + RowGroup aggRG(oidsAgg.size(), posAgg, oidsAgg, keysAgg, typeAgg, csNumAgg, scaleAgg, precisionAgg, jobInfo.stringTableThreshold); SP_ROWAGG_UM_t rowAgg(new RowAggregationUM(groupBy, functionVec, jobInfo.rm, jobInfo.umMemLimit)); rowAgg->timeZone(jobInfo.timeZone); @@ -1588,6 +1606,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( const vector& scaleProj = projRG.getScale(); const vector& precisionProj = projRG.getPrecision(); const vector& typeProj = projRG.getColTypes(); + const vector& csNumProj = projRG.getCharsetNumbers(); vector posAgg, posAggDist; vector oidsAgg, oidsAggDist; @@ -1595,6 +1614,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( vector scaleAgg, scaleAggDist; vector precisionAgg, precisionAggDist; vector typeAgg, typeAggDist; + vector csNumAgg, csNumAggDist; vector widthProj, widthAgg, widthAggDist; vector groupBy, groupByNoDist; vector functionVec1, functionVec2, functionNoDistVec; @@ -1662,6 +1682,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(widthProj[colProj]); aggFuncMap.insert(make_pair(boost::make_tuple(keysAgg[colAgg], 0, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL), colAgg)); @@ -1703,6 +1724,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(widthProj[colProj]); aggFuncMap.insert(make_pair(boost::make_tuple(keysAgg[colAgg], 0, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL), colAgg)); @@ -1731,6 +1753,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(ti.scale); precisionAgg.push_back(ti.precision); typeAgg.push_back(ti.dtype); + csNumAgg.push_back(ti.csNum); widthAgg.push_back(width); SP_ROWAGG_FUNC_t funct(new RowAggFunctionCol( aggOp, stats, colAgg, colAgg, -1)); @@ -1824,6 +1847,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(widthProj[colProj]); colAgg++; } @@ -1853,6 +1877,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( oidsAgg.push_back(oidsProj[colProj]); keysAgg.push_back(aggKey); typeAgg.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAgg.push_back(8); precisionAgg.push_back(-1); widthAgg.push_back(sizeof(long double)); scaleAgg.push_back(0); @@ -1884,6 +1909,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( typeAgg.push_back(CalpontSystemCatalog::BIGINT); } + csNumAgg.push_back(8); widthAgg.push_back(bigIntWidth); colAgg++; } @@ -1924,6 +1950,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(0); precisionAgg.push_back(-1); typeAgg.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAgg.push_back(8); widthAgg.push_back(sizeof(long double)); ++colAgg; @@ -1933,6 +1960,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(0); precisionAgg.push_back(-1); typeAgg.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAgg.push_back(8); widthAgg.push_back(sizeof(long double)); ++colAgg; } @@ -1956,6 +1984,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( typeAgg.push_back(CalpontSystemCatalog::BIGINT); } + csNumAgg.push_back(8); widthAgg.push_back(bigIntWidth); colAgg++; } @@ -1976,6 +2005,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(udafFuncCol->fUDAFContext.getScale()); precisionAgg.push_back(udafFuncCol->fUDAFContext.getPrecision()); typeAgg.push_back(udafFuncCol->fUDAFContext.getResultType()); + csNumAgg.push_back(udafFuncCol->fUDAFContext.getCharsetNumber()); widthAgg.push_back(udafFuncCol->fUDAFContext.getColWidth()); ++colAgg; // Column for index of UDAF UserData struct @@ -1984,6 +2014,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(0); precisionAgg.push_back(0); typeAgg.push_back(CalpontSystemCatalog::UBIGINT); + csNumAgg.push_back(8); widthAgg.push_back(sizeof(uint64_t)); funct->fAuxColumnIndex = colAgg++; // If the first param is const @@ -2004,6 +2035,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAgg.push_back(scaleProj[colProj]); precisionAgg.push_back(precisionProj[colProj]); typeAgg.push_back(typeProj[colProj]); + csNumAgg.push_back(csNumProj[colProj]); widthAgg.push_back(widthProj[colProj]); multiParmIndexes.push_back(colAgg); ++colAgg; @@ -2174,6 +2206,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( oidsAggDist.push_back(oidsAgg[colAgg]); keysAggDist.push_back(retKey); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); precisionAggDist.push_back(-1); widthAggDist.push_back(sizeof(long double)); scaleAggDist.push_back(0); @@ -2188,6 +2221,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( // work around count() in select subquery precisionAggDist.push_back(9999); typeAggDist.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } break; @@ -2214,6 +2248,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(scaleAgg[colAgg]); precisionAggDist.push_back(precisionAgg[colAgg]); typeAggDist.push_back(typeAgg[colAgg]); + csNumAggDist.push_back(csNumAgg[colAgg]); uint32_t width = widthAgg[colAgg]; if (aggOp == ROWAGG_GROUP_CONCAT) @@ -2250,6 +2285,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( keysAggDist.push_back(retKey); scaleAggDist.push_back(0); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); precisionAggDist.push_back(-1); widthAggDist.push_back(sizeof(long double)); } @@ -2272,7 +2308,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( typeAggDist.push_back(CalpontSystemCatalog::BIGINT); precisionAggDist.push_back(19); } - + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } } @@ -2287,6 +2323,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(ti.scale); precisionAggDist.push_back(ti.precision); typeAggDist.push_back(ti.dtype); + csNumAggDist.push_back(ti.csNum); widthAggDist.push_back(ti.width); returnColMissing = false; @@ -2299,6 +2336,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(ti.scale); precisionAggDist.push_back(ti.precision); typeAggDist.push_back(ti.dtype); + csNumAggDist.push_back(ti.csNum); widthAggDist.push_back(ti.width); returnColMissing = false; @@ -2313,6 +2351,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(ti.scale); precisionAggDist.push_back(ti.precision); typeAggDist.push_back(ti.dtype); + csNumAggDist.push_back(ti.csNum); widthAggDist.push_back(ti.width); returnColMissing = false; @@ -2332,6 +2371,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(scaleProj[k] >> 8); precisionAggDist.push_back(precisionProj[k]); typeAggDist.push_back(typeProj[k]); + csNumAggDist.push_back(csNumProj[k]); widthAggDist.push_back(widthProj[k]); returnColMissing = false; @@ -2352,6 +2392,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(scaleProj[k] >> 8); precisionAggDist.push_back(precisionProj[k]); typeAggDist.push_back(typeProj[k]); + csNumAggDist.push_back(csNumProj[k]); widthAggDist.push_back(widthProj[k]); returnColMissing = false; @@ -2476,6 +2517,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(19); typeAggDist.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } } @@ -2508,6 +2550,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(19); typeAggDist.push_back(CalpontSystemCatalog::BIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } } @@ -2533,6 +2576,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(0); typeAggDist.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(sizeof(uint64_t)); continue; } @@ -2548,6 +2592,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(0); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); widthAggDist.push_back(sizeof(long double)); ++lastCol; @@ -2557,6 +2602,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(-1); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); widthAggDist.push_back(sizeof(long double)); ++lastCol; } @@ -2568,7 +2614,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( for (uint64_t i = 0; i < oidsAgg.size(); i++) posAgg.push_back(posAgg[i] + widthAgg[i]); - RowGroup aggRG(oidsAgg.size(), posAgg, oidsAgg, keysAgg, typeAgg, scaleAgg, precisionAgg, + RowGroup aggRG(oidsAgg.size(), posAgg, oidsAgg, keysAgg, typeAgg, csNumAgg, scaleAgg, precisionAgg, jobInfo.stringTableThreshold); SP_ROWAGG_UM_t rowAgg(new RowAggregationUM(groupBy, functionVec1, jobInfo.rm, jobInfo.umMemLimit)); rowAgg->timeZone(jobInfo.timeZone); @@ -2579,7 +2625,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( posAggDist.push_back(posAggDist[i] + widthAggDist[i]); RowGroup aggRgDist(oidsAggDist.size(), posAggDist, oidsAggDist, keysAggDist, typeAggDist, - scaleAggDist, precisionAggDist, jobInfo.stringTableThreshold); + csNumAggDist, scaleAggDist, precisionAggDist, jobInfo.stringTableThreshold); SP_ROWAGG_DIST rowAggDist(new RowAggregationDistinct(groupByNoDist, functionVec2, jobInfo.rm, jobInfo.umMemLimit)); rowAggDist->timeZone(jobInfo.timeZone); @@ -2609,6 +2655,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( vector scaleAggGb, scaleAggSub; vector precisionAggGb, precisionAggSub; vector typeAggGb, typeAggSub; + vector csNumAggGb, csNumAggSub; vector widthAggGb, widthAggSub; // populate groupby column info @@ -2619,6 +2666,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggGb.push_back(scaleProj[i]); precisionAggGb.push_back(precisionProj[i]); typeAggGb.push_back(typeProj[i]); + csNumAggGb.push_back(csNumProj[i]); widthAggGb.push_back(widthProj[i]); } @@ -2647,6 +2695,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggSub = scaleAggGb; precisionAggSub = precisionAggGb; typeAggSub = typeAggGb; + csNumAggSub = csNumAggGb; widthAggSub = widthAggGb; oidsAggSub.push_back(oidsProj[j]); @@ -2654,6 +2703,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( scaleAggSub.push_back(scaleProj[j]); precisionAggSub.push_back(precisionProj[j]); typeAggSub.push_back(typeProj[j]); + csNumAggSub.push_back(csNumProj[j]); widthAggSub.push_back(widthProj[j]); // construct sub-rowgroup @@ -2664,7 +2714,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate( posAggSub.push_back(posAggSub[k] + widthAggSub[k]); RowGroup subRg(oidsAggSub.size(), posAggSub, oidsAggSub, keysAggSub, typeAggSub, - scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold); + csNumAggSub, scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold); subRgVec.push_back(subRg); // construct groupby vector @@ -2873,6 +2923,7 @@ void TupleAggregateStep::prep2PhasesAggregate( const vector& scaleProj = projRG.getScale(); const vector& precisionProj = projRG.getPrecision(); const vector& typeProj = projRG.getColTypes(); + const vector& csNumProj = projRG.getCharsetNumbers(); vector posAggPm, posAggUm; vector oidsAggPm, oidsAggUm; @@ -2880,6 +2931,7 @@ void TupleAggregateStep::prep2PhasesAggregate( vector scaleAggPm, scaleAggUm; vector precisionAggPm, precisionAggUm; vector typeAggPm, typeAggUm; + vector csNumAggPm, csNumAggUm; vector widthAggPm, widthAggUm; vector groupByPm, groupByUm; vector functionVecPm, functionVecUm; @@ -2937,6 +2989,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); aggFuncMap.insert(make_pair(boost::make_tuple(keysAggPm[colAggPm], 0, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL), colAggPm)); @@ -2977,6 +3030,7 @@ void TupleAggregateStep::prep2PhasesAggregate( keysAggPm.push_back(key); scaleAggPm.push_back(scaleProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); precisionAggPm.push_back(precisionProj[colProj]); @@ -3071,6 +3125,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); colAggPm++; } @@ -3100,6 +3155,7 @@ void TupleAggregateStep::prep2PhasesAggregate( oidsAggPm.push_back(oidsProj[colProj]); keysAggPm.push_back(aggKey); typeAggPm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggPm.push_back(8); scaleAggPm.push_back(0); precisionAggPm.push_back(-1); widthAggPm.push_back(sizeof(long double)); @@ -3120,6 +3176,7 @@ void TupleAggregateStep::prep2PhasesAggregate( // work around count() in select subquery precisionAggPm.push_back(9999); typeAggPm.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggPm.push_back(8); widthAggPm.push_back(bigIntWidth); colAggPm++; } @@ -3151,6 +3208,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(0); typeAggPm.push_back(CalpontSystemCatalog::DOUBLE); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(double)); funct->fAuxColumnIndex = ++colAggPm; @@ -3160,6 +3218,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(0); precisionAggPm.push_back(-1); typeAggPm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(long double)); ++colAggPm; @@ -3169,6 +3228,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(0); precisionAggPm.push_back(-1); typeAggPm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(long double)); ++colAggPm; } @@ -3192,6 +3252,7 @@ void TupleAggregateStep::prep2PhasesAggregate( typeAggPm.push_back(CalpontSystemCatalog::BIGINT); } + csNumAggPm.push_back(8); widthAggPm.push_back(bigIntWidth); colAggPm++; } @@ -3212,6 +3273,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(udafFuncCol->fUDAFContext.getScale()); precisionAggPm.push_back(udafFuncCol->fUDAFContext.getPrecision()); typeAggPm.push_back(udafFuncCol->fUDAFContext.getResultType()); + csNumAggPm.push_back(udafFuncCol->fUDAFContext.getCharsetNumber()); widthAggPm.push_back(udafFuncCol->fUDAFContext.getColWidth()); ++colAggPm; // Column for index of UDAF UserData struct @@ -3220,6 +3282,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(0); precisionAggPm.push_back(0); typeAggPm.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggPm.push_back(8); widthAggPm.push_back(bigUintWidth); funct->fAuxColumnIndex = colAggPm++; // If the first param is const @@ -3240,6 +3303,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); colAggPm++; // If the param is const @@ -3353,6 +3417,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(scaleAggPm[colPm]); precisionAggUm.push_back(precisionAggPm[colPm]); typeAggUm.push_back(typeAggPm[colPm]); + csNumAggUm.push_back(csNumAggPm[colPm]); widthAggUm.push_back(widthAggPm[colPm]); } @@ -3379,6 +3444,7 @@ void TupleAggregateStep::prep2PhasesAggregate( keysAggUm.push_back(retKey); scaleAggUm.push_back(0); typeAggUm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggUm.push_back(8); precisionAggUm.push_back(-1); widthAggUm.push_back(sizeof(long double)); } @@ -3393,6 +3459,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(0); precisionAggUm.push_back(19); typeAggUm.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggUm.push_back(8); widthAggUm.push_back(bigIntWidth); } } @@ -3407,6 +3474,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(ti.scale); precisionAggUm.push_back(ti.precision); typeAggUm.push_back(ti.dtype); + csNumAggUm.push_back(ti.csNum); widthAggUm.push_back(ti.width); returnColMissing = false; @@ -3420,6 +3488,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(ti.scale); precisionAggUm.push_back(ti.precision); typeAggUm.push_back(ti.dtype); + csNumAggUm.push_back(ti.csNum); widthAggUm.push_back(ti.width); returnColMissing = false; @@ -3432,6 +3501,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(ti.scale); precisionAggUm.push_back(ti.precision); typeAggUm.push_back(ti.dtype); + csNumAggUm.push_back(ti.csNum); widthAggUm.push_back(ti.width); returnColMissing = false; @@ -3560,6 +3630,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(0); precisionAggUm.push_back(19); typeAggUm.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggUm.push_back(8); widthAggUm.push_back(bigIntWidth); } } @@ -3585,6 +3656,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(0); precisionAggUm.push_back(0); typeAggUm.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggUm.push_back(8); widthAggUm.push_back(bigUintWidth); continue; } @@ -3600,6 +3672,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(0); precisionAggUm.push_back(-1); typeAggUm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggUm.push_back(8); widthAggUm.push_back(sizeof(long double)); ++lastCol; @@ -3609,6 +3682,7 @@ void TupleAggregateStep::prep2PhasesAggregate( scaleAggUm.push_back(0); precisionAggUm.push_back(-1); typeAggUm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggUm.push_back(8); widthAggUm.push_back(sizeof(long double)); ++lastCol; } @@ -3620,8 +3694,8 @@ void TupleAggregateStep::prep2PhasesAggregate( for (uint64_t i = 0; i < oidsAggUm.size(); i++) posAggUm.push_back(posAggUm[i] + widthAggUm[i]); - RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, scaleAggUm, - precisionAggUm, jobInfo.stringTableThreshold); + RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, + csNumAggUm, scaleAggUm, precisionAggUm, jobInfo.stringTableThreshold); SP_ROWAGG_UM_t rowAggUm(new RowAggregationUMP2(groupByUm, functionVecUm, jobInfo.rm, jobInfo.umMemLimit)); rowAggUm->timeZone(jobInfo.timeZone); rowgroups.push_back(aggRgUm); @@ -3632,8 +3706,8 @@ void TupleAggregateStep::prep2PhasesAggregate( for (uint64_t i = 0; i < oidsAggPm.size(); i++) posAggPm.push_back(posAggPm[i] + widthAggPm[i]); - RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm, scaleAggPm, - precisionAggPm, jobInfo.stringTableThreshold); + RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm, + csNumAggPm, scaleAggPm, precisionAggPm, jobInfo.stringTableThreshold); SP_ROWAGG_PM_t rowAggPm(new RowAggregation(groupByPm, functionVecPm)); rowAggPm->timeZone(jobInfo.timeZone); rowgroups.push_back(aggRgPm); @@ -3717,6 +3791,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( const vector& scaleProj = projRG.getScale(); const vector& precisionProj = projRG.getPrecision(); const vector& typeProj = projRG.getColTypes(); + const vector& csNumProj = projRG.getCharsetNumbers(); vector posAggPm, posAggUm, posAggDist; vector oidsAggPm, oidsAggUm, oidsAggDist; @@ -3724,6 +3799,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( vector scaleAggPm, scaleAggUm, scaleAggDist; vector precisionAggPm, precisionAggUm, precisionAggDist; vector typeAggPm, typeAggUm, typeAggDist; + vector csNumAggPm, csNumAggUm, csNumAggDist; vector widthAggPm, widthAggUm, widthAggDist; vector groupByPm, groupByUm, groupByNoDist; @@ -3785,6 +3861,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); aggFuncMap.insert(make_pair(boost::make_tuple(keysAggPm[colAggPm], 0, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL), colAggPm)); @@ -3826,6 +3903,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); precisionAggPm.push_back(precisionProj[colProj]); @@ -3927,6 +4005,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); colAggPm++; } @@ -3956,6 +4035,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( oidsAggPm.push_back(oidsProj[colProj]); keysAggPm.push_back(aggKey); typeAggPm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggPm.push_back(8); precisionAggPm.push_back(-1); widthAggPm.push_back(sizeof(long double)); scaleAggPm.push_back(0); @@ -3987,6 +4067,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( typeAggPm.push_back(CalpontSystemCatalog::BIGINT); } + csNumAggPm.push_back(8); widthAggPm.push_back(bigIntWidth); colAggPm++; } @@ -4018,6 +4099,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(0); typeAggPm.push_back(CalpontSystemCatalog::DOUBLE); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(double)); funct->fAuxColumnIndex = ++colAggPm; @@ -4027,6 +4109,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(0); precisionAggPm.push_back(-1); typeAggPm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(long double)); ++colAggPm; @@ -4036,6 +4119,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(0); precisionAggPm.push_back(-1); typeAggPm.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(long double)); ++colAggPm; } @@ -4059,6 +4143,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( typeAggPm.push_back(CalpontSystemCatalog::BIGINT); } + csNumAggPm.push_back(8); widthAggPm.push_back(bigIntWidth); ++colAggPm; } @@ -4079,6 +4164,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(udafFuncCol->fUDAFContext.getScale()); precisionAggPm.push_back(udafFuncCol->fUDAFContext.getPrecision()); typeAggPm.push_back(udafFuncCol->fUDAFContext.getResultType()); + csNumAggPm.push_back(udafFuncCol->fUDAFContext.getCharsetNumber()); widthAggPm.push_back(udafFuncCol->fUDAFContext.getColWidth()); ++colAggPm; // Column for index of UDAF UserData struct @@ -4087,6 +4173,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(0); precisionAggPm.push_back(0); typeAggPm.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggPm.push_back(8); widthAggPm.push_back(sizeof(uint64_t)); funct->fAuxColumnIndex = colAggPm++; // If the first param is const @@ -4107,6 +4194,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggPm.push_back(scaleProj[colProj]); precisionAggPm.push_back(precisionProj[colProj]); typeAggPm.push_back(typeProj[colProj]); + csNumAggPm.push_back(csNumProj[colProj]); widthAggPm.push_back(width[colProj]); multiParmIndexes.push_back(colAggPm); ++colAggPm; @@ -4208,6 +4296,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( precisionAggUm.push_back(precisionAggPm[idx]); widthAggUm.push_back(widthAggPm[idx]); typeAggUm.push_back(typeAggPm[idx]); + csNumAggUm.push_back(csNumAggPm[idx]); } } @@ -4325,6 +4414,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( oidsAggDist.push_back(oidsAggUm[colUm]); keysAggDist.push_back(retKey); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); precisionAggDist.push_back(-1); widthAggDist.push_back(sizeof(long double)); scaleAggDist.push_back(0); @@ -4342,6 +4432,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( // work around count() in select subquery precisionAggDist.push_back(9999); typeAggDist.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } break; @@ -4365,6 +4456,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(scaleAggUm[colUm]); precisionAggDist.push_back(precisionAggUm[colUm]); typeAggDist.push_back(typeAggUm[colUm]); + csNumAggDist.push_back(csNumAggUm[colUm]); widthAggDist.push_back(widthAggUm[colUm]); } @@ -4391,6 +4483,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( keysAggDist.push_back(retKey); scaleAggDist.push_back(0); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); precisionAggDist.push_back(-1); widthAggDist.push_back(sizeof(long double)); } @@ -4412,6 +4505,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( precisionAggDist.push_back(19); typeAggDist.push_back(CalpontSystemCatalog::BIGINT); } + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } } @@ -4426,6 +4520,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(ti.scale); precisionAggDist.push_back(ti.precision); typeAggDist.push_back(ti.dtype); + csNumAggDist.push_back(ti.csNum); widthAggDist.push_back(ti.width); returnColMissing = false; @@ -4439,6 +4534,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(ti.scale); precisionAggDist.push_back(ti.precision); typeAggDist.push_back(ti.dtype); + csNumAggDist.push_back(ti.csNum); widthAggDist.push_back(ti.width); returnColMissing = false; @@ -4451,6 +4547,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(ti.scale); precisionAggDist.push_back(ti.precision); typeAggDist.push_back(ti.dtype); + csNumAggDist.push_back(ti.csNum); widthAggDist.push_back(ti.width); returnColMissing = false; @@ -4570,6 +4667,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(19); typeAggDist.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } } @@ -4602,6 +4700,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(19); typeAggDist.push_back(CalpontSystemCatalog::BIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(bigIntWidth); } } @@ -4627,6 +4726,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(0); typeAggDist.push_back(CalpontSystemCatalog::UBIGINT); + csNumAggDist.push_back(8); widthAggDist.push_back(sizeof(uint64_t)); continue; } @@ -4642,6 +4742,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(-1); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); widthAggDist.push_back(sizeof(long double)); ++lastCol; @@ -4651,6 +4752,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggDist.push_back(0); precisionAggDist.push_back(-1); typeAggDist.push_back(CalpontSystemCatalog::LONGDOUBLE); + csNumAggDist.push_back(8); widthAggDist.push_back(sizeof(long double)); ++lastCol; } @@ -4663,8 +4765,8 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( for (uint64_t i = 0; i < oidsAggUm.size(); i++) posAggUm.push_back(posAggUm[i] + widthAggUm[i]); - RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, scaleAggUm, - precisionAggUm, jobInfo.stringTableThreshold); + RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, + csNumAggUm, scaleAggUm, precisionAggUm, jobInfo.stringTableThreshold); SP_ROWAGG_UM_t rowAggUm(new RowAggregationUMP2(groupByUm, functionNoDistVec, jobInfo.rm, jobInfo.umMemLimit)); rowAggUm->timeZone(jobInfo.timeZone); @@ -4673,8 +4775,9 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( for (uint64_t i = 0; i < oidsAggDist.size(); i++) posAggDist.push_back(posAggDist[i] + widthAggDist[i]); - RowGroup aggRgDist(oidsAggDist.size(), posAggDist, oidsAggDist, keysAggDist, typeAggDist, - scaleAggDist, precisionAggDist, jobInfo.stringTableThreshold); + RowGroup aggRgDist(oidsAggDist.size(), posAggDist, oidsAggDist, keysAggDist, + typeAggDist, csNumAggDist, scaleAggDist, + precisionAggDist, jobInfo.stringTableThreshold); SP_ROWAGG_DIST rowAggDist(new RowAggregationDistinct(groupByNoDist, functionVecUm, jobInfo.rm, jobInfo.umMemLimit)); rowAggDist->timeZone(jobInfo.timeZone); @@ -4695,6 +4798,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( vector scaleAggGb, scaleAggSub; vector precisionAggGb, precisionAggSub; vector typeAggGb, typeAggSub; + vector csNumAggGb, csNumAggSub; vector widthAggGb, widthAggSub; // populate groupby column info @@ -4705,6 +4809,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggGb.push_back(scaleAggUm[i]); precisionAggGb.push_back(precisionAggUm[i]); typeAggGb.push_back(typeAggUm[i]); + csNumAggGb.push_back(csNumAggUm[i]); widthAggGb.push_back(widthAggUm[i]); } @@ -4733,6 +4838,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggSub = scaleAggGb; precisionAggSub = precisionAggGb; typeAggSub = typeAggGb; + csNumAggSub = csNumAggGb; widthAggSub = widthAggGb; oidsAggSub.push_back(oidsAggUm[j]); @@ -4740,6 +4846,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( scaleAggSub.push_back(scaleAggUm[j]); precisionAggSub.push_back(precisionAggUm[j]); typeAggSub.push_back(typeAggUm[j]); + csNumAggSub.push_back(csNumAggUm[i]); widthAggSub.push_back(widthAggUm[j]); // construct sub-rowgroup @@ -4750,7 +4857,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( posAggSub.push_back(posAggSub[k] + widthAggSub[k]); RowGroup subRg(oidsAggSub.size(), posAggSub, oidsAggSub, keysAggSub, typeAggSub, - scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold); + csNumAggSub, scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold); subRgVec.push_back(subRg); // construct groupby vector @@ -4908,8 +5015,8 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate( for (uint64_t i = 0; i < oidsAggPm.size(); i++) posAggPm.push_back(posAggPm[i] + widthAggPm[i]); - RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm, scaleAggPm, - precisionAggPm, jobInfo.stringTableThreshold); + RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm, + csNumAggPm, scaleAggPm, precisionAggPm, jobInfo.stringTableThreshold); SP_ROWAGG_PM_t rowAggPm(new RowAggregation(groupByPm, functionVecPm)); rowAggPm->timeZone(jobInfo.timeZone); rowgroups.push_back(aggRgPm); diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp index 3f9e8a7b4..841817fc5 100644 --- a/dbcon/joblist/tupleannexstep.cpp +++ b/dbcon/joblist/tupleannexstep.cpp @@ -201,6 +201,7 @@ void TupleAnnexStep::initialize(const RowGroup& rgIn, const JobInfo& jobInfo) vector scale, scaleIn = rgIn.getScale(); vector precision, precisionIn = rgIn.getPrecision(); vector types, typesIn = rgIn.getColTypes(); + vector csNums, csNumsIn = rgIn.getCharsetNumbers(); vector pos, posIn = rgIn.getOffsets(); size_t n = jobInfo.nonConstDelCols.size(); @@ -210,9 +211,10 @@ void TupleAnnexStep::initialize(const RowGroup& rgIn, const JobInfo& jobInfo) scale.insert(scale.end(), scaleIn.begin(), scaleIn.begin() + n); precision.insert(precision.end(), precisionIn.begin(), precisionIn.begin() + n); types.insert(types.end(), typesIn.begin(), typesIn.begin() + n); + csNums.insert(csNums.end(), csNumsIn.begin(), csNumsIn.begin() + n); pos.insert(pos.end(), posIn.begin(), posIn.begin() + n + 1); - fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); } else { diff --git a/dbcon/joblist/tupleconstantstep.cpp b/dbcon/joblist/tupleconstantstep.cpp index 3e2b61e39..6ec49884e 100644 --- a/dbcon/joblist/tupleconstantstep.cpp +++ b/dbcon/joblist/tupleconstantstep.cpp @@ -108,6 +108,7 @@ void TupleConstantStep::initialize(const JobInfo& jobInfo, const RowGroup* rgIn) vector scale, scaleIn = fRowGroupIn.getScale(); vector precision, precisionIn = fRowGroupIn.getPrecision(); vector types, typesIn = fRowGroupIn.getColTypes(); + vector csNums, csNumsIn = fRowGroupIn.getCharsetNumbers(); vector pos; pos.push_back(2); @@ -120,6 +121,7 @@ void TupleConstantStep::initialize(const JobInfo& jobInfo, const RowGroup* rgIn) scaleIn = fRowGroupIn.getScale(); precisionIn = fRowGroupIn.getPrecision(); typesIn = fRowGroupIn.getColTypes(); + csNumsIn = fRowGroupIn.getCharsetNumbers(); } for (uint64_t i = 0, j = 0; i < jobInfo.deliveredCols.size(); i++) @@ -145,6 +147,7 @@ void TupleConstantStep::initialize(const JobInfo& jobInfo, const RowGroup* rgIn) scale.push_back(ct.scale); precision.push_back(ct.precision); types.push_back(ct.colDataType); + csNums.push_back(ct.charsetNumber); pos.push_back(pos.back() + ct.colWidth); fIndexConst.push_back(i); @@ -164,6 +167,7 @@ void TupleConstantStep::initialize(const JobInfo& jobInfo, const RowGroup* rgIn) scale.push_back(scaleIn[j]); precision.push_back(precisionIn[j]); types.push_back(typesIn[j]); + csNums.push_back(csNumsIn[j]); pos.push_back(pos.back() + fRowGroupIn.getColumnWidth(j)); j++; @@ -171,7 +175,7 @@ void TupleConstantStep::initialize(const JobInfo& jobInfo, const RowGroup* rgIn) } } - fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, scale, precision, + fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); fRowGroupOut.initRow(&fRowOut); fRowGroupOut.initRow(&fRowConst, true); @@ -644,6 +648,7 @@ void TupleConstantOnlyStep::initialize(const JobInfo& jobInfo, const rowgroup::R vector scale; vector precision; vector types; + vector csNums; vector pos; pos.push_back(2); @@ -673,12 +678,13 @@ void TupleConstantOnlyStep::initialize(const JobInfo& jobInfo, const rowgroup::R scale.push_back(ct.scale); precision.push_back(ct.precision); types.push_back(ct.colDataType); + csNums.push_back(ct.charsetNumber); pos.push_back(pos.back() + ct.colWidth); fIndexConst.push_back(i); } - fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold, false); + fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold, false); fRowGroupOut.initRow(&fRowOut); fRowGroupOut.initRow(&fRowConst, true); diff --git a/dbcon/joblist/tuplehavingstep.cpp b/dbcon/joblist/tuplehavingstep.cpp index 6477e9293..65c62d0a3 100644 --- a/dbcon/joblist/tuplehavingstep.cpp +++ b/dbcon/joblist/tuplehavingstep.cpp @@ -102,6 +102,7 @@ void TupleHavingStep::initialize(const RowGroup& rgIn, const JobInfo& jobInfo) vector scale, scaleIn = fRowGroupIn.getScale(); vector precision, precisionIn = fRowGroupIn.getPrecision(); vector types, typesIn = fRowGroupIn.getColTypes(); + vector csNums, csNumsIn = fRowGroupIn.getCharsetNumbers(); vector pos, posIn = fRowGroupIn.getOffsets(); size_t n = 0; @@ -116,9 +117,10 @@ void TupleHavingStep::initialize(const RowGroup& rgIn, const JobInfo& jobInfo) scale.insert(scale.end(), scaleIn.begin(), scaleIn.begin() + n); precision.insert(precision.end(), precisionIn.begin(), precisionIn.begin() + n); types.insert(types.end(), typesIn.begin(), typesIn.begin() + n); + csNums.insert(csNums.end(), csNumsIn.begin(), csNumsIn.begin() + n); pos.insert(pos.end(), posIn.begin(), posIn.begin() + n + 1); - fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, scale, precision, jobInfo.stringTableThreshold); + fRowGroupOut = RowGroup(oids.size(), pos, oids, keys, types, csNums, scale, precision, jobInfo.stringTableThreshold); fRowGroupOut.initRow(&fRowOut); } diff --git a/dbcon/joblist/windowfunctionstep.cpp b/dbcon/joblist/windowfunctionstep.cpp index ab1acd1fe..1da9ef60b 100755 --- a/dbcon/joblist/windowfunctionstep.cpp +++ b/dbcon/joblist/windowfunctionstep.cpp @@ -95,7 +95,10 @@ uint64_t getColumnIndex(const SRCP& c, const map& m, JobInfo //XXX use this before connector sets colType in sc correctly. // type of pseudo column is set by connector if (!(dynamic_cast(sc))) + { ct = jobInfo.csc->colType(sc->oid()); + ct.charsetNumber =sc->colType().charsetNumber; + } //X CalpontSystemCatalog::OID dictOid = isDictCol(ct); @@ -631,6 +634,7 @@ void WindowFunctionStep::initialize(const RowGroup& rg, JobInfo& jobInfo) const vector& oids = rg.getOIDs(); const vector& keys = rg.getKeys(); const vector& types = rg.getColTypes(); + const vector& csNums = rg.getCharsetNumbers(); const vector& scales = rg.getScale(); const vector& precisions = rg.getPrecision(); @@ -869,6 +873,7 @@ void WindowFunctionStep::initialize(const RowGroup& rg, JobInfo& jobInfo) vector scales1; vector precisions1; vector types1; + vector csNums1; pos1.push_back(2); for (size_t i = 0; i < retColCount; i++) @@ -880,10 +885,11 @@ void WindowFunctionStep::initialize(const RowGroup& rg, JobInfo& jobInfo) scales1.push_back(scales[j]); precisions1.push_back(precisions[j]); types1.push_back(types[j]); + csNums1.push_back(csNums[j]); } fRowGroupDelivered = RowGroup( - retColCount, pos1, oids1, keys1, types1, scales1, precisions1, jobInfo.stringTableThreshold); + retColCount, pos1, oids1, keys1, types1, csNums1, scales1, precisions1, jobInfo.stringTableThreshold); if (jobInfo.trace) cout << "delivered RG: " << fRowGroupDelivered.toString() << endl << endl; diff --git a/dbcon/mysql/ha_mcs_execplan.cpp b/dbcon/mysql/ha_mcs_execplan.cpp index 991cb556f..8007485d2 100755 --- a/dbcon/mysql/ha_mcs_execplan.cpp +++ b/dbcon/mysql/ha_mcs_execplan.cpp @@ -17,7 +17,8 @@ MA 02110-1301, USA. */ //#define DEBUG_WALK_COND -#include +#include + #include #include #include @@ -85,6 +86,8 @@ using namespace execplan; #include "functor.h" using namespace funcexp; +#include "collation.h" + const uint64_t AGG_BIT = 0x01; const uint64_t SUB_BIT = 0x02; const uint64_t AF_BIT = 0x04; @@ -2950,6 +2953,7 @@ SimpleColumn* getSmallestColumn(boost::shared_ptr csc, sc->viewName(lower(tan.view)); sc->timeZone(gwi.thd->variables.time_zone->get_name()->ptr()); sc->resultType(csc->colType(oidlist[minWidthColOffset].objnum)); + sc->charsetNumber(3000); return sc; } @@ -3113,7 +3117,7 @@ CalpontSystemCatalog::ColType colType_MysqlToIDB (const Item* item) << item->result_type() << endl ); break; } - + ct.charsetNumber = item->collation.collation->number; return ct; } @@ -3406,6 +3410,9 @@ ReturnedColumn* buildReturnedColumn( if (rc && item->name.length) rc->alias(item->name.str); + if (rc) + rc->charsetNumber(item->collation.collation->number); + return rc; } @@ -4080,6 +4087,7 @@ ReturnedColumn* buildFunctionColumn( fc->operationType(functor->operationType(funcParms, fc->resultType())); fc->expressionId(ci->expressionId++); + fc->charsetNumber(ifp->collation.collation->number); } else if (ifp->type() == Item::COND_ITEM || ifp->functype() == Item_func::EQ_FUNC || @@ -4348,6 +4356,7 @@ ConstantColumn* buildDecimalColumn(Item* item, gp_walk_info& gwi) columnstore_decimal.precision = idp->max_length - idp->decimals; ConstantColumn* cc = new ConstantColumn(valStr, columnstore_decimal); cc->timeZone(gwi.thd->variables.time_zone->get_name()->ptr()); + cc->charsetNumber(idp->collation.collation->number); return cc; } @@ -4467,8 +4476,8 @@ SimpleColumn* buildSimpleColumn(Item_field* ifp, gp_walk_info& gwi) default: sc = new SimpleColumn(ifp->db_name.str, bestTableName(ifp), ifp->field_name.str, columnStore, gwi.sessionid); } - sc->resultType(ct); + sc->charsetNumber(ifp->collation.collation->number); string tbname(ifp->table_name.str); if (isInformationSchema) @@ -5004,6 +5013,7 @@ ReturnedColumn* buildAggregateColumn(Item* item, gp_walk_info& gwi) colType.dataType = resultType.colDataType; colType.precision = resultType.precision; colType.scale = resultType.scale; + colType.charsetNumber = resultType.charsetNumber; colTypes[i] = colType; } @@ -5079,6 +5089,7 @@ ReturnedColumn* buildAggregateColumn(Item* item, gp_walk_info& gwi) return NULL; } + ac->charsetNumber(item->collation.collation->number); return ac; } @@ -6706,7 +6717,8 @@ int processLimitAndOffset( } // We don't currently support limit with correlated subquery - if (gwi.subQuery && !gwi.correlatedTbNameVec.empty() && csep->hasOrderBy()) + if (csep->limitNum() != (uint64_t) - 1 && + gwi.subQuery && !gwi.correlatedTbNameVec.empty()) { gwi.fatalParseError = true; gwi.parseErrorText = IDBErrorInfo::instance()->errorMsg(ERR_NON_SUPPORT_LIMIT_SUB); diff --git a/dbcon/mysql/ha_window_function.cpp b/dbcon/mysql/ha_window_function.cpp index 7c039cc75..e76a161fa 100644 --- a/dbcon/mysql/ha_window_function.cpp +++ b/dbcon/mysql/ha_window_function.cpp @@ -405,6 +405,7 @@ ReturnedColumn* buildWindowFunctionColumn(Item* item, gp_walk_info& gwi, bool& n colType.dataType = resultType.colDataType; colType.precision = resultType.precision; colType.scale = resultType.scale; + colType.charsetNumber = resultType.charsetNumber; colTypes[i] = colType; } @@ -938,6 +939,8 @@ ReturnedColumn* buildWindowFunctionColumn(Item* item, gp_walk_info& gwi, bool& n if (item->full_name()) ac->alias(item->full_name()); + ac->charsetNumber(item->collation.collation->number); + // put ac on windowFuncList gwi.windowFuncList.push_back(ac); return ac; diff --git a/dbcon/mysql/sm.cpp b/dbcon/mysql/sm.cpp index b1a1dad63..5acef3805 100644 --- a/dbcon/mysql/sm.cpp +++ b/dbcon/mysql/sm.cpp @@ -20,7 +20,8 @@ * ***********************************************************************/ -#include + +#include #include #include #include diff --git a/ddlproc/ddlproc.cpp b/ddlproc/ddlproc.cpp index 5cc663fc2..b8b231cb2 100644 --- a/ddlproc/ddlproc.cpp +++ b/ddlproc/ddlproc.cpp @@ -39,8 +39,8 @@ using namespace oam; #include "distributedenginecomm.h" using namespace joblist; -#include "boost/filesystem/operations.hpp" -#include "boost/filesystem/path.hpp" +//#include "boost/filesystem/operations.hpp" +//#include "boost/filesystem/path.hpp" #include "boost/progress.hpp" #include #include @@ -64,8 +64,7 @@ using namespace execplan; #include "crashtrace.h" #include "installdir.h" - -namespace fs = boost::filesystem; +#include "collation.h" namespace { @@ -97,10 +96,11 @@ void added_a_pm(int) int main(int argc, char* argv[]) { - // get and set locale language - string systemLang = "C"; - systemLang = funcexp::utf8::idb_setlocale(); - + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); + // Initialize the charset library + my_init(); // This is unset due to the way we start it program_invocation_short_name = const_cast("DDLProc"); diff --git a/dmlproc/dmlproc.cpp b/dmlproc/dmlproc.cpp index 09e5b9128..617023973 100644 --- a/dmlproc/dmlproc.cpp +++ b/dmlproc/dmlproc.cpp @@ -25,8 +25,8 @@ #include #include #include -#include "boost/filesystem/operations.hpp" -#include "boost/filesystem/path.hpp" +//#include "boost/filesystem/operations.hpp" +//#include "boost/filesystem/path.hpp" #include "boost/progress.hpp" using namespace std; @@ -84,7 +84,7 @@ using namespace joblist; #include "crashtrace.h" #include "installdir.h" -namespace fs = boost::filesystem; +#include "collation.h" threadpool::ThreadPool DMLServer::fDmlPackagepool(10, 0); @@ -511,13 +511,13 @@ int8_t setupCwd() int main(int argc, char* argv[]) { - // get and set locale language - string systemLang = "C"; - BRM::DBRM dbrm; Oam oam; - //BUG 5362 - systemLang = funcexp::utf8::idb_setlocale(); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); + // Initialize the charset library + my_init(); // This is unset due to the way we start it program_invocation_short_name = const_cast("DMLProc"); diff --git a/exemgr/main.cpp b/exemgr/main.cpp index 146709e1d..036fbbb75 100644 --- a/exemgr/main.cpp +++ b/exemgr/main.cpp @@ -39,13 +39,12 @@ * on the Front-End Processor where it is returned to the DBMS * front-end. */ - #include #include #include - #include +#undef root_name #include #include "calpontselectexecutionplan.h" @@ -70,7 +69,6 @@ #include "liboamcpp.h" #include "crashtrace.h" #include "utils_utf8.h" -#include "mcsconfig.h" #include #include @@ -78,6 +76,8 @@ #include "dbrm.h" +#include "collation.h" + namespace { @@ -1437,9 +1437,11 @@ void cleanTempDir() int main(int argc, char* argv[]) { - // get and set locale language - std::string systemLang = "C"; - systemLang = funcexp::utf8::idb_setlocale(); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); + // Initialize the charset library + my_init(); // This is unset due to the way we start it program_invocation_short_name = const_cast("ExeMgr"); diff --git a/oam/etc/Columnstore.xml b/oam/etc/Columnstore.xml index 5a2230c62..879dc3237 100644 --- a/oam/etc/Columnstore.xml +++ b/oam/etc/Columnstore.xml @@ -233,7 +233,6 @@ 8620 - C columnstore-1 pm1 unassigned diff --git a/oamapps/mcsadmin/mcsadmin.cpp b/oamapps/mcsadmin/mcsadmin.cpp index fa2a3ff22..e1732eab2 100644 --- a/oamapps/mcsadmin/mcsadmin.cpp +++ b/oamapps/mcsadmin/mcsadmin.cpp @@ -183,6 +183,7 @@ int main(int argc, char* argv[]) setuid(0); // set effective ID to root; ignore return status #endif setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); Oam oam; char* pcommand = 0; diff --git a/primitives/linux-port/dictionary.cpp b/primitives/linux-port/dictionary.cpp index 003fd3816..b197a6948 100644 --- a/primitives/linux-port/dictionary.cpp +++ b/primitives/linux-port/dictionary.cpp @@ -34,7 +34,8 @@ using namespace std; #include "dataconvert.h" #include -using namespace funcexp; +#include "collation.h" + using namespace logging; const char* nullString = " "; // this is not NULL to preempt segfaults. @@ -99,7 +100,7 @@ Notes: */ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, - TokenByScanResultHeader* ret, unsigned outSize, bool utf8, + TokenByScanResultHeader* ret, unsigned outSize, boost::shared_ptr eqFilter) { const DataValue* args; @@ -109,7 +110,6 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, int offsetIndex, argIndex, argsOffset; bool cmpResult = false; int tmp, i, err; - const char* sig; uint16_t siglen; @@ -144,6 +144,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, niceBlock = reinterpret_cast(block); offsets = reinterpret_cast(&niceBlock[10]); niceInput = reinterpret_cast(h); + + const CHARSET_INFO* cs = get_charset(h->charsetNumber, MYF(MY_WME)); // if LIKE is an operator, compile regexp's in advance. if ((h->NVALS > 0 && h->COP1 & COMPARE_LIKE) || @@ -182,12 +184,11 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, argIndex = 0; args = reinterpret_cast(&niceInput[argsOffset]); - string sig_utf8; - string arg_utf8; - if (eqFilter) { // MCOL-1246 Trim whitespace before match + // TODO MCOL-3536 use CHARSET_INFO* cs for collation + // cs->hash_sort(hash_sort(const uchar *key, size_t len, ulong *nr1, ulong *nr2)) string strData(sig, siglen); boost::trim_right_if(strData, boost::is_any_of(" ")); bool gotIt = eqFilter->find(strData) != eqFilter->end(); @@ -199,41 +200,27 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, goto no_store; } - // BUG 5110: If it is utf, we need to create utf strings to compare - if (utf8) + if (h->COP1 & COMPARE_LIKE) { - sig_utf8 = string(sig, siglen); - arg_utf8 = string(args->data, args->len); + p_DataValue dv; + + dv.len = siglen; + dv.data = (uint8_t*) sig; + cmpResult = isLike(&dv, ®ex[argIndex]); + + if (h->COP1 & COMPARE_NOT) + cmpResult = !cmpResult; + } + else + { + tmp = cs->strnncoll(sig, siglen, args->data, args->len); + cmpResult = compare(tmp, h->COP1, siglen, args->len); } switch (h->NVALS) { case 1: { - if (h->COP1 & COMPARE_LIKE) - { - p_DataValue dv; - - dv.len = siglen; - dv.data = (uint8_t*) sig; - cmpResult = isLike(&dv, ®ex[argIndex]); - - if (h->COP1 & COMPARE_NOT) - cmpResult = !cmpResult; - } - else - { - if (utf8) - { - tmp = utf8::idb_strcoll(sig_utf8.c_str(), arg_utf8.c_str()); - cmpResult = compare(tmp, h->COP1, siglen, args->len); - } - else - { - tmp = strncmp(sig, args->data, std::min(siglen, args->len)); - cmpResult = compare(tmp, h->COP1, siglen, args->len); - } - } if (cmpResult) goto store; @@ -243,32 +230,6 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, case 2: { - if (h->COP1 & COMPARE_LIKE) - { - p_DataValue dv; - - dv.len = siglen; - dv.data = (uint8_t*) sig; - cmpResult = isLike(&dv, ®ex[argIndex]); - - if (h->COP1 & COMPARE_NOT) - cmpResult = !cmpResult; - } - - else - { - if (utf8) - { - tmp = utf8::idb_strcoll(sig_utf8.c_str(), arg_utf8.c_str()); - cmpResult = compare(tmp, h->COP1, siglen, args->len); - } - else - { - tmp = strncmp(sig, args->data, std::min(siglen, args->len)); - cmpResult = compare(tmp, h->COP1, siglen, args->len); - } - } - if (!cmpResult && h->BOP == BOP_AND) goto no_store; @@ -293,17 +254,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, else { - if (utf8) - { - arg_utf8 = string(args->data, args->len); - tmp = utf8::idb_strcoll(sig_utf8.c_str(), arg_utf8.c_str()); - cmpResult = compare(tmp, h->COP2, siglen, args->len); - } - else - { - tmp = strncmp(sig, args->data, std::min(siglen, args->len)); - cmpResult = compare(tmp, h->COP2, siglen, args->len); - } + tmp = cs->strnncoll(sig, siglen, args->data, args->len); + cmpResult = compare(tmp, h->COP2, siglen, args->len); } if (cmpResult) @@ -330,16 +282,8 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, else { - if (utf8) - { - tmp = utf8::idb_strcoll(sig_utf8.c_str(), arg_utf8.c_str()); - cmpResult = compare(tmp, h->COP2, siglen, args->len); - } - else - { - tmp = strncmp(sig, args->data, std::min(siglen, args->len)); - cmpResult = compare(tmp, h->COP1, siglen, args->len); - } + tmp = cs->strnncoll(sig, siglen, args->data, args->len); + cmpResult = compare(tmp, h->COP2, siglen, args->len); } if (!cmpResult && h->BOP == BOP_AND) @@ -351,11 +295,6 @@ void PrimitiveProcessor::p_TokenByScan(const TokenByScanRequestHeader* h, argsOffset += sizeof(uint16_t) + args->len; argIndex++; args = (DataValue*) &niceInput[argsOffset]; - - if ( utf8) - { - arg_utf8 = string(args->data, args->len); - } } if (i == h->NVALS && cmpResult) @@ -566,112 +505,6 @@ again: dict_OffsetIndex++; } - -void PrimitiveProcessor::p_AggregateSignature(const AggregateSignatureRequestHeader* in, - AggregateSignatureResultHeader* out, unsigned outSize, unsigned* written, bool utf8) -{ - - uint8_t* niceOutput; // h cast to a byte-indexed type - int cmp; - char cMin[BLOCK_SIZE], cMax[BLOCK_SIZE]; - int cMinLen, cMaxLen; - p_DataValue sigptr; - - DataValue* min; - DataValue* max; - - { - void *outp = static_cast(out); - memcpy(outp, in, sizeof(ISMPacketHeader) + sizeof(PrimitiveHeader)); - } - out->ism.Command = DICT_AGGREGATE_RESULTS; - niceOutput = reinterpret_cast(out); - - // The first sig is the min and the max. - out->Count = 0; - dict_OffsetIndex = 0; - nextSig(in->NVALS, in->tokens, &sigptr); - - if (sigptr.len == -1) - return; - - out->Count++; - memcpy(cMin, sigptr.data, sigptr.len); - memcpy(cMax, sigptr.data, sigptr.len); - cMinLen = cMaxLen = sigptr.len; - - for (nextSig(in->NVALS, in->tokens, &sigptr); sigptr.len != -1; - nextSig(in->NVALS, in->tokens, &sigptr), out->Count++) - { - string sig_utf8; - - if (utf8) - { - string cMin_utf8(cMin, cMinLen); - string tmpString((char*)sigptr.data, sigptr.len); - sig_utf8 = tmpString; - cmp = utf8::idb_strcoll(cMin_utf8.c_str(), sig_utf8.c_str()); - } - else - { - cmp = strncmp(cMin, (char*)sigptr.data, std::min(cMinLen, sigptr.len)); - } - - if (cmp > 0) - { - memcpy(cMin, sigptr.data, sigptr.len); - cMinLen = sigptr.len; - } - - if (utf8) - { - string cMax_utf8(cMax, cMaxLen); - cmp = utf8::idb_strcoll(cMax_utf8.c_str(), sig_utf8.c_str()); - } - else - { - cmp = strncmp(cMax, (char*)sigptr.data, std::min(cMaxLen, sigptr.len)); - } - - if (cmp < 0) - { - memcpy(cMax, sigptr.data, sigptr.len); - cMaxLen = sigptr.len; - } - } - - //we now have the results, stuff them into the output buffer -#ifdef PRIM_DEBUG - unsigned size = sizeof(AggregateSignatureResultHeader) + cMaxLen + cMinLen - + sizeof(uint16_t) * 2; - - if (outSize < size) - { - MessageLog logger(LoggingID(28)); - logging::Message::Args marker; - Message msg(35); - - marker.add(11); - msg.format(marker); - logger.logErrorMessage(msg); - - throw length_error("PrimitiveProcessor::p_AggregateSignature(): output buffer is too small"); - } - -#endif - - min = reinterpret_cast - (&niceOutput[sizeof(AggregateSignatureResultHeader)]); - max = reinterpret_cast - (&niceOutput[sizeof(AggregateSignatureResultHeader) + cMinLen + sizeof(uint16_t)]); - min->len = cMinLen; - max->len = cMaxLen; - memcpy(min->data, cMin, cMinLen); - memcpy(max->data, cMax, cMaxLen); - *written = sizeof(AggregateSignatureResultHeader) + cMaxLen + cMinLen - + sizeof(uint16_t) * 2; -} - const char backslash = '\\'; inline bool PrimitiveProcessor::isEscapedChar(char c) @@ -811,8 +644,12 @@ PrimitiveProcessor::makeLikeFilter (const DictFilterElement* filterString, uint3 return ret; } -void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector* out, bool utf8, - bool skipNulls, boost::shared_ptr eqFilter, uint8_t eqOp) +void PrimitiveProcessor::p_Dictionary(const DictInput* in, + vector* out, + bool skipNulls, + uint32_t charsetNumber, + boost::shared_ptr eqFilter, + uint8_t eqOp) { PrimToken* outToken; const DictFilterElement* filter = 0; @@ -823,6 +660,7 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector* out, uint16_t aggCount; bool cmpResult; DictOutput header; + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); // default size of the ouput to something sufficiently large to prevent // excessive reallocation and copy when resizing @@ -858,30 +696,13 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector* out, nextSig(in->NVALS, in->tokens, &sigptr, in->OutputType, (in->InputFlags ? true : false), skipNulls)) { - - string sig_utf8; - - if (utf8) - { - string tmpString((char*)sigptr.data, sigptr.len); - sig_utf8 = tmpString; - } - // do aggregate processing if (in->OutputType & OT_AGGREGATE) { // len == 0 indicates this is the first pass if (max.len != 0) { - if (utf8 ) - { - string max_utf8((char*)max.data, max.len); - tmp = utf8::idb_strcoll(sig_utf8.c_str(), max_utf8.c_str()); - } - else - { - tmp = strncmp((char*)sigptr.data, (char*)max.data, std::min(sigptr.len, max.len)); - } + tmp = cs->strnncoll(sigptr.data, sigptr.len, max.data, max.len); if (tmp > 0) max = sigptr; @@ -891,15 +712,7 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector* out, if (min.len != 0) { - if (utf8) - { - string min_utf8((char*)min.data, min.len); - tmp = utf8::idb_strcoll(sig_utf8.c_str(), min_utf8.c_str()); - } - else - { - tmp = strncmp((char*)sigptr.data, (char*)min.data, std::min(sigptr.len, min.len)); - } + tmp = cs->strnncoll(sigptr.data, sigptr.len, min.data, min.len); if (tmp < 0) min = sigptr; @@ -932,15 +745,6 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector* out, for (filterIndex = 0; filterIndex < in->NOPS; filterIndex++) { filter = reinterpret_cast(&in8[filterOffset]); - string filt_utf8; - size_t filt_utf8_len = 0; - - if (utf8) - { - string tmpString((const char*)filter->data, filter->len); - filt_utf8 = tmpString; - filt_utf8_len = filt_utf8.length(); - } if (filter->COP & COMPARE_LIKE) { @@ -951,18 +755,7 @@ void PrimitiveProcessor::p_Dictionary(const DictInput* in, vector* out, } else { - if (utf8) - { - size_t sig_utf8_len = sig_utf8.length(); - tmp = utf8::idb_strcoll(sig_utf8.c_str(), filt_utf8.c_str()); - cmpResult = compare(tmp, filter->COP, sig_utf8_len, filt_utf8_len); - } - else - { - tmp = strncmp((const char*) sigptr.data, (const char*)filter->data, - std::min(sigptr.len, static_cast(filter->len))); - } - + tmp = cs->strnncoll(sigptr.data, sigptr.len, filter->data, filter->len); cmpResult = compare(tmp, filter->COP, sigptr.len, filter->len); } diff --git a/primitives/linux-port/primitiveprocessor.h b/primitives/linux-port/primitiveprocessor.h index 366e90daf..0e8a93396 100644 --- a/primitives/linux-port/primitiveprocessor.h +++ b/primitives/linux-port/primitiveprocessor.h @@ -185,7 +185,7 @@ public: * @note Throws logic_error if the output buffer is too small for the result. */ void p_TokenByScan(const TokenByScanRequestHeader* t, - TokenByScanResultHeader* out, unsigned outSize, bool utf8, + TokenByScanResultHeader* out, unsigned outSize, boost::shared_ptr eqFilter); /** @brief The p_IdxWalk primitive processor @@ -227,19 +227,6 @@ public: */ void p_IdxList(const IndexListHeader* rqst, IndexListHeader* rslt, int mode = 1); - /** @brief The p_AggregateSignature primitive processor. - * - * The p_AggregateSignature primitive processor. It operates on a dictionary - * block and assumes the continuation pointer is not used. - * @param in The input parameters - * @param out A pointer to a buffer where the result will be written. - * @param outSize The size of the output buffer in bytes. - * @param written (out parameter) A pointer to 1 int, which will contain the - * number of bytes written to out. - */ - void p_AggregateSignature(const AggregateSignatureRequestHeader* in, - AggregateSignatureResultHeader* out, unsigned outSize, unsigned* written, bool utf8); - /** @brief The p_Col primitive processor. * * The p_Col primitive processor. It operates on a column block specified using setBlockPtr(). @@ -277,8 +264,9 @@ public: */ // void p_ColAggregate(const NewColAggRequestHeader *in, NewColAggResultHeader *out); - void p_Dictionary(const DictInput* in, std::vector* out, bool utf8, - bool skipNulls, boost::shared_ptr eqFilter, + void p_Dictionary(const DictInput* in, std::vector* out, + bool skipNulls, uint32_t charsetNumber, + boost::shared_ptr eqFilter, uint8_t eqOp); inline void setLogicalBlockMode(bool b) diff --git a/primitives/primproc/batchprimitiveprocessor.cpp b/primitives/primproc/batchprimitiveprocessor.cpp index 64a9d2455..6cd1ccec4 100644 --- a/primitives/primproc/batchprimitiveprocessor.cpp +++ b/primitives/primproc/batchprimitiveprocessor.cpp @@ -393,15 +393,6 @@ void BatchPrimitiveProcessor::initBPP(ByteStream& bs) // cout << "got the joined Rowgroup: " << joinedRG.toString() << "\n"; } } - else - { - bs >> tmp8; - bs >> joinerSize; - joiner.reset(new Joiner((bool) tmp8)); - // going to use just one lock for this old style, probably not used, join - addToJoinerLocks.reset(new boost::scoped_array[1]); - addToJoinerLocks[0].reset(new boost::mutex[1]); - } #ifdef __FreeBSD__ pthread_mutex_unlock(&objLock); @@ -786,19 +777,6 @@ void BatchPrimitiveProcessor::addToJoiner(ByteStream& bs) */ } } - else - { - joblist::ElementType *et = (joblist::ElementType*) bs.buf(); - - boost::mutex::scoped_lock lk(addToJoinerLocks[0][0]); - for (i = 0; i < count; i++) - { -// cout << "BPP: adding <" << et[i].first << ", " << et[i].second << "> to Joiner\n"; - joiner->insert(et[i]); - } - - bs.advance(count << 4); - } idbassert(bs.length() == 0); } @@ -838,38 +816,35 @@ int BatchPrimitiveProcessor::endOfJoiner() return 0; } - if (ot == ROW_GROUP) - for (i = 0; i < joinerCount; i++) + for (i = 0; i < joinerCount; i++) + { + if (!typelessJoin[i]) { - if (!typelessJoin[i]) - { - currentSize = 0; - for (uint j = 0; j < processorThreads; ++j) - if (!tJoiners[i] || !tJoiners[i][j]) - return -1; - else - currentSize += tJoiners[i][j]->size(); - if (currentSize != tJoinerSizes[i]) + currentSize = 0; + for (uint j = 0; j < processorThreads; ++j) + if (!tJoiners[i] || !tJoiners[i][j]) return -1; - //if ((!tJoiners[i] || tJoiners[i]->size() != tJoinerSizes[i])) - // return -1; - } - else - { - currentSize = 0; - for (uint j = 0; j < processorThreads; ++j) - if (!tlJoiners[i] || !tlJoiners[i][j]) - return -1; - else - currentSize += tlJoiners[i][j]->size(); - if (currentSize != tJoinerSizes[i]) - return -1; - //if ((!tJoiners[i] || tlJoiners[i]->size() != tJoinerSizes[i])) - // return -1; - } + else + currentSize += tJoiners[i][j]->size(); + if (currentSize != tJoinerSizes[i]) + return -1; + //if ((!tJoiners[i] || tJoiners[i]->size() != tJoinerSizes[i])) + // return -1; } - else if (joiner.get() == NULL || joiner->size() != joinerSize) - return -1; + else + { + currentSize = 0; + for (uint j = 0; j < processorThreads; ++j) + if (!tlJoiners[i] || !tlJoiners[i][j]) + return -1; + else + currentSize += tlJoiners[i][j]->size(); + if (currentSize != tJoinerSizes[i]) + return -1; + //if ((!tJoiners[i] || tlJoiners[i]->size() != tJoinerSizes[i])) + // return -1; + } + } endOfJoinerRan = true; @@ -1115,26 +1090,6 @@ void BatchPrimitiveProcessor::initProcessor() asyncLoaded.reset(new bool[projectCount + 1]); } -void BatchPrimitiveProcessor::executeJoin() -{ - uint32_t newRowCount, i; - - preJoinRidCount = ridCount; - newRowCount = 0; - smallSideMatches.clear(); - - for (i = 0; i < ridCount; i++) - { - if (joiner->getNewMatches(values[i], &smallSideMatches)) - { - values[newRowCount] = values[i]; - relRids[newRowCount++] = relRids[i]; - } - } - - ridCount = newRowCount; -} - /* This version does a join on projected rows */ void BatchPrimitiveProcessor::executeTupleJoin() { @@ -1143,7 +1098,6 @@ void BatchPrimitiveProcessor::executeTupleJoin() uint64_t largeKey; TypelessData tlLargeKey; - preJoinRidCount = ridCount; outputRG.getRow(0, &oldRow); outputRG.getRow(0, &newRow); @@ -1513,17 +1467,6 @@ void BatchPrimitiveProcessor::execute() stopwatch->start("BatchPrimitiveProcessor::execute third part"); #endif - if (doJoin && ot != ROW_GROUP) - { -#ifdef PRIMPROC_STOPWATCH - stopwatch->start("- executeJoin"); - executeJoin(); - stopwatch->stop("- executeJoin"); -#else - executeJoin(); -#endif - } - if (projectCount > 0 || ot == ROW_GROUP) { #ifdef PRIMPROC_STOPWATCH @@ -2058,17 +2001,6 @@ void BatchPrimitiveProcessor::serializeElementTypes() *serialized << ridCount; serialized->append((uint8_t*) relRids, ridCount << 1); serialized->append((uint8_t*) values, ridCount << 3); - - /* Send the small side matches if there was a join */ - if (doJoin) - { - uint32_t ssize = smallSideMatches.size(); - *serialized << preJoinRidCount; - *serialized << (uint32_t) ssize; - - if (ssize > 0) - serialized->append((uint8_t*) &smallSideMatches[0], ssize << 4); - } } void BatchPrimitiveProcessor::serializeStrings() @@ -2403,51 +2335,44 @@ SBPP BatchPrimitiveProcessor::duplicate() if (doJoin) { pthread_mutex_lock(&bpp->objLock); - bpp->joinerSize = joinerSize; + /* There are add'l join vars, but only these are necessary for processing + a join */ + bpp->tJoinerSizes = tJoinerSizes; + bpp->joinerCount = joinerCount; + bpp->joinTypes = joinTypes; + bpp->largeSideKeyColumns = largeSideKeyColumns; + bpp->tJoiners = tJoiners; + //bpp->_pools = _pools; + bpp->typelessJoin = typelessJoin; + bpp->tlLargeSideKeyColumns = tlLargeSideKeyColumns; + bpp->tlJoiners = tlJoiners; + bpp->tlKeyLengths = tlKeyLengths; + bpp->storedKeyAllocators = storedKeyAllocators; + bpp->joinNullValues = joinNullValues; + bpp->doMatchNulls = doMatchNulls; + bpp->hasJoinFEFilters = hasJoinFEFilters; + bpp->hasSmallOuterJoin = hasSmallOuterJoin; - if (ot == ROW_GROUP) + if (hasJoinFEFilters) { - /* There are add'l join vars, but only these are necessary for processing - a join */ - bpp->tJoinerSizes = tJoinerSizes; - bpp->joinerCount = joinerCount; - bpp->joinTypes = joinTypes; - bpp->largeSideKeyColumns = largeSideKeyColumns; - bpp->tJoiners = tJoiners; - //bpp->_pools = _pools; - bpp->typelessJoin = typelessJoin; - bpp->tlLargeSideKeyColumns = tlLargeSideKeyColumns; - bpp->tlJoiners = tlJoiners; - bpp->tlKeyLengths = tlKeyLengths; - bpp->storedKeyAllocators = storedKeyAllocators; - bpp->joinNullValues = joinNullValues; - bpp->doMatchNulls = doMatchNulls; - bpp->hasJoinFEFilters = hasJoinFEFilters; - bpp->hasSmallOuterJoin = hasSmallOuterJoin; + bpp->joinFERG = joinFERG; + bpp->joinFEFilters.reset(new scoped_ptr[joinerCount]); - if (hasJoinFEFilters) - { - bpp->joinFERG = joinFERG; - bpp->joinFEFilters.reset(new scoped_ptr[joinerCount]); - - for (i = 0; i < joinerCount; i++) - if (joinFEFilters[i]) - bpp->joinFEFilters[i].reset(new FuncExpWrapper(*joinFEFilters[i])); - } - - if (getTupleJoinRowGroupData) - { - bpp->smallSideRGs = smallSideRGs; - bpp->largeSideRG = largeSideRG; - bpp->smallSideRowLengths = smallSideRowLengths; - bpp->smallSideRowData = smallSideRowData; - bpp->smallNullRowData = smallNullRowData; - bpp->smallNullPointers = smallNullPointers; - bpp->joinedRG = joinedRG; - } + for (i = 0; i < joinerCount; i++) + if (joinFEFilters[i]) + bpp->joinFEFilters[i].reset(new FuncExpWrapper(*joinFEFilters[i])); + } + + if (getTupleJoinRowGroupData) + { + bpp->smallSideRGs = smallSideRGs; + bpp->largeSideRG = largeSideRG; + bpp->smallSideRowLengths = smallSideRowLengths; + bpp->smallSideRowData = smallSideRowData; + bpp->smallNullRowData = smallNullRowData; + bpp->smallNullPointers = smallNullPointers; + bpp->joinedRG = joinedRG; } - else - bpp->joiner = joiner; #ifdef __FreeBSD__ pthread_mutex_unlock(&bpp->objLock); @@ -2549,10 +2474,6 @@ bool BatchPrimitiveProcessor::operator==(const BatchPrimitiveProcessor& bpp) con if (*filterSteps[i] != *bpp.filterSteps[i]) return false; - for (i = 0; i < projectCount; i++) - if (*projectSteps[i] != *bpp.projectSteps[i]) - return false; - return true; } #endif diff --git a/primitives/primproc/batchprimitiveprocessor.h b/primitives/primproc/batchprimitiveprocessor.h index e3863e429..d56481a5f 100644 --- a/primitives/primproc/batchprimitiveprocessor.h +++ b/primitives/primproc/batchprimitiveprocessor.h @@ -251,14 +251,9 @@ private: bool fBusy; /* Join support TODO: Make join ops a seperate Command class. */ - boost::shared_ptr joiner; - std::vector smallSideMatches; bool doJoin; - uint32_t joinerSize; - uint16_t preJoinRidCount; boost::scoped_array > addToJoinerLocks; boost::scoped_array smallSideDataLocks; - void executeJoin(); // uint32_t ridsIn, ridsOut; diff --git a/primitives/primproc/dictstep.cpp b/primitives/primproc/dictstep.cpp index abd99ada3..cc728e770 100644 --- a/primitives/primproc/dictstep.cpp +++ b/primitives/primproc/dictstep.cpp @@ -44,7 +44,6 @@ namespace primitiveprocessor { extern uint32_t dictBufferSize; -extern bool utf8; DictStep::DictStep() : Command(DICT_STEP), strValues(NULL), filterCount(0), bufferSize(0) @@ -67,6 +66,7 @@ DictStep& DictStep::operator=(const DictStep& d) eqFilter = d.eqFilter; eqOp = d.eqOp; filterCount = d.filterCount; + charsetNumber = d.charsetNumber; return *this; } @@ -99,7 +99,8 @@ void DictStep::createCommand(ByteStream& bs) } else bs >> filterString; - + + bs >> charsetNumber; #if 0 cout << "see " << filterCount << " filters\n"; DictFilterElement* filters = (DictFilterElement*) filterString.buf(); @@ -174,7 +175,7 @@ void DictStep::issuePrimitive(bool isFilter) } bpp->pp.setLikeFilter(likeFilter); - bpp->pp.p_Dictionary(primMsg, &result, utf8, isFilter, eqFilter, eqOp); + bpp->pp.p_Dictionary(primMsg, &result, isFilter, charsetNumber, eqFilter, eqOp); } void DictStep::copyResultToTmpSpace(OrderedToken* ot) @@ -698,6 +699,7 @@ SCommand DictStep::duplicate() ds->eqOp = eqOp; ds->filterString = filterString; ds->filterCount = filterCount; + ds->charsetNumber = charsetNumber; ds->Command::duplicate(this); return ret; } diff --git a/primitives/primproc/dictstep.h b/primitives/primproc/dictstep.h index 025658d5b..581a4cd14 100644 --- a/primitives/primproc/dictstep.h +++ b/primitives/primproc/dictstep.h @@ -141,8 +141,9 @@ private: messageqcpp::ByteStream filterString; uint32_t filterCount; uint32_t bufferSize; + uint32_t charsetNumber; uint16_t inputRidCount; - + bool hasEqFilter; boost::shared_ptr eqFilter; boost::shared_array likeFilter; diff --git a/primitives/primproc/primitiveserver.cpp b/primitives/primproc/primitiveserver.cpp index 13abe2a04..b41736566 100644 --- a/primitives/primproc/primitiveserver.cpp +++ b/primitives/primproc/primitiveserver.cpp @@ -160,8 +160,6 @@ std::map djLock; // djLock synchronizes destroy and j volatile int32_t asyncCounter; const int asyncMax = 20; // current number of asynchronous loads -extern bool utf8; - struct preFetchCond { //uint64_t lbid; @@ -1211,7 +1209,7 @@ int DictScanJob::operator()() fLBIDTraceOn, session); pproc.setBlockPtr((int*) data); - pproc.p_TokenByScan(cmd, output, output_buf_size, utf8, eqFilter); + pproc.p_TokenByScan(cmd, output, output_buf_size, eqFilter); if (wasBlockInCache) output->CacheIO++; diff --git a/primitives/primproc/primproc.cpp b/primitives/primproc/primproc.cpp index 201e76fa7..6a10c1c95 100644 --- a/primitives/primproc/primproc.cpp +++ b/primitives/primproc/primproc.cpp @@ -21,8 +21,6 @@ * * ***********************************************************************/ - - #include #include #include @@ -76,6 +74,8 @@ using namespace idbdatafile; #include "crashtrace.h" #include "installdir.h" +#include "collation.h" + namespace primitiveprocessor { @@ -89,18 +89,15 @@ extern uint32_t lowPriorityThreads; extern int directIOFlag; extern int noVB; - DebugLevel gDebugLevel; Logger* mlp; -string systemLang; -bool utf8 = false; bool isDebug( const DebugLevel level ) { return level <= gDebugLevel; } -} +} //namespace primitiveprocessor namespace { @@ -316,16 +313,15 @@ void* waitForSIGUSR1(void* p) int main(int argc, char* argv[]) { - // get and set locale language - systemLang = funcexp::utf8::idb_setlocale(); - - if ( systemLang != "en_US.UTF-8" && - systemLang.find("UTF") != string::npos ) - utf8 = true; - // This is unset due to the way we start it program_invocation_short_name = const_cast("PrimProc"); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); + // Initialize the charset library + my_init(); + int gDebug = 0; int c; @@ -698,7 +694,7 @@ int main(int argc, char* argv[]) } BPPCount = highPriorityThreads + medPriorityThreads + lowPriorityThreads; - + // let the user override if they want temp = toInt(cf->getConfig(primitiveServers, "BPPCount")); diff --git a/procmgr/main.cpp b/procmgr/main.cpp index f1f7bfb47..4e2995f32 100644 --- a/procmgr/main.cpp +++ b/procmgr/main.cpp @@ -101,10 +101,9 @@ int main(int argc, char** argv) #ifndef _MSC_VER setuid(0); // set effective ID to root; ignore return status #endif - // get and set locale language - string systemLang = "C"; - - setlocale(LC_ALL, systemLang.c_str()); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); idbdatafile::IDBPolicy::configIDBPolicy(); diff --git a/procmon/main.cpp b/procmon/main.cpp index 9112d3962..c278c3680 100644 --- a/procmon/main.cpp +++ b/procmon/main.cpp @@ -26,7 +26,7 @@ namespace bi = boost::interprocess; #include "installdir.h" #include "IDBPolicy.h" - +#include "utils_utf8.h" #include "crashtrace.h" using namespace std; @@ -177,10 +177,9 @@ int main(int argc, char** argv) if (p && *p) USER = p; - // get and set locale language - string systemLang = "C"; - - setlocale(LC_ALL, systemLang.c_str()); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); //get tmp log directory tmpLogDir = startup::StartUp::tmpDir(); diff --git a/tools/configMgt/autoConfigure.cpp b/tools/configMgt/autoConfigure.cpp index 2f30a14bd..d48399893 100644 --- a/tools/configMgt/autoConfigure.cpp +++ b/tools/configMgt/autoConfigure.cpp @@ -372,23 +372,6 @@ int main(int argc, char* argv[]) catch (...) { } - //setup System Language - string systemLang = "C"; - - try - { - systemLang = sysConfigOld->getConfig(SystemSection, "SystemLang"); - } - catch (...) - { } - - try - { - sysConfigNew->setConfig(SystemSection, "SystemLang", systemLang); - } - catch (...) - {} - //setup HA IP Address string HA_IPadd; diff --git a/tools/dbloadxml/colxml.cpp b/tools/dbloadxml/colxml.cpp index 778d095e7..993e6d334 100644 --- a/tools/dbloadxml/colxml.cpp +++ b/tools/dbloadxml/colxml.cpp @@ -50,6 +50,7 @@ int main(int argc, char** argv) std::cerr << " colxml: couldn't set uid " << std::endl; } setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); WriteEngine::Config::initConfigCache(); // load Columnstore.xml config settings //Bug 6137 diff --git a/tools/pingproc/pingproc.cpp b/tools/pingproc/pingproc.cpp index f1dd7b8bc..e417a2baa 100644 --- a/tools/pingproc/pingproc.cpp +++ b/tools/pingproc/pingproc.cpp @@ -466,6 +466,7 @@ const ByteStream formatDictionaryScanMsg(const uint64_t lbid, hdr.COP2 = oidOp.COP2(); hdr.NVALS = oidOp.FilterCount(); hdr.Count = count; + hdr.charsetNumber = oidOp.ColumnType().charsetNumber; idbassert(hdr.Count > 0); primMsg.load((const uint8_t*) &hdr.ism, sizeof(ISMPacketHeader)); diff --git a/utils/common/CMakeLists.txt b/utils/common/CMakeLists.txt index 939aee0a2..c6a0fde01 100644 --- a/utils/common/CMakeLists.txt +++ b/utils/common/CMakeLists.txt @@ -9,7 +9,8 @@ set(common_LIB_SRCS cgroupconfigurator.cpp MonitorProcMem.cpp nullvaluemanip.cpp - threadnaming.cpp) + threadnaming.cpp + utils_utf8.cpp) add_library(common SHARED ${common_LIB_SRCS}) diff --git a/utils/common/collation.h b/utils/common/collation.h new file mode 100644 index 000000000..db7b7cf26 --- /dev/null +++ b/utils/common/collation.h @@ -0,0 +1,8 @@ +// These are the common headers needed to use the MariaDB collation library + +// This must be included after any boost headers, or anything that includes +// boost headers. and boost are not friends. +#include +#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost +#include +#include diff --git a/utils/common/utils_utf8.cpp b/utils/common/utils_utf8.cpp new file mode 100644 index 000000000..d638e20d9 --- /dev/null +++ b/utils/common/utils_utf8.cpp @@ -0,0 +1,77 @@ +/* Copyright (C) 2020 MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "utils_utf8.h" +#include "collation.h" + +namespace utf8 +{ + +/* + * mcs_strcoll +*/ +int mcs_strcoll(const char* str1, const char* str2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncoll(str1, strlen(str1), str2, strlen(str2)); +} + +int mcs_strcoll(const char* str1, const uint32_t l1, const char* str2, const uint32_t l2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncoll(str1, l1, str2, l2); +} + +int mcs_strcoll(const std::string* str1, const std::string* str2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncoll(str1->c_str(), str1->length(), str2->c_str(), str2->length()); +} + +int mcs_strcoll(const std::string& str1, const std::string& str2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncoll(str1.c_str(), str1.length(), str2.c_str(), str2.length()); +} + +/* + * mcs_strcollsp +*/ +int mcs_strcollsp(const char* str1, const char* str2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncollsp(str1, strlen(str1), str2, strlen(str2)); +} + +int mcs_strcollsp(const char* str1, uint32_t l1, const char* str2, const uint32_t l2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncollsp(str1, l1, str2, l2); +} + +int mcs_strcollsp(const std::string* str1, const std::string* str2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncollsp(str1->c_str(), str1->length(), str2->c_str(), str2->length()); +} + +int mcs_strcollsp(const std::string& str1, const std::string& str2, const uint32_t charsetNumber) +{ + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncollsp(str1.c_str(), str1.length(), str2.c_str(), str2.length()); +} + +} //namespace utf8 + diff --git a/utils/common/utils_utf8.h b/utils/common/utils_utf8.h new file mode 100644 index 000000000..75123e460 --- /dev/null +++ b/utils/common/utils_utf8.h @@ -0,0 +1,161 @@ +/* Copyright (C) 2014 InfiniDB, Inc. + * Copyright (C) 2016 MariaDB Corporation. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301, USA. */ + +// $Id$ + + +#ifndef _UTILS_UTF8_H_ +#define _UTILS_UTF8_H_ + + + +#include +#if defined(_MSC_VER) +#include +#include +#elif defined(__FreeBSD__) +//#include +#else +#include +#endif +#include + +#include +#include "liboamcpp.h" + +// Change the name from utf8. Even change the file name to something resembling char helper +namespace utf8 +{ + +const int MAX_UTF8_BYTES_PER_CHAR = 4; + +// BUG 5241 +// Infinidb specific mbstowcs(). This will handle both windows and unix platforms +// Params dest and max should have enough length to accomodate NULL +inline +size_t idb_mbstowcs(wchar_t* dest, const char* src, size_t max) +{ +#ifdef _MSC_VER + // 4th param (-1) denotes to convert till hit NULL char + // if 6th param max = 0, will return the required buffer size + size_t strwclen = MultiByteToWideChar(CP_UTF8, 0, src, -1, dest, (int)max); + // decrement the count of NULL; will become -1 on failure + return --strwclen; + +#else + return mbstowcs(dest, src, max); +#endif +} + +// BUG 5241 +// Infinidb specific wcstombs(). This will handle both windows and unix platforms +// Params dest and max should have enough length to accomodate NULL +inline +size_t idb_wcstombs(char* dest, const wchar_t* src, size_t max) +{ +#ifdef _MSC_VER + // 4th param (-1) denotes to convert till hit NULL char + //if 6th param max = 0, will return the required buffer size + size_t strmblen = WideCharToMultiByte( CP_UTF8, 0, src, -1, dest, (int)max, NULL, NULL); + // decrement the count of NULL; will become -1 on failure + return --strmblen; +#else + return wcstombs(dest, src, max); +#endif +} + + +// convert UTF-8 string to wstring +inline +std::wstring utf8_to_wstring (const std::string& str) +{ + size_t bufsize = str.length() + 1; + + // Convert to wide characters. Do all further work in wide characters + wchar_t* wcbuf = new wchar_t[bufsize]; + // Passing +1 so that windows is happy to see extra position to place NULL + size_t strwclen = idb_mbstowcs(wcbuf, str.c_str(), str.length() + 1); + + // if result is -1 it means bad characters which may happen if locale is wrong. + // return an empty string + if ( strwclen == static_cast(-1) ) + strwclen = 0; + + std::wstring ret(wcbuf, strwclen); + + delete [] wcbuf; + return ret; +} + + +// convert wstring to UTF-8 string +inline +std::string wstring_to_utf8 (const std::wstring& str) +{ + char* outbuf = new char[(str.length() * MAX_UTF8_BYTES_PER_CHAR) + 1]; + // Passing +1 so that windows is happy to see extra position to place NULL + size_t strmblen = idb_wcstombs(outbuf, str.c_str(), str.length() * MAX_UTF8_BYTES_PER_CHAR + 1); + + // if result is -1 it means bad characters which may happen if locale is wrong. + // return an empty string + if ( strmblen == static_cast(-1) ) + strmblen = 0; + + std::string ret(outbuf, strmblen); + + delete [] outbuf; + return ret; +} + +inline +uint8_t utf8_truncate_point(const char* input, size_t length) +{ + // Find the beginning of a multibyte char to truncate at and return the + // number of bytes to truncate1` + if (length < 3) + { + return 0; + } + + const unsigned char* b = (const unsigned char*)(input) + length - 3; + + if (b[2] & 0x80) + { + // First byte in a new multi-byte sequence + if (b[2] & 0x40) return 1; + // 3 byte sequence + else if ((b[1] & 0xe0) == 0xe0) return 2; + // 4 byte sequence + else if ((b[0] & 0xf0) == 0xf0) return 3; + } + + return 0; +} + +int mcs_strcoll(const char* str1, const char* str2, const uint32_t charsetNumber); +int mcs_strcoll(const char* str1, const uint32_t l1, const char* str2, const uint32_t l2, const uint32_t charsetNumber); +int mcs_strcoll(const std::string* str1, const std::string* str2, const uint32_t charsetNumber); +int mcs_strcoll(const std::string& str1, const std::string& str2, const uint32_t charsetNumber); + +int mcs_strcollsp(const char* str1, const char* str2, const uint32_t charsetNumber); +int mcs_strcollsp(const char* str1, uint32_t l1, const char* str2, const uint32_t l2, const uint32_t charsetNumber); +int mcs_strcollsp(const std::string* str1, const std::string* str2, const uint32_t charsetNumber); +int mcs_strcollsp(const std::string& str1, const std::string& str2, const uint32_t charsetNumber); +} //namespace utf8 + +#endif diff --git a/utils/funcexp/func_between.cpp b/utils/funcexp/func_between.cpp index d742c2d16..ab6aef028 100644 --- a/utils/funcexp/func_between.cpp +++ b/utils/funcexp/func_between.cpp @@ -37,12 +37,11 @@ using namespace execplan; #include "errorcodes.h" #include "idberrorinfo.h" #include "errorids.h" + +#include "collation.h" + using namespace logging; -#include "utils_utf8.h" -using namespace funcexp; - - namespace { template @@ -57,16 +56,16 @@ inline bool numericLE(result_t op1, result_t op2) return op1 <= op2; } -inline bool strGE(const string& op1, const string& op2) +inline bool strGE(uint32_t charsetNumber, const string& op1, const string& op2) { - //return strcoll(op1.c_str(), op2.c_str()) >= 0; - return utf8::idb_strcoll(op1.c_str(), op2.c_str()) >= 0; + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncoll(op1.c_str(), op1.length(), op2.c_str(), op2.length()) >= 0; } -inline bool strLE(const string& op1, const string& op2) +inline bool strLE(uint32_t charsetNumber, const string& op1, const string& op2) { - //return strcoll(op1.c_str(), op2.c_str()) <= 0; - return utf8::idb_strcoll(op1.c_str(), op2.c_str()) <= 0; + const CHARSET_INFO* cs = get_charset(charsetNumber, MYF(MY_WME)); + return cs->strnncoll(op1.c_str(), op1.length(), op2.c_str(), op2.length()) <= 0; } inline bool getBool(rowgroup::Row& row, @@ -260,16 +259,16 @@ inline bool getBool(rowgroup::Row& row, if (notBetween) { - if (!strGE(val, pm[1]->data()->getStrVal(row, isNull)) && !isNull) + if (!strGE(ct.charsetNumber, val, pm[1]->data()->getStrVal(row, isNull)) && !isNull) return true; isNull = false; - return (!strLE(val, pm[2]->data()->getStrVal(row, isNull)) && !isNull); + return (!strLE(ct.charsetNumber, val, pm[2]->data()->getStrVal(row, isNull)) && !isNull); } - + return !isNull && - strGE(val, pm[1]->data()->getStrVal(row, isNull)) && - strLE(val, pm[2]->data()->getStrVal(row, isNull)); + strGE(ct.charsetNumber, val, pm[1]->data()->getStrVal(row, isNull)) && + strLE(ct.charsetNumber, val, pm[2]->data()->getStrVal(row, isNull)); } default: diff --git a/utils/funcexp/func_case.cpp b/utils/funcexp/func_case.cpp index 1a5f511ab..327c02cc0 100644 --- a/utils/funcexp/func_case.cpp +++ b/utils/funcexp/func_case.cpp @@ -43,6 +43,8 @@ using namespace logging; #include "utils_utf8.h" using namespace funcexp; +#include "collation.h" + namespace { using namespace funcexp; @@ -180,20 +182,21 @@ inline uint64_t simple_case_cmp(Row& row, case execplan::CalpontSystemCatalog::VARCHAR: { const string& ev = parm[n]->data()->getStrVal(row, isNull); - if (isNull) break; + CHARSET_INFO* cs = parm[n]->data()->resultType().getCharset(); for (i = 1; i <= whereCount; i++) { //BUG 5362 - if (utf8::idb_strcoll(ev.c_str(), parm[i]->data()->getStrVal(row, isNull).c_str()) == 0 && !isNull) + const string& p1 = parm[i]->data()->getStrVal(row, isNull); + if (isNull) + break; + if (cs->strnncoll(ev.c_str(), ev.length(), p1.c_str(), p1.length()) == 0) { foundIt = true; break; } - else - isNull = false; } break; diff --git a/utils/funcexp/func_char_length.cpp b/utils/funcexp/func_char_length.cpp index 72bdfdfda..6812e8f0f 100644 --- a/utils/funcexp/func_char_length.cpp +++ b/utils/funcexp/func_char_length.cpp @@ -38,6 +38,9 @@ using namespace execplan; #include "errorcodes.h" #include "idberrorinfo.h" #include "errorids.h" + +#include "collation.h" + using namespace logging; namespace funcexp @@ -45,7 +48,7 @@ namespace funcexp CalpontSystemCatalog::ColType Func_char_length::operationType( FunctionParm& fp, CalpontSystemCatalog::ColType& resultType ) { - return resultType; + return fp[0]->data()->resultType(); } int64_t Func_char_length::getIntVal(rowgroup::Row& row, @@ -78,15 +81,11 @@ int64_t Func_char_length::getIntVal(rowgroup::Row& row, case execplan::CalpontSystemCatalog::UDECIMAL: { const string& tstr = parm[0]->data()->getStrVal(row, isNull); - if (isNull) return 0; - - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - delete [] wcbuf; - return (int64_t)strwclen; + const char* b = tstr.c_str(); + const char* e = tstr.c_str() + tstr.length(); + return (int64_t)parm[0]->data()->resultType().getCharset()->numchars(b, e); } case execplan::CalpontSystemCatalog::DATE: diff --git a/utils/funcexp/func_concat.cpp b/utils/funcexp/func_concat.cpp index 39a72b4fa..e2415f7d5 100644 --- a/utils/funcexp/func_concat.cpp +++ b/utils/funcexp/func_concat.cpp @@ -35,8 +35,6 @@ using namespace rowgroup; #include "dataconvert.h" using namespace dataconvert; -#define STRCOLL_ENH__ - namespace funcexp { @@ -58,7 +56,10 @@ string Func_concat::getStrVal(Row& row, string ret; string tmp; stringValue(parm[0], row, isNull, ret); - + + // TODO: do a better job of cutting down the number re-allocations. + // look at Item_func_concat::realloc_result for ideas and use + // std::string:resize() appropriatly. for ( unsigned int id = 1 ; id < parm.size() ; id++) { stringValue(parm[id], row, isNull, tmp); diff --git a/utils/funcexp/func_concat_ws.cpp b/utils/funcexp/func_concat_ws.cpp index af3da5245..21d1ae955 100644 --- a/utils/funcexp/func_concat_ws.cpp +++ b/utils/funcexp/func_concat_ws.cpp @@ -32,7 +32,7 @@ using namespace execplan; #include "rowgroup.h" using namespace rowgroup; -#define STRCOLL_ENH__ +#include "collation.h" namespace funcexp { @@ -47,14 +47,16 @@ CalpontSystemCatalog::ColType Func_concat_ws::operationType(FunctionParm& fp, Ca string Func_concat_ws::getStrVal(Row& row, FunctionParm& parm, bool& isNull, - CalpontSystemCatalog::ColType&) + CalpontSystemCatalog::ColType& type) { string delim; stringValue(parm[0], row, isNull, delim); if (isNull) return ""; -#ifdef STRCOLL_ENH__ + // TODO: I don't think we need wide chars here. + // Concatenation works without see Server implementation. +#if 0 wstring wstr; size_t strwclen = utf8::idb_mbstowcs(0, delim.c_str(), 0) + 1; wchar_t* wcbuf = new wchar_t[strwclen]; @@ -95,23 +97,24 @@ string Func_concat_ws::getStrVal(Row& row, delete [] outbuf; delete [] wcbuf; return ret; - -#else +#endif string str; string tmp; for ( uint32_t i = 1 ; i < parm.size() ; i++) { - stringValue(parm[i], row, isNull, tmp); - str += tmp; - + stringValue(parm[i], row, isNull, tmp); if (isNull) { isNull = false; continue; } - if (!str.empty() && !isNull) + if (!str.empty()) str += delim; + + // TODO: Work on string reallocation. Use std::string::resize() to + // grab larger chunks in some intellegent manner. + str += tmp; } if (str.empty()) @@ -120,7 +123,6 @@ string Func_concat_ws::getStrVal(Row& row, isNull = false; return str; -#endif } diff --git a/utils/funcexp/func_find_in_set.cpp b/utils/funcexp/func_find_in_set.cpp index 9cc8a2485..21f45943d 100644 --- a/utils/funcexp/func_find_in_set.cpp +++ b/utils/funcexp/func_find_in_set.cpp @@ -42,6 +42,8 @@ using namespace execplan; #include "errorids.h" using namespace logging; +#include "collation.h" + namespace funcexp { @@ -56,37 +58,58 @@ int64_t Func_find_in_set::getIntVal(rowgroup::Row& row, CalpontSystemCatalog::ColType& op_ct) { const string& searchStr = parm[0]->data()->getStrVal(row, isNull); - if (isNull) return 0; const string& setString = parm[1]->data()->getStrVal(row, isNull); - if (isNull) return 0; if (searchStr.find(",") != string::npos) return 0; - string newSearchStr(searchStr.substr(0, strlen(searchStr.c_str()))); - string newSetString(setString.substr(0, strlen(setString.c_str()))); - //tokenize the setStr with comma as seprator. - typedef boost::tokenizer > tokenizer; - boost::char_separator sep( ","); - tokenizer tokens(newSetString, sep); + if (setString.length() < searchStr.length()) + return 0; + + CHARSET_INFO *cs= op_ct.getCharset(); - unsigned i = 0; - size_t pos = 0; - - for (tokenizer::iterator tok_iter = tokens.begin(); tok_iter != tokens.end(); ++tok_iter) + my_wc_t wc= 0; + const char *str_begin = setString.c_str(); + const char *str_end = setString.c_str(); + const char *real_end = str_end + setString.length(); + const char *find_str = searchStr.c_str(); + size_t find_str_len = searchStr.length(); + int position = 0; + static const char separator=','; + while (1) { - pos = (*tok_iter).find(newSearchStr); - i++; - - if (( pos != string::npos) && (newSearchStr.length() == (*tok_iter).length())) - return i; + int symbol_len; + if ((symbol_len= cs->mb_wc(&wc, (uchar*) str_end, + (uchar*) real_end)) > 0) + { + const char *substr_end = str_end + symbol_len; + bool is_last_item= (substr_end == real_end); + bool is_separator = (wc == (my_wc_t) separator); + if (is_separator || is_last_item) + { + position++; + if (is_last_item && !is_separator) + str_end = substr_end; + if (!cs->strnncoll(str_begin, (size_t) (str_end - str_begin), + find_str, find_str_len)) + return (int64_t) position; + else + str_begin = substr_end; + } + str_end = substr_end; + } + else if (str_end - str_begin == 0 && + find_str_len == 0 && + wc == (my_wc_t) separator) + return (longlong) ++position; + else + return 0; } - return 0; } diff --git a/utils/funcexp/func_greatest.cpp b/utils/funcexp/func_greatest.cpp index e064eacb3..2c58ec481 100644 --- a/utils/funcexp/func_greatest.cpp +++ b/utils/funcexp/func_greatest.cpp @@ -40,6 +40,7 @@ using namespace joblist; #include "utils_utf8.h" using namespace funcexp; +#include "collation.h" class to_lower { @@ -148,6 +149,7 @@ std::string Func_greatest::getStrVal(rowgroup::Row& row, execplan::CalpontSystemCatalog::ColType& op_ct) { const string& str = fp[0]->data()->getStrVal(row, isNull); + CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset(); string greatestStr = str; @@ -155,12 +157,10 @@ std::string Func_greatest::getStrVal(rowgroup::Row& row, { const string& str1 = fp[i]->data()->getStrVal(row, isNull); - int tmp = utf8::idb_strcoll(greatestStr.c_str(), str1.c_str()); - - if ( tmp < 0 ) - -// if ( greatestStr < str1 ) + if (cs->strnncoll(greatestStr.c_str(), greatestStr.length(), str1.c_str(), str1.length()) < 0) + { greatestStr = str1; + } } return greatestStr; diff --git a/utils/funcexp/func_if.cpp b/utils/funcexp/func_if.cpp index ef2827bbe..6d10d6829 100644 --- a/utils/funcexp/func_if.cpp +++ b/utils/funcexp/func_if.cpp @@ -53,22 +53,22 @@ bool boolVal(SPTP& parm, Row& row, const string& timeZone) case CalpontSystemCatalog::TEXT: case CalpontSystemCatalog::VARCHAR: ret = (atoi((char*)(parm->data()->getStrVal(timeZone).c_str())) != 0); - + break; case CalpontSystemCatalog::FLOAT: case CalpontSystemCatalog::UFLOAT: ret = (parm->data()->getFloatVal(row, isNull) != 0); - + break; case CalpontSystemCatalog::DOUBLE: case CalpontSystemCatalog::UDOUBLE: ret = (parm->data()->getDoubleVal(row, isNull) != 0); - + break; case CalpontSystemCatalog::LONGDOUBLE: ret = (parm->data()->getLongDoubleVal(row, isNull) != 0); - + break; case CalpontSystemCatalog::DECIMAL: case CalpontSystemCatalog::UDECIMAL: ret = (parm->data()->getDecimalVal(row, isNull).value != 0); - + break; case CalpontSystemCatalog::BIGINT: case CalpontSystemCatalog::SMALLINT: case CalpontSystemCatalog::MEDINT: @@ -83,6 +83,7 @@ bool boolVal(SPTP& parm, Row& row, const string& timeZone) case CalpontSystemCatalog::TIME: default: ret = (parm->data()->getIntVal(row, isNull) != 0); + break; } } diff --git a/utils/funcexp/func_in.cpp b/utils/funcexp/func_in.cpp index 2de6359e7..bd0ae3662 100644 --- a/utils/funcexp/func_in.cpp +++ b/utils/funcexp/func_in.cpp @@ -44,6 +44,8 @@ using namespace logging; #include "utils_utf8.h" using namespace funcexp; +#include "collation.h" + namespace { template @@ -52,11 +54,6 @@ inline bool numericEQ(result_t op1, result_t op2) return op1 == op2; } -inline bool strEQ(string op1, string op2) -{ - return utf8::idb_strcoll(op1.c_str(), op2.c_str()) == 0; -} - inline bool getBoolForIn(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull, @@ -273,15 +270,16 @@ inline bool getBoolForIn(rowgroup::Row& row, case execplan::CalpontSystemCatalog::TEXT: { const string& val = pm[0]->data()->getStrVal(row, isNull); - if (isNull) return false; + CHARSET_INFO* cs = pm[0]->data()->resultType().getCharset(); + for (uint32_t i = 1; i < pm.size(); i++) { isNull = false; - - if ( utf8::idb_strcoll(val.c_str(), pm[i]->data()->getStrVal(row, isNull).c_str()) == 0 && !isNull) + const string& str1 = pm[i]->data()->getStrVal(row, isNull); + if (cs->strnncoll(val.c_str(), val.length(), str1.c_str(), str1.length()) == 0 && !isNull) return true; if (isNull && isNotIn) diff --git a/utils/funcexp/func_insert.cpp b/utils/funcexp/func_insert.cpp index 20109c27b..a869ef4bf 100644 --- a/utils/funcexp/func_insert.cpp +++ b/utils/funcexp/func_insert.cpp @@ -37,7 +37,7 @@ using namespace joblist; #include "utf8.h" using namespace utf8; -#define STRCOLL_ENH__ +#include "collation.h" namespace funcexp { @@ -48,69 +48,62 @@ CalpontSystemCatalog::ColType Func_insert::operationType(FunctionParm& fp, Calpo return fp[0]->data()->resultType(); } -string insertStr(const string& src, int pos, int len, const string& targ) -{ - int64_t strLen = static_cast(src.length()); - - if ((pos <= 0) || ((pos - 1) >= strLen)) - return src; - - if ((len < 0) || (len > strLen)) - len = strLen; - - const char* srcptr = src.c_str(); - advance(srcptr, pos - 1, srcptr + strLen); - // srcptr now pointing to where we need to insert targ string - - uint32_t srcPos = srcptr - src.c_str(); - - uint32_t finPos = strLen; - const char* finptr = src.c_str(); - - if ((strLen - (pos - 1 + len)) >= 0) - { - advance(finptr, (pos - 1 + len), finptr + strLen); - // finptr now pointing to the end of the string to replace - finPos = finptr - src.c_str(); - } - - string out; - out.reserve(srcPos + targ.length() + strLen - finPos + 1); - out.append( src.c_str(), srcPos ); - out.append( targ.c_str(), targ.length() ); - out.append( src.c_str() + finPos, strLen - finPos ); - - return out; -} - std::string Func_insert::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, execplan::CalpontSystemCatalog::ColType&) { - string tstr; + string src; string tnewstr; - stringValue(fp[0], row, isNull, tstr); + int64_t start, length; + + stringValue(fp[0], row, isNull, src); if (isNull) - { return ""; - } stringValue(fp[3], row, isNull, tnewstr); if (isNull) return ""; - int64_t pos = fp[1]->data()->getIntVal(row, isNull); - + start = fp[1]->data()->getIntVal(row, isNull); if (isNull) return ""; - int64_t len = fp[2]->data()->getIntVal(row, isNull); - + length = fp[2]->data()->getIntVal(row, isNull); if (isNull) return ""; - return insertStr( tstr, pos, len, tnewstr ); + start--; // Because SQL syntax is 1 based and we want 0 based. + + CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset(); + + // binLen represents the number of bytes + int64_t binLen = static_cast(src.length()); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen is number of characters + int64_t strLen = cs->numchars(pos, end); + + // Return the original string if start isn't within the string. + if ((start < 1) || start >= strLen) + return src; + + if ((length < 0) || (length > strLen)) + length = strLen; + + // Convert start and length from characters to bytes. + start = cs->charpos(pos, end, start); + length = cs->charpos(pos+start, end, length); + + string out; + out.reserve(binLen - length + tnewstr.length() + 1); + + out.append(src.c_str(), start); + out.append(tnewstr.c_str(), tnewstr.length()); + if (binLen - start - length > 0) + out.append(src.c_str() + start + length, binLen - start - length); + + return out; } diff --git a/utils/funcexp/func_instr.cpp b/utils/funcexp/func_instr.cpp index 47f25fd27..dae5fb473 100644 --- a/utils/funcexp/func_instr.cpp +++ b/utils/funcexp/func_instr.cpp @@ -32,6 +32,8 @@ using namespace std; #include "utils_utf8.h" using namespace execplan; +#include "collation.h" + namespace funcexp { CalpontSystemCatalog::ColType Func_instr::operationType( FunctionParm& fp, CalpontSystemCatalog::ColType& resultType ) @@ -42,37 +44,50 @@ CalpontSystemCatalog::ColType Func_instr::operationType( FunctionParm& fp, Calpo return ct; } -size_t Func_instr::in_str(const string& str, const string& substr, size_t start) -{ - // convert both inputs to wide character strings - std::wstring wcstr = utf8::utf8_to_wstring(str); - std::wstring wcsubstr = utf8::utf8_to_wstring(substr); - - if ((str.length() && !wcstr.length()) || - (substr.length() && !wcsubstr.length())) - // this means one or both of the strings had conversion errors to wide character - return 0; - - size_t pos = wcstr.find(wcsubstr, start - 1); - return (pos != string::npos ? pos + 1 : 0); -} - int64_t Func_instr::getIntVal(rowgroup::Row& row, FunctionParm& parm, bool& isNull, - CalpontSystemCatalog::ColType&) + CalpontSystemCatalog::ColType& colType) { - uint64_t start = 1; - - if (parm.size() == 3) - start = parm[2]->data()->getIntVal(row, isNull); - - if (isNull || start == 0) + int64_t start = 0; + int64_t start0= 0; + my_match_t match; + + const std::string& str = parm[0]->data()->getStrVal(row, isNull); + if (isNull) + return 0; + const char* s1 = str.c_str(); + uint32_t l1 = (uint32_t)str.length(); + + const std::string& substr =parm[1]->data()->getStrVal(row, isNull); + if (isNull) return 0; - //Bug 5110 : to support utf8 char type, we have to convert and search - return in_str(parm[0]->data()->getStrVal(row, isNull), parm[1]->data()->getStrVal(row, isNull), start); + const char* s2 = substr.c_str(); + uint32_t l2 = (uint32_t)substr.length(); + if (l2 < 1) + return start + 1; + CHARSET_INFO* cs = colType.getCharset(); + + if (parm.size() == 3) + { + start0 = start = parm[2]->data()->getIntVal(row, isNull) - 1; + + if ((start < 0) || (start > l1)) + return 0; + + start = (int64_t)cs->charpos(s1, s1+l1, start); // adjust start for multi-byte + + if (start + l2 > l1) // Substring is longer than str at pos. + return 0; + } + + if (!cs->instr(s1+start, l1-start, + s2, l2, + &match, 1)) + return 0; + return (int64_t)match.mb_len + start0 + 1; } diff --git a/utils/funcexp/func_lcase.cpp b/utils/funcexp/func_lcase.cpp index ae399c986..073c92f40 100644 --- a/utils/funcexp/func_lcase.cpp +++ b/utils/funcexp/func_lcase.cpp @@ -35,6 +35,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + class to_lower { public: @@ -56,31 +58,22 @@ CalpontSystemCatalog::ColType Func_lcase::operationType(FunctionParm& fp, Calpon std::string Func_lcase::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& colType) { -// string str = fp[0]->data()->getStrVal(row, isNull); - -// transform (str.begin(), str.end(), str.begin(), to_lower()); - const string& tstr = fp[0]->data()->getStrVal(row, isNull); if (isNull) return ""; - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring wstr(wcbuf, strwclen); + CHARSET_INFO* cs = colType.getCharset(); + uint64_t inLen = tstr.length(); + uint64_t bufLen= inLen * cs->casedn_multiply; + char* outBuf = new char[bufLen]; + + uint64_t outLen = cs->casedn(tstr.c_str(), inLen, outBuf, bufLen); - for (uint32_t i = 0; i < strwclen; i++) - wstr[i] = std::towlower(wstr[i]); - - size_t strmblen = utf8::idb_wcstombs(0, wstr.c_str(), 0) + 1; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, wstr.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; + string ret = string(outBuf, outLen); + delete [] outBuf; return ret; } diff --git a/utils/funcexp/func_least.cpp b/utils/funcexp/func_least.cpp index 5f97ee892..f8256ac60 100644 --- a/utils/funcexp/func_least.cpp +++ b/utils/funcexp/func_least.cpp @@ -40,6 +40,8 @@ using namespace joblist; #include "utils_utf8.h" using namespace funcexp; +#include "collation.h" + class to_lower { public: @@ -127,17 +129,16 @@ std::string Func_least::getStrVal(rowgroup::Row& row, execplan::CalpontSystemCatalog::ColType& op_ct) { string leastStr = fp[0]->data()->getStrVal(row, isNull); + CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset(); for (uint32_t i = 1; i < fp.size(); i++) { const string& str1 = fp[i]->data()->getStrVal(row, isNull); - int tmp = utf8::idb_strcoll(leastStr.c_str(), str1.c_str()); - - if ( tmp > 0 ) - -// if ( leastStr > str1 ) + if (cs->strnncoll(leastStr.c_str(), leastStr.length(), str1.c_str(), str1.length()) > 0) + { leastStr = str1; + } } return leastStr; diff --git a/utils/funcexp/func_left.cpp b/utils/funcexp/func_left.cpp index 3fc0ea403..6c8bad1e2 100644 --- a/utils/funcexp/func_left.cpp +++ b/utils/funcexp/func_left.cpp @@ -20,7 +20,6 @@ * * ****************************************************************************/ - #include using namespace std; @@ -35,6 +34,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + namespace funcexp { @@ -48,36 +49,34 @@ CalpontSystemCatalog::ColType Func_left::operationType(FunctionParm& fp, Calpont std::string Func_left::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - const string& tstr = fp[0]->data()->getStrVal(row, isNull); - + CHARSET_INFO* cs = type.getCharset(); + // The original string + const string& src = fp[0]->data()->getStrVal(row, isNull); if (isNull) return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring str(wcbuf, strwclen); - - int64_t pos = fp[1]->data()->getIntVal(row, isNull) - 1; - - if (isNull) + size_t trimLength = fp[1]->data()->getUintVal(row, isNull); + if (isNull || trimLength <= 0) return ""; - if (pos == -1) // pos == 0 - return ""; + size_t charPos; - wstring out = str.substr(0, pos + 1); - size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; + if ((binLen <= trimLength) || + (binLen <= (charPos= cs->charpos(pos, end, trimLength)))) + { + return src; + } + + std::string ret(pos, charPos); return ret; - -// return str.substr(0, pos+1); } diff --git a/utils/funcexp/func_length.cpp b/utils/funcexp/func_length.cpp index dbcf8eaac..98e941ab1 100644 --- a/utils/funcexp/func_length.cpp +++ b/utils/funcexp/func_length.cpp @@ -33,6 +33,8 @@ using namespace execplan; #include "rowgroup.h" +#include "collation.h" + namespace funcexp { CalpontSystemCatalog::ColType Func_length::operationType( FunctionParm& fp, CalpontSystemCatalog::ColType& resultType ) diff --git a/utils/funcexp/func_lpad.cpp b/utils/funcexp/func_lpad.cpp index 8a40f21d2..d66fe2541 100644 --- a/utils/funcexp/func_lpad.cpp +++ b/utils/funcexp/func_lpad.cpp @@ -20,6 +20,7 @@ * * ****************************************************************************/ + #include "errorids.h" #include using namespace std; @@ -35,7 +36,7 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; -#define STRCOLL_ENH__ +#include "collation.h" namespace funcexp { @@ -52,191 +53,80 @@ CalpontSystemCatalog::ColType Func_lpad::operationType(FunctionParm& fp, Calpont std::string Func_lpad::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - unsigned i; - // The number of characters (not bytes) in our input str. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in our pad str. - size_t padwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); - // The result length in number of characters - size_t len = 0; - - switch (fp[1]->data()->resultType().colDataType) + // In the case where someone entered pad length as a quoted string, + // it may be interpreted by columnstore to be an actual string + // and stored in fResult.int as a htonl of that string, + // However fResult.double is always correct, so we'll use that. + size_t padLength = (size_t)fp[1]->data()->getDoubleVal(row, isNull); + if (isNull || padLength <= 0) + return ""; + if (padLength > (size_t)INT_MAX32) + padLength = (size_t)INT_MAX32; + + if (padLength < strLen) { - case execplan::CalpontSystemCatalog::BIGINT: - case execplan::CalpontSystemCatalog::INT: - case execplan::CalpontSystemCatalog::MEDINT: - case execplan::CalpontSystemCatalog::TINYINT: - case execplan::CalpontSystemCatalog::SMALLINT: - { - len = fp[1]->data()->getIntVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::UBIGINT: - case execplan::CalpontSystemCatalog::UINT: - case execplan::CalpontSystemCatalog::UMEDINT: - case execplan::CalpontSystemCatalog::UTINYINT: - case execplan::CalpontSystemCatalog::USMALLINT: - { - len = fp[1]->data()->getUintVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::FLOAT: - case execplan::CalpontSystemCatalog::UFLOAT: - case execplan::CalpontSystemCatalog::DOUBLE: - case execplan::CalpontSystemCatalog::UDOUBLE: - case execplan::CalpontSystemCatalog::DECIMAL: - case execplan::CalpontSystemCatalog::UDECIMAL: - { - double value = fp[1]->data()->getDoubleVal(row, isNull); - - if (value > 0) - value += 0.5; - else if (value < 0) - value -= 0.5; - - int64_t ret = (int64_t) value; - - if (value > (double) numeric_limits::max()) - ret = numeric_limits::max(); - else if (value < (double) (numeric_limits::min() + 2)) - ret = numeric_limits::min() + 2; // IDB min for bigint - - len = ret; - } - break; - - case execplan::CalpontSystemCatalog::CHAR: - case execplan::CalpontSystemCatalog::VARCHAR: - { - const string& strval = fp[1]->data()->getStrVal(row, isNull); - len = strtol(strval.c_str(), NULL, 10); - break; - } - - default: - { - std::ostringstream oss; - oss << "lpad parameter 2 must be numeric, not " << execplan::colDataTypeToString(fp[1]->data()->resultType().colDataType); - throw logging::IDBExcept(oss.str(), logging::ERR_DATATYPE_NOT_SUPPORT); - - } + binLen = cs->charpos(pos, end, padLength); + std::string ret(pos, binLen); + return ret; } - if (len < 1) - return ""; - - // MCOL-2182 As of MariaDB 10.3 the third parameter - pad characters - is optional // The pad characters. - const string* pad = &fPad; + const string* pad = &fPad; // Defaults to space if (fp.size() > 2) { pad = &fp[2]->data()->getStrVal(row, isNull); } + // binPLen represents the number of bytes in pad + size_t binPLen = pad->length(); + const char* posP = pad->c_str(); + // plen = the number of characters in pad + size_t plen = cs->numchars(posP, posP+binPLen); + if (plen == 0) + return src; - if (isNull) - return ""; + size_t byteCount = (padLength+1) * cs->mbmaxlen; // absolute maximun number of bytes + char* buf = new char[byteCount]; + char* pBuf = buf; - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - size_t alen = len; - - if (strwclen > len) - alen = strwclen; - - size_t bufsize = alen + 1; - - // Convert to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - size_t strSize = strwclen; // The number of significant characters - const wchar_t* pWChar = wcbuf; - - for (i = 0; *pWChar != '\0' && i < strwclen; ++pWChar, ++i) + padLength -= strLen; + byteCount = 0; + + while (padLength >= plen) { + memcpy(pBuf, posP, binPLen); + padLength -= plen; + byteCount += binPLen; + pBuf += binPLen; } - - strSize = i; - - // If the incoming str is exactly the len of the result str, - // return the original - if (strSize == len) + // Sometimes, in a case with multi-char pad, we need to add a partial pad + if (padLength > 0) { - return tstr; + size_t partialSize = cs->charpos(posP, posP+binPLen, padLength); + memcpy(pBuf, posP, partialSize); + byteCount += partialSize; + pBuf += partialSize; } - - // If the incoming str is too big for the result str - // truncate the widechar buffer and return as a string - if (strSize > len) - { - // Trim the excess length of the buffer - wstring trimmed = wstring(wcbuf, len); - return utf8::wstring_to_utf8(trimmed.c_str()); - } - - // This is the case where there's room to pad. - - // Convert the pad string to wide - padwclen = pad->length(); // A guess to start. - size_t padbufsize = padwclen + 1; - wchar_t* wcpad = new wchar_t[padbufsize]; - // padwclen+1 is for giving count for the terminating null - size_t padlen = utf8::idb_mbstowcs(wcpad, pad->c_str(), padwclen + 1); - - // How many chars do we need? - size_t padspace = len - strSize; - - // Shift the contents of wcbuf to the right. - wchar_t* startofstr = wcbuf + padspace; - - // Move the original string to the right to make room for the pad chars - // Testing has shown that this loop is faster than memmove - wchar_t* newchar = wcbuf + len; // Last spot to put a char in buf - wchar_t* pChar = wcbuf + strSize; // terminal NULL of our str - - while (pChar >= wcbuf) - { - *newchar-- = *pChar--; - } - - // Fill in the front of the buffer with the pad chars - wchar_t* firstpadchar = wcbuf; - - for (wchar_t* pch = wcbuf; pch < startofstr && padlen > 0;) - { - // Truncate the number of fill chars if running out of space - if (padlen > padspace) - { - padlen = padspace; - } - - // Move the fill chars to buffer - for (wchar_t* padchar = wcpad; padchar < wcpad + padlen; ++padchar) - { - *firstpadchar++ = *padchar; - } - - padspace -= padlen; - pch += padlen; - } - - wstring padded = wstring(wcbuf, len); - // Turn back to a string - std::string ret(utf8::wstring_to_utf8(padded.c_str())); - delete [] wcpad; - delete [] wcbuf; + memcpy(pBuf, pos, binLen); + byteCount += binLen; + + std::string ret(buf, byteCount); + delete [] buf; return ret; } diff --git a/utils/funcexp/func_ltrim.cpp b/utils/funcexp/func_ltrim.cpp index 77db579b8..5af7dd3b9 100644 --- a/utils/funcexp/func_ltrim.cpp +++ b/utils/funcexp/func_ltrim.cpp @@ -35,6 +35,7 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" namespace funcexp { @@ -47,89 +48,56 @@ CalpontSystemCatalog::ColType Func_ltrim::operationType(FunctionParm& fp, Calpon std::string Func_ltrim::getStrVal(rowgroup::Row& row, - FunctionParm& fp, - bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + FunctionParm& fp, + bool& isNull, + execplan::CalpontSystemCatalog::ColType& type) { - // The number of characters (not bytes) in our input tstr. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in ourtrim tstr. - size_t trimwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); // The trim characters. const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " "); + // binTLen represents the number of bytes in trim + size_t binTLen = trim.length(); + const char* posT = trim.c_str(); + // strTLen = the number of characters in trim + size_t strTLen = cs->numchars(posT, posT+binTLen); + if (strTLen == 0 || strTLen > strLen) + return src; - if (isNull) - return ""; - - if (tstr.empty() || tstr.length() == 0) - return tstr; - - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int bufsize = strwclen + 1; - - // Convert the string to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - // idb_mbstowcs can return -1 if there is bad mbs char in tstr - if (strwclen == static_cast(-1)) - strwclen = 0; - - // Convert the trim string to wide - trimwclen = trim.length(); // A guess to start. - int trimbufsize = trimwclen + 1; - wchar_t* wctrim = new wchar_t[trimbufsize]; - size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1); - - // idb_mbstowcs can return -1 if there is bad mbs char in tstr - if (trimlen == static_cast(-1)) - trimlen = 0; - - size_t trimCmpLen = trimlen * sizeof(wchar_t); - - const wchar_t* oPtr = wcbuf; // To remember the start of the string - const wchar_t* aPtr = oPtr; - const wchar_t* aEnd = wcbuf + strwclen - 1; - - if (trimlen > 0) + if (binTLen == 1) { - if (trimlen == 1) + // If the trim string is 1 byte, don't waste cpu for memcmp + while (pos < end && *pos == *posT) { - // If trim is a single char, then don't spend the overhead for memcmp. - wchar_t chr = wctrim[0]; - - while (aPtr <= aEnd && *aPtr == chr) - aPtr++; - } - else - { - aEnd -= (trimlen - 1); // So we don't compare past the end of the string. - - while (aPtr <= aEnd && !memcmp(aPtr, wctrim, trimCmpLen)) - aPtr += trimlen; + ++pos; + --binLen; + } + } + else + { + while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0) + { + pos += binTLen; + binLen -= binTLen; } } - - // Bug 5110 - error in allocating enough memory for utf8 chars - size_t aLen = strwclen - (aPtr - oPtr); - wstring trimmed = wstring(aPtr, aLen); // Turn back to a string - std::string ret(utf8::wstring_to_utf8(trimmed.c_str())); - delete [] wctrim; - delete [] wcbuf; + std::string ret(pos, binLen); return ret; } - } // namespace funcexp // vim:ts=4 sw=4: diff --git a/utils/funcexp/func_nullif.cpp b/utils/funcexp/func_nullif.cpp index 3af8a7d4b..ca835e5bc 100644 --- a/utils/funcexp/func_nullif.cpp +++ b/utils/funcexp/func_nullif.cpp @@ -22,6 +22,11 @@ * ****************************************************************************/ +//#include +//#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost +//#undef LONGLONG_MIN +//#include + #include #include #include @@ -45,6 +50,8 @@ using namespace dataconvert; #include "utils_utf8.h" using namespace funcexp; +#include "collation.h" + namespace funcexp { @@ -363,6 +370,7 @@ string Func_nullif::getStrVal(rowgroup::Row& row, CalpontSystemCatalog::ColType& op_ct) { string exp1 = parm[0]->data()->getStrVal(row, isNull); + CHARSET_INFO* cs = parm[0]->data()->resultType().getCharset(); if (isNull) { @@ -395,7 +403,7 @@ string Func_nullif::getStrVal(rowgroup::Row& row, exp2 = exp2 + " 00:00:00"; } - if ( utf8::idb_strcoll(exp1.c_str(), exp2.c_str()) == 0 ) + if (cs->strnncoll(exp1.c_str(), exp1.length(), exp2.c_str(), exp2.length()) == 0) { isNull = true; return ""; diff --git a/utils/funcexp/func_repeat.cpp b/utils/funcexp/func_repeat.cpp index 65a6b45e2..2a14bbdc8 100644 --- a/utils/funcexp/func_repeat.cpp +++ b/utils/funcexp/func_repeat.cpp @@ -59,7 +59,7 @@ CalpontSystemCatalog::ColType Func_repeat::operationType(FunctionParm& fp, Calpo std::string Func_repeat::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType& op_ct) + execplan::CalpontSystemCatalog::ColType& type) { string str; diff --git a/utils/funcexp/func_replace.cpp b/utils/funcexp/func_replace.cpp index 549e5f2a3..0c16c9174 100644 --- a/utils/funcexp/func_replace.cpp +++ b/utils/funcexp/func_replace.cpp @@ -34,6 +34,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + namespace funcexp { diff --git a/utils/funcexp/func_reverse.cpp b/utils/funcexp/func_reverse.cpp index 6ef4fcf1f..848da8041 100644 --- a/utils/funcexp/func_reverse.cpp +++ b/utils/funcexp/func_reverse.cpp @@ -34,8 +34,6 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; -#define STRCOLL_ENH__ - namespace { diff --git a/utils/funcexp/func_right.cpp b/utils/funcexp/func_right.cpp index f7a21faed..cfcb3c45f 100644 --- a/utils/funcexp/func_right.cpp +++ b/utils/funcexp/func_right.cpp @@ -35,6 +35,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + namespace funcexp { @@ -48,42 +50,33 @@ CalpontSystemCatalog::ColType Func_right::operationType(FunctionParm& fp, Calpon std::string Func_right::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - const string& tstr = fp[0]->data()->getStrVal(row, isNull); - + CHARSET_INFO* cs = type.getCharset(); + // The original string + const string& src = fp[0]->data()->getStrVal(row, isNull); if (isNull) return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; - int64_t pos = fp[1]->data()->getIntVal(row, isNull); - - if (isNull) + size_t trimLength = fp[1]->data()->getUintVal(row, isNull); + if (isNull || trimLength <= 0) return ""; - if (pos == -1) // pos == 0 - return ""; + size_t start = cs->numchars(pos, end); // Here, start is number of characters in src + if (start <= trimLength) + return src; + start = cs->charpos(pos, end, start - trimLength); // Here, start becomes number of bytes into src to start copying - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - //wchar_t wcbuf[strwclen]; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring str(wcbuf, strwclen); - - if ( (unsigned) pos >= strwclen ) - pos = strwclen; - - wstring out = str.substr(strwclen - pos, strwclen); - size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1; - //char outbuf[strmblen]; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; + std::string ret(pos+start, binLen-start); return ret; } - } // namespace funcexp // vim:ts=4 sw=4: diff --git a/utils/funcexp/func_rpad.cpp b/utils/funcexp/func_rpad.cpp index 37e1d8791..1a8841f34 100644 --- a/utils/funcexp/func_rpad.cpp +++ b/utils/funcexp/func_rpad.cpp @@ -35,7 +35,7 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; -#define STRCOLL_ENH__ +#include "collation.h" namespace funcexp { @@ -51,179 +51,81 @@ CalpontSystemCatalog::ColType Func_rpad::operationType(FunctionParm& fp, Calpont std::string Func_rpad::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - unsigned i; - // The number of characters (not bytes) in our input str. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in our pad str. - size_t padwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); - // The result length in number of characters - size_t len = 0; - - switch (fp[1]->data()->resultType().colDataType) + // In the case where someone entered pad length as a quoted string, + // it may be interpreted by columnstore to be an actual string + // and stored in fResult.int as a htonl of that string, + // However fResult.double is always correct, so we'll use that. + size_t padLength = (size_t)fp[1]->data()->getDoubleVal(row, isNull); + if (isNull || padLength <= 0) + return ""; + if (padLength > (size_t)INT_MAX32) + padLength = (size_t)INT_MAX32; + + if (padLength < strLen) { - case execplan::CalpontSystemCatalog::BIGINT: - case execplan::CalpontSystemCatalog::INT: - case execplan::CalpontSystemCatalog::MEDINT: - case execplan::CalpontSystemCatalog::TINYINT: - case execplan::CalpontSystemCatalog::SMALLINT: - { - len = fp[1]->data()->getIntVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::UBIGINT: - case execplan::CalpontSystemCatalog::UINT: - case execplan::CalpontSystemCatalog::UMEDINT: - case execplan::CalpontSystemCatalog::UTINYINT: - case execplan::CalpontSystemCatalog::USMALLINT: - { - len = fp[1]->data()->getUintVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::FLOAT: - case execplan::CalpontSystemCatalog::UFLOAT: - case execplan::CalpontSystemCatalog::DOUBLE: - case execplan::CalpontSystemCatalog::UDOUBLE: - case execplan::CalpontSystemCatalog::DECIMAL: - case execplan::CalpontSystemCatalog::UDECIMAL: - { - double value = fp[1]->data()->getDoubleVal(row, isNull); - - if (value > 0) - value += 0.5; - else if (value < 0) - value -= 0.5; - else if (value < 0) - value -= 0.5; - - int64_t ret = (int64_t) value; - - if (value > (double) numeric_limits::max()) - ret = numeric_limits::max(); - else if (value < (double) (numeric_limits::min() + 2)) - ret = numeric_limits::min() + 2; // IDB min for bigint - - len = ret; - } - break; - - case execplan::CalpontSystemCatalog::CHAR: - case execplan::CalpontSystemCatalog::VARCHAR: - { - const string& strval = fp[1]->data()->getStrVal(row, isNull); - len = strtol(strval.c_str(), NULL, 10); - break; - } - - default: - { - std::ostringstream oss; - oss << "lpad parameter 2 must be numeric, not " << execplan::colDataTypeToString(fp[1]->data()->resultType().colDataType); - throw logging::IDBExcept(oss.str(), logging::ERR_DATATYPE_NOT_SUPPORT); - } + binLen = cs->charpos(pos, end, padLength); + std::string ret(pos, binLen); + return ret; } - if (len < 1) - return ""; - // The pad characters. - // MCOL-2182 As of MariaDB 10.3 the third parameter - pad characters - is optional const string* pad = &fPad; if (fp.size() > 2) { pad = &fp[2]->data()->getStrVal(row, isNull); } + // binPLen represents the number of bytes in pad + size_t binPLen = pad->length(); + const char* posP = pad->c_str(); + // plen = the number of characters in pad + size_t plen = cs->numchars(posP, posP+binPLen); + if (plen == 0 || plen > strLen) + return src; - if (isNull) - return ""; + size_t byteCount = (padLength+1) * cs->mbmaxlen; // absolute maximun number of bytes + char* buf = new char[byteCount]; + char* pBuf = buf; - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int alen = len; + byteCount = 0; + + memcpy(pBuf, pos, binLen); + byteCount += binLen; + padLength -= strLen; + pBuf += binLen; - if (strwclen > len) - alen = strwclen; - - int bufsize = alen + 1; - - // Convert to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - unsigned int strSize = strwclen; // The number of significant characters - const wchar_t* pWChar = wcbuf; - - for (i = 0; *pWChar != '\0' && i < strwclen; ++pWChar, ++i) + while (padLength >= plen) { + memcpy(pBuf, posP, binPLen); + padLength -= plen; + byteCount += binPLen; + pBuf += binPLen; } - - strSize = i; - - // If the incoming str is exactly the len of the result str, - // return the original - if (strSize == len) + // Sometimes, in a case with multi-char pad, we need to add a partial pad + if (padLength > 0) { - return tstr; + size_t partialSize = cs->charpos(posP, posP+plen, padLength); + memcpy(pBuf, posP, partialSize); + byteCount += partialSize; } - - // If the incoming str is too big for the result str - // truncate the widechar buffer and return as a string - if (strSize > len) - { - // Trim the excess length of the buffer - wstring trimmed = wstring(wcbuf, len); - return utf8::wstring_to_utf8(trimmed.c_str()); - } - - // This is the case where there's room to pad. - - // Convert the pad string to wide - padwclen = pad->length(); // A guess to start. - int padbufsize = padwclen + 1; - wchar_t* wcpad = new wchar_t[padbufsize]; - size_t padlen = utf8::idb_mbstowcs(wcpad, pad->c_str(), padwclen + 1); - - // How many chars do we need? - unsigned int padspace = len - strSize; - - // Fill in the back of the buffer - wchar_t* firstpadchar = wcbuf + strSize; - - for (wchar_t* pch = wcbuf; pch < wcbuf + len && padlen > 0;) - { - // Truncate the number of fill chars if running out of space - if (padlen > padspace) - { - padlen = padspace; - } - - // Move the fill chars to buffer - for (wchar_t* padchar = wcpad; padchar < wcpad + padlen; ++padchar) - { - *firstpadchar++ = *padchar; - } - - padspace -= padlen; - pch += padlen; - } - - wstring padded = wstring(wcbuf, len); - - // Bug 5110 : strings were getting truncated since enough bytes not allocated. - std::string ret(utf8::wstring_to_utf8(padded.c_str())); - delete [] wcpad; - delete [] wcbuf; + + std::string ret(buf, byteCount); + delete [] buf; return ret; } diff --git a/utils/funcexp/func_rtrim.cpp b/utils/funcexp/func_rtrim.cpp index 513567d6d..77dc22424 100644 --- a/utils/funcexp/func_rtrim.cpp +++ b/utils/funcexp/func_rtrim.cpp @@ -35,6 +35,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + namespace funcexp { @@ -46,95 +48,118 @@ CalpontSystemCatalog::ColType Func_rtrim::operationType(FunctionParm& fp, Calpon std::string Func_rtrim::getStrVal(rowgroup::Row& row, - FunctionParm& fp, - bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + FunctionParm& fp, + bool& isNull, + execplan::CalpontSystemCatalog::ColType& type) { - // The number of characters (not bytes) in our input tstr. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in ourtrim tstr. - size_t trimwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); // The trim characters. const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " "); + // binTLen represents the number of bytes in trim + size_t binTLen = trim.length(); + const char* posT = trim.c_str(); + // strTLen = the number of characters in trim + size_t strTLen = cs->numchars(posT, posT+binTLen); + if (strTLen == 0 || strTLen > strLen) + return src; - if (isNull) - return ""; - - if (tstr.empty() || tstr.length() == 0) - return tstr; - - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int bufsize = strwclen + 1; - - // Convert the string to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - // utf8::idb_mbstowcs could return -1 if there is bad chars - if (strwclen == static_cast(-1)) - strwclen = 0; - - // Convert the trim string to wide - trimwclen = trim.length(); // A guess to start. - int trimbufsize = trimwclen + 1; - wchar_t* wctrim = new wchar_t[trimbufsize]; - size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1); - - // idb_mbstowcs could return -1 if there is bad chars - if (trimlen == static_cast(-1)) - trimlen = 0; - - size_t trimCmpLen = trimlen * sizeof(wchar_t); - - const wchar_t* oPtr = wcbuf; // To remember the start of the string - const wchar_t* aPtr = oPtr; - const wchar_t* aEnd = wcbuf + strwclen - 1; - size_t trimCnt = 0; - - if (trimlen > 0) + if (binTLen == 1) { - if (trimlen == 1) + const char* ptr = pos; + if (cs->use_mb()) // This is a multi-byte charset { - // If trim is a single char, then don't spend the overhead for memcmp. - wchar_t chr = wctrim[0]; - - while (aEnd >= aPtr && *aEnd == chr) + const char* p = pos; + uint32 l; + // Multibyte characters in the string give us alignment problems + // What we do here is skip past any multibyte characters. Whn + // don with this loop, ptr is pointing to a singlebyte char that + // is after all multibyte chars in the string, or to end. + while (ptr < end) { - --aEnd; - ++trimCnt; + if ((l = my_ismbchar(cs, ptr, end))) // returns the number of bytes in the leading char or zero if one byte + { + ptr += l; + p = ptr; + } + else + { + ++ptr; + } + } + ptr = p; + } + while (ptr < end && end[-1] == *posT) + { + --end; + --binLen; + } + } + else + { + // An uncommon case where the space character is > 1 byte + if (cs->use_mb()) // This is a multi-byte charset + { + // The problem is that the byte pattern at the end could + // match memcmp, but not be correct since the first byte compared + // may actually be a second or later byte from a previous char. + + // We start at the beginning of the string and move forward + // one character at a time until we reach the end. Then we can + // safely compare and remove on character. Then back to the beginning + // and try again. + while (end - binTLen >= pos) + { + const char* p = pos; + uint32 l; + while (p + binTLen < end) + { + if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte + p += l; + else + ++p; + } + if (p + binTLen == end && memcmp(p,posT,binTLen) == 0) + { + end -= binTLen; + binLen -= binTLen; + } + else + { + break; // We've run out of places to look + } } } else { - aEnd -= (trimlen - 1); // So we don't compare past the end of the string. - - while (aPtr <= aEnd && !memcmp(aEnd, wctrim, trimCmpLen)) + // This implies we have a single byte charset and a multibyte + // space character. + // Should never get here, since rtrim only trims space characters + // Included for completeness. + while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0) { - aEnd -= trimCmpLen; - trimCnt += trimlen; + end -= binTLen; + binLen -= binTLen; } } } - - size_t aLen = strwclen - trimCnt; - wstring trimmed = wstring(aPtr, aLen); // Turn back to a string - std::string ret(utf8::wstring_to_utf8(trimmed.c_str())); - delete [] wctrim; - delete [] wcbuf; + std::string ret(pos, binLen); return ret; } - } // namespace funcexp // vim:ts=4 sw=4: diff --git a/utils/funcexp/func_strcmp.cpp b/utils/funcexp/func_strcmp.cpp index a0f7a8930..743903f42 100644 --- a/utils/funcexp/func_strcmp.cpp +++ b/utils/funcexp/func_strcmp.cpp @@ -39,6 +39,12 @@ using namespace joblist; #include "utils_utf8.h" using namespace funcexp; +#include "collation.h" + +// Because including my_sys.h in a Columnstore header causes too many conflicts +struct charset_info_st; +typedef const struct charset_info_st CHARSET_INFO; + class to_lower { public: @@ -62,12 +68,13 @@ CalpontSystemCatalog::ColType Func_strcmp::operationType(FunctionParm& fp, Calpo int64_t Func_strcmp::getIntVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType& op_ct) + execplan::CalpontSystemCatalog::ColType& type) { + CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset(); const string& str = fp[0]->data()->getStrVal(row, isNull); - const string& str1 = fp[1]->data()->getStrVal(row, isNull); - int ret = utf8::idb_strcoll(str.c_str(), str1.c_str()); + + int ret = cs->strnncoll(str.c_str(), str.length(), str1.c_str(), str1.length()); // mysql's strcmp returns only -1, 0, and 1 return (ret < 0 ? -1 : (ret > 0 ? 1 : 0)); } @@ -76,9 +83,9 @@ int64_t Func_strcmp::getIntVal(rowgroup::Row& row, std::string Func_strcmp::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType& op_ct) + execplan::CalpontSystemCatalog::ColType& type) { - uint64_t val = getIntVal(row, fp, isNull, op_ct); + uint64_t val = getIntVal(row, fp, isNull, type); if (val > 0) return string("1"); diff --git a/utils/funcexp/func_substr.cpp b/utils/funcexp/func_substr.cpp index e99287af8..3f8be7373 100644 --- a/utils/funcexp/func_substr.cpp +++ b/utils/funcexp/func_substr.cpp @@ -35,6 +35,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + #define STRCOLL_ENH__ namespace funcexp diff --git a/utils/funcexp/func_substring_index.cpp b/utils/funcexp/func_substring_index.cpp index a3b5c342e..970b3d28c 100644 --- a/utils/funcexp/func_substring_index.cpp +++ b/utils/funcexp/func_substring_index.cpp @@ -21,7 +21,6 @@ * * ****************************************************************************/ - #include using namespace std; @@ -35,7 +34,7 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; -#define STRCOLL_ENH__ +#include "collation.h" namespace funcexp { diff --git a/utils/funcexp/func_trim.cpp b/utils/funcexp/func_trim.cpp index 83cbe6707..2a522d3fb 100644 --- a/utils/funcexp/func_trim.cpp +++ b/utils/funcexp/func_trim.cpp @@ -35,6 +35,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + namespace funcexp { CalpontSystemCatalog::ColType Func_trim::operationType(FunctionParm& fp, CalpontSystemCatalog::ColType& resultType) @@ -47,106 +49,124 @@ CalpontSystemCatalog::ColType Func_trim::operationType(FunctionParm& fp, Calpont std::string Func_trim::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - // The number of characters (not bytes) in our input tstr. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in ourtrim tstr. - size_t trimwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); // The trim characters. const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " "); + // binTLen represents the number of bytes in trim + size_t binTLen = trim.length(); + const char* posT = trim.c_str(); + // strTLen = the number of characters in trim + size_t strTLen = cs->numchars(posT, posT+binTLen); + if (strTLen == 0 || strTLen > strLen) + return src; - if (isNull) - return ""; - - if (tstr.empty() || tstr.length() == 0) - return tstr; - - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int bufsize = strwclen + 1; - - // Convert the string to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - // Bad char in mbc can return -1 - if (strwclen == static_cast(-1)) - strwclen = 0; - - // Convert the trim string to wide - trimwclen = trim.length(); // A guess to start. - int trimbufsize = trimwclen + 1; - wchar_t* wctrim = new wchar_t[trimbufsize]; - size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1); - - // Bad char in mbc can return -1 - if (trimlen == static_cast(-1)) - trimlen = 0; - - size_t trimCmpLen = trimlen * sizeof(wchar_t); - - const wchar_t* oPtr = wcbuf; // To remember the start of the string - const wchar_t* aPtr = oPtr; - const wchar_t* aEnd = wcbuf + strwclen - 1; - size_t trimCnt = 0; - - if (trimlen > 0) + if (binTLen == 1) { - if (trimlen == 1) + // If the trim string is 1 byte, don't waste cpu for memcmp + // Trim leading + while (pos < end && *pos == *posT) { - // If trim is a single char, then don't spend the overhead for memcmp. - wchar_t chr = wctrim[0]; - - // remove leading - while (aPtr != aEnd && *aPtr == chr) + ++pos; + --binLen; + } + // Trim trailing + const char* ptr = pos; + if (cs->use_mb()) // This is a multi-byte charset + { + const char* p = pos; + uint32 l; + // Multibyte characters in the string give us alignment problems + // What we do here is skip past any multibyte characters. Whn + // don with this loop, ptr is pointing to a singlebyte char that + // is after all multibyte chars in the string, or to end. + while (ptr < end) { - aPtr++; - ++trimCnt; + if ((l = my_ismbchar(cs, ptr, end))) // returns the number of bytes in the leading char or zero if one byte + { + ptr += l; + p = ptr; + } + else + { + ++ptr; + } } - - // remove trailing - while (aEnd != aPtr && *aEnd == chr) + ptr = p; + } + while (ptr < end && end[-1] == *posT) + { + --end; + --binLen; + } + } + else + { + // Trim leading is easy + while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0) + { + pos += binTLen; + binLen -= binTLen; + } + + // Trim trailing + if (cs->use_mb()) // This is a multi-byte charset + { + // The problem is that the byte pattern at the end could + // match memcmp, but not be correct since the first byte compared + // may actually be a second or later byte from a previous char. + + // We start at the beginning of the string and move forward + // one character at a time until we reach the end. Then we can + // safely compare and remove on character. Then back to the beginning + // and try again. + while (end - binTLen >= pos) { - aEnd--; - ++trimCnt; + const char* p = pos; + uint32 l; + while (p + binTLen < end) + { + if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte + p += l; + else + ++p; + } + if (p + binTLen == end && memcmp(p,posT,binTLen) == 0) + { + end -= binTLen; + binLen -= binTLen; + } + else + { + break; // We've run out of places to look + } } } else { - aEnd -= (trimlen - 1); // So we don't compare past the end of the string. - - // remove leading - while (aPtr <= aEnd && !memcmp(aPtr, wctrim, trimCmpLen)) + while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0) { - aPtr += trimlen; - trimCnt += trimlen; - } - - // remove trailing - while (aPtr <= aEnd && !memcmp(aEnd, wctrim, trimCmpLen)) - { - aEnd -= trimlen; //BUG 5241 - trimCnt += trimlen; + end -= binTLen; + binLen -= binTLen; } } } - - // Bug 5110 - error in allocating enough memory for utf8 chars - size_t aLen = strwclen - trimCnt; - wstring trimmed = wstring(aPtr, aLen); // Turn back to a string - std::string ret(utf8::wstring_to_utf8(trimmed.c_str())); - delete [] wctrim; - delete [] wcbuf; + std::string ret(pos, binLen); return ret; } diff --git a/utils/funcexp/func_ucase.cpp b/utils/funcexp/func_ucase.cpp index f032de880..f8d21b8a2 100644 --- a/utils/funcexp/func_ucase.cpp +++ b/utils/funcexp/func_ucase.cpp @@ -35,6 +35,8 @@ using namespace rowgroup; #include "joblisttypes.h" using namespace joblist; +#include "collation.h" + class to_upper { public: @@ -55,31 +57,22 @@ CalpontSystemCatalog::ColType Func_ucase::operationType(FunctionParm& fp, Calpon std::string Func_ucase::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& colType) { -// string str = fp[0]->data()->getStrVal(row, isNull); - -// transform (str.begin(), str.end(), str.begin(), to_lower()); - const string& tstr = fp[0]->data()->getStrVal(row, isNull); if (isNull) return ""; - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring wstr(wcbuf, strwclen); + CHARSET_INFO* cs = colType.getCharset(); + uint64_t inLen = tstr.length(); + uint64_t bufLen= inLen * cs->caseup_multiply; + char* outBuf = new char[bufLen]; + + uint64_t outLen = cs->caseup(tstr.c_str(), inLen, outBuf, bufLen); - for (uint32_t i = 0; i < strwclen; i++) - wstr[i] = std::towupper(wstr[i]); - - size_t strmblen = utf8::idb_wcstombs(0, wstr.c_str(), 0) + 1; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, wstr.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; + string ret = string(outBuf, outLen); + delete [] outBuf; return ret; } diff --git a/utils/funcexp/funcexp.cpp b/utils/funcexp/funcexp.cpp index 5f15ec991..79039c923 100644 --- a/utils/funcexp/funcexp.cpp +++ b/utils/funcexp/funcexp.cpp @@ -44,13 +44,6 @@ using namespace joblist; namespace funcexp { -namespace utf8 -{ -// A global loc object so we don't construct one at every compare -std::locale loc; - -bool JPcodePoint = false; // extern-ed in utils_utf8.h -} /* static */ FuncExp* FuncExp::fInstance = 0; diff --git a/utils/funcexp/funchelpers.h b/utils/funcexp/funchelpers.h index 013b81b0d..8f017d544 100644 --- a/utils/funcexp/funchelpers.h +++ b/utils/funcexp/funchelpers.h @@ -41,8 +41,9 @@ #include "intervalcolumn.h" #include "treenode.h" +#ifndef ULONGLONG_MAX #define ULONGLONG_MAX ulonglong_max - +#endif namespace funcexp { namespace helpers diff --git a/utils/funcexp/functor_int.h b/utils/funcexp/functor_int.h index 3e2f465cb..425e6b485 100644 --- a/utils/funcexp/functor_int.h +++ b/utils/funcexp/functor_int.h @@ -84,8 +84,6 @@ public: execplan::CalpontSystemCatalog::ColType operationType(FunctionParm& fp, execplan::CalpontSystemCatalog::ColType& resultType); - size_t in_str(const std::string& str, const std::string& substr, size_t start); - int64_t getIntVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, diff --git a/utils/funcexp/utils_utf8.h b/utils/funcexp/utils_utf8.h deleted file mode 100644 index 273a853c7..000000000 --- a/utils/funcexp/utils_utf8.h +++ /dev/null @@ -1,303 +0,0 @@ -/* Copyright (C) 2014 InfiniDB, Inc. - * Copyright (C) 2016 MariaDB Corporation. - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - as published by the Free Software Foundation; version 2 of - the License. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - MA 02110-1301, USA. */ - -// $Id$ - - -#ifndef _UTILS_UTF8_H_ -#define _UTILS_UTF8_H_ - - - -#include -#if defined(_MSC_VER) -#include -#include -#elif defined(__FreeBSD__) -//#include -#else -#include -#endif -#include - -#include -#include "liboamcpp.h" - -/** @file */ - -namespace funcexp -{ -namespace utf8 -{ -extern bool JPcodePoint; // code point ordering (Japanese UTF) flag, used in idb_strcoll - -const int MAX_UTF8_BYTES_PER_CHAR = 4; - -// A global loc object so we don't construct one at every compare -extern std::locale loc; -// Is there a way to construct a global reference to a facet? -// const std::collate& coll = std::use_facet >(loc); - -//Infinidb version of strlocale BUG 5362 -//set System Locale "C" by default -//return the system Locale currently set in from Columnstore.xml -inline -std::string idb_setlocale() -{ - // get and set locale language - std::string systemLang("C"); - oam::Oam oam; - static bool loggedMsg = false; - - try - { - oam.getSystemConfig("SystemLang", systemLang); - } - catch (...) - { - systemLang = "C"; - } - - char* pLoc = setlocale(LC_ALL, systemLang.c_str()); - - if (pLoc == NULL) - { - try - { - if (!loggedMsg) - { - //send alarm - alarmmanager::ALARMManager alarmMgr; - std::string alarmItem = "system"; - alarmMgr.sendAlarmReport(alarmItem.c_str(), oam::INVALID_LOCALE, alarmmanager::SET); - - // Log one line - logging::LoggingID lid(17); // ProcessManager -- probably the only one to find this for now - logging::MessageLog ml(lid); - logging::Message msg(1); - logging::Message::Args args; - args.add("Failed to set locale "); - args.add(systemLang.c_str()); - args.add(": Setting to 'C'. Critical alarm generated"); - msg.format( args ); - ml.logErrorMessage(msg); - - loggedMsg = true; - } - systemLang = "C"; - } - catch (...) - { - // Ignoring for time being. - } - } - else - { - try - { - //send alarm - alarmmanager::ALARMManager alarmMgr; - std::string alarmItem = "system"; - alarmMgr.sendAlarmReport(alarmItem.c_str(), oam::INVALID_LOCALE, alarmmanager::CLEAR); - } - catch (...) - { - // Ignoring for time being. - } - - } - - printf ("Locale is : %s\n", systemLang.c_str() ); - - //BUG 2991 - setlocale(LC_NUMERIC, "C"); - - if (systemLang.find("ja_JP") != std::string::npos) - JPcodePoint = true; - - // MCOL-1559 Save off the locale to save runtime cpus - std::locale localloc(systemLang.c_str()); - loc = localloc; - - return systemLang; -} - -// Infinidb version of strcoll. BUG 5362 -// strcoll() comparison while ja_JP.utf8 does not give correct results. -// For correct results strcmp() can be used. -inline -int idb_strcoll(const char* str1, const char* str2) -{ - if (JPcodePoint) - return strcmp(str1, str2); - else - return strcoll(str1, str2); -} - -// MCOL-1559 Add a trimmed version of strcoll -// The intent here is to make no copy of the original strings and -// not modify them, so we can't use trim to deal with the spaces. -inline -int idb_strtrimcoll(const std::string& str1, const std::string& str2) -{ - static const std::string whitespaces (" "); - const char* s1 = str1.c_str(); - const char* s2 = str2.c_str(); - - // Set found1 to the last non-whitespace char in str1 - std::size_t found1 = str1.find_last_not_of(whitespaces); - // Set found2 to the first whitespace char in str2 - std::size_t found2 = str2.find_last_not_of(whitespaces); - - // Are both strings empty or all whitespace? - if (found1 == std::string::npos && found2 == std::string::npos) - { - return 0; // they match - } - // If str1 is empty or all spaces - if (found1 == std::string::npos) - { - return -1; - } - // If str2 is empty or all spaces - if (found2 == std::string::npos) - { - return 1; - } - - // found1 and found2 point to the character that is not a space. - // compare wants it to point to one past. - found1 += 1; - found2 += 1; - // If no trimming needs doing, then strcoll is faster - if (found1 == str1.size() && found2 == str2.size()) - { - return idb_strcoll(s1, s2); - } - // Compare the (trimmed) strings - const std::collate& coll = std::use_facet >(loc); - int rtn = coll.compare(s1, s1+found1, s2, s2+found2); - return rtn; -} - -// BUG 5241 -// Infinidb specific mbstowcs(). This will handle both windows and unix platforms -// Params dest and max should have enough length to accomodate NULL -inline -size_t idb_mbstowcs(wchar_t* dest, const char* src, size_t max) -{ -#ifdef _MSC_VER - // 4th param (-1) denotes to convert till hit NULL char - // if 6th param max = 0, will return the required buffer size - size_t strwclen = MultiByteToWideChar(CP_UTF8, 0, src, -1, dest, (int)max); - // decrement the count of NULL; will become -1 on failure - return --strwclen; - -#else - return mbstowcs(dest, src, max); -#endif -} - -// BUG 5241 -// Infinidb specific wcstombs(). This will handle both windows and unix platforms -// Params dest and max should have enough length to accomodate NULL -inline -size_t idb_wcstombs(char* dest, const wchar_t* src, size_t max) -{ -#ifdef _MSC_VER - // 4th param (-1) denotes to convert till hit NULL char - //if 6th param max = 0, will return the required buffer size - size_t strmblen = WideCharToMultiByte( CP_UTF8, 0, src, -1, dest, (int)max, NULL, NULL); - // decrement the count of NULL; will become -1 on failure - return --strmblen; -#else - return wcstombs(dest, src, max); -#endif -} - -// convert UTF-8 string to wstring -inline -std::wstring utf8_to_wstring (const std::string& str) -{ - size_t bufsize = str.length() + 1; - - // Convert to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - // Passing +1 so that windows is happy to see extra position to place NULL - size_t strwclen = idb_mbstowcs(wcbuf, str.c_str(), str.length() + 1); - - // if result is -1 it means bad characters which may happen if locale is wrong. - // return an empty string - if ( strwclen == static_cast(-1) ) - strwclen = 0; - - std::wstring ret(wcbuf, strwclen); - - delete [] wcbuf; - return ret; -} - - -// convert wstring to UTF-8 string -inline -std::string wstring_to_utf8 (const std::wstring& str) -{ - char* outbuf = new char[(str.length() * MAX_UTF8_BYTES_PER_CHAR) + 1]; - // Passing +1 so that windows is happy to see extra position to place NULL - size_t strmblen = idb_wcstombs(outbuf, str.c_str(), str.length() * MAX_UTF8_BYTES_PER_CHAR + 1); - - // if result is -1 it means bad characters which may happen if locale is wrong. - // return an empty string - if ( strmblen == static_cast(-1) ) - strmblen = 0; - - std::string ret(outbuf, strmblen); - - delete [] outbuf; - return ret; -} - -inline -uint8_t utf8_truncate_point(const char* input, size_t length) -{ - // Find the beginning of a multibyte char to truncate at and return the - // number of bytes to truncate - if (length < 3) - { - return 0; - } - - const unsigned char* b = (const unsigned char*)(input) + length - 3; - - if (b[2] & 0x80) - { - // First byte in a new multi-byte sequence - if (b[2] & 0x40) return 1; - // 3 byte sequence - else if ((b[1] & 0xe0) == 0xe0) return 2; - // 4 byte sequence - else if ((b[0] & 0xf0) == 0xf0) return 3; - } - - return 0; -} - -} //namespace utf8 -} //namespace funcexp - -#endif diff --git a/utils/idbdatafile/IDBPolicy.cpp b/utils/idbdatafile/IDBPolicy.cpp index b88c918a5..9d2766fcf 100644 --- a/utils/idbdatafile/IDBPolicy.cpp +++ b/utils/idbdatafile/IDBPolicy.cpp @@ -122,9 +122,9 @@ bool IDBPolicy::isLocalFile( const std::string& path ) { boost::filesystem::path filepath( path ); #ifdef _MSC_VER - size_t strmblen = funcexp::utf8::idb_wcstombs(0, filepath.extension().c_str(), 0) + 1; + size_t strmblen = utf8::idb_wcstombs(0, filepath.extension().c_str(), 0) + 1; char* outbuf = (char*)alloca(strmblen * sizeof(char)); - strmblen = funcexp::utf8::idb_wcstombs(outbuf, filepath.extension().c_str(), strmblen); + strmblen = utf8::idb_wcstombs(outbuf, filepath.extension().c_str(), strmblen); string fileExt(outbuf, strmblen); #else //string fileExt = filepath.extension().c_str(); diff --git a/utils/joiner/CMakeLists.txt b/utils/joiner/CMakeLists.txt index f1555e8bd..a881402bf 100644 --- a/utils/joiner/CMakeLists.txt +++ b/utils/joiner/CMakeLists.txt @@ -4,7 +4,7 @@ include_directories( ${ENGINE_COMMON_INCLUDES} ) ########### next target ############### -set(joiner_LIB_SRCS joiner.cpp tuplejoiner.cpp joinpartition.cpp) +set(joiner_LIB_SRCS tuplejoiner.cpp joinpartition.cpp) add_library(joiner SHARED ${joiner_LIB_SRCS}) diff --git a/utils/rowgroup/rowaggregation.cpp b/utils/rowgroup/rowaggregation.cpp index 71f43652c..5c02d0c48 100755 --- a/utils/rowgroup/rowaggregation.cpp +++ b/utils/rowgroup/rowaggregation.cpp @@ -49,18 +49,17 @@ #include "funcexp.h" #include "rowaggregation.h" #include "calpontsystemcatalog.h" -//#include "utils_utf8.h" +#include "utils_utf8.h" + +#include "collation.h" //..comment out NDEBUG to enable assertions, uncomment NDEBUG to disable //#define NDEBUG -#include "funcexp/utils_utf8.h" - using namespace std; using namespace boost; using namespace dataconvert; - // inlines of RowAggregation that used only in this file namespace { @@ -387,36 +386,21 @@ inline void RowAggregation::updateFloatMinMax(float val1, float val2, int64_t co fRow.setFloatField(val1, col); } - - -#define STRCOLL_ENH__ - void RowAggregation::updateStringMinMax(string val1, string val2, int64_t col, int func) { if (isNull(fRowGroupOut, fRow, col)) { fRow.setStringField(val1, col); + return; } + CHARSET_INFO* cs = fRow.getCharset(col); + int tmp = cs->strnncoll(val1.c_str(), val1.length(), val2.c_str(), val2.length()); -#ifdef STRCOLL_ENH__ - else - { - int tmp = funcexp::utf8::idb_strcoll(val1.c_str(), val2.c_str()); - - if ((tmp < 0 && func == rowgroup::ROWAGG_MIN) || - (tmp > 0 && func == rowgroup::ROWAGG_MAX)) - { - fRow.setStringField(val1, col); - } - } - -#else - else if (minMax(val1, val2, func)) + if ((tmp < 0 && func == rowgroup::ROWAGG_MIN) || + (tmp > 0 && func == rowgroup::ROWAGG_MAX)) { fRow.setStringField(val1, col); } - -#endif } //------------------------------------------------------------------------------ @@ -1295,19 +1279,9 @@ void RowAggregation::doMinMax(const Row& rowIn, int64_t colIn, int64_t colOut, i case execplan::CalpontSystemCatalog::VARCHAR: case execplan::CalpontSystemCatalog::TEXT: { - int colWidth = fRowGroupIn.getColumnWidth(colIn); - if (colWidth <= 8) - { - uint64_t valIn = rowIn.getUintField(colIn); - uint64_t valOut = fRow.getUintField(colOut); - updateCharMinMax(valIn, valOut, colOut, funcType); - } - else - { - string valIn = rowIn.getStringField(colIn); - string valOut = fRow.getStringField(colOut); - updateStringMinMax(valIn, valOut, colOut, funcType); - } + string valIn = rowIn.getStringField(colIn); + string valOut = fRow.getStringField(colOut); + updateStringMinMax(valIn, valOut, colOut, funcType); break; } diff --git a/utils/rowgroup/rowaggregation.h b/utils/rowgroup/rowaggregation.h index aeb7dbf53..5a51f90af 100644 --- a/utils/rowgroup/rowaggregation.h +++ b/utils/rowgroup/rowaggregation.h @@ -53,6 +53,9 @@ #include "mcsv1_udaf.h" #include "constantcolumn.h" +// Because including my_sys.h in a Columnstore header causes too many conflicts +struct charset_info_st; +typedef const struct charset_info_st CHARSET_INFO; // To do: move code that depends on joblist to a proper subsystem. namespace joblist { @@ -706,7 +709,7 @@ protected: // We need a separate copy for each thread. mcsv1sdk::mcsv1Context fRGContext; - + // These are handy for testing the actual type of static_any for UDAF static const static_any::any& charTypeId; static const static_any::any& scharTypeId; diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index a8bae1086..2bfcca9fc 100644 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -26,10 +26,7 @@ // Author: Patrick LeBlanc , (C) 2008 // -#include //#define NDEBUG -#include -#include #include #include using namespace std; @@ -43,10 +40,11 @@ using namespace messageqcpp; #include "calpontsystemcatalog.h" using namespace execplan; -#include "joblisttypes.h" #include "nullvaluemanip.h" #include "rowgroup.h" +#include "collation.h" + namespace rowgroup { @@ -344,7 +342,45 @@ void UserDataStore::deserialize(ByteStream& bs) return; } -//uint32_t rgDataCount = 0; +inline bool StringStore::equals(const std::string& str, uint64_t off, CHARSET_INFO* cs) const +{ + uint32_t length; + + if (off == std::numeric_limits::max()) + return str == joblist::CPNULLSTRMARK; + + MemChunk* mc; + + if (off & 0x8000000000000000) + { + if (longStrings.size() <= (off & ~0x8000000000000000)) + return false; + + mc = (MemChunk*) longStrings[off & ~0x8000000000000000].get(); + + memcpy(&length, mc->data, 4); + + // Not sure if this check it needed, but adds safety + if (length > mc->currentSize) + return false; + + return (cs->strnncoll(str.c_str(), str.length(), (const char*)mc->data+4, length) == 0); + } + + uint32_t chunk = off / CHUNK_SIZE; + uint32_t offset = off % CHUNK_SIZE; + + if (mem.size() <= chunk) + return false; + + mc = (MemChunk*) mem[chunk].get(); + memcpy(&length, &mc->data[offset], 4); + + if ((offset + length) > mc->currentSize) + return false; + + return (cs->strnncoll(str.c_str(), str.length(), (const char*)&mc->data[offset]+4, length) == 0); +} RGData::RGData() { @@ -505,9 +541,10 @@ Row::Row() : data(NULL), strings(NULL), userDataStore(NULL) { } Row::Row(const Row& r) : columnCount(r.columnCount), baseRid(r.baseRid), oldOffsets(r.oldOffsets), stOffsets(r.stOffsets), - offsets(r.offsets), colWidths(r.colWidths), types(r.types), data(r.data), - scale(r.scale), precision(r.precision), strings(r.strings), - useStringTable(r.useStringTable), hasLongStringField(r.hasLongStringField), + offsets(r.offsets), colWidths(r.colWidths), types(r.types), + charsetNumbers(r.charsetNumbers), charsets(r.charsets), + data(r.data), scale(r.scale), precision(r.precision), strings(r.strings), + useStringTable(r.useStringTable), hasCollation(r.hasCollation), hasLongStringField(r.hasLongStringField), sTableThreshold(r.sTableThreshold), forceInline(r.forceInline), userDataStore(NULL) { } @@ -522,11 +559,14 @@ Row& Row::operator=(const Row& r) offsets = r.offsets; colWidths = r.colWidths; types = r.types; + charsetNumbers = r.charsetNumbers; + charsets = r.charsets; data = r.data; scale = r.scale; precision = r.precision; strings = r.strings; useStringTable = r.useStringTable; + hasCollation = r.hasCollation; hasLongStringField = r.hasLongStringField; sTableThreshold = r.sTableThreshold; forceInline = r.forceInline; @@ -989,6 +1029,124 @@ int64_t Row::getSignedNullValue(uint32_t colIndex) const return utils::getSignedNullValue(types[colIndex], getColumnWidth(colIndex)); } +bool Row::equals(const std::string& val, uint32_t col) const +{ + const CHARSET_INFO* cs = getCharset(col); + if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::BLOB)) + { + if (getStringLength(col) != val.length()) + return false; + + if (memcmp(getStringPointer(col), val.c_str(), val.length())) + return false; + } + else + { + return (cs->strnncollsp((char*)getStringPointer(col), getStringLength(col), + val.c_str(), val.length()) == 0); + } + return true; +} + +bool Row::equals(const Row& r2, const std::vector& keyCols) const +{ + for (uint32_t i = 0; i < keyCols.size(); i++) + { + const uint32_t& col = keyCols[i]; + + if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::VARCHAR || + (getColType(col) == execplan::CalpontSystemCatalog::CHAR && (colWidths[col] > 1)) || + getColType(col) == execplan::CalpontSystemCatalog::TEXT)) + { + CHARSET_INFO* cs = getCharset(col); + if (cs->strnncollsp(getStringPointer(col), getStringLength(col), + r2.getStringPointer(col), r2.getStringLength(col))) + { + return false; + } + } + else if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::BLOB)) + { + if (getStringLength(col) != r2.getStringLength(col)) + return false; + + if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col))) + return false; + } + else + { + if (getColType(col) == execplan::CalpontSystemCatalog::LONGDOUBLE) + { + if (getLongDoubleField(col) != r2.getLongDoubleField(col)) + return false; + } + else if (getUintField(col) != r2.getUintField(col)) + return false; + } + } + + return true; +} + +bool Row::equals(const Row& r2, uint32_t lastCol) const +{ + // This check fires with empty r2 only. + if (lastCol >= columnCount) + return true; + + // If there are no strings in the row, then we can just memcmp the whole row. + // hasCollation is true if there is any column of type CHAR, VARCHAR or TEXT + // useStringTable is true if any field declared > max inline field size, including BLOB + // For memcmp to be correct, both must be false. + if (!hasCollation && !useStringTable && !r2.hasCollation && !r2.useStringTable) + return !(memcmp(&data[offsets[0]], &r2.data[offsets[0]], offsets[lastCol + 1] - offsets[0])); + + // There are strings involved, so we need to check each column + // because binary equality is not equality for many charsets/collations + for (uint32_t col = 0; col <= lastCol; col++) + { + if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::VARCHAR || + (getColType(col) == execplan::CalpontSystemCatalog::CHAR && (colWidths[col] > 1)) || + getColType(col) == execplan::CalpontSystemCatalog::TEXT)) + { + CHARSET_INFO* cs = getCharset(col); + if (cs->strnncollsp(getStringPointer(col), getStringLength(col), + r2.getStringPointer(col), r2.getStringLength(col))) + { + return false; + } + } + else if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::BLOB)) + { + if (getStringLength(col) != r2.getStringLength(col)) + return false; + + if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col))) + return false; + } + else + { + if (getColType(col) == execplan::CalpontSystemCatalog::LONGDOUBLE) + { + if (getLongDoubleField(col) != r2.getLongDoubleField(col)) + return false; + } + else if (getUintField(col) != r2.getUintField(col)) + return false; + } + } + return true; +} + +const CHARSET_INFO* Row::getCharset(uint32_t col) const +{ + if (charsets[col] == NULL) + { + const_cast(charsets)[col] = get_charset(charsetNumbers[col], MYF(MY_WME)); + } + return charsets[col]; +} + RowGroup::RowGroup() : columnCount(0), data(NULL), rgData(NULL), strings(NULL), useStringTable(true), hasLongStringField(false), sTableThreshold(20) { @@ -1006,6 +1164,7 @@ RowGroup::RowGroup(uint32_t colCount, const vector& roids, const vector& tkeys, const vector& colTypes, + const vector& csNumbers, const vector& cscale, const vector& cprecision, uint32_t stringTableThreshold, @@ -1013,7 +1172,7 @@ RowGroup::RowGroup(uint32_t colCount, const vector& forceInlineData ) : columnCount(colCount), data(NULL), oldOffsets(positions), oids(roids), keys(tkeys), - types(colTypes), scale(cscale), precision(cprecision), rgData(NULL), strings(NULL), + types(colTypes), charsetNumbers(csNumbers), scale(cscale), precision(cprecision), rgData(NULL), strings(NULL), sTableThreshold(stringTableThreshold) { uint32_t i; @@ -1043,16 +1202,28 @@ RowGroup::RowGroup(uint32_t colCount, } else stOffsets[i + 1] = stOffsets[i] + colWidths[i]; + + execplan::CalpontSystemCatalog::ColDataType type = types[i]; + if ((type == execplan::CalpontSystemCatalog::CHAR && (colWidths[i] > 1)) || + type == execplan::CalpontSystemCatalog::VARCHAR || + type == execplan::CalpontSystemCatalog::TEXT) + { + hasCollation = true; + } } useStringTable = (stringTable && hasLongStringField); offsets = (useStringTable ? &stOffsets[0] : &oldOffsets[0]); + + // Set all the charsets to NULL for jit initialization. + charsets.insert(charsets.begin(), charsetNumbers.size(), NULL); } RowGroup::RowGroup(const RowGroup& r) : columnCount(r.columnCount), data(r.data), oldOffsets(r.oldOffsets), stOffsets(r.stOffsets), colWidths(r.colWidths), - oids(r.oids), keys(r.keys), types(r.types), scale(r.scale), precision(r.precision), + oids(r.oids), keys(r.keys), types(r.types), charsetNumbers(r.charsetNumbers), + charsets(r.charsets), scale(r.scale), precision(r.precision), rgData(r.rgData), strings(r.strings), useStringTable(r.useStringTable), hasLongStringField(r.hasLongStringField), sTableThreshold(r.sTableThreshold), forceInline(r.forceInline) @@ -1076,12 +1247,15 @@ RowGroup& RowGroup::operator=(const RowGroup& r) oids = r.oids; keys = r.keys; types = r.types; + charsetNumbers = r.charsetNumbers; + charsets = r.charsets; data = r.data; scale = r.scale; precision = r.precision; rgData = r.rgData; strings = r.strings; useStringTable = r.useStringTable; + hasCollation = r.hasCollation; hasLongStringField = r.hasLongStringField; sTableThreshold = r.sTableThreshold; forceInline = r.forceInline; @@ -1120,6 +1294,7 @@ void RowGroup::serialize(ByteStream& bs) const serializeInlineVector(bs, oids); serializeInlineVector(bs, keys); serializeInlineVector(bs, types); + serializeInlineVector(bs, charsetNumbers); serializeInlineVector(bs, scale); serializeInlineVector(bs, precision); bs << (uint8_t) useStringTable; @@ -1139,6 +1314,7 @@ void RowGroup::deserialize(ByteStream& bs) deserializeInlineVector(bs, oids); deserializeInlineVector(bs, keys); deserializeInlineVector(bs, types); + deserializeInlineVector(bs, charsetNumbers); deserializeInlineVector(bs, scale); deserializeInlineVector(bs, precision); bs >> tmp8; @@ -1156,6 +1332,10 @@ void RowGroup::deserialize(ByteStream& bs) offsets = &stOffsets[0]; else if (!useStringTable && !oldOffsets.empty()) offsets = &oldOffsets[0]; + + // Set all the charsets to NULL for jit initialization. + charsets.insert(charsets.begin(), charsetNumbers.size(), NULL); + } void RowGroup::serializeRGData(ByteStream& bs) const @@ -1467,6 +1647,15 @@ void RowGroup::addToSysDataList(execplan::CalpontSystemCatalog::NJLSysDataList& } } +const CHARSET_INFO* RowGroup::getCharset(uint32_t col) +{ + if (charsets[col] == NULL) + { + charsets[col] = get_charset(charsetNumbers[col], MYF(MY_WME)); + } + return charsets[col]; +} + void RowGroup::setDBRoot(uint32_t dbroot) { *((uint32_t*) &data[dbRootOffset]) = dbroot; diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index 504dda86e..e14604166 100644 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -58,6 +58,11 @@ #include "../winport/winport.h" +// Because including my_sys.h in a Columnstore header causes too many conflicts +struct charset_info_st; +typedef const struct charset_info_st CHARSET_INFO; + + // Workaround for my_global.h #define of isnan(X) causing a std::std namespace namespace rowgroup @@ -103,7 +108,7 @@ public: inline bool isEmpty() const; inline uint64_t getSize() const; inline bool isNullValue(uint64_t offset) const; - inline bool equals(const std::string& str, uint64_t offset) const; + bool equals(const std::string& str, uint64_t offset, CHARSET_INFO* cs) const; void clear(); @@ -319,7 +324,8 @@ public: inline execplan::CalpontSystemCatalog::ColDataType getColType(uint32_t colIndex) const; inline execplan::CalpontSystemCatalog::ColDataType* getColTypes(); inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const; - + inline uint32_t getCharsetNumber(uint32_t colIndex) const; + // this returns true if the type is not CHAR or VARCHAR inline bool isCharType(uint32_t colIndex) const; inline bool isUnsigned(uint32_t colIndex) const; @@ -332,7 +338,7 @@ public: inline int64_t getIntField(uint32_t colIndex) const; template inline bool equals(uint64_t val, uint32_t colIndex) const; inline bool equals(long double val, uint32_t colIndex) const; - inline bool equals(const std::string& val, uint32_t colIndex) const; + bool equals(const std::string& val, uint32_t colIndex) const; inline double getDoubleField(uint32_t colIndex) const; inline float getFloatField(uint32_t colIndex) const; @@ -381,7 +387,7 @@ public: inline void setStringField(const uint8_t*, uint32_t len, uint32_t colIndex); // support VARBINARY - // Add 2-byte length at the beginning of the field. NULL and zero length field are + // Add 2-byte length at the CHARSET_INFO*beginning of the field. NULL and zero length field are // treated the same, could use one of the length bit to distinguish these two cases. inline std::string getVarBinaryStringField(uint32_t colIndex) const; inline void setVarBinaryField(const std::string& val, uint32_t colIndex); @@ -443,14 +449,17 @@ public: inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol] inline uint64_t hash() const; // generates a hash for all cols - inline bool equals(const Row&, const std::vector& keyColumns) const; - inline bool equals(const Row&, uint32_t lastCol) const; + bool equals(const Row&, const std::vector& keyColumns) const; + bool equals(const Row&, uint32_t lastCol) const; inline bool equals(const Row&) const; inline void setUserDataStore(UserDataStore* u) { userDataStore = u; } + + const CHARSET_INFO* getCharset(uint32_t col) const; + private: uint32_t columnCount; uint64_t baseRid; @@ -461,12 +470,15 @@ private: uint32_t* offsets; uint32_t* colWidths; execplan::CalpontSystemCatalog::ColDataType* types; + uint32_t* charsetNumbers; + CHARSET_INFO** charsets; uint8_t* data; uint32_t* scale; uint32_t* precision; StringStore* strings; bool useStringTable; + bool hasCollation; bool hasLongStringField; uint32_t sTableThreshold; boost::shared_array forceInline; @@ -569,6 +581,11 @@ inline const execplan::CalpontSystemCatalog::ColDataType* Row::getColTypes() con return types; } +inline uint32_t Row::getCharsetNumber(uint32_t col) const +{ + return charsetNumbers[col]; +} + inline bool Row::isCharType(uint32_t colIndex) const { return execplan::isCharType(types[colIndex]); @@ -622,18 +639,6 @@ inline bool Row::equals(long double val, uint32_t colIndex) const { return *((long double*) &data[offsets[colIndex]]) == val; } - -inline bool Row::equals(const std::string& val, uint32_t colIndex) const -{ - if (inStringTable(colIndex)) - { - uint64_t offset = *((uint64_t*) &data[offsets[colIndex]]); - return strings->equals(val, offset); - } - else - return (strncmp(val.c_str(), (char*) &data[offsets[colIndex]], getColumnWidth(colIndex)) == 0); -} - template inline uint64_t Row::getUintField(uint32_t colIndex) const { @@ -1170,69 +1175,6 @@ inline uint64_t Row::hash(uint32_t lastCol) const return ret; } -inline bool Row::equals(const Row& r2, const std::vector& keyCols) const -{ - for (uint32_t i = 0; i < keyCols.size(); i++) - { - const uint32_t& col = keyCols[i]; - - if (!isLongString(col)) - { - if (getColType(i) == execplan::CalpontSystemCatalog::LONGDOUBLE) - { - if (getLongDoubleField(i) != r2.getLongDoubleField(i)) - return false; - } - else if (getUintField(col) != r2.getUintField(col)) - return false; - } - else - { - if (getStringLength(col) != r2.getStringLength(col)) - return false; - - if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col))) - return false; - } - } - - return true; -} - -inline bool Row::equals(const Row& r2, uint32_t lastCol) const -{ - // This check fires with empty r2 only. - if (lastCol >= columnCount) - return true; - - if (!useStringTable && !r2.useStringTable) - return !(memcmp(&data[offsets[0]], &r2.data[offsets[0]], offsets[lastCol + 1] - offsets[0])); - - for (uint32_t i = 0; i <= lastCol; i++) - if (!isLongString(i)) - { - if (getColType(i) == execplan::CalpontSystemCatalog::LONGDOUBLE) - { - if (getLongDoubleField(i) != r2.getLongDoubleField(i)) - return false; - } - else if (getUintField(i) != r2.getUintField(i)) - return false; - } - else - { - uint32_t len = getStringLength(i); - - if (len != r2.getStringLength(i)) - return false; - - if (memcmp(getStringPointer(i), r2.getStringPointer(i), len)) - return false; - } - - return true; -} - inline bool Row::equals(const Row& r2) const { return equals(r2, columnCount - 1); @@ -1268,6 +1210,7 @@ public: @param coids An array of oids for each column. @param tkeys An array of unique id for each column. @param colTypes An array of COLTYPEs for each column. + @param charsetNumbers an Array of the lookup numbers for the charset/collation object. @param scale An array specifying the scale of DECIMAL types (0 for non-decimal) @param precision An array specifying the precision of DECIMAL types (0 for non-decimal) */ @@ -1277,6 +1220,7 @@ public: const std::vector& cOids, const std::vector& tkeys, const std::vector& colTypes, + const std::vector& charsetNumbers, const std::vector& scale, const std::vector& precision, uint32_t stringTableThreshold, @@ -1284,7 +1228,7 @@ public: const std::vector& forceInlineData = std::vector() ); - /** @brief The copiers. It copies metadata, not the row data */ + /** @brief The copiers. It copies metadata, not thetypes row data */ RowGroup(const RowGroup&); /** @brief Assignment operator. It copies metadata, not the row data */ @@ -1338,6 +1282,8 @@ public: inline execplan::CalpontSystemCatalog::ColDataType getColType(uint32_t colIndex) const; inline const std::vector& getColTypes() const; inline std::vector& getColTypes(); + inline const std::vector& getCharsetNumbers() const; + inline uint32_t getCharsetNumber(uint32_t colIndex) const; inline boost::shared_array& getForceInline(); static inline uint32_t getHeaderSize() { @@ -1397,6 +1343,8 @@ public: uint16_t* blockNum); inline void setStringStore(boost::shared_ptr); + + const CHARSET_INFO* getCharset(uint32_t col); private: uint32_t columnCount; @@ -1413,8 +1361,11 @@ private: // Used to map the projected column and rowgroup index std::vector keys; std::vector types; - - // DECIMAL support. For non-decimal fields, the values are 0. + // For string collation + std::vector charsetNumbers; + std::vector charsets; + + // DECIMAL support. For non-decimal fields, the valutypeses are 0. std::vector scale; std::vector precision; @@ -1422,6 +1373,7 @@ private: RGData* rgData; StringStore* strings; // note, strings and data belong to rgData bool useStringTable; + bool hasCollation; bool hasLongStringField; uint32_t sTableThreshold; boost::shared_array forceInline; @@ -1547,6 +1499,8 @@ void RowGroup::initRow(Row* r, bool forceInlineData) const { r->colWidths = (uint32_t*) &colWidths[0]; r->types = (execplan::CalpontSystemCatalog::ColDataType*) & (types[0]); + r->charsetNumbers = (uint32_t*) & (charsetNumbers[0]); + r->charsets = (CHARSET_INFO**) & (charsets[0]); r->scale = (uint32_t*) & (scale[0]); r->precision = (uint32_t*) & (precision[0]); } @@ -1569,6 +1523,7 @@ void RowGroup::initRow(Row* r, bool forceInlineData) const r->hasLongStringField = hasLongStringField; r->sTableThreshold = sTableThreshold; r->forceInline = forceInline; + r->hasCollation = hasCollation; } inline uint32_t RowGroup::getRowSize() const @@ -1649,6 +1604,16 @@ inline std::vector& RowGroup::getCo return types; } +inline const std::vector& RowGroup::getCharsetNumbers() const +{ + return charsetNumbers; +} + +inline uint32_t RowGroup::getCharsetNumber(uint32_t colIndex) const +{ + return charsetNumbers[colIndex]; +} + inline const std::vector& RowGroup::getScale() const { return scale; @@ -1903,45 +1868,6 @@ inline bool StringStore::isNullValue(uint64_t off) const return (memcmp(&mc->data[offset+4], joblist::CPNULLSTRMARK.c_str(), 8) == 0); } -inline bool StringStore::equals(const std::string& str, uint64_t off) const -{ - uint32_t length; - - if (off == std::numeric_limits::max()) - return str == joblist::CPNULLSTRMARK; - - MemChunk* mc; - - if (off & 0x8000000000000000) - { - if (longStrings.size() <= (off & ~0x8000000000000000)) - return false; - - mc = (MemChunk*) longStrings[off & ~0x8000000000000000].get(); - - memcpy(&length, mc->data, 4); - - // Not sure if this check it needed, but adds safety - if (length > mc->currentSize) - return false; - - return (strncmp(str.c_str(), (const char*) mc->data + 4, length) == 0); - } - - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; - - if (mem.size() <= chunk) - return false; - - mc = (MemChunk*) mem[chunk].get(); - memcpy(&length, &mc->data[offset], 4); - - if ((offset + length) > mc->currentSize) - return false; - - return (strncmp(str.c_str(), (const char*) &mc->data[offset] + 4, length) == 0); -} inline uint32_t StringStore::getStringLength(uint64_t off) { uint32_t length; diff --git a/utils/udfsdk/mcsv1_udaf.h b/utils/udfsdk/mcsv1_udaf.h index 3057cc1aa..d7096c052 100755 --- a/utils/udfsdk/mcsv1_udaf.h +++ b/utils/udfsdk/mcsv1_udaf.h @@ -369,6 +369,11 @@ public: EXPORT mcsv1Context& operator=(const mcsv1Context& rhs); EXPORT mcsv1Context& copy(const mcsv1Context& rhs); + + // Character collation support + EXPORT void setCharsetNumber(uint32_t csNum); + EXPORT uint32_t getCharsetNumber(); // Returns the unique ID for the language/collation + EXPORT CHARSET_INFO* getCharset(); private: @@ -392,6 +397,7 @@ private: int32_t fParamCount; std::vector paramKeys; enum_mariadb_return_type mariadbReturnType; + uint32_t fCharsetNumber; public: // For use by the framework @@ -416,6 +422,7 @@ public: EXPORT void setParamCount(int32_t paramCount); std::vector* getParamKeys(); EXPORT void setMariaDBReturnType(enum_mariadb_return_type rt); + }; // Since aggregate functions can operate on any data type, we use the following structure @@ -438,7 +445,9 @@ struct ColumnDatum uint32_t scale; // If dataType is a DECIMAL type uint32_t precision; // If dataType is a DECIMAL type std::string alias; // Only filled in for init() - ColumnDatum() : dataType(execplan::CalpontSystemCatalog::UNDEFINED), scale(0), precision(-1) {}; + uint32_t charsetNumber; // For string collations + ColumnDatum() : dataType(execplan::CalpontSystemCatalog::UNDEFINED), + scale(0), precision(-1), charsetNumber(8) {}; }; // Override mcsv1_UDAF to build your User Defined Aggregate (UDAF) and/or @@ -658,7 +667,8 @@ inline mcsv1Context::mcsv1Context() : fStartConstant(0), fEndConstant(0), func(NULL), - fParamCount(0) + fParamCount(0), + fCharsetNumber(8) // Latin1 { } @@ -683,6 +693,7 @@ inline mcsv1Context& mcsv1Context::copy(const mcsv1Context& rhs) bInterrupted = rhs.bInterrupted; // Multiple threads will use the same reference func = rhs.func; fParamCount = rhs.fParamCount; + fCharsetNumber = rhs.fCharsetNumber; return *this; } @@ -979,6 +990,16 @@ inline void mcsv1Context::setMariaDBReturnType(enum_mariadb_return_type rt) mariadbReturnType = rt; } +inline void mcsv1Context::setCharsetNumber(uint32_t csNum) +{ + fCharsetNumber=csNum; +} + +inline uint32_t mcsv1Context::getCharsetNumber() +{ + return fCharsetNumber; +} + inline mcsv1_UDAF::ReturnCode mcsv1_UDAF::dropValue(mcsv1Context* context, ColumnDatum* valsDropped) { return NOT_IMPLEMENTED; diff --git a/utils/windowfunction/idborderby.cpp b/utils/windowfunction/idborderby.cpp index 8009492b3..58440de14 100644 --- a/utils/windowfunction/idborderby.cpp +++ b/utils/windowfunction/idborderby.cpp @@ -18,13 +18,16 @@ // $Id: idborderby.cpp 3932 2013-06-25 16:08:10Z xlou $ - #include #include #include #include using namespace std; +#include "objectreader.h" +#include "calpontselectexecutionplan.h" +#include "rowgroup.h" + #include using namespace boost; @@ -45,6 +48,10 @@ using namespace rowgroup; #include "joblisttypes.h" +#include "collation.h" + +// See agg_arg_charsets in sql_type.h to see conversion rules for +// items that have different char sets namespace ordering { int TinyIntCompare::operator()(IdbCompare* l, Row::Pointer r1, Row::Pointer r2) @@ -293,16 +300,11 @@ int StringCompare::operator()(IdbCompare* l, Row::Pointer r1, Row::Pointer r2) int len2 = l->row2().getStringLength(fSpec.fIndex); const char* s1 = (const char*)l->row1().getStringPointer(fSpec.fIndex); const char* s2 = (const char*)l->row2().getStringPointer(fSpec.fIndex); - // For Japanese, coll.compare() may not be as correct as strncmp - if (JPcodePoint) - { - ret = fSpec.fAsc * strncmp(s1, s2, max(len1,len2)); - } - else - { - const std::collate& coll = std::use_facet >(loc); - ret = fSpec.fAsc * coll.compare(s1, s1+len1, s2, s2+len2); - } + + if (!cs) + cs = l->rowGroup()->getCharset(fSpec.fIndex); + + ret = fSpec.fAsc * cs->strnncollsp(s1, len1, s2, len2); } return ret; diff --git a/utils/windowfunction/idborderby.h b/utils/windowfunction/idborderby.h index e26ba1aa3..dc7ccef28 100644 --- a/utils/windowfunction/idborderby.h +++ b/utils/windowfunction/idborderby.h @@ -80,7 +80,6 @@ struct IdbSortSpec // TODO There are three ordering specs since 10.2 int fAsc; // ::= ASC | DESC int fNf; // ::= NULLS FIRST | NULLS LAST - std::string fLocale; IdbSortSpec() : fIndex(-1), fAsc(1), fNf(1) {} IdbSortSpec(int i, bool b) : fIndex(i), fAsc(b ? 1 : -1), fNf(fAsc) {} @@ -93,39 +92,7 @@ struct IdbSortSpec class Compare { public: - Compare(const IdbSortSpec& spec) : fSpec(spec) - { - // Save off the current Locale in case something goes wrong. - std::string curLocale = setlocale(LC_COLLATE, NULL); - if (spec.fLocale.length() > 0) - { - fLocale = spec.fLocale; - } - else - { - fLocale = curLocale; - } - - try - { - std::locale localloc(fLocale.c_str()); - loc = localloc; - } - catch(...) - { - fLocale = curLocale; - std::locale localloc(fLocale.c_str()); - loc = localloc; - } - if (fLocale.find("ja_JP") != std::string::npos) - { - JPcodePoint = true; - } - else - { - JPcodePoint = false; - } - } + Compare(const IdbSortSpec& spec) : fSpec(spec) {} virtual ~Compare() {} virtual int operator()(IdbCompare*, rowgroup::Row::Pointer, rowgroup::Row::Pointer) = 0; @@ -137,9 +104,6 @@ public: protected: IdbSortSpec fSpec; - std::string fLocale; - std::locale loc; - bool JPcodePoint; // code point ordering (Japanese UTF) flag }; // Comparators for signed types @@ -283,9 +247,11 @@ public: class StringCompare : public Compare { public: - StringCompare(const IdbSortSpec& spec) : Compare(spec) {} + StringCompare(const IdbSortSpec& spec) : Compare(spec), cs(NULL) {} int operator()(IdbCompare*, rowgroup::Row::Pointer, rowgroup::Row::Pointer); + + CHARSET_INFO* cs; }; // End of comparators for variable sized types @@ -324,6 +290,10 @@ public: return fRow2; } + rowgroup::RowGroup* rowGroup() + { + return &fRowGroup; + } protected: rowgroup::RowGroup fRowGroup; rowgroup::Row fRow1; diff --git a/versioning/BRM/masternode.cpp b/versioning/BRM/masternode.cpp index a43c41479..a358a04ae 100644 --- a/versioning/BRM/masternode.cpp +++ b/versioning/BRM/masternode.cpp @@ -102,9 +102,10 @@ void reload(int num) int main(int argc, char** argv) { - // get and set locale language - BUG 5362 - string systemLang = "C"; - systemLang = funcexp::utf8::idb_setlocale(); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); + BRM::logInit ( BRM::SubSystemLogId_controllerNode ); diff --git a/versioning/BRM/slavenode.cpp b/versioning/BRM/slavenode.cpp index 7c0bf2638..d862a1b44 100644 --- a/versioning/BRM/slavenode.cpp +++ b/versioning/BRM/slavenode.cpp @@ -78,10 +78,9 @@ void reset(int sig) int main(int argc, char** argv) { - - // get and set locale language - BUG 5362 - string systemLang = "C"; - systemLang = funcexp::utf8::idb_setlocale(); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); BRM::logInit ( BRM::SubSystemLogId_workerNode ); diff --git a/writeengine/bulk/cpimport.cpp b/writeengine/bulk/cpimport.cpp index 4f596ba7a..4a878210a 100644 --- a/writeengine/bulk/cpimport.cpp +++ b/writeengine/bulk/cpimport.cpp @@ -1091,9 +1091,18 @@ int main(int argc, char** argv) #endif setupSignalHandlers(); - // Set up LOCALE - BUG 5362 - std::string systemLang("C"); - systemLang = funcexp::utf8::idb_setlocale(); + // Set locale language + const char* pLoc = setlocale(LC_ALL, ""); + if (pLoc) + { + // Log one line + cout << "Locale = " << pLoc; + } + else + { + cout << "Failed to set locale "; + } + setlocale(LC_NUMERIC, "C"); // Initialize singleton instance of syslogging if (argc > 0) @@ -1377,7 +1386,7 @@ int main(int argc, char** argv) //-------------------------------------------------------------------------- task = TASK_LOAD_JOBFILE; rc = curJob.loadJobInfo( sFileName.string(), bUseTempJobFile, - systemLang, argc, argv, bLogInfo2ToConsole, bValidateColumnList ); + argc, argv, bLogInfo2ToConsole, bValidateColumnList ); if ( rc != NO_ERROR ) { diff --git a/writeengine/bulk/we_bulkload.cpp b/writeengine/bulk/we_bulkload.cpp index 9642e19fc..9c97fc5c0 100644 --- a/writeengine/bulk/we_bulkload.cpp +++ b/writeengine/bulk/we_bulkload.cpp @@ -231,7 +231,6 @@ int BulkLoad::setAlternateImportDir(const std::string& loadDir, // PARAMETERS: // fullName - full filename for job description file // bUseTempJobFile - are we using a temporary job XML file -// systemLang-SystemLang setting used to set locale. // argc - command line arg count // argv - command line arguments // bLogInfo2ToConsole - Log info2 msgs to the console @@ -244,7 +243,6 @@ int BulkLoad::setAlternateImportDir(const std::string& loadDir, int BulkLoad::loadJobInfo( const string& fullName, bool bUseTempJobFile, - const string& systemLang, int argc, char** argv, bool bLogInfo2ToConsole, @@ -285,13 +283,8 @@ int BulkLoad::loadJobInfo( else fLog.setLogFileName(logFile.c_str(), errlogFile.c_str(), (int)bLogInfo2ToConsole); - std::ostringstream ossLocale; - ossLocale << "Locale is : " << systemLang; - if (!(disableConsoleOutput())) { - fLog.logMsg( ossLocale.str(), MSGLVL_INFO2 ); - if (!BulkLoad::disableConsoleOutput()) cout << "Log file for this job: " << logFile << std::endl; diff --git a/writeengine/bulk/we_bulkload.h b/writeengine/bulk/we_bulkload.h index 7ea707177..682567b1b 100644 --- a/writeengine/bulk/we_bulkload.h +++ b/writeengine/bulk/we_bulkload.h @@ -78,7 +78,6 @@ public: */ EXPORT int loadJobInfo( const std::string& fullFileName, bool bUseTempJobFile, - const std::string& systemLang, int argc, char** argv, bool bLogInfo2ToConsole, diff --git a/writeengine/bulk/we_bulkloadbuffer.cpp b/writeengine/bulk/we_bulkloadbuffer.cpp index 60a0a457e..069aa0c64 100644 --- a/writeengine/bulk/we_bulkloadbuffer.cpp +++ b/writeengine/bulk/we_bulkloadbuffer.cpp @@ -561,7 +561,7 @@ void BulkLoadBuffer::convert(char* field, int fieldLength, // on disk (e.g. 5 for a varchar(5) instead of 8). if (fieldLength > column.definedWidth) { - uint8_t truncate_point = funcexp::utf8::utf8_truncate_point(field, column.definedWidth); + uint8_t truncate_point = utf8::utf8_truncate_point(field, column.definedWidth); memcpy( charTmpBuf, field, column.definedWidth - truncate_point ); bufStats.satCount++; } diff --git a/writeengine/dictionary/we_dctnry.cpp b/writeengine/dictionary/we_dctnry.cpp index e8b5e03c8..df87a9789 100644 --- a/writeengine/dictionary/we_dctnry.cpp +++ b/writeengine/dictionary/we_dctnry.cpp @@ -862,7 +862,7 @@ int Dctnry::insertDctnry(const char* buf, // @Bug 2565: Truncate any strings longer than schema's column width if (curSig.size > m_colWidth) { - uint8_t truncate_point = funcexp::utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth); + uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth); curSig.size = m_colWidth - truncate_point; ++truncCount; } diff --git a/writeengine/server/we_server.cpp b/writeengine/server/we_server.cpp index 826b2dc73..1dda4bf0a 100644 --- a/writeengine/server/we_server.cpp +++ b/writeengine/server/we_server.cpp @@ -41,7 +41,6 @@ using namespace messageqcpp; using namespace threadpool; #include "we_readthread.h" -using namespace WriteEngine; #include "liboamcpp.h" using namespace oam; @@ -53,6 +52,10 @@ using namespace oam; #include "crashtrace.h" +#include "collation.h" + +using namespace WriteEngine; + namespace { void added_a_pm(int) @@ -101,15 +104,15 @@ int setupResources() int main(int argc, char** argv) { - // get and set locale language - string systemLang = "C"; - systemLang = funcexp::utf8::idb_setlocale(); + // Set locale language + setlocale(LC_ALL, ""); + setlocale(LC_NUMERIC, "C"); + // Initialize the charset library + my_init(); // This is unset due to the way we start it program_invocation_short_name = const_cast("WriteEngineServ"); - printf ("Locale is : %s\n", systemLang.c_str() ); - int gDebug = 0; int c; while ((c = getopt(argc, argv, "d")) != EOF) @@ -138,8 +141,7 @@ int main(int argc, char** argv) { } } - //BUG 2991 - setlocale(LC_NUMERIC, "C"); + #ifndef _MSC_VER struct sigaction sa; memset(&sa, 0, sizeof(sa));