diff --git a/utils/funcexp/func_concat.cpp b/utils/funcexp/func_concat.cpp index 39a72b4fa..de77d6007 100644 --- a/utils/funcexp/func_concat.cpp +++ b/utils/funcexp/func_concat.cpp @@ -58,7 +58,10 @@ string Func_concat::getStrVal(Row& row, string ret; string tmp; stringValue(parm[0], row, isNull, ret); - + + // TODO: do a better job of cutting down the number re-allocations. + // look at Item_func_concat::realloc_result for ideas and use + // std::string:resize() appropriatly. for ( unsigned int id = 1 ; id < parm.size() ; id++) { stringValue(parm[id], row, isNull, tmp); diff --git a/utils/funcexp/func_concat_ws.cpp b/utils/funcexp/func_concat_ws.cpp index 467fb6845..4b93814aa 100644 --- a/utils/funcexp/func_concat_ws.cpp +++ b/utils/funcexp/func_concat_ws.cpp @@ -20,6 +20,9 @@ * * ****************************************************************************/ +#include +#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost +#include #include using namespace std; @@ -47,13 +50,16 @@ CalpontSystemCatalog::ColType Func_concat_ws::operationType(FunctionParm& fp, Ca string Func_concat_ws::getStrVal(Row& row, FunctionParm& parm, bool& isNull, - CalpontSystemCatalog::ColType&) + CalpontSystemCatalog::ColType& type) { string delim; stringValue(parm[0], row, isNull, delim); if (isNull) return ""; + // TODO: I don't think we need wide chars here. + // Concatenation works without see Server implementation. +#if 0 wstring wstr; size_t strwclen = utf8::idb_mbstowcs(0, delim.c_str(), 0) + 1; wchar_t* wcbuf = new wchar_t[strwclen]; @@ -94,10 +100,11 @@ string Func_concat_ws::getStrVal(Row& row, delete [] outbuf; delete [] wcbuf; return ret; - -#if 0 +#endif string str; string tmp; + // Work on reallocation. use std::string::resize() to + // grab larger chunks in some intellegent manner. for ( uint32_t i = 1 ; i < parm.size() ; i++) { stringValue(parm[i], row, isNull, tmp); @@ -119,7 +126,6 @@ string Func_concat_ws::getStrVal(Row& row, isNull = false; return str; -#endif } diff --git a/utils/funcexp/func_left.cpp b/utils/funcexp/func_left.cpp index 31588b9e9..99fd6b015 100644 --- a/utils/funcexp/func_left.cpp +++ b/utils/funcexp/func_left.cpp @@ -51,36 +51,34 @@ CalpontSystemCatalog::ColType Func_left::operationType(FunctionParm& fp, Calpont std::string Func_left::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - const string& tstr = fp[0]->data()->getStrVal(row, isNull); - + CHARSET_INFO* cs = type.getCharset(); + // The original string + const string& src = fp[0]->data()->getStrVal(row, isNull); if (isNull) return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring str(wcbuf, strwclen); - - int64_t pos = fp[1]->data()->getIntVal(row, isNull) - 1; - - if (isNull) + size_t trimLength = fp[1]->data()->getUintVal(row, isNull); + if (isNull || trimLength <= 0) return ""; - if (pos == -1) // pos == 0 - return ""; + size_t charPos; - wstring out = str.substr(0, pos + 1); - size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; + if ((binLen <= trimLength) || + (binLen <= (charPos= cs->charpos(pos, end, trimLength)))) + { + return src; + } + + std::string ret(pos, charPos); return ret; - -// return str.substr(0, pos+1); } diff --git a/utils/funcexp/func_lpad.cpp b/utils/funcexp/func_lpad.cpp index 458ba747a..4d75a3c5a 100644 --- a/utils/funcexp/func_lpad.cpp +++ b/utils/funcexp/func_lpad.cpp @@ -56,191 +56,80 @@ CalpontSystemCatalog::ColType Func_lpad::operationType(FunctionParm& fp, Calpont std::string Func_lpad::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - unsigned i; - // The number of characters (not bytes) in our input str. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in our pad str. - size_t padwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); - // The result length in number of characters - size_t len = 0; - - switch (fp[1]->data()->resultType().colDataType) + // In the case where someone entered pad length as a quoted string, + // it may be interpreted by columnstore to be an actual string + // and stored in fResult.int as a htonl of that string, + // However fResult.double is always correct, so we'll use that. + size_t padLength = (size_t)fp[1]->data()->getDoubleVal(row, isNull); + if (isNull || padLength <= 0) + return ""; + if (padLength > (size_t)INT_MAX32) + padLength = (size_t)INT_MAX32; + + if (padLength < strLen) { - case execplan::CalpontSystemCatalog::BIGINT: - case execplan::CalpontSystemCatalog::INT: - case execplan::CalpontSystemCatalog::MEDINT: - case execplan::CalpontSystemCatalog::TINYINT: - case execplan::CalpontSystemCatalog::SMALLINT: - { - len = fp[1]->data()->getIntVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::UBIGINT: - case execplan::CalpontSystemCatalog::UINT: - case execplan::CalpontSystemCatalog::UMEDINT: - case execplan::CalpontSystemCatalog::UTINYINT: - case execplan::CalpontSystemCatalog::USMALLINT: - { - len = fp[1]->data()->getUintVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::FLOAT: - case execplan::CalpontSystemCatalog::UFLOAT: - case execplan::CalpontSystemCatalog::DOUBLE: - case execplan::CalpontSystemCatalog::UDOUBLE: - case execplan::CalpontSystemCatalog::DECIMAL: - case execplan::CalpontSystemCatalog::UDECIMAL: - { - double value = fp[1]->data()->getDoubleVal(row, isNull); - - if (value > 0) - value += 0.5; - else if (value < 0) - value -= 0.5; - - int64_t ret = (int64_t) value; - - if (value > (double) numeric_limits::max()) - ret = numeric_limits::max(); - else if (value < (double) (numeric_limits::min() + 2)) - ret = numeric_limits::min() + 2; // IDB min for bigint - - len = ret; - } - break; - - case execplan::CalpontSystemCatalog::CHAR: - case execplan::CalpontSystemCatalog::VARCHAR: - { - const string& strval = fp[1]->data()->getStrVal(row, isNull); - len = strtol(strval.c_str(), NULL, 10); - break; - } - - default: - { - std::ostringstream oss; - oss << "lpad parameter 2 must be numeric, not " << execplan::colDataTypeToString(fp[1]->data()->resultType().colDataType); - throw logging::IDBExcept(oss.str(), logging::ERR_DATATYPE_NOT_SUPPORT); - - } + binLen = cs->charpos(pos, end, padLength); + std::string ret(pos, binLen); + return ret; } - if (len < 1) - return ""; - - // MCOL-2182 As of MariaDB 10.3 the third parameter - pad characters - is optional // The pad characters. const string* pad = &fPad; if (fp.size() > 2) { pad = &fp[2]->data()->getStrVal(row, isNull); } + // binPLen represents the number of bytes in pad + size_t binPLen = pad->length(); + const char* posP = pad->c_str(); + // plen = the number of characters in pad + size_t plen = cs->numchars(posP, posP+binPLen); + if (plen == 0 || plen > strLen) + return src; - if (isNull) - return ""; + size_t byteCount = (padLength+1) * cs->mbmaxlen; // absolute maximun number of bytes + char* buf = new char[byteCount]; + char* pBuf = buf; - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - size_t alen = len; - - if (strwclen > len) - alen = strwclen; - - size_t bufsize = alen + 1; - - // Convert to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - size_t strSize = strwclen; // The number of significant characters - const wchar_t* pWChar = wcbuf; - - for (i = 0; *pWChar != '\0' && i < strwclen; ++pWChar, ++i) + padLength -= strLen; + byteCount = 0; + + while (padLength >= plen) { + memcpy(pBuf, posP, plen); + padLength -= plen; + byteCount += plen; + pBuf += plen; } - - strSize = i; - - // If the incoming str is exactly the len of the result str, - // return the original - if (strSize == len) + // Sometimes, in a case with multi-char pad, we need to add a partial pad + if (padLength > 0) { - return tstr; + size_t partialSize = cs->charpos(posP, posP+plen, padLength); + memcpy(pBuf, posP, partialSize); + byteCount += partialSize; + pBuf += partialSize; } - - // If the incoming str is too big for the result str - // truncate the widechar buffer and return as a string - if (strSize > len) - { - // Trim the excess length of the buffer - wstring trimmed = wstring(wcbuf, len); - return utf8::wstring_to_utf8(trimmed.c_str()); - } - - // This is the case where there's room to pad. - - // Convert the pad string to wide - padwclen = pad->length(); // A guess to start. - size_t padbufsize = padwclen + 1; - wchar_t* wcpad = new wchar_t[padbufsize]; - // padwclen+1 is for giving count for the terminating null - size_t padlen = utf8::idb_mbstowcs(wcpad, pad->c_str(), padwclen + 1); - - // How many chars do we need? - size_t padspace = len - strSize; - - // Shift the contents of wcbuf to the right. - wchar_t* startofstr = wcbuf + padspace; - - // Move the original string to the right to make room for the pad chars - // Testing has shown that this loop is faster than memmove - wchar_t* newchar = wcbuf + len; // Last spot to put a char in buf - wchar_t* pChar = wcbuf + strSize; // terminal NULL of our str - - while (pChar >= wcbuf) - { - *newchar-- = *pChar--; - } - - // Fill in the front of the buffer with the pad chars - wchar_t* firstpadchar = wcbuf; - - for (wchar_t* pch = wcbuf; pch < startofstr && padlen > 0;) - { - // Truncate the number of fill chars if running out of space - if (padlen > padspace) - { - padlen = padspace; - } - - // Move the fill chars to buffer - for (wchar_t* padchar = wcpad; padchar < wcpad + padlen; ++padchar) - { - *firstpadchar++ = *padchar; - } - - padspace -= padlen; - pch += padlen; - } - - wstring padded = wstring(wcbuf, len); - // Turn back to a string - std::string ret(utf8::wstring_to_utf8(padded.c_str())); - delete [] wcpad; - delete [] wcbuf; + memcpy(pBuf, pos, binLen); + byteCount += binLen; + + std::string ret(buf, byteCount); + delete [] buf; return ret; } diff --git a/utils/funcexp/func_ltrim.cpp b/utils/funcexp/func_ltrim.cpp index 7e340914d..520300bc2 100644 --- a/utils/funcexp/func_ltrim.cpp +++ b/utils/funcexp/func_ltrim.cpp @@ -50,89 +50,56 @@ CalpontSystemCatalog::ColType Func_ltrim::operationType(FunctionParm& fp, Calpon std::string Func_ltrim::getStrVal(rowgroup::Row& row, - FunctionParm& fp, - bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + FunctionParm& fp, + bool& isNull, + execplan::CalpontSystemCatalog::ColType& type) { - // The number of characters (not bytes) in our input tstr. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in ourtrim tstr. - size_t trimwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); // The trim characters. const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " "); + // binTLen represents the number of bytes in trim + size_t binTLen = trim.length(); + const char* posT = trim.c_str(); + // strTLen = the number of characters in trim + size_t strTLen = cs->numchars(posT, posT+binTLen); + if (strTLen == 0 || strTLen > strLen) + return src; - if (isNull) - return ""; - - if (tstr.empty() || tstr.length() == 0) - return tstr; - - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int bufsize = strwclen + 1; - - // Convert the string to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - // idb_mbstowcs can return -1 if there is bad mbs char in tstr - if (strwclen == static_cast(-1)) - strwclen = 0; - - // Convert the trim string to wide - trimwclen = trim.length(); // A guess to start. - int trimbufsize = trimwclen + 1; - wchar_t* wctrim = new wchar_t[trimbufsize]; - size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1); - - // idb_mbstowcs can return -1 if there is bad mbs char in tstr - if (trimlen == static_cast(-1)) - trimlen = 0; - - size_t trimCmpLen = trimlen * sizeof(wchar_t); - - const wchar_t* oPtr = wcbuf; // To remember the start of the string - const wchar_t* aPtr = oPtr; - const wchar_t* aEnd = wcbuf + strwclen - 1; - - if (trimlen > 0) + if (binTLen == 1) { - if (trimlen == 1) + // If the trim string is 1 byte, don't waste cpu for memcmp + while (pos < end && *pos == *posT) { - // If trim is a single char, then don't spend the overhead for memcmp. - wchar_t chr = wctrim[0]; - - while (aPtr <= aEnd && *aPtr == chr) - aPtr++; - } - else - { - aEnd -= (trimlen - 1); // So we don't compare past the end of the string. - - while (aPtr <= aEnd && !memcmp(aPtr, wctrim, trimCmpLen)) - aPtr += trimlen; + ++pos; + --binLen; + } + } + else + { + while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0) + { + pos += binTLen; + binLen -= binTLen; } } - - // Bug 5110 - error in allocating enough memory for utf8 chars - size_t aLen = strwclen - (aPtr - oPtr); - wstring trimmed = wstring(aPtr, aLen); // Turn back to a string - std::string ret(utf8::wstring_to_utf8(trimmed.c_str())); - delete [] wctrim; - delete [] wcbuf; + std::string ret(pos, binLen); return ret; } - } // namespace funcexp // vim:ts=4 sw=4: diff --git a/utils/funcexp/func_repeat.cpp b/utils/funcexp/func_repeat.cpp index 65a6b45e2..2a14bbdc8 100644 --- a/utils/funcexp/func_repeat.cpp +++ b/utils/funcexp/func_repeat.cpp @@ -59,7 +59,7 @@ CalpontSystemCatalog::ColType Func_repeat::operationType(FunctionParm& fp, Calpo std::string Func_repeat::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType& op_ct) + execplan::CalpontSystemCatalog::ColType& type) { string str; diff --git a/utils/funcexp/func_right.cpp b/utils/funcexp/func_right.cpp index 81d7d190a..b0dbfd289 100644 --- a/utils/funcexp/func_right.cpp +++ b/utils/funcexp/func_right.cpp @@ -51,42 +51,33 @@ CalpontSystemCatalog::ColType Func_right::operationType(FunctionParm& fp, Calpon std::string Func_right::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - const string& tstr = fp[0]->data()->getStrVal(row, isNull); - + CHARSET_INFO* cs = type.getCharset(); + // The original string + const string& src = fp[0]->data()->getStrVal(row, isNull); if (isNull) return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; - int64_t pos = fp[1]->data()->getIntVal(row, isNull); - - if (isNull) + size_t trimLength = fp[1]->data()->getUintVal(row, isNull); + if (isNull || trimLength <= 0) return ""; - if (pos == -1) // pos == 0 - return ""; + size_t start = cs->numchars(pos, end); // Here, start is number of characters in src + if (start <= trimLength) + return src; + start = cs->charpos(pos, end, start - trimLength); // Here, start becomes number of bytes into src to start copying - size_t strwclen = utf8::idb_mbstowcs(0, tstr.c_str(), 0) + 1; - //wchar_t wcbuf[strwclen]; - wchar_t* wcbuf = new wchar_t[strwclen]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen); - wstring str(wcbuf, strwclen); - - if ( (unsigned) pos >= strwclen ) - pos = strwclen; - - wstring out = str.substr(strwclen - pos, strwclen); - size_t strmblen = utf8::idb_wcstombs(0, out.c_str(), 0) + 1; - //char outbuf[strmblen]; - char* outbuf = new char[strmblen]; - strmblen = utf8::idb_wcstombs(outbuf, out.c_str(), strmblen); - std::string ret(outbuf, strmblen); - delete [] outbuf; - delete [] wcbuf; + std::string ret(pos+start, binLen-start); return ret; } - } // namespace funcexp // vim:ts=4 sw=4: diff --git a/utils/funcexp/func_rpad.cpp b/utils/funcexp/func_rpad.cpp index b92030faa..da5155b6a 100644 --- a/utils/funcexp/func_rpad.cpp +++ b/utils/funcexp/func_rpad.cpp @@ -55,179 +55,81 @@ CalpontSystemCatalog::ColType Func_rpad::operationType(FunctionParm& fp, Calpont std::string Func_rpad::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + execplan::CalpontSystemCatalog::ColType& type) { - unsigned i; - // The number of characters (not bytes) in our input str. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in our pad str. - size_t padwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); - // The result length in number of characters - size_t len = 0; - - switch (fp[1]->data()->resultType().colDataType) + // In the case where someone entered pad length as a quoted string, + // it may be interpreted by columnstore to be an actual string + // and stored in fResult.int as a htonl of that string, + // However fResult.double is always correct, so we'll use that. + size_t padLength = (size_t)fp[1]->data()->getDoubleVal(row, isNull); + if (isNull || padLength <= 0) + return ""; + if (padLength > (size_t)INT_MAX32) + padLength = (size_t)INT_MAX32; + + if (padLength < strLen) { - case execplan::CalpontSystemCatalog::BIGINT: - case execplan::CalpontSystemCatalog::INT: - case execplan::CalpontSystemCatalog::MEDINT: - case execplan::CalpontSystemCatalog::TINYINT: - case execplan::CalpontSystemCatalog::SMALLINT: - { - len = fp[1]->data()->getIntVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::UBIGINT: - case execplan::CalpontSystemCatalog::UINT: - case execplan::CalpontSystemCatalog::UMEDINT: - case execplan::CalpontSystemCatalog::UTINYINT: - case execplan::CalpontSystemCatalog::USMALLINT: - { - len = fp[1]->data()->getUintVal(row, isNull); - } - break; - - case execplan::CalpontSystemCatalog::FLOAT: - case execplan::CalpontSystemCatalog::UFLOAT: - case execplan::CalpontSystemCatalog::DOUBLE: - case execplan::CalpontSystemCatalog::UDOUBLE: - case execplan::CalpontSystemCatalog::DECIMAL: - case execplan::CalpontSystemCatalog::UDECIMAL: - { - double value = fp[1]->data()->getDoubleVal(row, isNull); - - if (value > 0) - value += 0.5; - else if (value < 0) - value -= 0.5; - else if (value < 0) - value -= 0.5; - - int64_t ret = (int64_t) value; - - if (value > (double) numeric_limits::max()) - ret = numeric_limits::max(); - else if (value < (double) (numeric_limits::min() + 2)) - ret = numeric_limits::min() + 2; // IDB min for bigint - - len = ret; - } - break; - - case execplan::CalpontSystemCatalog::CHAR: - case execplan::CalpontSystemCatalog::VARCHAR: - { - const string& strval = fp[1]->data()->getStrVal(row, isNull); - len = strtol(strval.c_str(), NULL, 10); - break; - } - - default: - { - std::ostringstream oss; - oss << "lpad parameter 2 must be numeric, not " << execplan::colDataTypeToString(fp[1]->data()->resultType().colDataType); - throw logging::IDBExcept(oss.str(), logging::ERR_DATATYPE_NOT_SUPPORT); - } + binLen = cs->charpos(pos, end, padLength); + std::string ret(pos, binLen); + return ret; } - if (len < 1) - return ""; - // The pad characters. - // MCOL-2182 As of MariaDB 10.3 the third parameter - pad characters - is optional const string* pad = &fPad; if (fp.size() > 2) { pad = &fp[2]->data()->getStrVal(row, isNull); } + // binPLen represents the number of bytes in pad + size_t binPLen = pad->length(); + const char* posP = pad->c_str(); + // plen = the number of characters in pad + size_t plen = cs->numchars(posP, posP+binPLen); + if (plen == 0 || plen > strLen) + return src; - if (isNull) - return ""; + size_t byteCount = (padLength+1) * cs->mbmaxlen; // absolute maximun number of bytes + char* buf = new char[byteCount]; + char* pBuf = buf; - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int alen = len; + byteCount = 0; + + memcpy(pBuf, pos, binLen); + byteCount += binLen; + padLength -= strLen; + pBuf += binLen; - if (strwclen > len) - alen = strwclen; - - int bufsize = alen + 1; - - // Convert to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - unsigned int strSize = strwclen; // The number of significant characters - const wchar_t* pWChar = wcbuf; - - for (i = 0; *pWChar != '\0' && i < strwclen; ++pWChar, ++i) + while (padLength >= plen) { + memcpy(pBuf, posP, plen); + padLength -= plen; + byteCount += plen; + pBuf += plen; } - - strSize = i; - - // If the incoming str is exactly the len of the result str, - // return the original - if (strSize == len) + // Sometimes, in a case with multi-char pad, we need to add a partial pad + if (padLength > 0) { - return tstr; + size_t partialSize = cs->charpos(posP, posP+plen, padLength); + memcpy(pBuf, posP, partialSize); + byteCount += partialSize; } - - // If the incoming str is too big for the result str - // truncate the widechar buffer and return as a string - if (strSize > len) - { - // Trim the excess length of the buffer - wstring trimmed = wstring(wcbuf, len); - return utf8::wstring_to_utf8(trimmed.c_str()); - } - - // This is the case where there's room to pad. - - // Convert the pad string to wide - padwclen = pad->length(); // A guess to start. - int padbufsize = padwclen + 1; - wchar_t* wcpad = new wchar_t[padbufsize]; - size_t padlen = utf8::idb_mbstowcs(wcpad, pad->c_str(), padwclen + 1); - - // How many chars do we need? - unsigned int padspace = len - strSize; - - // Fill in the back of the buffer - wchar_t* firstpadchar = wcbuf + strSize; - - for (wchar_t* pch = wcbuf; pch < wcbuf + len && padlen > 0;) - { - // Truncate the number of fill chars if running out of space - if (padlen > padspace) - { - padlen = padspace; - } - - // Move the fill chars to buffer - for (wchar_t* padchar = wcpad; padchar < wcpad + padlen; ++padchar) - { - *firstpadchar++ = *padchar; - } - - padspace -= padlen; - pch += padlen; - } - - wstring padded = wstring(wcbuf, len); - - // Bug 5110 : strings were getting truncated since enough bytes not allocated. - std::string ret(utf8::wstring_to_utf8(padded.c_str())); - delete [] wcpad; - delete [] wcbuf; + + std::string ret(buf, byteCount); + delete [] buf; return ret; } diff --git a/utils/funcexp/func_rtrim.cpp b/utils/funcexp/func_rtrim.cpp index 4bfb9ac40..9cfd8b71b 100644 --- a/utils/funcexp/func_rtrim.cpp +++ b/utils/funcexp/func_rtrim.cpp @@ -49,95 +49,118 @@ CalpontSystemCatalog::ColType Func_rtrim::operationType(FunctionParm& fp, Calpon std::string Func_rtrim::getStrVal(rowgroup::Row& row, - FunctionParm& fp, - bool& isNull, - execplan::CalpontSystemCatalog::ColType&) + FunctionParm& fp, + bool& isNull, + execplan::CalpontSystemCatalog::ColType& type) { - // The number of characters (not bytes) in our input tstr. - // Not all of these are necessarily significant. We need to search for the - // NULL terminator to be sure. - size_t strwclen; - // this holds the number of characters (not bytes) in ourtrim tstr. - size_t trimwclen; - + CHARSET_INFO* cs = type.getCharset(); // The original string - const string& tstr = fp[0]->data()->getStrVal(row, isNull); + const string& src = fp[0]->data()->getStrVal(row, isNull); + if (isNull) + return ""; + if (src.empty() || src.length() == 0) + return src; + // binLen represents the number of bytes in src + size_t binLen = src.length(); + const char* pos = src.c_str(); + const char* end = pos + binLen; + // strLen = the number of characters in src + size_t strLen = cs->numchars(pos, end); // The trim characters. const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull) : " "); + // binTLen represents the number of bytes in trim + size_t binTLen = trim.length(); + const char* posT = trim.c_str(); + // strTLen = the number of characters in trim + size_t strTLen = cs->numchars(posT, posT+binTLen); + if (strTLen == 0 || strTLen > strLen) + return src; - if (isNull) - return ""; - - if (tstr.empty() || tstr.length() == 0) - return tstr; - - // Rather than calling the wideconvert functions with a null buffer to - // determine the size of buffer to allocate, we can be sure the wide - // char string won't be longer than: - strwclen = tstr.length(); // a guess to start with. This will be >= to the real count. - int bufsize = strwclen + 1; - - // Convert the string to wide characters. Do all further work in wide characters - wchar_t* wcbuf = new wchar_t[bufsize]; - strwclen = utf8::idb_mbstowcs(wcbuf, tstr.c_str(), strwclen + 1); - - // utf8::idb_mbstowcs could return -1 if there is bad chars - if (strwclen == static_cast(-1)) - strwclen = 0; - - // Convert the trim string to wide - trimwclen = trim.length(); // A guess to start. - int trimbufsize = trimwclen + 1; - wchar_t* wctrim = new wchar_t[trimbufsize]; - size_t trimlen = utf8::idb_mbstowcs(wctrim, trim.c_str(), trimwclen + 1); - - // idb_mbstowcs could return -1 if there is bad chars - if (trimlen == static_cast(-1)) - trimlen = 0; - - size_t trimCmpLen = trimlen * sizeof(wchar_t); - - const wchar_t* oPtr = wcbuf; // To remember the start of the string - const wchar_t* aPtr = oPtr; - const wchar_t* aEnd = wcbuf + strwclen - 1; - size_t trimCnt = 0; - - if (trimlen > 0) + if (binTLen == 1) { - if (trimlen == 1) + const char* ptr = pos; + if (cs->use_mb()) // This is a multi-byte charset { - // If trim is a single char, then don't spend the overhead for memcmp. - wchar_t chr = wctrim[0]; - - while (aEnd >= aPtr && *aEnd == chr) + const char* p = pos; + uint32 l; + // Multibyte characters in the string give us alignment problems + // What we do here is skip past any multibyte characters. Whn + // don with this loop, ptr is pointing to a singlebyte char that + // is after all multibyte chars in the string, or to end. + while (ptr < end) { - --aEnd; - ++trimCnt; + if ((l = my_ismbchar(cs, ptr, end))) // returns the number of bytes in the leading char or zero if one byte + { + ptr += l; + p = ptr; + } + else + { + ++ptr; + } + } + ptr = p; + } + while (ptr < end && end[-1] == *posT) + { + --end; + --binLen; + } + } + else + { + // An uncommon case where the space character is > 1 byte + if (cs->use_mb()) // This is a multi-byte charset + { + // The problem is that the byte pattern at the end could + // match memcmp, but not be correct since the first byte compared + // may actually be a second or later byte from a previous char. + + // We start at the beginning of the string and move forward + // one character at a time until we reach the end. Then we can + // safely compare and remove on character. Then back to the beginning + // and try again. + while (end - binTLen >= pos) + { + const char* p = pos; + uint32 l; + while (p + binTLen < end) + { + if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte + p += l; + else + ++p; + } + if (p + binTLen == end && memcmp(p,posT,binTLen) == 0) + { + end -= binTLen; + binLen -= binTLen; + } + else + { + break; // We've run out of places to look + } } } else { - aEnd -= (trimlen - 1); // So we don't compare past the end of the string. - - while (aPtr <= aEnd && !memcmp(aEnd, wctrim, trimCmpLen)) + // This implies we have a single byte charset and a multibyte + // space character. + // Should never get here, since rtrim only trims space characters + // Included for completeness. + while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0) { - aEnd -= trimCmpLen; - trimCnt += trimlen; + end -= binTLen; + binLen -= binTLen; } } } - - size_t aLen = strwclen - trimCnt; - wstring trimmed = wstring(aPtr, aLen); // Turn back to a string - std::string ret(utf8::wstring_to_utf8(trimmed.c_str())); - delete [] wctrim; - delete [] wcbuf; + std::string ret(pos, binLen); return ret; } - } // namespace funcexp // vim:ts=4 sw=4: diff --git a/utils/funcexp/func_strcmp.cpp b/utils/funcexp/func_strcmp.cpp index cec87f0ae..2b0c6a2f1 100644 --- a/utils/funcexp/func_strcmp.cpp +++ b/utils/funcexp/func_strcmp.cpp @@ -73,7 +73,7 @@ CalpontSystemCatalog::ColType Func_strcmp::operationType(FunctionParm& fp, Calpo int64_t Func_strcmp::getIntVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType& op_ct) + execplan::CalpontSystemCatalog::ColType& type) { CHARSET_INFO* cs = fp[0]->data()->resultType().getCharset(); const string& str = fp[0]->data()->getStrVal(row, isNull); @@ -88,9 +88,9 @@ int64_t Func_strcmp::getIntVal(rowgroup::Row& row, std::string Func_strcmp::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, - execplan::CalpontSystemCatalog::ColType& op_ct) + execplan::CalpontSystemCatalog::ColType& type) { - uint64_t val = getIntVal(row, fp, isNull, op_ct); + uint64_t val = getIntVal(row, fp, isNull, type); if (val > 0) return string("1"); diff --git a/utils/funcexp/func_trim.cpp b/utils/funcexp/func_trim.cpp index dedb891aa..7a3ca9e88 100644 --- a/utils/funcexp/func_trim.cpp +++ b/utils/funcexp/func_trim.cpp @@ -86,31 +86,37 @@ std::string Func_trim::getStrVal(rowgroup::Row& row, --binLen; } // Trim trailing - while (end > pos && *end == *posT) + const char* ptr = pos; + if (cs->use_mb()) // This is a multi-byte charset + { + const char* p = pos; + uint32 l; + // Multibyte characters in the string give us alignment problems + // What we do here is skip past any multibyte characters. Whn + // don with this loop, ptr is pointing to a singlebyte char that + // is after all multibyte chars in the string, or to end. + while (ptr < end) + { + if ((l = my_ismbchar(cs, ptr, end))) // returns the number of bytes in the leading char or zero if one byte + { + ptr += l; + p = ptr; + } + else + { + ++ptr; + } + } + ptr = p; + } + while (ptr < end && end[-1] == *posT) { --end; --binLen; } } - else if (!cs->use_mb()) - { - // This is a one byte per char charset with multiple char trim. - // Trim leading - while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0) - { - pos += binTLen; - binLen -= binTLen; - } - // Trim trailing - while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0) - { - end -= binTLen; - binLen -= binTLen; - } - } else { - // We're using a multi-byte charset // Trim leading is easy while (pos+binTLen <= end && memcmp(pos,posT,binTLen) == 0) { @@ -119,33 +125,45 @@ std::string Func_trim::getStrVal(rowgroup::Row& row, } // Trim trailing - // The problem is that the byte pattern at the end could - // match memcmp, but not be correct since the first byte compared - // may actually be a second or later byte from a previous char. - - // We start at the beginning of the string and move forward - // one character at a time until we reach the end. Then we can - // safely compare. - while (end - binTLen >= pos) + if (cs->use_mb()) // This is a multi-byte charset { - const char* p = pos; - uint32 l; - while (p + binTLen < end) + // The problem is that the byte pattern at the end could + // match memcmp, but not be correct since the first byte compared + // may actually be a second or later byte from a previous char. + + // We start at the beginning of the string and move forward + // one character at a time until we reach the end. Then we can + // safely compare and remove on character. Then back to the beginning + // and try again. + while (end - binTLen >= pos) { - if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte - p += l; + const char* p = pos; + uint32 l; + while (p + binTLen < end) + { + if ((l = my_ismbchar(cs, p, end))) // returns the number of bytes in the leading char or zero if one byte + p += l; + else + ++p; + } + if (p + binTLen == end && memcmp(p,posT,binTLen) == 0) + { + end -= binTLen; + binLen -= binTLen; + } else - ++p; + { + break; // We've run out of places to look + } } - if (p + binTLen == end && memcmp(p,posT,binTLen) == 0) + } + else + { + while (end-binTLen >= pos && memcmp(end-binTLen,posT,binTLen) == 0) { end -= binTLen; binLen -= binTLen; } - else - { - break; // We've run out of places to look - } } } // Turn back to a string