mariadb-columnstore-engine/utils/funcexp/func_trim.cpp

/* Copyright (C) 2014 InfiniDB, Inc.

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; version 2 of
   the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   MA 02110-1301, USA. */

/****************************************************************************
 * $Id: func_trim.cpp 3923 2013-06-19 21:43:06Z bwilkinson $
 *
 *
 ****************************************************************************/

#include <string>
using namespace std;

#include "functor_str.h"
#include "functioncolumn.h"
using namespace execplan;

#include "rowgroup.h"
using namespace rowgroup;

#include "joblisttypes.h"
using namespace joblist;

namespace funcexp
{
CalpontSystemCatalog::ColType Func_trim::operationType(FunctionParm& fp,
                                                       CalpontSystemCatalog::ColType& resultType)
{
  // operation type is not used by this functor
  return fp[0]->data()->resultType();
}

std::string Func_trim::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
                                 execplan::CalpontSystemCatalog::ColType& type)
{
  CHARSET_INFO* cs = type.getCharset();
  // The original string
  const auto& src = fp[0]->data()->getStrVal(row, isNull);
  if (src.isNull() || src.length() < 1)
    return "";
  // binLen represents the number of bytes in src
  size_t binLen = src.length();
  const char* pos = src.str();
  const char* end = pos + binLen;
  // strLen = the number of characters in src
  size_t strLen = cs->numchars(pos, end);

  // The trim characters.
  const string& trim = (fp.size() > 1 ? fp[1]->data()->getStrVal(row, isNull).safeString("") : " ");
  // binTLen represents the number of bytes in trim
  size_t binTLen = trim.length();
  const char* posT = trim.c_str();
  // strTLen = the number of characters in trim
  size_t strTLen = cs->numchars(posT, posT + binTLen);
  if (strTLen == 0 || strTLen > strLen)
    return src.safeString("");

  if (binTLen == 1)
  {
    // If the trim string is 1 byte, don't waste cpu for memcmp
    // Trim leading
    while (pos < end && *pos == *posT)
    {
      ++pos;
      --binLen;
    }
    // Trim trailing
    const char* ptr = pos;
    if (cs->use_mb())  // This is a multi-byte charset
    {
      const char* p = pos;
      uint32 l;
      // Multibyte characters in the string give us alignment problems
      // What we do here is skip past any multibyte characters. Whn
      // don with this loop, ptr is pointing to a singlebyte char that
      // is after all multibyte chars in the string, or to end.
      while (ptr < end)
      {
        if ((l = my_ismbchar(cs, ptr,
                             end)))  // returns the number of bytes in the leading char or zero if one byte
        {
          ptr += l;
          p = ptr;
        }
        else
        {
          ++ptr;
        }
      }
      ptr = p;
    }
    while (ptr < end && end[-1] == *posT)
    {
      --end;
      --binLen;
    }
  }
  else
  {
    // Trim leading is easy
    while (pos + binTLen <= end && memcmp(pos, posT, binTLen) == 0)
    {
      pos += binTLen;
      binLen -= binTLen;
    }

    // Trim trailing
    if (cs->use_mb())  // This is a multi-byte charset
    {
      // The problem is that the byte pattern at the end could
      // match memcmp, but not be correct since the first byte compared
      // may actually be a second or later byte from a previous char.

      // We start at the beginning of the string and move forward
      // one character at a time until we reach the end. Then we can
      // safely compare and remove one character. Then back to the beginning
      // and try again.
      while (end - binTLen >= pos)
      {
        const char* p = pos;
        uint32_t l;
        while (p + binTLen < end)
        {
          if ((l = my_ismbchar(cs, p,
                               end)))  // returns the number of bytes in the leading char or zero if one byte
            p += l;
          else
            ++p;
        }
        if (p + binTLen == end && memcmp(p, posT, binTLen) == 0)
        {
          end -= binTLen;
          binLen -= binTLen;
        }
        else
        {
          break;  // We've run out of places to look
        }
      }
    }
    else
    {
      while (end - binTLen >= pos && memcmp(end - binTLen, posT, binTLen) == 0)
      {
        end -= binTLen;
        binLen -= binTLen;
      }
    }
  }
  // Turn back to a string
  std::string ret(pos, binLen);
  return ret;
}

}  // namespace funcexp