MCOL-4580 extent elimination for dictionary-based text/varchar types

The idea is relatively simple - encode prefixes of collated strings as integers and use them to compute extents' ranges. Then we can eliminate extents with strings. The actual patch does have all the code there but miss one important step: we do not keep collation index, we keep charset index. Because of this, some of the tests in the bugfix suite fail and thus main functionality is turned off. The reason of this patch to be put into PR at all is that it contains changes that made CHAR/VARCHAR columns unsigned. This change is needed in vectorization work.
2025-10-22 22:32:34 +03:00 · 2022-02-04 11:55:09 +00:00
parent a66a8dfabf
commit 53b9a2a0f9
54 changed files with 698 additions and 227 deletions
--- a/writeengine/wrapper/writeengine.cpp
+++ b/writeengine/wrapper/writeengine.cpp
@@ -21,6 +21,10 @@
 /** @writeengine.cpp
 *   A wrapper class for the write engine to write information to files
 */
+
+// XXX: a definition to switch off computations for token columns.
+//#define	XXX_WRITEENGINE_TOKENS_RANGES_XXX
+
 #include <cmath>
 #include <cstdlib>
 #include <unistd.h>
@@ -59,6 +63,7 @@ using namespace execplan;
 #include "MonitorProcMem.h"
 using namespace idbdatafile;
 #include "dataconvert.h"
+#include "string_prefixes.h"

 #ifdef _MSC_VER
 #define isnan _isnan
@@ -362,6 +367,9 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
    case WR_UINT:
    case WR_ULONGLONG:
    case WR_CHAR:
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+    case WR_TOKEN:
+#endif
    {
      isUnsigned = true;
      break;
@@ -385,6 +393,13 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
      maxMin->fromToChars();
    }
  }
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+  if (colType == WR_TOKEN)
+  {
+    oldValArrayVoid = nullptr;  // no old values for tokens, sadly.
+    valArrayVoid = (void*)maxMin->stringsPrefixes();
+  }
+#endif
  size_t i;
  for (i = 0; i < totalOldRow; i++)
  {
@@ -435,6 +450,9 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
        fetchNewOldValues<int64_t, int64_t>(value, oldValue, valArrayVoid, oldValArrayVoid, i, totalNewRow);
        break;
      }
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+      case WR_TOKEN:
+#endif
      case WR_ULONGLONG:
      {
        fetchNewOldValues<uint64_t, uint64_t>(uvalue, oldUValue, valArrayVoid, oldValArrayVoid, i,
@@ -449,12 +467,11 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
      }
      case WR_CHAR:
      {
-        fetchNewOldValues<uint64_t, uint64_t>(uvalue, oldUValue, valArrayVoid, oldValArrayVoid, i,
-                                              totalNewRow);
+        fetchNewOldValues<int64_t, int64_t>(value, oldValue, valArrayVoid, oldValArrayVoid, i, totalNewRow);
        // for characters (strings, actually), we fetched then in LSB order, on x86, at the very least.
        // this means most significant byte of the string, which is first, is now in LSB of uvalue/oldValue.
        // we must perform a conversion.
-        uvalue = uint64ToStr(uvalue);
+        value = uint64ToStr(uvalue);
        oldValue = uint64ToStr(oldValue);
        break;
      }
@@ -576,6 +593,7 @@ void WriteEngineWrapper::convertValue(const execplan::CalpontSystemCatalog::ColT
        curStr = curStr.substr(0, MAX_COLUMN_BOUNDARY);

      memcpy(value, curStr.c_str(), curStr.length());
+
      break;

    case WriteEngine::WR_FLOAT:
@@ -1179,10 +1197,17 @@ static void log_this(const char *message,
 #endif

 /** @brief Determine whether we may update a column's ranges (by type) and return nullptr if we can't */
-static ExtCPInfo* getCPInfoToUpdateForUpdatableType(const ColStruct& colStruct, ExtCPInfo* currentCPInfo)
+static ExtCPInfo* getCPInfoToUpdateForUpdatableType(const ColStruct& colStruct, ExtCPInfo* currentCPInfo,
+                                                    OpType optype)
 {
  if (colStruct.tokenFlag)
  {
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+    if (currentCPInfo && currentCPInfo->hasStringsPrefixes() && optype == INSERT)
+    {
+      return currentCPInfo;
+    }
+#endif
    return nullptr;
  }
  switch (colStruct.colType)
@@ -1689,10 +1714,16 @@ int WriteEngineWrapper::insertColumnRecs(

      for (uint32_t rows = 0; rows < (totalRow - rowsLeft); rows++)
      {
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+        int64_t strPrefix;
+#endif
        if (dctStr_iter->length() == 0)
        {
          Token nullToken;
          col_iter->data = nullToken;
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+          strPrefix = (int64_t)joblist::UBIGINTNULL;  // the string prefixes are signed long ints.
+#endif
        }
        else
        {
@@ -1702,6 +1733,10 @@ int WriteEngineWrapper::insertColumnRecs(
          DctnryTuple dctTuple;
          dctTuple.sigValue = (unsigned char*)dctStr_iter->c_str();
          dctTuple.sigSize = dctStr_iter->length();
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+          strPrefix = encodeStringPrefix_check_null(dctTuple.sigValue, dctTuple.sigSize,
+                                                    dctnryStructList[i].fCharsetNumber);
+#endif
          dctTuple.isNull = false;
          rc = tokenize(txnid, dctTuple, dctnryStructList[i].fCompressionType);

@@ -1717,6 +1752,9 @@ int WriteEngineWrapper::insertColumnRecs(
          col_iter->data = dctTuple.token;
        }

+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+        maxMins[i].fSplitMaxMinInfo[0].addStringPrefix(strPrefix);
+#endif
        dctStr_iter++;
        col_iter++;
      }
@@ -1744,10 +1782,16 @@ int WriteEngineWrapper::insertColumnRecs(

        for (uint32_t rows = 0; rows < rowsLeft; rows++)
        {
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+          int64_t strPrefix;
+#endif
          if (dctStr_iter->length() == 0)
          {
            Token nullToken;
            col_iter->data = nullToken;
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+            strPrefix = joblist::UBIGINTNULL;  // string prefixes are signed long ints.
+#endif
          }
          else
          {
@@ -1757,6 +1801,10 @@ int WriteEngineWrapper::insertColumnRecs(
            DctnryTuple dctTuple;
            dctTuple.sigValue = (unsigned char*)dctStr_iter->c_str();
            dctTuple.sigSize = dctStr_iter->length();
+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+            strPrefix = encodeStringPrefix_check_null(dctTuple.sigValue, dctTuple.sigSize,
+                                                      dctnryStructList[i].fCharsetNumber);
+#endif
            dctTuple.isNull = false;
            rc = tokenize(txnid, dctTuple, newDctnryStructList[i].fCompressionType);

@@ -1772,6 +1820,9 @@ int WriteEngineWrapper::insertColumnRecs(
            col_iter->data = dctTuple.token;
          }

+#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
+          maxMins[i].fSplitMaxMinInfo[1].addStringPrefix(strPrefix);
+#endif
          dctStr_iter++;
          col_iter++;
        }
@@ -1938,7 +1989,7 @@ int WriteEngineWrapper::insertColumnRecs(

    if (isFirstBatchPm && (totalRow == rowsLeft))
    {
-      // in this particular case we already marked extents as invalid up there.
+      // in this particular case we already marked extents as invalid above.
    }
    else
    {
@@ -1950,7 +2001,7 @@ int WriteEngineWrapper::insertColumnRecs(
        if (firstHalfCount)
        {
          ExtCPInfo* cpInfoP =
-              getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[0]);
+              getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[0], m_opType);
          RID thisRid = rowsLeft ? lastRid : lastRidNew;
          successFlag = colOp->calculateRowId(thisRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);

@@ -1966,7 +2017,7 @@ int WriteEngineWrapper::insertColumnRecs(
        if (rowsLeft)
        {
          ExtCPInfo* cpInfoP =
-              getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[1]);
+              getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[1], m_opType);
          if (cpInfoP)
          {
            RETURN_ON_ERROR(GetLBIDRange(newExtentsStartingLbids[i], colStructList[i], *cpInfoP));
@@ -4446,11 +4497,6 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
  ColumnOp* colOp = NULL;
  ExtCPInfoList infosToUpdate;

-  if (m_opType != DELETE)
-  {
-    m_opType = UPDATE;
-  }
-
  for (unsigned extent = 0; extent < numExtents; extent++)
  {
    colStructList = colExtentsStruct[extent];
@@ -4524,15 +4570,19 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
    }
    std::vector<ExtCPInfo*> currentExtentRangesPtrs(colStructList.size(), NULL);  // pointers for each extent.

+    if (m_opType != DELETE)
+      m_opType = UPDATE;
+
    for (unsigned j = 0; j < colStructList.size(); j++)
    {
      colOp = m_colOp[op(colStructList[j].fCompressionType)];
      ExtCPInfo* cpInfoP = &(currentExtentRanges[j]);
-      cpInfoP = getCPInfoToUpdateForUpdatableType(colStructList[j], cpInfoP);
+      cpInfoP = getCPInfoToUpdateForUpdatableType(colStructList[j], cpInfoP, m_opType);
      currentExtentRangesPtrs[j] = cpInfoP;

-      if (colStructList[j].tokenFlag)
-        continue;
+      // XXX: highly dubious.
+      // if (!colStructList[j].tokenFlag)
+      //    continue;

      width = colOp->getCorrectRowWidth(colStructList[j].colDataType, colStructList[j].colWidth);
      successFlag = colOp->calculateRowId(aRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
@@ -4550,9 +4600,6 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
    // timer.start("markExtentsInvalid");
    //#endif

-    if (m_opType != DELETE)
-      m_opType = UPDATE;
-
    rc = writeColumnRecUpdate(txnid, cscColTypeList, colStructList, colValueList, colOldValueList,
                              ridLists[extent], tableOid, true, ridLists[extent].size(),
                              &currentExtentRangesPtrs);
@@ -4578,6 +4625,7 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
    {
      cpInfo.fCPInfo.seqNum = SEQNUM_MARK_INVALID_SET_RANGE;
    }
+    // ZZZZ
    rc = BRMWrapper::getInstance()->setExtentsMaxMin(infosToDrop);
    setInvalidCPInfosSpecialMarks(infosToUpdate);
    rc = BRMWrapper::getInstance()->setExtentsMaxMin(infosToUpdate);
@@ -4611,12 +4659,9 @@ int WriteEngineWrapper::updateColumnRecs(const TxnID& txnid, const CSCTypesList&
    colOp = m_colOp[op(colExtentsStruct[j].fCompressionType)];

    ExtCPInfo* cpInfoP = &(infosToUpdate[j]);
-    cpInfoP = getCPInfoToUpdateForUpdatableType(colExtentsStruct[j], cpInfoP);
+    cpInfoP = getCPInfoToUpdateForUpdatableType(colExtentsStruct[j], cpInfoP, m_opType);
    pointersToInfos.push_back(cpInfoP);

-    if (colExtentsStruct[j].tokenFlag)
-      continue;
-
    width = colOp->getCorrectRowWidth(colExtentsStruct[j].colDataType, colExtentsStruct[j].colWidth);
    successFlag = colOp->calculateRowId(aRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);

@@ -4964,7 +5009,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
        allocateValArray(valArray, totalRow1, colStructList[i].colType, colStructList[i].colWidth);

        ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
-            colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL);
+            colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL, m_opType);

        if (m_opType != INSERT && cpInfo != NULL)  // we allocate space for old values only when we need them.
        {
@@ -5109,7 +5154,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
      }

      ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
-          newColStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[1] : NULL);
+          newColStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[1] : NULL, m_opType);
      allocateValArray(valArray, totalRow2, newColStructList[i].colType, newColStructList[i].colWidth);

      if (m_opType != INSERT && cpInfo != NULL)  // we allocate space for old values only when we need them.
@@ -5190,7 +5235,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
      ColumnOp* colOp = m_colOp[op(colStructList[i].fCompressionType)];

      ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
-          colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL);
+          colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL, m_opType);

      // set params
      colOp->initColumn(curCol);