1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-4580 extent elimination for dictionary-based text/varchar types

The idea is relatively simple - encode prefixes of collated strings as
integers and use them to compute extents' ranges. Then we can eliminate
extents with strings.

The actual patch does have all the code there but miss one important
step: we do not keep collation index, we keep charset index. Because of
this, some of the tests in the bugfix suite fail and thus main
functionality is turned off.

The reason of this patch to be put into PR at all is that it contains
changes that made CHAR/VARCHAR columns unsigned. This change is needed in
vectorization work.
This commit is contained in:
Serguey Zefirov
2022-02-04 11:55:09 +00:00
parent a66a8dfabf
commit 53b9a2a0f9
54 changed files with 698 additions and 227 deletions

View File

@ -21,6 +21,10 @@
/** @writeengine.cpp
* A wrapper class for the write engine to write information to files
*/
// XXX: a definition to switch off computations for token columns.
//#define XXX_WRITEENGINE_TOKENS_RANGES_XXX
#include <cmath>
#include <cstdlib>
#include <unistd.h>
@ -59,6 +63,7 @@ using namespace execplan;
#include "MonitorProcMem.h"
using namespace idbdatafile;
#include "dataconvert.h"
#include "string_prefixes.h"
#ifdef _MSC_VER
#define isnan _isnan
@ -362,6 +367,9 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
case WR_UINT:
case WR_ULONGLONG:
case WR_CHAR:
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
case WR_TOKEN:
#endif
{
isUnsigned = true;
break;
@ -385,6 +393,13 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
maxMin->fromToChars();
}
}
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
if (colType == WR_TOKEN)
{
oldValArrayVoid = nullptr; // no old values for tokens, sadly.
valArrayVoid = (void*)maxMin->stringsPrefixes();
}
#endif
size_t i;
for (i = 0; i < totalOldRow; i++)
{
@ -435,6 +450,9 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
fetchNewOldValues<int64_t, int64_t>(value, oldValue, valArrayVoid, oldValArrayVoid, i, totalNewRow);
break;
}
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
case WR_TOKEN:
#endif
case WR_ULONGLONG:
{
fetchNewOldValues<uint64_t, uint64_t>(uvalue, oldUValue, valArrayVoid, oldValArrayVoid, i,
@ -449,12 +467,11 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
}
case WR_CHAR:
{
fetchNewOldValues<uint64_t, uint64_t>(uvalue, oldUValue, valArrayVoid, oldValArrayVoid, i,
totalNewRow);
fetchNewOldValues<int64_t, int64_t>(value, oldValue, valArrayVoid, oldValArrayVoid, i, totalNewRow);
// for characters (strings, actually), we fetched then in LSB order, on x86, at the very least.
// this means most significant byte of the string, which is first, is now in LSB of uvalue/oldValue.
// we must perform a conversion.
uvalue = uint64ToStr(uvalue);
value = uint64ToStr(uvalue);
oldValue = uint64ToStr(oldValue);
break;
}
@ -576,6 +593,7 @@ void WriteEngineWrapper::convertValue(const execplan::CalpontSystemCatalog::ColT
curStr = curStr.substr(0, MAX_COLUMN_BOUNDARY);
memcpy(value, curStr.c_str(), curStr.length());
break;
case WriteEngine::WR_FLOAT:
@ -1179,10 +1197,17 @@ static void log_this(const char *message,
#endif
/** @brief Determine whether we may update a column's ranges (by type) and return nullptr if we can't */
static ExtCPInfo* getCPInfoToUpdateForUpdatableType(const ColStruct& colStruct, ExtCPInfo* currentCPInfo)
static ExtCPInfo* getCPInfoToUpdateForUpdatableType(const ColStruct& colStruct, ExtCPInfo* currentCPInfo,
OpType optype)
{
if (colStruct.tokenFlag)
{
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
if (currentCPInfo && currentCPInfo->hasStringsPrefixes() && optype == INSERT)
{
return currentCPInfo;
}
#endif
return nullptr;
}
switch (colStruct.colType)
@ -1689,10 +1714,16 @@ int WriteEngineWrapper::insertColumnRecs(
for (uint32_t rows = 0; rows < (totalRow - rowsLeft); rows++)
{
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
int64_t strPrefix;
#endif
if (dctStr_iter->length() == 0)
{
Token nullToken;
col_iter->data = nullToken;
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
strPrefix = (int64_t)joblist::UBIGINTNULL; // the string prefixes are signed long ints.
#endif
}
else
{
@ -1702,6 +1733,10 @@ int WriteEngineWrapper::insertColumnRecs(
DctnryTuple dctTuple;
dctTuple.sigValue = (unsigned char*)dctStr_iter->c_str();
dctTuple.sigSize = dctStr_iter->length();
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
strPrefix = encodeStringPrefix_check_null(dctTuple.sigValue, dctTuple.sigSize,
dctnryStructList[i].fCharsetNumber);
#endif
dctTuple.isNull = false;
rc = tokenize(txnid, dctTuple, dctnryStructList[i].fCompressionType);
@ -1717,6 +1752,9 @@ int WriteEngineWrapper::insertColumnRecs(
col_iter->data = dctTuple.token;
}
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
maxMins[i].fSplitMaxMinInfo[0].addStringPrefix(strPrefix);
#endif
dctStr_iter++;
col_iter++;
}
@ -1744,10 +1782,16 @@ int WriteEngineWrapper::insertColumnRecs(
for (uint32_t rows = 0; rows < rowsLeft; rows++)
{
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
int64_t strPrefix;
#endif
if (dctStr_iter->length() == 0)
{
Token nullToken;
col_iter->data = nullToken;
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
strPrefix = joblist::UBIGINTNULL; // string prefixes are signed long ints.
#endif
}
else
{
@ -1757,6 +1801,10 @@ int WriteEngineWrapper::insertColumnRecs(
DctnryTuple dctTuple;
dctTuple.sigValue = (unsigned char*)dctStr_iter->c_str();
dctTuple.sigSize = dctStr_iter->length();
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
strPrefix = encodeStringPrefix_check_null(dctTuple.sigValue, dctTuple.sigSize,
dctnryStructList[i].fCharsetNumber);
#endif
dctTuple.isNull = false;
rc = tokenize(txnid, dctTuple, newDctnryStructList[i].fCompressionType);
@ -1772,6 +1820,9 @@ int WriteEngineWrapper::insertColumnRecs(
col_iter->data = dctTuple.token;
}
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
maxMins[i].fSplitMaxMinInfo[1].addStringPrefix(strPrefix);
#endif
dctStr_iter++;
col_iter++;
}
@ -1938,7 +1989,7 @@ int WriteEngineWrapper::insertColumnRecs(
if (isFirstBatchPm && (totalRow == rowsLeft))
{
// in this particular case we already marked extents as invalid up there.
// in this particular case we already marked extents as invalid above.
}
else
{
@ -1950,7 +2001,7 @@ int WriteEngineWrapper::insertColumnRecs(
if (firstHalfCount)
{
ExtCPInfo* cpInfoP =
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[0]);
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[0], m_opType);
RID thisRid = rowsLeft ? lastRid : lastRidNew;
successFlag = colOp->calculateRowId(thisRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
@ -1966,7 +2017,7 @@ int WriteEngineWrapper::insertColumnRecs(
if (rowsLeft)
{
ExtCPInfo* cpInfoP =
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[1]);
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[1], m_opType);
if (cpInfoP)
{
RETURN_ON_ERROR(GetLBIDRange(newExtentsStartingLbids[i], colStructList[i], *cpInfoP));
@ -4446,11 +4497,6 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
ColumnOp* colOp = NULL;
ExtCPInfoList infosToUpdate;
if (m_opType != DELETE)
{
m_opType = UPDATE;
}
for (unsigned extent = 0; extent < numExtents; extent++)
{
colStructList = colExtentsStruct[extent];
@ -4524,15 +4570,19 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
}
std::vector<ExtCPInfo*> currentExtentRangesPtrs(colStructList.size(), NULL); // pointers for each extent.
if (m_opType != DELETE)
m_opType = UPDATE;
for (unsigned j = 0; j < colStructList.size(); j++)
{
colOp = m_colOp[op(colStructList[j].fCompressionType)];
ExtCPInfo* cpInfoP = &(currentExtentRanges[j]);
cpInfoP = getCPInfoToUpdateForUpdatableType(colStructList[j], cpInfoP);
cpInfoP = getCPInfoToUpdateForUpdatableType(colStructList[j], cpInfoP, m_opType);
currentExtentRangesPtrs[j] = cpInfoP;
if (colStructList[j].tokenFlag)
continue;
// XXX: highly dubious.
// if (!colStructList[j].tokenFlag)
// continue;
width = colOp->getCorrectRowWidth(colStructList[j].colDataType, colStructList[j].colWidth);
successFlag = colOp->calculateRowId(aRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
@ -4550,9 +4600,6 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
// timer.start("markExtentsInvalid");
//#endif
if (m_opType != DELETE)
m_opType = UPDATE;
rc = writeColumnRecUpdate(txnid, cscColTypeList, colStructList, colValueList, colOldValueList,
ridLists[extent], tableOid, true, ridLists[extent].size(),
&currentExtentRangesPtrs);
@ -4578,6 +4625,7 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
{
cpInfo.fCPInfo.seqNum = SEQNUM_MARK_INVALID_SET_RANGE;
}
// ZZZZ
rc = BRMWrapper::getInstance()->setExtentsMaxMin(infosToDrop);
setInvalidCPInfosSpecialMarks(infosToUpdate);
rc = BRMWrapper::getInstance()->setExtentsMaxMin(infosToUpdate);
@ -4611,12 +4659,9 @@ int WriteEngineWrapper::updateColumnRecs(const TxnID& txnid, const CSCTypesList&
colOp = m_colOp[op(colExtentsStruct[j].fCompressionType)];
ExtCPInfo* cpInfoP = &(infosToUpdate[j]);
cpInfoP = getCPInfoToUpdateForUpdatableType(colExtentsStruct[j], cpInfoP);
cpInfoP = getCPInfoToUpdateForUpdatableType(colExtentsStruct[j], cpInfoP, m_opType);
pointersToInfos.push_back(cpInfoP);
if (colExtentsStruct[j].tokenFlag)
continue;
width = colOp->getCorrectRowWidth(colExtentsStruct[j].colDataType, colExtentsStruct[j].colWidth);
successFlag = colOp->calculateRowId(aRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
@ -4964,7 +5009,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
allocateValArray(valArray, totalRow1, colStructList[i].colType, colStructList[i].colWidth);
ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL);
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL, m_opType);
if (m_opType != INSERT && cpInfo != NULL) // we allocate space for old values only when we need them.
{
@ -5109,7 +5154,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
}
ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
newColStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[1] : NULL);
newColStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[1] : NULL, m_opType);
allocateValArray(valArray, totalRow2, newColStructList[i].colType, newColStructList[i].colWidth);
if (m_opType != INSERT && cpInfo != NULL) // we allocate space for old values only when we need them.
@ -5190,7 +5235,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
ColumnOp* colOp = m_colOp[op(colStructList[i].fCompressionType)];
ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL);
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL, m_opType);
// set params
colOp->initColumn(curCol);