You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-4580 extent elimination for dictionary-based text/varchar types
The idea is relatively simple - encode prefixes of collated strings as integers and use them to compute extents' ranges. Then we can eliminate extents with strings. The actual patch does have all the code there but miss one important step: we do not keep collation index, we keep charset index. Because of this, some of the tests in the bugfix suite fail and thus main functionality is turned off. The reason of this patch to be put into PR at all is that it contains changes that made CHAR/VARCHAR columns unsigned. This change is needed in vectorization work.
This commit is contained in:
@ -21,6 +21,10 @@
|
||||
/** @writeengine.cpp
|
||||
* A wrapper class for the write engine to write information to files
|
||||
*/
|
||||
|
||||
// XXX: a definition to switch off computations for token columns.
|
||||
//#define XXX_WRITEENGINE_TOKENS_RANGES_XXX
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <unistd.h>
|
||||
@ -59,6 +63,7 @@ using namespace execplan;
|
||||
#include "MonitorProcMem.h"
|
||||
using namespace idbdatafile;
|
||||
#include "dataconvert.h"
|
||||
#include "string_prefixes.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define isnan _isnan
|
||||
@ -362,6 +367,9 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
|
||||
case WR_UINT:
|
||||
case WR_ULONGLONG:
|
||||
case WR_CHAR:
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
case WR_TOKEN:
|
||||
#endif
|
||||
{
|
||||
isUnsigned = true;
|
||||
break;
|
||||
@ -385,6 +393,13 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
|
||||
maxMin->fromToChars();
|
||||
}
|
||||
}
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
if (colType == WR_TOKEN)
|
||||
{
|
||||
oldValArrayVoid = nullptr; // no old values for tokens, sadly.
|
||||
valArrayVoid = (void*)maxMin->stringsPrefixes();
|
||||
}
|
||||
#endif
|
||||
size_t i;
|
||||
for (i = 0; i < totalOldRow; i++)
|
||||
{
|
||||
@ -435,6 +450,9 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
|
||||
fetchNewOldValues<int64_t, int64_t>(value, oldValue, valArrayVoid, oldValArrayVoid, i, totalNewRow);
|
||||
break;
|
||||
}
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
case WR_TOKEN:
|
||||
#endif
|
||||
case WR_ULONGLONG:
|
||||
{
|
||||
fetchNewOldValues<uint64_t, uint64_t>(uvalue, oldUValue, valArrayVoid, oldValArrayVoid, i,
|
||||
@ -449,12 +467,11 @@ void WriteEngineWrapper::updateMaxMinRange(const size_t totalNewRow, const size_
|
||||
}
|
||||
case WR_CHAR:
|
||||
{
|
||||
fetchNewOldValues<uint64_t, uint64_t>(uvalue, oldUValue, valArrayVoid, oldValArrayVoid, i,
|
||||
totalNewRow);
|
||||
fetchNewOldValues<int64_t, int64_t>(value, oldValue, valArrayVoid, oldValArrayVoid, i, totalNewRow);
|
||||
// for characters (strings, actually), we fetched then in LSB order, on x86, at the very least.
|
||||
// this means most significant byte of the string, which is first, is now in LSB of uvalue/oldValue.
|
||||
// we must perform a conversion.
|
||||
uvalue = uint64ToStr(uvalue);
|
||||
value = uint64ToStr(uvalue);
|
||||
oldValue = uint64ToStr(oldValue);
|
||||
break;
|
||||
}
|
||||
@ -576,6 +593,7 @@ void WriteEngineWrapper::convertValue(const execplan::CalpontSystemCatalog::ColT
|
||||
curStr = curStr.substr(0, MAX_COLUMN_BOUNDARY);
|
||||
|
||||
memcpy(value, curStr.c_str(), curStr.length());
|
||||
|
||||
break;
|
||||
|
||||
case WriteEngine::WR_FLOAT:
|
||||
@ -1179,10 +1197,17 @@ static void log_this(const char *message,
|
||||
#endif
|
||||
|
||||
/** @brief Determine whether we may update a column's ranges (by type) and return nullptr if we can't */
|
||||
static ExtCPInfo* getCPInfoToUpdateForUpdatableType(const ColStruct& colStruct, ExtCPInfo* currentCPInfo)
|
||||
static ExtCPInfo* getCPInfoToUpdateForUpdatableType(const ColStruct& colStruct, ExtCPInfo* currentCPInfo,
|
||||
OpType optype)
|
||||
{
|
||||
if (colStruct.tokenFlag)
|
||||
{
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
if (currentCPInfo && currentCPInfo->hasStringsPrefixes() && optype == INSERT)
|
||||
{
|
||||
return currentCPInfo;
|
||||
}
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
switch (colStruct.colType)
|
||||
@ -1689,10 +1714,16 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
|
||||
for (uint32_t rows = 0; rows < (totalRow - rowsLeft); rows++)
|
||||
{
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
int64_t strPrefix;
|
||||
#endif
|
||||
if (dctStr_iter->length() == 0)
|
||||
{
|
||||
Token nullToken;
|
||||
col_iter->data = nullToken;
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
strPrefix = (int64_t)joblist::UBIGINTNULL; // the string prefixes are signed long ints.
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1702,6 +1733,10 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
DctnryTuple dctTuple;
|
||||
dctTuple.sigValue = (unsigned char*)dctStr_iter->c_str();
|
||||
dctTuple.sigSize = dctStr_iter->length();
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
strPrefix = encodeStringPrefix_check_null(dctTuple.sigValue, dctTuple.sigSize,
|
||||
dctnryStructList[i].fCharsetNumber);
|
||||
#endif
|
||||
dctTuple.isNull = false;
|
||||
rc = tokenize(txnid, dctTuple, dctnryStructList[i].fCompressionType);
|
||||
|
||||
@ -1717,6 +1752,9 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
col_iter->data = dctTuple.token;
|
||||
}
|
||||
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
maxMins[i].fSplitMaxMinInfo[0].addStringPrefix(strPrefix);
|
||||
#endif
|
||||
dctStr_iter++;
|
||||
col_iter++;
|
||||
}
|
||||
@ -1744,10 +1782,16 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
|
||||
for (uint32_t rows = 0; rows < rowsLeft; rows++)
|
||||
{
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
int64_t strPrefix;
|
||||
#endif
|
||||
if (dctStr_iter->length() == 0)
|
||||
{
|
||||
Token nullToken;
|
||||
col_iter->data = nullToken;
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
strPrefix = joblist::UBIGINTNULL; // string prefixes are signed long ints.
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1757,6 +1801,10 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
DctnryTuple dctTuple;
|
||||
dctTuple.sigValue = (unsigned char*)dctStr_iter->c_str();
|
||||
dctTuple.sigSize = dctStr_iter->length();
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
strPrefix = encodeStringPrefix_check_null(dctTuple.sigValue, dctTuple.sigSize,
|
||||
dctnryStructList[i].fCharsetNumber);
|
||||
#endif
|
||||
dctTuple.isNull = false;
|
||||
rc = tokenize(txnid, dctTuple, newDctnryStructList[i].fCompressionType);
|
||||
|
||||
@ -1772,6 +1820,9 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
col_iter->data = dctTuple.token;
|
||||
}
|
||||
|
||||
#if defined(XXX_WRITEENGINE_TOKENS_RANGES_XXX)
|
||||
maxMins[i].fSplitMaxMinInfo[1].addStringPrefix(strPrefix);
|
||||
#endif
|
||||
dctStr_iter++;
|
||||
col_iter++;
|
||||
}
|
||||
@ -1938,7 +1989,7 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
|
||||
if (isFirstBatchPm && (totalRow == rowsLeft))
|
||||
{
|
||||
// in this particular case we already marked extents as invalid up there.
|
||||
// in this particular case we already marked extents as invalid above.
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1950,7 +2001,7 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
if (firstHalfCount)
|
||||
{
|
||||
ExtCPInfo* cpInfoP =
|
||||
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[0]);
|
||||
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[0], m_opType);
|
||||
RID thisRid = rowsLeft ? lastRid : lastRidNew;
|
||||
successFlag = colOp->calculateRowId(thisRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
|
||||
|
||||
@ -1966,7 +2017,7 @@ int WriteEngineWrapper::insertColumnRecs(
|
||||
if (rowsLeft)
|
||||
{
|
||||
ExtCPInfo* cpInfoP =
|
||||
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[1]);
|
||||
getCPInfoToUpdateForUpdatableType(colStructList[i], &maxMins[i].fSplitMaxMinInfo[1], m_opType);
|
||||
if (cpInfoP)
|
||||
{
|
||||
RETURN_ON_ERROR(GetLBIDRange(newExtentsStartingLbids[i], colStructList[i], *cpInfoP));
|
||||
@ -4446,11 +4497,6 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
|
||||
ColumnOp* colOp = NULL;
|
||||
ExtCPInfoList infosToUpdate;
|
||||
|
||||
if (m_opType != DELETE)
|
||||
{
|
||||
m_opType = UPDATE;
|
||||
}
|
||||
|
||||
for (unsigned extent = 0; extent < numExtents; extent++)
|
||||
{
|
||||
colStructList = colExtentsStruct[extent];
|
||||
@ -4524,15 +4570,19 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
|
||||
}
|
||||
std::vector<ExtCPInfo*> currentExtentRangesPtrs(colStructList.size(), NULL); // pointers for each extent.
|
||||
|
||||
if (m_opType != DELETE)
|
||||
m_opType = UPDATE;
|
||||
|
||||
for (unsigned j = 0; j < colStructList.size(); j++)
|
||||
{
|
||||
colOp = m_colOp[op(colStructList[j].fCompressionType)];
|
||||
ExtCPInfo* cpInfoP = &(currentExtentRanges[j]);
|
||||
cpInfoP = getCPInfoToUpdateForUpdatableType(colStructList[j], cpInfoP);
|
||||
cpInfoP = getCPInfoToUpdateForUpdatableType(colStructList[j], cpInfoP, m_opType);
|
||||
currentExtentRangesPtrs[j] = cpInfoP;
|
||||
|
||||
if (colStructList[j].tokenFlag)
|
||||
continue;
|
||||
// XXX: highly dubious.
|
||||
// if (!colStructList[j].tokenFlag)
|
||||
// continue;
|
||||
|
||||
width = colOp->getCorrectRowWidth(colStructList[j].colDataType, colStructList[j].colWidth);
|
||||
successFlag = colOp->calculateRowId(aRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
|
||||
@ -4550,9 +4600,6 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
|
||||
// timer.start("markExtentsInvalid");
|
||||
//#endif
|
||||
|
||||
if (m_opType != DELETE)
|
||||
m_opType = UPDATE;
|
||||
|
||||
rc = writeColumnRecUpdate(txnid, cscColTypeList, colStructList, colValueList, colOldValueList,
|
||||
ridLists[extent], tableOid, true, ridLists[extent].size(),
|
||||
¤tExtentRangesPtrs);
|
||||
@ -4578,6 +4625,7 @@ int WriteEngineWrapper::updateColumnRec(const TxnID& txnid, const vector<CSCType
|
||||
{
|
||||
cpInfo.fCPInfo.seqNum = SEQNUM_MARK_INVALID_SET_RANGE;
|
||||
}
|
||||
// ZZZZ
|
||||
rc = BRMWrapper::getInstance()->setExtentsMaxMin(infosToDrop);
|
||||
setInvalidCPInfosSpecialMarks(infosToUpdate);
|
||||
rc = BRMWrapper::getInstance()->setExtentsMaxMin(infosToUpdate);
|
||||
@ -4611,12 +4659,9 @@ int WriteEngineWrapper::updateColumnRecs(const TxnID& txnid, const CSCTypesList&
|
||||
colOp = m_colOp[op(colExtentsStruct[j].fCompressionType)];
|
||||
|
||||
ExtCPInfo* cpInfoP = &(infosToUpdate[j]);
|
||||
cpInfoP = getCPInfoToUpdateForUpdatableType(colExtentsStruct[j], cpInfoP);
|
||||
cpInfoP = getCPInfoToUpdateForUpdatableType(colExtentsStruct[j], cpInfoP, m_opType);
|
||||
pointersToInfos.push_back(cpInfoP);
|
||||
|
||||
if (colExtentsStruct[j].tokenFlag)
|
||||
continue;
|
||||
|
||||
width = colOp->getCorrectRowWidth(colExtentsStruct[j].colDataType, colExtentsStruct[j].colWidth);
|
||||
successFlag = colOp->calculateRowId(aRid, BYTE_PER_BLOCK / width, width, curFbo, curBio);
|
||||
|
||||
@ -4964,7 +5009,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
|
||||
allocateValArray(valArray, totalRow1, colStructList[i].colType, colStructList[i].colWidth);
|
||||
|
||||
ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
|
||||
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL);
|
||||
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL, m_opType);
|
||||
|
||||
if (m_opType != INSERT && cpInfo != NULL) // we allocate space for old values only when we need them.
|
||||
{
|
||||
@ -5109,7 +5154,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
|
||||
}
|
||||
|
||||
ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
|
||||
newColStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[1] : NULL);
|
||||
newColStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[1] : NULL, m_opType);
|
||||
allocateValArray(valArray, totalRow2, newColStructList[i].colType, newColStructList[i].colWidth);
|
||||
|
||||
if (m_opType != INSERT && cpInfo != NULL) // we allocate space for old values only when we need them.
|
||||
@ -5190,7 +5235,7 @@ int WriteEngineWrapper::writeColumnRec(const TxnID& txnid, const CSCTypesList& c
|
||||
ColumnOp* colOp = m_colOp[op(colStructList[i].fCompressionType)];
|
||||
|
||||
ExtCPInfo* cpInfo = getCPInfoToUpdateForUpdatableType(
|
||||
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL);
|
||||
colStructList[i], maxMins ? ((*maxMins)[i]).fSplitMaxMinInfoPtrs[0] : NULL, m_opType);
|
||||
|
||||
// set params
|
||||
colOp->initColumn(curCol);
|
||||
|
Reference in New Issue
Block a user