1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-3536 Collation

This commit is contained in:
David Hall
2020-05-28 14:19:17 -05:00
parent 06e50e0926
commit 2e66b1f1e8
11 changed files with 258 additions and 250 deletions

View File

@ -344,7 +344,45 @@ void UserDataStore::deserialize(ByteStream& bs)
return;
}
//uint32_t rgDataCount = 0;
inline bool StringStore::equals(const std::string& str, uint64_t off, CHARSET_INFO* cs) const
{
uint32_t length;
if (off == std::numeric_limits<uint64_t>::max())
return str == joblist::CPNULLSTRMARK;
MemChunk* mc;
if (off & 0x8000000000000000)
{
if (longStrings.size() <= (off & ~0x8000000000000000))
return false;
mc = (MemChunk*) longStrings[off & ~0x8000000000000000].get();
memcpy(&length, mc->data, 4);
// Not sure if this check it needed, but adds safety
if (length > mc->currentSize)
return false;
return (cs->strnncoll(str.c_str(), str.length(), (const char*)mc->data+4, length) == 0);
}
uint32_t chunk = off / CHUNK_SIZE;
uint32_t offset = off % CHUNK_SIZE;
if (mem.size() <= chunk)
return false;
mc = (MemChunk*) mem[chunk].get();
memcpy(&length, &mc->data[offset], 4);
if ((offset + length) > mc->currentSize)
return false;
return (cs->strnncoll(str.c_str(), str.length(), (const char*)&mc->data[offset]+4, length) == 0);
}
RGData::RGData()
{
@ -505,9 +543,10 @@ Row::Row() : data(NULL), strings(NULL), userDataStore(NULL) { }
Row::Row(const Row& r) : columnCount(r.columnCount), baseRid(r.baseRid),
oldOffsets(r.oldOffsets), stOffsets(r.stOffsets),
offsets(r.offsets), colWidths(r.colWidths), types(r.types), charsetNumbers(r.charsetNumbers),
offsets(r.offsets), colWidths(r.colWidths), types(r.types),
charsetNumbers(r.charsetNumbers), charsets(r.charsets),
data(r.data), scale(r.scale), precision(r.precision), strings(r.strings),
useStringTable(r.useStringTable), hasLongStringField(r.hasLongStringField),
useStringTable(r.useStringTable), hasStrings(r.hasStrings), hasLongStringField(r.hasLongStringField),
sTableThreshold(r.sTableThreshold), forceInline(r.forceInline), userDataStore(NULL)
{ }
@ -523,11 +562,13 @@ Row& Row::operator=(const Row& r)
colWidths = r.colWidths;
types = r.types;
charsetNumbers = r.charsetNumbers;
charsets = r.charsets;
data = r.data;
scale = r.scale;
precision = r.precision;
strings = r.strings;
useStringTable = r.useStringTable;
hasStrings = r.hasStrings;
hasLongStringField = r.hasLongStringField;
sTableThreshold = r.sTableThreshold;
forceInline = r.forceInline;
@ -990,6 +1031,128 @@ int64_t Row::getSignedNullValue(uint32_t colIndex) const
return utils::getSignedNullValue(types[colIndex], getColumnWidth(colIndex));
}
bool Row::equals(const std::string& val, uint32_t col) const
{
const CHARSET_INFO* cs = getCharset(col);
if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::BLOB))
{
if (getStringLength(col) != val.length())
return false;
if (memcmp(getStringPointer(col), val.c_str(), val.length()))
return false;
}
else if (inStringTable(col))
{
uint64_t offset = *((uint64_t*) &data[offsets[col]]);
return strings->equals(val, offset, cs);
}
else
{
return (cs->strnncollsp(val.c_str(), val.length(), (char*)&data[offsets[col]], getColumnWidth(col)) == 0);
}
return true;
}
bool Row::equals(const Row& r2, const std::vector<uint32_t>& keyCols) const
{
for (uint32_t i = 0; i < keyCols.size(); i++)
{
const uint32_t& col = keyCols[i];
if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::VARCHAR ||
getColType(col) == execplan::CalpontSystemCatalog::CHAR ||
getColType(col) == execplan::CalpontSystemCatalog::TEXT))
{
CHARSET_INFO* cs = getCharset(col);
if (cs->strnncollsp(getStringPointer(col), getStringLength(col),
r2.getStringPointer(col), r2.getStringLength(col)))
{
return false;
}
}
else if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::BLOB))
{
if (getStringLength(col) != r2.getStringLength(col))
return false;
if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col)))
return false;
}
else
{
if (getColType(col) == execplan::CalpontSystemCatalog::LONGDOUBLE)
{
if (getLongDoubleField(col) != r2.getLongDoubleField(col))
return false;
}
else if (getUintField(col) != r2.getUintField(col))
return false;
}
}
return true;
}
bool Row::equals(const Row& r2, uint32_t lastCol) const
{
// This check fires with empty r2 only.
if (lastCol >= columnCount)
return true;
// If there are no strings in the row, then we can just memcmp the whole row.
// hasStrings is true if there is any column of type CHAR, VARCHAR or TEXT
// useStringTable is true if any field declared > max inline field size, including BLOB
// For memcmp to be correct, both must be false.
if (!hasStrings && !useStringTable && !r2.hasStrings && !r2.useStringTable)
return !(memcmp(&data[offsets[0]], &r2.data[offsets[0]], offsets[lastCol + 1] - offsets[0]));
// There are strings involved, so we need to check each column
// because binary equality is not equality for many charsets/collations
for (uint32_t col = 0; col <= lastCol; col++)
{
if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::VARCHAR ||
getColType(col) == execplan::CalpontSystemCatalog::CHAR ||
getColType(col) == execplan::CalpontSystemCatalog::TEXT))
{
CHARSET_INFO* cs = getCharset(col);
if (cs->strnncollsp(getStringPointer(col), getStringLength(col),
r2.getStringPointer(col), r2.getStringLength(col)))
{
return false;
}
}
else if (UNLIKELY(getColType(col) == execplan::CalpontSystemCatalog::BLOB))
{
if (getStringLength(col) != r2.getStringLength(col))
return false;
if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col)))
return false;
}
else
{
if (getColType(col) == execplan::CalpontSystemCatalog::LONGDOUBLE)
{
if (getLongDoubleField(col) != r2.getLongDoubleField(col))
return false;
}
else if (getUintField(col) != r2.getUintField(col))
return false;
}
}
return true;
}
const CHARSET_INFO* Row::getCharset(uint32_t col) const
{
if (charsets[col] == NULL)
{
const_cast<CHARSET_INFO**>(charsets)[col] = get_charset(charsetNumbers[col], MYF(MY_WME));
}
return charsets[col];
}
RowGroup::RowGroup() : columnCount(0), data(NULL), rgData(NULL), strings(NULL),
useStringTable(true), hasLongStringField(false), sTableThreshold(20)
{
@ -1045,6 +1208,15 @@ RowGroup::RowGroup(uint32_t colCount,
}
else
stOffsets[i + 1] = stOffsets[i] + colWidths[i];
execplan::CalpontSystemCatalog::ColDataType type = types[i];
if (type == execplan::CalpontSystemCatalog::CHAR ||
type == execplan::CalpontSystemCatalog::VARCHAR ||
type == execplan::CalpontSystemCatalog::TEXT)
{
hasStrings = true;
break;
}
}
useStringTable = (stringTable && hasLongStringField);
@ -1090,6 +1262,7 @@ RowGroup& RowGroup::operator=(const RowGroup& r)
rgData = r.rgData;
strings = r.strings;
useStringTable = r.useStringTable;
hasStrings = r.hasStrings;
hasLongStringField = r.hasLongStringField;
sTableThreshold = r.sTableThreshold;
forceInline = r.forceInline;
@ -1481,7 +1654,7 @@ void RowGroup::addToSysDataList(execplan::CalpontSystemCatalog::NJLSysDataList&
}
}
CHARSET_INFO* RowGroup::getCharset(uint32_t col)
const CHARSET_INFO* RowGroup::getCharset(uint32_t col)
{
if (charsets[col] == NULL)
{

View File

@ -108,7 +108,7 @@ public:
inline bool isEmpty() const;
inline uint64_t getSize() const;
inline bool isNullValue(uint64_t offset) const;
inline bool equals(const std::string& str, uint64_t offset) const;
bool equals(const std::string& str, uint64_t offset, CHARSET_INFO* cs) const;
void clear();
@ -325,7 +325,7 @@ public:
inline execplan::CalpontSystemCatalog::ColDataType* getColTypes();
inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const;
inline uint32_t getCharsetNumber(uint32_t colIndex) const;
// this returns true if the type is not CHAR or VARCHAR
inline bool isCharType(uint32_t colIndex) const;
inline bool isUnsigned(uint32_t colIndex) const;
@ -338,7 +338,7 @@ public:
inline int64_t getIntField(uint32_t colIndex) const;
template<int len> inline bool equals(uint64_t val, uint32_t colIndex) const;
inline bool equals(long double val, uint32_t colIndex) const;
inline bool equals(const std::string& val, uint32_t colIndex) const;
bool equals(const std::string& val, uint32_t colIndex) const;
inline double getDoubleField(uint32_t colIndex) const;
inline float getFloatField(uint32_t colIndex) const;
@ -387,7 +387,7 @@ public:
inline void setStringField(const uint8_t*, uint32_t len, uint32_t colIndex);
// support VARBINARY
// Add 2-byte length at the beginning of the field. NULL and zero length field are
// Add 2-byte length at the CHARSET_INFO*beginning of the field. NULL and zero length field are
// treated the same, could use one of the length bit to distinguish these two cases.
inline std::string getVarBinaryStringField(uint32_t colIndex) const;
inline void setVarBinaryField(const std::string& val, uint32_t colIndex);
@ -449,14 +449,17 @@ public:
inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol]
inline uint64_t hash() const; // generates a hash for all cols
inline bool equals(const Row&, const std::vector<uint32_t>& keyColumns) const;
inline bool equals(const Row&, uint32_t lastCol) const;
bool equals(const Row&, const std::vector<uint32_t>& keyColumns) const;
bool equals(const Row&, uint32_t lastCol) const;
inline bool equals(const Row&) const;
inline void setUserDataStore(UserDataStore* u)
{
userDataStore = u;
}
const CHARSET_INFO* getCharset(uint32_t col) const;
private:
uint32_t columnCount;
uint64_t baseRid;
@ -468,12 +471,14 @@ private:
uint32_t* colWidths;
execplan::CalpontSystemCatalog::ColDataType* types;
uint32_t* charsetNumbers;
CHARSET_INFO** charsets;
uint8_t* data;
uint32_t* scale;
uint32_t* precision;
StringStore* strings;
bool useStringTable;
bool hasStrings;
bool hasLongStringField;
uint32_t sTableThreshold;
boost::shared_array<bool> forceInline;
@ -634,18 +639,6 @@ inline bool Row::equals(long double val, uint32_t colIndex) const
{
return *((long double*) &data[offsets[colIndex]]) == val;
}
inline bool Row::equals(const std::string& val, uint32_t colIndex) const
{
if (inStringTable(colIndex))
{
uint64_t offset = *((uint64_t*) &data[offsets[colIndex]]);
return strings->equals(val, offset);
}
else
return (strncmp(val.c_str(), (char*) &data[offsets[colIndex]], getColumnWidth(colIndex)) == 0);
}
template<int len>
inline uint64_t Row::getUintField(uint32_t colIndex) const
{
@ -1182,69 +1175,6 @@ inline uint64_t Row::hash(uint32_t lastCol) const
return ret;
}
inline bool Row::equals(const Row& r2, const std::vector<uint32_t>& keyCols) const
{
for (uint32_t i = 0; i < keyCols.size(); i++)
{
const uint32_t& col = keyCols[i];
if (!isLongString(col))
{
if (getColType(i) == execplan::CalpontSystemCatalog::LONGDOUBLE)
{
if (getLongDoubleField(i) != r2.getLongDoubleField(i))
return false;
}
else if (getUintField(col) != r2.getUintField(col))
return false;
}
else
{
if (getStringLength(col) != r2.getStringLength(col))
return false;
if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col)))
return false;
}
}
return true;
}
inline bool Row::equals(const Row& r2, uint32_t lastCol) const
{
// This check fires with empty r2 only.
if (lastCol >= columnCount)
return true;
if (!useStringTable && !r2.useStringTable)
return !(memcmp(&data[offsets[0]], &r2.data[offsets[0]], offsets[lastCol + 1] - offsets[0]));
for (uint32_t i = 0; i <= lastCol; i++)
if (!isLongString(i))
{
if (getColType(i) == execplan::CalpontSystemCatalog::LONGDOUBLE)
{
if (getLongDoubleField(i) != r2.getLongDoubleField(i))
return false;
}
else if (getUintField(i) != r2.getUintField(i))
return false;
}
else
{
uint32_t len = getStringLength(i);
if (len != r2.getStringLength(i))
return false;
if (memcmp(getStringPointer(i), r2.getStringPointer(i), len))
return false;
}
return true;
}
inline bool Row::equals(const Row& r2) const
{
return equals(r2, columnCount - 1);
@ -1414,7 +1344,7 @@ public:
inline void setStringStore(boost::shared_ptr<StringStore>);
CHARSET_INFO* getCharset(uint32_t col);
const CHARSET_INFO* getCharset(uint32_t col);
private:
uint32_t columnCount;
@ -1443,6 +1373,7 @@ private:
RGData* rgData;
StringStore* strings; // note, strings and data belong to rgData
bool useStringTable;
bool hasStrings;
bool hasLongStringField;
uint32_t sTableThreshold;
boost::shared_array<bool> forceInline;
@ -1569,6 +1500,7 @@ void RowGroup::initRow(Row* r, bool forceInlineData) const
r->colWidths = (uint32_t*) &colWidths[0];
r->types = (execplan::CalpontSystemCatalog::ColDataType*) & (types[0]);
r->charsetNumbers = (uint32_t*) & (charsetNumbers[0]);
r->charsets = (CHARSET_INFO**) & (charsets[0]);
r->scale = (uint32_t*) & (scale[0]);
r->precision = (uint32_t*) & (precision[0]);
}
@ -1591,6 +1523,7 @@ void RowGroup::initRow(Row* r, bool forceInlineData) const
r->hasLongStringField = hasLongStringField;
r->sTableThreshold = sTableThreshold;
r->forceInline = forceInline;
r->hasStrings = hasStrings;
}
inline uint32_t RowGroup::getRowSize() const
@ -1935,45 +1868,6 @@ inline bool StringStore::isNullValue(uint64_t off) const
return (memcmp(&mc->data[offset+4], joblist::CPNULLSTRMARK.c_str(), 8) == 0);
}
inline bool StringStore::equals(const std::string& str, uint64_t off) const
{
uint32_t length;
if (off == std::numeric_limits<uint64_t>::max())
return str == joblist::CPNULLSTRMARK;
MemChunk* mc;
if (off & 0x8000000000000000)
{
if (longStrings.size() <= (off & ~0x8000000000000000))
return false;
mc = (MemChunk*) longStrings[off & ~0x8000000000000000].get();
memcpy(&length, mc->data, 4);
// Not sure if this check it needed, but adds safety
if (length > mc->currentSize)
return false;
return (strncmp(str.c_str(), (const char*) mc->data + 4, length) == 0);
}
uint32_t chunk = off / CHUNK_SIZE;
uint32_t offset = off % CHUNK_SIZE;
if (mem.size() <= chunk)
return false;
mc = (MemChunk*) mem[chunk].get();
memcpy(&length, &mc->data[offset], 4);
if ((offset + length) > mc->currentSize)
return false;
return (strncmp(str.c_str(), (const char*) &mc->data[offset] + 4, length) == 0);
}
inline uint32_t StringStore::getStringLength(uint64_t off)
{
uint32_t length;