You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-1396 Allow StringStore to hold more than 2GB
StringStore originally worked by returning a 32bit pointer to a memory location and storing the length with that pointer. This allowed 4GB to be stored in 64KB blocks. With 1.1 we used the high bit to signify a TEXT/BLOB string of > 64KB reducing the max capacity to 2GB but without any bounds checking. So, if you went over the 2GB mark the getter would think you are trying to get a long string instead of a short one and come up empty. It would then return NULL. This patch uses 64bit memory points still retaining the high bit to signify long strings. It also now stores the length with the string rather than with the pointer to allow the full 64bits for pointers. It also adds a bounds check for small strings.
This commit is contained in:
@ -79,10 +79,10 @@ StringStore::~StringStore()
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
uint64_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
{
|
||||
MemChunk *lastMC = NULL;
|
||||
uint32_t ret = 0;
|
||||
uint64_t ret = 0;
|
||||
|
||||
empty = false; // At least a NULL is being stored.
|
||||
|
||||
@ -92,7 +92,7 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
|
||||
if ((len == 8 || len == 9) &&
|
||||
*((uint64_t *) data) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()))
|
||||
return numeric_limits<uint32_t>::max();
|
||||
return numeric_limits<uint64_t>::max();
|
||||
|
||||
//@bug6065, make StringStore::storeString() thread safe
|
||||
boost::mutex::scoped_lock lk(fMutex, defer_lock);
|
||||
@ -102,20 +102,21 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
if (mem.size() > 0)
|
||||
lastMC = (MemChunk *) mem.back().get();
|
||||
|
||||
if (len >= CHUNK_SIZE)
|
||||
if ((len+4) >= CHUNK_SIZE)
|
||||
{
|
||||
shared_array<uint8_t> newOne(new uint8_t[len + sizeof(MemChunk)]);
|
||||
shared_array<uint8_t> newOne(new uint8_t[len + sizeof(MemChunk) + 4]);
|
||||
longStrings.push_back(newOne);
|
||||
lastMC = (MemChunk*) longStrings.back().get();
|
||||
lastMC->capacity = lastMC->currentSize = len;
|
||||
memcpy(lastMC->data, data, len);
|
||||
lastMC->capacity = lastMC->currentSize = len + 4;
|
||||
memcpy(lastMC->data, &len, 4);
|
||||
memcpy(lastMC->data + 4, data, len);
|
||||
// High bit to mark a long string
|
||||
ret = 0x80000000;
|
||||
ret = 0x8000000000000000;
|
||||
ret += longStrings.size() - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len))
|
||||
if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < (len + 4)))
|
||||
{
|
||||
// mem usage debugging
|
||||
//if (lastMC)
|
||||
@ -130,7 +131,11 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
|
||||
|
||||
ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize;
|
||||
memcpy(&(lastMC->data[lastMC->currentSize]), data, len);
|
||||
// If this ever happens then we have big problems
|
||||
if (ret & 0x8000000000000000)
|
||||
throw logic_error("StringStore memory exceeded.");
|
||||
memcpy(&(lastMC->data[lastMC->currentSize]), &len, 4);
|
||||
memcpy(&(lastMC->data[lastMC->currentSize]) + 4, data, len);
|
||||
/*
|
||||
cout << "stored: '" << hex;
|
||||
for (uint32_t i = 0; i < len ; i++) {
|
||||
@ -138,7 +143,7 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
}
|
||||
cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl;
|
||||
*/
|
||||
lastMC->currentSize += len;
|
||||
lastMC->currentSize += len + 4;
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -146,31 +151,31 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
|
||||
void StringStore::serialize(ByteStream &bs) const
|
||||
{
|
||||
uint32_t i;
|
||||
uint64_t i;
|
||||
MemChunk *mc;
|
||||
|
||||
bs << (uint32_t) mem.size();
|
||||
bs << (uint64_t) mem.size();
|
||||
bs << (uint8_t) empty;
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
bs << (uint32_t) mc->currentSize;
|
||||
bs << (uint64_t) mc->currentSize;
|
||||
//cout << "serialized " << mc->currentSize << " bytes\n";
|
||||
bs.append(mc->data, mc->currentSize);
|
||||
}
|
||||
bs << (uint32_t) longStrings.size();
|
||||
bs << (uint64_t) longStrings.size();
|
||||
for (i = 0; i < longStrings.size(); i++)
|
||||
{
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
bs << (uint32_t) mc->currentSize;
|
||||
bs << (uint64_t) mc->currentSize;
|
||||
bs.append(mc->data, mc->currentSize);
|
||||
}
|
||||
}
|
||||
|
||||
void StringStore::deserialize(ByteStream &bs)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t count;
|
||||
uint32_t size;
|
||||
uint64_t i;
|
||||
uint64_t count;
|
||||
uint64_t size;
|
||||
uint8_t *buf;
|
||||
MemChunk *mc;
|
||||
uint8_t tmp8;
|
||||
@ -718,10 +723,9 @@ bool Row::isNullValue(uint32_t colIndex) const
|
||||
case CalpontSystemCatalog::STRINT: {
|
||||
uint32_t len = getColumnWidth(colIndex);
|
||||
if (inStringTable(colIndex)) {
|
||||
uint32_t offset, length;
|
||||
offset = *((uint32_t *) &data[offsets[colIndex]]);
|
||||
length = *((uint32_t *) &data[offsets[colIndex] + 4]);
|
||||
return strings->isNullValue(offset, length);
|
||||
uint64_t offset;
|
||||
offset = *((uint64_t *) &data[offsets[colIndex]]);
|
||||
return strings->isNullValue(offset);
|
||||
}
|
||||
if (data[offsets[colIndex]] == 0) // empty string
|
||||
return true;
|
||||
@ -757,10 +761,9 @@ bool Row::isNullValue(uint32_t colIndex) const
|
||||
case CalpontSystemCatalog::VARBINARY: {
|
||||
uint32_t pos = offsets[colIndex];
|
||||
if (inStringTable(colIndex)) {
|
||||
uint32_t offset, length;
|
||||
offset = *((uint32_t *) &data[pos]);
|
||||
length = *((uint32_t *) &data[pos+4]);
|
||||
return strings->isNullValue(offset, length);
|
||||
uint64_t offset;
|
||||
offset = *((uint64_t *) &data[pos]);
|
||||
return strings->isNullValue(offset);
|
||||
}
|
||||
if (*((uint16_t*) &data[pos]) == 0)
|
||||
return true;
|
||||
@ -1416,8 +1419,8 @@ RGData RowGroup::duplicate()
|
||||
|
||||
void Row::setStringField(const std::string &val, uint32_t colIndex)
|
||||
{
|
||||
uint32_t length;
|
||||
uint32_t offset;
|
||||
uint64_t offset;
|
||||
uint64_t length;
|
||||
|
||||
//length = strlen(val.c_str()) + 1;
|
||||
length = val.length();
|
||||
@ -1426,8 +1429,7 @@ void Row::setStringField(const std::string &val, uint32_t colIndex)
|
||||
|
||||
if (inStringTable(colIndex)) {
|
||||
offset = strings->storeString((const uint8_t *) val.data(), length);
|
||||
*((uint32_t *) &data[offsets[colIndex]]) = offset;
|
||||
*((uint32_t *) &data[offsets[colIndex] + 4]) = length;
|
||||
*((uint64_t *) &data[offsets[colIndex]]) = offset;
|
||||
// cout << " -- stored offset " << *((uint32_t *) &data[offsets[colIndex]])
|
||||
// << " length " << *((uint32_t *) &data[offsets[colIndex] + 4])
|
||||
// << endl;
|
||||
|
@ -92,13 +92,14 @@ public:
|
||||
StringStore();
|
||||
virtual ~StringStore();
|
||||
|
||||
inline std::string getString(uint32_t offset, uint32_t length) const;
|
||||
uint32_t storeString(const uint8_t *data, uint32_t length); //returns the offset
|
||||
inline const uint8_t * getPointer(uint32_t offset) const;
|
||||
inline std::string getString(uint64_t offset) const;
|
||||
uint64_t storeString(const uint8_t *data, uint32_t length); //returns the offset
|
||||
inline const uint8_t * getPointer(uint64_t offset) const;
|
||||
inline uint32_t getStringLength(uint64_t offset);
|
||||
inline bool isEmpty() const;
|
||||
inline uint64_t getSize() const;
|
||||
inline bool isNullValue(uint32_t offset, uint32_t length) const;
|
||||
inline bool equals(const std::string &str, uint32_t offset, uint32_t length) const;
|
||||
inline bool isNullValue(uint64_t offset) const;
|
||||
inline bool equals(const std::string &str, uint64_t offset) const;
|
||||
|
||||
void clear();
|
||||
|
||||
@ -541,9 +542,8 @@ inline bool Row::equals(uint64_t val, uint32_t colIndex) const
|
||||
inline bool Row::equals(const std::string &val, uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex)) {
|
||||
uint32_t offset = *((uint32_t *) &data[offsets[colIndex]]);
|
||||
uint32_t length = *((uint32_t *) &data[offsets[colIndex] + 4]);
|
||||
return strings->equals(val, offset, length);
|
||||
uint64_t offset = *((uint64_t *) &data[offsets[colIndex]]);
|
||||
return strings->equals(val, offset);
|
||||
}
|
||||
else
|
||||
return (strncmp(val.c_str(), (char *) &data[offsets[colIndex]], getColumnWidth(colIndex)) == 0);
|
||||
@ -609,28 +609,27 @@ inline int64_t Row::getIntField(uint32_t colIndex) const
|
||||
inline const uint8_t * Row::getStringPointer(uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex))
|
||||
return strings->getPointer(*((uint32_t *) &data[offsets[colIndex]]));
|
||||
return strings->getPointer(*((uint64_t *) &data[offsets[colIndex]]));
|
||||
return &data[offsets[colIndex]];
|
||||
}
|
||||
|
||||
inline uint32_t Row::getStringLength(uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex))
|
||||
return *((uint32_t *) &data[offsets[colIndex] + 4]);
|
||||
return strings->getStringLength(*((uint64_t *) &data[offsets[colIndex]]));
|
||||
return strnlen((char *) &data[offsets[colIndex]], getColumnWidth(colIndex));
|
||||
}
|
||||
|
||||
inline void Row::setStringField(const uint8_t *strdata, uint32_t length, uint32_t colIndex)
|
||||
{
|
||||
uint32_t offset;
|
||||
uint64_t offset;
|
||||
|
||||
if (length > getColumnWidth(colIndex))
|
||||
length = getColumnWidth(colIndex);
|
||||
|
||||
if (inStringTable(colIndex)) {
|
||||
offset = strings->storeString(strdata, length);
|
||||
*((uint32_t *) &data[offsets[colIndex]]) = offset;
|
||||
*((uint32_t *) &data[offsets[colIndex] + 4]) = length;
|
||||
*((uint64_t *) &data[offsets[colIndex]]) = offset;
|
||||
// cout << " -- stored offset " << *((uint32_t *) &data[offsets[colIndex]])
|
||||
// << " length " << *((uint32_t *) &data[offsets[colIndex] + 4])
|
||||
// << endl;
|
||||
@ -645,8 +644,7 @@ inline void Row::setStringField(const uint8_t *strdata, uint32_t length, uint32_
|
||||
inline std::string Row::getStringField(uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex))
|
||||
return strings->getString(*((uint32_t *) &data[offsets[colIndex]]),
|
||||
*((uint32_t *) &data[offsets[colIndex] + 4]));
|
||||
return strings->getString(*((uint64_t *) &data[offsets[colIndex]]));
|
||||
// Not all CHAR/VARCHAR are NUL terminated so use length
|
||||
return std::string((char *) &data[offsets[colIndex]],
|
||||
strnlen((char *) &data[offsets[colIndex]], getColumnWidth(colIndex)));
|
||||
@ -662,21 +660,21 @@ inline std::string Row::getVarBinaryStringField(uint32_t colIndex) const
|
||||
inline uint32_t Row::getVarBinaryLength(uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex))
|
||||
return *((uint32_t *) &data[offsets[colIndex] + 4]);
|
||||
return strings->getStringLength(*((uint64_t *) &data[offsets[colIndex]]));;
|
||||
return *((uint16_t*) &data[offsets[colIndex]]);
|
||||
}
|
||||
|
||||
inline const uint8_t* Row::getVarBinaryField(uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex))
|
||||
return strings->getPointer(*((uint32_t *) &data[offsets[colIndex]]));
|
||||
return strings->getPointer(*((uint64_t *) &data[offsets[colIndex]]));
|
||||
return &data[offsets[colIndex] + 2];
|
||||
}
|
||||
|
||||
inline const uint8_t* Row::getVarBinaryField(uint32_t& len, uint32_t colIndex) const
|
||||
{
|
||||
if (inStringTable(colIndex)) {
|
||||
len = *((uint32_t *) &data[offsets[colIndex] + 4]);
|
||||
len = strings->getStringLength(*((uint64_t *) &data[offsets[colIndex]]));
|
||||
return getVarBinaryField(colIndex);
|
||||
}
|
||||
else {
|
||||
@ -854,9 +852,8 @@ inline void Row::setVarBinaryField(const uint8_t *val, uint32_t len, uint32_t co
|
||||
if (len > getColumnWidth(colIndex))
|
||||
len = getColumnWidth(colIndex);
|
||||
if (inStringTable(colIndex)) {
|
||||
uint32_t offset = strings->storeString(val, len);
|
||||
*((uint32_t *) &data[offsets[colIndex]]) = offset;
|
||||
*((uint32_t *) &data[offsets[colIndex] + 4]) = len;
|
||||
uint64_t offset = strings->storeString(val, len);
|
||||
*((uint64_t *) &data[offsets[colIndex]]) = offset;
|
||||
}
|
||||
else {
|
||||
*((uint16_t*) &data[offsets[colIndex]]) = len;
|
||||
@ -1535,49 +1532,53 @@ inline void copyRow(const Row &in, Row *out)
|
||||
copyRow(in, out, std::min(in.getColumnCount(), out->getColumnCount()));
|
||||
}
|
||||
|
||||
inline std::string StringStore::getString(uint32_t off, uint32_t len) const
|
||||
inline std::string StringStore::getString(uint64_t off) const
|
||||
{
|
||||
if (off == std::numeric_limits<uint32_t>::max())
|
||||
uint32_t length;
|
||||
if (off == std::numeric_limits<uint64_t>::max())
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
if (off & 0x8000000000000000)
|
||||
{
|
||||
off = off - 0x80000000;
|
||||
off = off - 0x8000000000000000;
|
||||
if (longStrings.size() <= off)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
return std::string((char *) mc->data, len);
|
||||
memcpy(&length, mc->data, 4);
|
||||
return std::string((char *) mc->data+4, length);
|
||||
}
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
uint64_t chunk = off / CHUNK_SIZE;
|
||||
uint64_t offset = off % CHUNK_SIZE;
|
||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||
// what gets returned, it just can't go out of bounds.
|
||||
if (mem.size() <= chunk)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
|
||||
memcpy(&length, &mc->data[offset], 4);
|
||||
if ((offset + length) > mc->currentSize)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
return std::string((char *) &(mc->data[offset]), len);
|
||||
return std::string((char *) &(mc->data[offset])+4, length);
|
||||
}
|
||||
|
||||
inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||
inline const uint8_t * StringStore::getPointer(uint64_t off) const
|
||||
{
|
||||
if (off == std::numeric_limits<uint32_t>::max())
|
||||
if (off == std::numeric_limits<uint64_t>::max())
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
uint64_t chunk = off / CHUNK_SIZE;
|
||||
uint64_t offset = off % CHUNK_SIZE;
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
if (off & 0x8000000000000000)
|
||||
{
|
||||
off = off - 0x80000000;
|
||||
off = off - 0x8000000000000000;
|
||||
if (longStrings.size() <= off)
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
return mc->data;
|
||||
return mc->data+4;
|
||||
}
|
||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||
// what gets returned, it just can't go out of bounds.
|
||||
@ -1587,19 +1588,17 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||
if (offset > mc->currentSize)
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
return &(mc->data[offset]);
|
||||
return &(mc->data[offset]) + 4;
|
||||
}
|
||||
|
||||
inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||
inline bool StringStore::isNullValue(uint64_t off) const
|
||||
{
|
||||
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
|
||||
uint32_t length;
|
||||
if (off == std::numeric_limits<uint64_t>::max())
|
||||
return true;
|
||||
|
||||
if (len < 8)
|
||||
return false;
|
||||
|
||||
// Long strings won't be NULL
|
||||
if (off & 0x80000000)
|
||||
if (off & 0x8000000000000000)
|
||||
return false;
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
@ -1609,31 +1608,38 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||
return true;
|
||||
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
memcpy(&length, &mc->data[offset], 4);
|
||||
if (length == 0)
|
||||
return true;
|
||||
if (mc->data[offset] == 0) // "" = NULL string for some reason...
|
||||
if (length < 8)
|
||||
return false;
|
||||
if ((offset + length) > mc->currentSize)
|
||||
return true;
|
||||
return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()));
|
||||
if (mc->data[offset+4] == 0) // "" = NULL string for some reason...
|
||||
return true;
|
||||
return (*((uint64_t *) &mc->data[offset]+4) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()));
|
||||
}
|
||||
|
||||
inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const
|
||||
inline bool StringStore::equals(const std::string &str, uint64_t off) const
|
||||
{
|
||||
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
|
||||
uint32_t length;
|
||||
if (off == std::numeric_limits<uint64_t>::max())
|
||||
return str == joblist::CPNULLSTRMARK;
|
||||
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
if (off & 0x8000000000000000)
|
||||
{
|
||||
if (longStrings.size() <= (off - 0x80000000))
|
||||
if (longStrings.size() <= (off - 0x8000000000000000))
|
||||
return false;
|
||||
|
||||
mc = (MemChunk *) longStrings[off - 0x80000000].get();
|
||||
mc = (MemChunk *) longStrings[off - 0x8000000000000000].get();
|
||||
|
||||
memcpy(&length, mc->data, 4);
|
||||
// Not sure if this check it needed, but adds safety
|
||||
if (len > mc->currentSize)
|
||||
if (length > mc->currentSize)
|
||||
return false;
|
||||
|
||||
return (strncmp(str.c_str(), (const char*) mc->data, len) == 0);
|
||||
return (strncmp(str.c_str(), (const char*) mc->data+4, length) == 0);
|
||||
}
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
@ -1641,10 +1647,37 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l
|
||||
return false;
|
||||
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
memcpy(&length, &mc->data[offset], 4);
|
||||
if ((offset + length) > mc->currentSize)
|
||||
return false;
|
||||
|
||||
return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0);
|
||||
return (strncmp(str.c_str(), (const char *) &mc->data[offset]+4, length) == 0);
|
||||
}
|
||||
inline uint32_t StringStore::getStringLength(uint64_t off)
|
||||
{
|
||||
uint32_t length;
|
||||
MemChunk *mc;
|
||||
if (off == std::numeric_limits<uint64_t>::max())
|
||||
return 0;
|
||||
if (off & 0x8000000000000000)
|
||||
{
|
||||
off = off - 0x8000000000000000;
|
||||
if (longStrings.size() <= off)
|
||||
return 0;
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
memcpy(&length, mc->data, 4);
|
||||
}
|
||||
else
|
||||
{
|
||||
uint64_t chunk = off / CHUNK_SIZE;
|
||||
uint64_t offset = off % CHUNK_SIZE;
|
||||
if (mem.size() <= chunk)
|
||||
return 0;
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
memcpy(&length, &mc->data[offset], 4);
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
inline bool StringStore::isEmpty() const
|
||||
|
Reference in New Issue
Block a user