From ac3e702a3e038765772f662da029e4e7c61333c5 Mon Sep 17 00:00:00 2001 From: Andrew Hutchings Date: Tue, 8 May 2018 19:38:06 +0100 Subject: [PATCH] MCOL-1396 Allow StringStore to hold more than 2GB StringStore originally worked by returning a 32bit pointer to a memory location and storing the length with that pointer. This allowed 4GB to be stored in 64KB blocks. With 1.1 we used the high bit to signify a TEXT/BLOB string of > 64KB reducing the max capacity to 2GB but without any bounds checking. So, if you went over the 2GB mark the getter would think you are trying to get a long string instead of a short one and come up empty. It would then return NULL. This patch uses 64bit memory points still retaining the high bit to signify long strings. It also now stores the length with the string rather than with the pointer to allow the full 64bits for pointers. It also adds a bounds check for small strings. --- utils/rowgroup/rowgroup.cpp | 64 ++++++++-------- utils/rowgroup/rowgroup.h | 147 ++++++++++++++++++++++-------------- 2 files changed, 123 insertions(+), 88 deletions(-) diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index 48bdd7031..ba64e3596 100755 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -79,10 +79,10 @@ StringStore::~StringStore() #endif } -uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) +uint64_t StringStore::storeString(const uint8_t *data, uint32_t len) { MemChunk *lastMC = NULL; - uint32_t ret = 0; + uint64_t ret = 0; empty = false; // At least a NULL is being stored. @@ -92,7 +92,7 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) if ((len == 8 || len == 9) && *((uint64_t *) data) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str())) - return numeric_limits::max(); + return numeric_limits::max(); //@bug6065, make StringStore::storeString() thread safe boost::mutex::scoped_lock lk(fMutex, defer_lock); @@ -102,20 +102,21 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) if (mem.size() > 0) lastMC = (MemChunk *) mem.back().get(); - if (len >= CHUNK_SIZE) + if ((len+4) >= CHUNK_SIZE) { - shared_array newOne(new uint8_t[len + sizeof(MemChunk)]); + shared_array newOne(new uint8_t[len + sizeof(MemChunk) + 4]); longStrings.push_back(newOne); lastMC = (MemChunk*) longStrings.back().get(); - lastMC->capacity = lastMC->currentSize = len; - memcpy(lastMC->data, data, len); + lastMC->capacity = lastMC->currentSize = len + 4; + memcpy(lastMC->data, &len, 4); + memcpy(lastMC->data + 4, data, len); // High bit to mark a long string - ret = 0x80000000; + ret = 0x8000000000000000; ret += longStrings.size() - 1; } else { - if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len)) + if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < (len + 4))) { // mem usage debugging //if (lastMC) @@ -130,7 +131,11 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize; - memcpy(&(lastMC->data[lastMC->currentSize]), data, len); + // If this ever happens then we have big problems + if (ret & 0x8000000000000000) + throw logic_error("StringStore memory exceeded."); + memcpy(&(lastMC->data[lastMC->currentSize]), &len, 4); + memcpy(&(lastMC->data[lastMC->currentSize]) + 4, data, len); /* cout << "stored: '" << hex; for (uint32_t i = 0; i < len ; i++) { @@ -138,7 +143,7 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) } cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl; */ - lastMC->currentSize += len; + lastMC->currentSize += len + 4; } return ret; @@ -146,31 +151,31 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) void StringStore::serialize(ByteStream &bs) const { - uint32_t i; + uint64_t i; MemChunk *mc; - bs << (uint32_t) mem.size(); + bs << (uint64_t) mem.size(); bs << (uint8_t) empty; for (i = 0; i < mem.size(); i++) { mc = (MemChunk *) mem[i].get(); - bs << (uint32_t) mc->currentSize; + bs << (uint64_t) mc->currentSize; //cout << "serialized " << mc->currentSize << " bytes\n"; bs.append(mc->data, mc->currentSize); } - bs << (uint32_t) longStrings.size(); + bs << (uint64_t) longStrings.size(); for (i = 0; i < longStrings.size(); i++) { mc = (MemChunk *) longStrings[i].get(); - bs << (uint32_t) mc->currentSize; + bs << (uint64_t) mc->currentSize; bs.append(mc->data, mc->currentSize); } } void StringStore::deserialize(ByteStream &bs) { - uint32_t i; - uint32_t count; - uint32_t size; + uint64_t i; + uint64_t count; + uint64_t size; uint8_t *buf; MemChunk *mc; uint8_t tmp8; @@ -718,10 +723,9 @@ bool Row::isNullValue(uint32_t colIndex) const case CalpontSystemCatalog::STRINT: { uint32_t len = getColumnWidth(colIndex); if (inStringTable(colIndex)) { - uint32_t offset, length; - offset = *((uint32_t *) &data[offsets[colIndex]]); - length = *((uint32_t *) &data[offsets[colIndex] + 4]); - return strings->isNullValue(offset, length); + uint64_t offset; + offset = *((uint64_t *) &data[offsets[colIndex]]); + return strings->isNullValue(offset); } if (data[offsets[colIndex]] == 0) // empty string return true; @@ -757,10 +761,9 @@ bool Row::isNullValue(uint32_t colIndex) const case CalpontSystemCatalog::VARBINARY: { uint32_t pos = offsets[colIndex]; if (inStringTable(colIndex)) { - uint32_t offset, length; - offset = *((uint32_t *) &data[pos]); - length = *((uint32_t *) &data[pos+4]); - return strings->isNullValue(offset, length); + uint64_t offset; + offset = *((uint64_t *) &data[pos]); + return strings->isNullValue(offset); } if (*((uint16_t*) &data[pos]) == 0) return true; @@ -1416,8 +1419,8 @@ RGData RowGroup::duplicate() void Row::setStringField(const std::string &val, uint32_t colIndex) { - uint32_t length; - uint32_t offset; + uint64_t offset; + uint64_t length; //length = strlen(val.c_str()) + 1; length = val.length(); @@ -1426,8 +1429,7 @@ void Row::setStringField(const std::string &val, uint32_t colIndex) if (inStringTable(colIndex)) { offset = strings->storeString((const uint8_t *) val.data(), length); - *((uint32_t *) &data[offsets[colIndex]]) = offset; - *((uint32_t *) &data[offsets[colIndex] + 4]) = length; + *((uint64_t *) &data[offsets[colIndex]]) = offset; // cout << " -- stored offset " << *((uint32_t *) &data[offsets[colIndex]]) // << " length " << *((uint32_t *) &data[offsets[colIndex] + 4]) // << endl; diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index 8b5ea75d7..7aca0c93f 100755 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -92,13 +92,14 @@ public: StringStore(); virtual ~StringStore(); - inline std::string getString(uint32_t offset, uint32_t length) const; - uint32_t storeString(const uint8_t *data, uint32_t length); //returns the offset - inline const uint8_t * getPointer(uint32_t offset) const; + inline std::string getString(uint64_t offset) const; + uint64_t storeString(const uint8_t *data, uint32_t length); //returns the offset + inline const uint8_t * getPointer(uint64_t offset) const; + inline uint32_t getStringLength(uint64_t offset); inline bool isEmpty() const; inline uint64_t getSize() const; - inline bool isNullValue(uint32_t offset, uint32_t length) const; - inline bool equals(const std::string &str, uint32_t offset, uint32_t length) const; + inline bool isNullValue(uint64_t offset) const; + inline bool equals(const std::string &str, uint64_t offset) const; void clear(); @@ -541,9 +542,8 @@ inline bool Row::equals(uint64_t val, uint32_t colIndex) const inline bool Row::equals(const std::string &val, uint32_t colIndex) const { if (inStringTable(colIndex)) { - uint32_t offset = *((uint32_t *) &data[offsets[colIndex]]); - uint32_t length = *((uint32_t *) &data[offsets[colIndex] + 4]); - return strings->equals(val, offset, length); + uint64_t offset = *((uint64_t *) &data[offsets[colIndex]]); + return strings->equals(val, offset); } else return (strncmp(val.c_str(), (char *) &data[offsets[colIndex]], getColumnWidth(colIndex)) == 0); @@ -609,28 +609,27 @@ inline int64_t Row::getIntField(uint32_t colIndex) const inline const uint8_t * Row::getStringPointer(uint32_t colIndex) const { if (inStringTable(colIndex)) - return strings->getPointer(*((uint32_t *) &data[offsets[colIndex]])); + return strings->getPointer(*((uint64_t *) &data[offsets[colIndex]])); return &data[offsets[colIndex]]; } inline uint32_t Row::getStringLength(uint32_t colIndex) const { if (inStringTable(colIndex)) - return *((uint32_t *) &data[offsets[colIndex] + 4]); + return strings->getStringLength(*((uint64_t *) &data[offsets[colIndex]])); return strnlen((char *) &data[offsets[colIndex]], getColumnWidth(colIndex)); } inline void Row::setStringField(const uint8_t *strdata, uint32_t length, uint32_t colIndex) { - uint32_t offset; + uint64_t offset; if (length > getColumnWidth(colIndex)) length = getColumnWidth(colIndex); if (inStringTable(colIndex)) { offset = strings->storeString(strdata, length); - *((uint32_t *) &data[offsets[colIndex]]) = offset; - *((uint32_t *) &data[offsets[colIndex] + 4]) = length; + *((uint64_t *) &data[offsets[colIndex]]) = offset; // cout << " -- stored offset " << *((uint32_t *) &data[offsets[colIndex]]) // << " length " << *((uint32_t *) &data[offsets[colIndex] + 4]) // << endl; @@ -645,8 +644,7 @@ inline void Row::setStringField(const uint8_t *strdata, uint32_t length, uint32_ inline std::string Row::getStringField(uint32_t colIndex) const { if (inStringTable(colIndex)) - return strings->getString(*((uint32_t *) &data[offsets[colIndex]]), - *((uint32_t *) &data[offsets[colIndex] + 4])); + return strings->getString(*((uint64_t *) &data[offsets[colIndex]])); // Not all CHAR/VARCHAR are NUL terminated so use length return std::string((char *) &data[offsets[colIndex]], strnlen((char *) &data[offsets[colIndex]], getColumnWidth(colIndex))); @@ -662,21 +660,21 @@ inline std::string Row::getVarBinaryStringField(uint32_t colIndex) const inline uint32_t Row::getVarBinaryLength(uint32_t colIndex) const { if (inStringTable(colIndex)) - return *((uint32_t *) &data[offsets[colIndex] + 4]); + return strings->getStringLength(*((uint64_t *) &data[offsets[colIndex]]));; return *((uint16_t*) &data[offsets[colIndex]]); } inline const uint8_t* Row::getVarBinaryField(uint32_t colIndex) const { if (inStringTable(colIndex)) - return strings->getPointer(*((uint32_t *) &data[offsets[colIndex]])); + return strings->getPointer(*((uint64_t *) &data[offsets[colIndex]])); return &data[offsets[colIndex] + 2]; } inline const uint8_t* Row::getVarBinaryField(uint32_t& len, uint32_t colIndex) const { if (inStringTable(colIndex)) { - len = *((uint32_t *) &data[offsets[colIndex] + 4]); + len = strings->getStringLength(*((uint64_t *) &data[offsets[colIndex]])); return getVarBinaryField(colIndex); } else { @@ -854,9 +852,8 @@ inline void Row::setVarBinaryField(const uint8_t *val, uint32_t len, uint32_t co if (len > getColumnWidth(colIndex)) len = getColumnWidth(colIndex); if (inStringTable(colIndex)) { - uint32_t offset = strings->storeString(val, len); - *((uint32_t *) &data[offsets[colIndex]]) = offset; - *((uint32_t *) &data[offsets[colIndex] + 4]) = len; + uint64_t offset = strings->storeString(val, len); + *((uint64_t *) &data[offsets[colIndex]]) = offset; } else { *((uint16_t*) &data[offsets[colIndex]]) = len; @@ -1535,49 +1532,53 @@ inline void copyRow(const Row &in, Row *out) copyRow(in, out, std::min(in.getColumnCount(), out->getColumnCount())); } -inline std::string StringStore::getString(uint32_t off, uint32_t len) const +inline std::string StringStore::getString(uint64_t off) const { - if (off == std::numeric_limits::max()) + uint32_t length; + if (off == std::numeric_limits::max()) return joblist::CPNULLSTRMARK; MemChunk *mc; - if (off & 0x80000000) + if (off & 0x8000000000000000) { - off = off - 0x80000000; + off = off - 0x8000000000000000; if (longStrings.size() <= off) return joblist::CPNULLSTRMARK; mc = (MemChunk*) longStrings[off].get(); - return std::string((char *) mc->data, len); + memcpy(&length, mc->data, 4); + return std::string((char *) mc->data+4, length); } - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; + uint64_t chunk = off / CHUNK_SIZE; + uint64_t offset = off % CHUNK_SIZE; // this has to handle uninitialized data as well. If it's uninitialized it doesn't matter // what gets returned, it just can't go out of bounds. if (mem.size() <= chunk) return joblist::CPNULLSTRMARK; mc = (MemChunk *) mem[chunk].get(); - if ((offset + len) > mc->currentSize) + + memcpy(&length, &mc->data[offset], 4); + if ((offset + length) > mc->currentSize) return joblist::CPNULLSTRMARK; - - return std::string((char *) &(mc->data[offset]), len); + + return std::string((char *) &(mc->data[offset])+4, length); } -inline const uint8_t * StringStore::getPointer(uint32_t off) const +inline const uint8_t * StringStore::getPointer(uint64_t off) const { - if (off == std::numeric_limits::max()) + if (off == std::numeric_limits::max()) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; + uint64_t chunk = off / CHUNK_SIZE; + uint64_t offset = off % CHUNK_SIZE; MemChunk *mc; - if (off & 0x80000000) + if (off & 0x8000000000000000) { - off = off - 0x80000000; + off = off - 0x8000000000000000; if (longStrings.size() <= off) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); mc = (MemChunk*) longStrings[off].get(); - return mc->data; + return mc->data+4; } // this has to handle uninitialized data as well. If it's uninitialized it doesn't matter // what gets returned, it just can't go out of bounds. @@ -1587,19 +1588,17 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const if (offset > mc->currentSize) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - return &(mc->data[offset]); + return &(mc->data[offset]) + 4; } -inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const +inline bool StringStore::isNullValue(uint64_t off) const { - if (off == std::numeric_limits::max() || len == 0) + uint32_t length; + if (off == std::numeric_limits::max()) return true; - if (len < 8) - return false; - // Long strings won't be NULL - if (off & 0x80000000) + if (off & 0x8000000000000000) return false; uint32_t chunk = off / CHUNK_SIZE; @@ -1609,31 +1608,38 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const return true; mc = (MemChunk *) mem[chunk].get(); - if ((offset + len) > mc->currentSize) + memcpy(&length, &mc->data[offset], 4); + if (length == 0) return true; - if (mc->data[offset] == 0) // "" = NULL string for some reason... + if (length < 8) + return false; + if ((offset + length) > mc->currentSize) return true; - return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str())); + if (mc->data[offset+4] == 0) // "" = NULL string for some reason... + return true; + return (*((uint64_t *) &mc->data[offset]+4) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str())); } -inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const +inline bool StringStore::equals(const std::string &str, uint64_t off) const { - if (off == std::numeric_limits::max() || len == 0) + uint32_t length; + if (off == std::numeric_limits::max()) return str == joblist::CPNULLSTRMARK; MemChunk *mc; - if (off & 0x80000000) + if (off & 0x8000000000000000) { - if (longStrings.size() <= (off - 0x80000000)) + if (longStrings.size() <= (off - 0x8000000000000000)) return false; - mc = (MemChunk *) longStrings[off - 0x80000000].get(); + mc = (MemChunk *) longStrings[off - 0x8000000000000000].get(); + memcpy(&length, mc->data, 4); // Not sure if this check it needed, but adds safety - if (len > mc->currentSize) + if (length > mc->currentSize) return false; - return (strncmp(str.c_str(), (const char*) mc->data, len) == 0); + return (strncmp(str.c_str(), (const char*) mc->data+4, length) == 0); } uint32_t chunk = off / CHUNK_SIZE; uint32_t offset = off % CHUNK_SIZE; @@ -1641,10 +1647,37 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l return false; mc = (MemChunk *) mem[chunk].get(); - if ((offset + len) > mc->currentSize) + memcpy(&length, &mc->data[offset], 4); + if ((offset + length) > mc->currentSize) return false; - return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0); + return (strncmp(str.c_str(), (const char *) &mc->data[offset]+4, length) == 0); +} +inline uint32_t StringStore::getStringLength(uint64_t off) +{ + uint32_t length; + MemChunk *mc; + if (off == std::numeric_limits::max()) + return 0; + if (off & 0x8000000000000000) + { + off = off - 0x8000000000000000; + if (longStrings.size() <= off) + return 0; + mc = (MemChunk*) longStrings[off].get(); + memcpy(&length, mc->data, 4); + } + else + { + uint64_t chunk = off / CHUNK_SIZE; + uint64_t offset = off % CHUNK_SIZE; + if (mem.size() <= chunk) + return 0; + mc = (MemChunk *) mem[chunk].get(); + memcpy(&length, &mc->data[offset], 4); + } + + return length; } inline bool StringStore::isEmpty() const