From 3f040173d2aa2b6c69eef193e9941a043dd265bb Mon Sep 17 00:00:00 2001 From: Andrew Hutchings Date: Mon, 14 Aug 2017 21:47:04 +0100 Subject: [PATCH] MCOL-874 StringStore Mk.3 StringStore as a vector of std::string had a performance regressions and a rare crash. This new version of StringStore restores the original StringStore with the 64KB limitation and adds another vector to store strings that won't fit into the small string storage. --- utils/rowgroup/rowgroup.cpp | 103 +++++++++++++++++++++++++++------- utils/rowgroup/rowgroup.h | 108 ++++++++++++++++++++++++++++-------- 2 files changed, 169 insertions(+), 42 deletions(-) diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index 886035b98..bcfa3e9eb 100755 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -69,10 +69,10 @@ StringStore::~StringStore() uint32_t i; uint64_t inUse = 0, allocated = 0; - for (i = 0; i < mem.size(); i++) { - std::string *tmp = mem.back().get(); - inUse += tmp->length(); - allocated += tmp->length(); + for (i = 0; i < mem.size(); i++) { + MemChunk *tmp = (MemChunk *) mem.back().get(); + inUse += tmp->currentSize; + allocated += tmp->capacity; } if (allocated > 0) cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl; @@ -81,6 +81,7 @@ StringStore::~StringStore() uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) { + MemChunk *lastMC = NULL; uint32_t ret = 0; empty = false; // At least a NULL is being stored. @@ -98,10 +99,47 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) if (fUseStoreStringMutex) lk.lock(); - shared_ptr newString(new std::string((char*)data, len)); - mem.push_back(newString); + if (mem.size() > 0) + lastMC = (MemChunk *) mem.back().get(); - ret = mem.size(); + if (len >= CHUNK_SIZE) + { + shared_array newOne(new uint8_t[len + sizeof(MemChunk)]); + longStrings.push_back(newOne); + lastMC = (MemChunk*) longStrings.back().get(); + lastMC->capacity = lastMC->currentSize = len; + memcpy(lastMC->data, data, len); + // High bit to mark a long string + ret = 0x80000000; + ret += longStrings.size() - 1; + } + else + { + if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len)) + { + // mem usage debugging + //if (lastMC) + //cout << "Memchunk efficiency = " << lastMC->currentSize << "/" << lastMC->capacity << endl; + shared_array newOne(new uint8_t[CHUNK_SIZE + sizeof(MemChunk)]); + mem.push_back(newOne); + lastMC = (MemChunk *) mem.back().get(); + lastMC->currentSize = 0; + lastMC->capacity = CHUNK_SIZE; + memset(lastMC->data, 0, CHUNK_SIZE); + } + + + ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize; + memcpy(&(lastMC->data[lastMC->currentSize]), data, len); + /* + cout << "stored: '" << hex; + for (uint32_t i = 0; i < len ; i++) { + cout << (char) lastMC->data[lastMC->currentSize + i]; + } + cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl; + */ + lastMC->currentSize += len; + } return ret; } @@ -109,15 +147,22 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) void StringStore::serialize(ByteStream &bs) const { uint32_t i; + MemChunk *mc; bs << (uint32_t) mem.size(); bs << (uint8_t) empty; for (i = 0; i < mem.size(); i++) { - if (mem[i].get() == NULL) - bs << empty_str; - else - bs << *mem[i].get(); + mc = (MemChunk *) mem[i].get(); + bs << (uint32_t) mc->currentSize; //cout << "serialized " << mc->currentSize << " bytes\n"; + bs.append(mc->data, mc->currentSize); + } + bs << (uint32_t) longStrings.size(); + for (i = 0; i < longStrings.size(); i++) + { + mc = (MemChunk *) longStrings[i].get(); + bs << (uint32_t) mc->currentSize; + bs.append(mc->data, mc->currentSize); } } @@ -125,30 +170,48 @@ void StringStore::deserialize(ByteStream &bs) { uint32_t i; uint32_t count; - std::string buf; + uint32_t size; + uint8_t *buf; + MemChunk *mc; uint8_t tmp8; //mem.clear(); bs >> count; - mem.reserve(count); + mem.resize(count); bs >> tmp8; empty = (bool) tmp8; for (i = 0; i < count; i++) { + bs >> size; //cout << "deserializing " << size << " bytes\n"; - bs >> buf; - // We do this to avoid pre-C++11 zero copy hell but need to - // preserve all data including NULs so using c_str() is out. - shared_ptr newString(new std::string()); - newString->append(buf); - mem.push_back(newString); + buf = bs.buf(); + mem[i].reset(new uint8_t[size + sizeof(MemChunk)]); + mc = (MemChunk *) mem[i].get(); + mc->currentSize = size; + mc->capacity = size; + memcpy(mc->data, buf, size); + bs.advance(size); } + bs >> count; + longStrings.resize(count); + for (i = 0; i < count; i++) + { + bs >> size; + buf = bs.buf(); + longStrings[i].reset(new uint8_t[size + sizeof(MemChunk)]); + mc = (MemChunk *) longStrings[i].get(); + mc->capacity = mc->currentSize = size; + memcpy(mc->data, buf, size); + bs.advance(size); + } return; } void StringStore::clear() { - vector > emptyv; + vector > emptyv; + vector > emptyv2; mem.swap(emptyv); + longStrings.swap(emptyv2); empty = true; } diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index c2504b99a..8b5ea75d7 100755 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -118,9 +118,18 @@ private: // This is an overlay b/c the underlying data needs to be any size, // and alloc'd in one chunk. data can't be a separate dynamic chunk. - - std::vector > mem; - bool empty; + struct MemChunk + { + uint32_t currentSize; + uint32_t capacity; + uint8_t data[]; + }; + + std::vector > mem; + + // To store strings > 64KB (BLOB/TEXT) + std::vector > longStrings; + bool empty; bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe boost::mutex fMutex; }; @@ -1531,13 +1540,27 @@ inline std::string StringStore::getString(uint32_t off, uint32_t len) const if (off == std::numeric_limits::max()) return joblist::CPNULLSTRMARK; - if ((mem.size() < off) || off == 0) + MemChunk *mc; + if (off & 0x80000000) + { + off = off - 0x80000000; + if (longStrings.size() <= off) + return joblist::CPNULLSTRMARK; + mc = (MemChunk*) longStrings[off].get(); + return std::string((char *) mc->data, len); + } + + uint32_t chunk = off / CHUNK_SIZE; + uint32_t offset = off % CHUNK_SIZE; + // this has to handle uninitialized data as well. If it's uninitialized it doesn't matter + // what gets returned, it just can't go out of bounds. + if (mem.size() <= chunk) + return joblist::CPNULLSTRMARK; + mc = (MemChunk *) mem[chunk].get(); + if ((offset + len) > mc->currentSize) return joblist::CPNULLSTRMARK; - if (mem[off-1].get() == NULL) - return joblist::CPNULLSTRMARK; - - return *mem[off-1].get(); + return std::string((char *) &(mc->data[offset]), len); } inline const uint8_t * StringStore::getPointer(uint32_t off) const @@ -1545,15 +1568,26 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const if (off == std::numeric_limits::max()) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); + uint32_t chunk = off / CHUNK_SIZE; + uint32_t offset = off % CHUNK_SIZE; + MemChunk *mc; + if (off & 0x80000000) + { + off = off - 0x80000000; + if (longStrings.size() <= off) + return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); + mc = (MemChunk*) longStrings[off].get(); + return mc->data; + } // this has to handle uninitialized data as well. If it's uninitialized it doesn't matter // what gets returned, it just can't go out of bounds. - if (UNLIKELY(mem.size() < off)) - return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - - if (off == 0 || (mem[off-1].get() == NULL)) + if (UNLIKELY(mem.size() <= chunk)) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); + mc = (MemChunk *) mem[chunk].get(); + if (offset > mc->currentSize) + return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - return (uint8_t*)mem[off-1].get()->c_str(); + return &(mc->data[offset]); } inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const @@ -1564,15 +1598,22 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const if (len < 8) return false; - if ((mem.size() < off) || off == 0) + // Long strings won't be NULL + if (off & 0x80000000) + return false; + + uint32_t chunk = off / CHUNK_SIZE; + uint32_t offset = off % CHUNK_SIZE; + MemChunk *mc; + if (mem.size() <= chunk) return true; - if (mem[off-1].get() == NULL) + mc = (MemChunk *) mem[chunk].get(); + if ((offset + len) > mc->currentSize) return true; - - if (mem[off-1].get()->empty()) // Empty string is NULL + if (mc->data[offset] == 0) // "" = NULL string for some reason... return true; - return (mem[off-1].get()->compare(joblist::CPNULLSTRMARK) == 0); + return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str())); } inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const @@ -1580,13 +1621,30 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l if (off == std::numeric_limits::max() || len == 0) return str == joblist::CPNULLSTRMARK; - if ((mem.size() < off) || off == 0) + MemChunk *mc; + if (off & 0x80000000) + { + if (longStrings.size() <= (off - 0x80000000)) + return false; + + mc = (MemChunk *) longStrings[off - 0x80000000].get(); + + // Not sure if this check it needed, but adds safety + if (len > mc->currentSize) + return false; + + return (strncmp(str.c_str(), (const char*) mc->data, len) == 0); + } + uint32_t chunk = off / CHUNK_SIZE; + uint32_t offset = off % CHUNK_SIZE; + if (mem.size() <= chunk) return false; - if (mem[off-1].get() == NULL) + mc = (MemChunk *) mem[chunk].get(); + if ((offset + len) > mc->currentSize) return false; - return (mem[off-1].get()->compare(str) == 0); + return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0); } inline bool StringStore::isEmpty() const @@ -1598,10 +1656,16 @@ inline uint64_t StringStore::getSize() const { uint32_t i; uint64_t ret = 0; + MemChunk *mc; for (i = 0; i < mem.size(); i++) { - ret+= mem[i].get()->length(); + mc = (MemChunk *) mem[i].get(); + ret += mc->capacity; } + for (i = 0; i < longStrings.size(); i++) { + mc = (MemChunk *) longStrings[i].get(); + ret += mc->capacity; + } return ret; }