1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-874 StringStore Mk.3

StringStore as a vector of std::string had a performance regressions and
a rare crash.

This new version of StringStore restores the original StringStore with
the 64KB limitation and adds another vector to store strings that won't
fit into the small string storage.
This commit is contained in:
Andrew Hutchings
2017-08-14 21:47:04 +01:00
parent 553c3d5f27
commit 3f040173d2
2 changed files with 169 additions and 42 deletions

View File

@ -69,10 +69,10 @@ StringStore::~StringStore()
uint32_t i;
uint64_t inUse = 0, allocated = 0;
for (i = 0; i < mem.size(); i++) {
std::string *tmp = mem.back().get();
inUse += tmp->length();
allocated += tmp->length();
for (i = 0; i < mem.size(); i++) {
MemChunk *tmp = (MemChunk *) mem.back().get();
inUse += tmp->currentSize;
allocated += tmp->capacity;
}
if (allocated > 0)
cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl;
@ -81,6 +81,7 @@ StringStore::~StringStore()
uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
{
MemChunk *lastMC = NULL;
uint32_t ret = 0;
empty = false; // At least a NULL is being stored.
@ -98,10 +99,47 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
if (fUseStoreStringMutex)
lk.lock();
shared_ptr<std::string> newString(new std::string((char*)data, len));
mem.push_back(newString);
if (mem.size() > 0)
lastMC = (MemChunk *) mem.back().get();
ret = mem.size();
if (len >= CHUNK_SIZE)
{
shared_array<uint8_t> newOne(new uint8_t[len + sizeof(MemChunk)]);
longStrings.push_back(newOne);
lastMC = (MemChunk*) longStrings.back().get();
lastMC->capacity = lastMC->currentSize = len;
memcpy(lastMC->data, data, len);
// High bit to mark a long string
ret = 0x80000000;
ret += longStrings.size() - 1;
}
else
{
if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len))
{
// mem usage debugging
//if (lastMC)
//cout << "Memchunk efficiency = " << lastMC->currentSize << "/" << lastMC->capacity << endl;
shared_array<uint8_t> newOne(new uint8_t[CHUNK_SIZE + sizeof(MemChunk)]);
mem.push_back(newOne);
lastMC = (MemChunk *) mem.back().get();
lastMC->currentSize = 0;
lastMC->capacity = CHUNK_SIZE;
memset(lastMC->data, 0, CHUNK_SIZE);
}
ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize;
memcpy(&(lastMC->data[lastMC->currentSize]), data, len);
/*
cout << "stored: '" << hex;
for (uint32_t i = 0; i < len ; i++) {
cout << (char) lastMC->data[lastMC->currentSize + i];
}
cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl;
*/
lastMC->currentSize += len;
}
return ret;
}
@ -109,15 +147,22 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
void StringStore::serialize(ByteStream &bs) const
{
uint32_t i;
MemChunk *mc;
bs << (uint32_t) mem.size();
bs << (uint8_t) empty;
for (i = 0; i < mem.size(); i++) {
if (mem[i].get() == NULL)
bs << empty_str;
else
bs << *mem[i].get();
mc = (MemChunk *) mem[i].get();
bs << (uint32_t) mc->currentSize;
//cout << "serialized " << mc->currentSize << " bytes\n";
bs.append(mc->data, mc->currentSize);
}
bs << (uint32_t) longStrings.size();
for (i = 0; i < longStrings.size(); i++)
{
mc = (MemChunk *) longStrings[i].get();
bs << (uint32_t) mc->currentSize;
bs.append(mc->data, mc->currentSize);
}
}
@ -125,30 +170,48 @@ void StringStore::deserialize(ByteStream &bs)
{
uint32_t i;
uint32_t count;
std::string buf;
uint32_t size;
uint8_t *buf;
MemChunk *mc;
uint8_t tmp8;
//mem.clear();
bs >> count;
mem.reserve(count);
mem.resize(count);
bs >> tmp8;
empty = (bool) tmp8;
for (i = 0; i < count; i++) {
bs >> size;
//cout << "deserializing " << size << " bytes\n";
bs >> buf;
// We do this to avoid pre-C++11 zero copy hell but need to
// preserve all data including NULs so using c_str() is out.
shared_ptr<std::string> newString(new std::string());
newString->append(buf);
mem.push_back(newString);
buf = bs.buf();
mem[i].reset(new uint8_t[size + sizeof(MemChunk)]);
mc = (MemChunk *) mem[i].get();
mc->currentSize = size;
mc->capacity = size;
memcpy(mc->data, buf, size);
bs.advance(size);
}
bs >> count;
longStrings.resize(count);
for (i = 0; i < count; i++)
{
bs >> size;
buf = bs.buf();
longStrings[i].reset(new uint8_t[size + sizeof(MemChunk)]);
mc = (MemChunk *) longStrings[i].get();
mc->capacity = mc->currentSize = size;
memcpy(mc->data, buf, size);
bs.advance(size);
}
return;
}
void StringStore::clear()
{
vector<shared_ptr<std::string> > emptyv;
vector<shared_array<uint8_t> > emptyv;
vector<shared_array<uint8_t> > emptyv2;
mem.swap(emptyv);
longStrings.swap(emptyv2);
empty = true;
}

View File

@ -118,9 +118,18 @@ private:
// This is an overlay b/c the underlying data needs to be any size,
// and alloc'd in one chunk. data can't be a separate dynamic chunk.
struct MemChunk
{
uint32_t currentSize;
uint32_t capacity;
uint8_t data[];
};
std::vector<boost::shared_ptr<std::string> > mem;
bool empty;
std::vector<boost::shared_array<uint8_t> > mem;
// To store strings > 64KB (BLOB/TEXT)
std::vector<boost::shared_array<uint8_t> > longStrings;
bool empty;
bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe
boost::mutex fMutex;
};
@ -1531,13 +1540,27 @@ inline std::string StringStore::getString(uint32_t off, uint32_t len) const
if (off == std::numeric_limits<uint32_t>::max())
return joblist::CPNULLSTRMARK;
if ((mem.size() < off) || off == 0)
MemChunk *mc;
if (off & 0x80000000)
{
off = off - 0x80000000;
if (longStrings.size() <= off)
return joblist::CPNULLSTRMARK;
mc = (MemChunk*) longStrings[off].get();
return std::string((char *) mc->data, len);
}
uint32_t chunk = off / CHUNK_SIZE;
uint32_t offset = off % CHUNK_SIZE;
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
// what gets returned, it just can't go out of bounds.
if (mem.size() <= chunk)
return joblist::CPNULLSTRMARK;
mc = (MemChunk *) mem[chunk].get();
if ((offset + len) > mc->currentSize)
return joblist::CPNULLSTRMARK;
if (mem[off-1].get() == NULL)
return joblist::CPNULLSTRMARK;
return *mem[off-1].get();
return std::string((char *) &(mc->data[offset]), len);
}
inline const uint8_t * StringStore::getPointer(uint32_t off) const
@ -1545,15 +1568,26 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const
if (off == std::numeric_limits<uint32_t>::max())
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
uint32_t chunk = off / CHUNK_SIZE;
uint32_t offset = off % CHUNK_SIZE;
MemChunk *mc;
if (off & 0x80000000)
{
off = off - 0x80000000;
if (longStrings.size() <= off)
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
mc = (MemChunk*) longStrings[off].get();
return mc->data;
}
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
// what gets returned, it just can't go out of bounds.
if (UNLIKELY(mem.size() < off))
if (UNLIKELY(mem.size() <= chunk))
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
mc = (MemChunk *) mem[chunk].get();
if (offset > mc->currentSize)
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
if (off == 0 || (mem[off-1].get() == NULL))
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
return (uint8_t*)mem[off-1].get()->c_str();
return &(mc->data[offset]);
}
inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
@ -1564,15 +1598,22 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
if (len < 8)
return false;
if ((mem.size() < off) || off == 0)
// Long strings won't be NULL
if (off & 0x80000000)
return false;
uint32_t chunk = off / CHUNK_SIZE;
uint32_t offset = off % CHUNK_SIZE;
MemChunk *mc;
if (mem.size() <= chunk)
return true;
if (mem[off-1].get() == NULL)
mc = (MemChunk *) mem[chunk].get();
if ((offset + len) > mc->currentSize)
return true;
if (mem[off-1].get()->empty()) // Empty string is NULL
if (mc->data[offset] == 0) // "" = NULL string for some reason...
return true;
return (mem[off-1].get()->compare(joblist::CPNULLSTRMARK) == 0);
return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()));
}
inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const
@ -1580,13 +1621,30 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
return str == joblist::CPNULLSTRMARK;
if ((mem.size() < off) || off == 0)
MemChunk *mc;
if (off & 0x80000000)
{
if (longStrings.size() <= (off - 0x80000000))
return false;
mc = (MemChunk *) longStrings[off - 0x80000000].get();
// Not sure if this check it needed, but adds safety
if (len > mc->currentSize)
return false;
return (strncmp(str.c_str(), (const char*) mc->data, len) == 0);
}
uint32_t chunk = off / CHUNK_SIZE;
uint32_t offset = off % CHUNK_SIZE;
if (mem.size() <= chunk)
return false;
if (mem[off-1].get() == NULL)
mc = (MemChunk *) mem[chunk].get();
if ((offset + len) > mc->currentSize)
return false;
return (mem[off-1].get()->compare(str) == 0);
return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0);
}
inline bool StringStore::isEmpty() const
@ -1598,10 +1656,16 @@ inline uint64_t StringStore::getSize() const
{
uint32_t i;
uint64_t ret = 0;
MemChunk *mc;
for (i = 0; i < mem.size(); i++) {
ret+= mem[i].get()->length();
mc = (MemChunk *) mem[i].get();
ret += mc->capacity;
}
for (i = 0; i < longStrings.size(); i++) {
mc = (MemChunk *) longStrings[i].get();
ret += mc->capacity;
}
return ret;
}