You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-874 StringStore Mk.3
StringStore as a vector of std::string had a performance regressions and a rare crash. This new version of StringStore restores the original StringStore with the 64KB limitation and adds another vector to store strings that won't fit into the small string storage.
This commit is contained in:
@ -70,9 +70,9 @@ StringStore::~StringStore()
|
||||
uint64_t inUse = 0, allocated = 0;
|
||||
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
std::string *tmp = mem.back().get();
|
||||
inUse += tmp->length();
|
||||
allocated += tmp->length();
|
||||
MemChunk *tmp = (MemChunk *) mem.back().get();
|
||||
inUse += tmp->currentSize;
|
||||
allocated += tmp->capacity;
|
||||
}
|
||||
if (allocated > 0)
|
||||
cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl;
|
||||
@ -81,6 +81,7 @@ StringStore::~StringStore()
|
||||
|
||||
uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
{
|
||||
MemChunk *lastMC = NULL;
|
||||
uint32_t ret = 0;
|
||||
|
||||
empty = false; // At least a NULL is being stored.
|
||||
@ -98,10 +99,47 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
if (fUseStoreStringMutex)
|
||||
lk.lock();
|
||||
|
||||
shared_ptr<std::string> newString(new std::string((char*)data, len));
|
||||
mem.push_back(newString);
|
||||
if (mem.size() > 0)
|
||||
lastMC = (MemChunk *) mem.back().get();
|
||||
|
||||
ret = mem.size();
|
||||
if (len >= CHUNK_SIZE)
|
||||
{
|
||||
shared_array<uint8_t> newOne(new uint8_t[len + sizeof(MemChunk)]);
|
||||
longStrings.push_back(newOne);
|
||||
lastMC = (MemChunk*) longStrings.back().get();
|
||||
lastMC->capacity = lastMC->currentSize = len;
|
||||
memcpy(lastMC->data, data, len);
|
||||
// High bit to mark a long string
|
||||
ret = 0x80000000;
|
||||
ret += longStrings.size() - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len))
|
||||
{
|
||||
// mem usage debugging
|
||||
//if (lastMC)
|
||||
//cout << "Memchunk efficiency = " << lastMC->currentSize << "/" << lastMC->capacity << endl;
|
||||
shared_array<uint8_t> newOne(new uint8_t[CHUNK_SIZE + sizeof(MemChunk)]);
|
||||
mem.push_back(newOne);
|
||||
lastMC = (MemChunk *) mem.back().get();
|
||||
lastMC->currentSize = 0;
|
||||
lastMC->capacity = CHUNK_SIZE;
|
||||
memset(lastMC->data, 0, CHUNK_SIZE);
|
||||
}
|
||||
|
||||
|
||||
ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize;
|
||||
memcpy(&(lastMC->data[lastMC->currentSize]), data, len);
|
||||
/*
|
||||
cout << "stored: '" << hex;
|
||||
for (uint32_t i = 0; i < len ; i++) {
|
||||
cout << (char) lastMC->data[lastMC->currentSize + i];
|
||||
}
|
||||
cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl;
|
||||
*/
|
||||
lastMC->currentSize += len;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -109,15 +147,22 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
void StringStore::serialize(ByteStream &bs) const
|
||||
{
|
||||
uint32_t i;
|
||||
MemChunk *mc;
|
||||
|
||||
bs << (uint32_t) mem.size();
|
||||
bs << (uint8_t) empty;
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
if (mem[i].get() == NULL)
|
||||
bs << empty_str;
|
||||
else
|
||||
bs << *mem[i].get();
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
bs << (uint32_t) mc->currentSize;
|
||||
//cout << "serialized " << mc->currentSize << " bytes\n";
|
||||
bs.append(mc->data, mc->currentSize);
|
||||
}
|
||||
bs << (uint32_t) longStrings.size();
|
||||
for (i = 0; i < longStrings.size(); i++)
|
||||
{
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
bs << (uint32_t) mc->currentSize;
|
||||
bs.append(mc->data, mc->currentSize);
|
||||
}
|
||||
}
|
||||
|
||||
@ -125,30 +170,48 @@ void StringStore::deserialize(ByteStream &bs)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t count;
|
||||
std::string buf;
|
||||
uint32_t size;
|
||||
uint8_t *buf;
|
||||
MemChunk *mc;
|
||||
uint8_t tmp8;
|
||||
|
||||
//mem.clear();
|
||||
bs >> count;
|
||||
mem.reserve(count);
|
||||
mem.resize(count);
|
||||
bs >> tmp8;
|
||||
empty = (bool) tmp8;
|
||||
for (i = 0; i < count; i++) {
|
||||
bs >> size;
|
||||
//cout << "deserializing " << size << " bytes\n";
|
||||
bs >> buf;
|
||||
// We do this to avoid pre-C++11 zero copy hell but need to
|
||||
// preserve all data including NULs so using c_str() is out.
|
||||
shared_ptr<std::string> newString(new std::string());
|
||||
newString->append(buf);
|
||||
mem.push_back(newString);
|
||||
buf = bs.buf();
|
||||
mem[i].reset(new uint8_t[size + sizeof(MemChunk)]);
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
mc->currentSize = size;
|
||||
mc->capacity = size;
|
||||
memcpy(mc->data, buf, size);
|
||||
bs.advance(size);
|
||||
}
|
||||
bs >> count;
|
||||
longStrings.resize(count);
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
bs >> size;
|
||||
buf = bs.buf();
|
||||
longStrings[i].reset(new uint8_t[size + sizeof(MemChunk)]);
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
mc->capacity = mc->currentSize = size;
|
||||
memcpy(mc->data, buf, size);
|
||||
bs.advance(size);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void StringStore::clear()
|
||||
{
|
||||
vector<shared_ptr<std::string> > emptyv;
|
||||
vector<shared_array<uint8_t> > emptyv;
|
||||
vector<shared_array<uint8_t> > emptyv2;
|
||||
mem.swap(emptyv);
|
||||
longStrings.swap(emptyv2);
|
||||
empty = true;
|
||||
}
|
||||
|
||||
|
@ -118,8 +118,17 @@ private:
|
||||
|
||||
// This is an overlay b/c the underlying data needs to be any size,
|
||||
// and alloc'd in one chunk. data can't be a separate dynamic chunk.
|
||||
struct MemChunk
|
||||
{
|
||||
uint32_t currentSize;
|
||||
uint32_t capacity;
|
||||
uint8_t data[];
|
||||
};
|
||||
|
||||
std::vector<boost::shared_ptr<std::string> > mem;
|
||||
std::vector<boost::shared_array<uint8_t> > mem;
|
||||
|
||||
// To store strings > 64KB (BLOB/TEXT)
|
||||
std::vector<boost::shared_array<uint8_t> > longStrings;
|
||||
bool empty;
|
||||
bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe
|
||||
boost::mutex fMutex;
|
||||
@ -1531,13 +1540,27 @@ inline std::string StringStore::getString(uint32_t off, uint32_t len) const
|
||||
if (off == std::numeric_limits<uint32_t>::max())
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
if ((mem.size() < off) || off == 0)
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
{
|
||||
off = off - 0x80000000;
|
||||
if (longStrings.size() <= off)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
return std::string((char *) mc->data, len);
|
||||
}
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||
// what gets returned, it just can't go out of bounds.
|
||||
if (mem.size() <= chunk)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
if (mem[off-1].get() == NULL)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
return *mem[off-1].get();
|
||||
return std::string((char *) &(mc->data[offset]), len);
|
||||
}
|
||||
|
||||
inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||
@ -1545,15 +1568,26 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||
if (off == std::numeric_limits<uint32_t>::max())
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
{
|
||||
off = off - 0x80000000;
|
||||
if (longStrings.size() <= off)
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
return mc->data;
|
||||
}
|
||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||
// what gets returned, it just can't go out of bounds.
|
||||
if (UNLIKELY(mem.size() < off))
|
||||
if (UNLIKELY(mem.size() <= chunk))
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if (offset > mc->currentSize)
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
if (off == 0 || (mem[off-1].get() == NULL))
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
return (uint8_t*)mem[off-1].get()->c_str();
|
||||
return &(mc->data[offset]);
|
||||
}
|
||||
|
||||
inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||
@ -1564,15 +1598,22 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||
if (len < 8)
|
||||
return false;
|
||||
|
||||
if ((mem.size() < off) || off == 0)
|
||||
// Long strings won't be NULL
|
||||
if (off & 0x80000000)
|
||||
return false;
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
MemChunk *mc;
|
||||
if (mem.size() <= chunk)
|
||||
return true;
|
||||
|
||||
if (mem[off-1].get() == NULL)
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
return true;
|
||||
|
||||
if (mem[off-1].get()->empty()) // Empty string is NULL
|
||||
if (mc->data[offset] == 0) // "" = NULL string for some reason...
|
||||
return true;
|
||||
return (mem[off-1].get()->compare(joblist::CPNULLSTRMARK) == 0);
|
||||
return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()));
|
||||
}
|
||||
|
||||
inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const
|
||||
@ -1580,13 +1621,30 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l
|
||||
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
|
||||
return str == joblist::CPNULLSTRMARK;
|
||||
|
||||
if ((mem.size() < off) || off == 0)
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
{
|
||||
if (longStrings.size() <= (off - 0x80000000))
|
||||
return false;
|
||||
|
||||
if (mem[off-1].get() == NULL)
|
||||
mc = (MemChunk *) longStrings[off - 0x80000000].get();
|
||||
|
||||
// Not sure if this check it needed, but adds safety
|
||||
if (len > mc->currentSize)
|
||||
return false;
|
||||
|
||||
return (mem[off-1].get()->compare(str) == 0);
|
||||
return (strncmp(str.c_str(), (const char*) mc->data, len) == 0);
|
||||
}
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
if (mem.size() <= chunk)
|
||||
return false;
|
||||
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
return false;
|
||||
|
||||
return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0);
|
||||
}
|
||||
|
||||
inline bool StringStore::isEmpty() const
|
||||
@ -1598,9 +1656,15 @@ inline uint64_t StringStore::getSize() const
|
||||
{
|
||||
uint32_t i;
|
||||
uint64_t ret = 0;
|
||||
MemChunk *mc;
|
||||
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
ret+= mem[i].get()->length();
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
ret += mc->capacity;
|
||||
}
|
||||
for (i = 0; i < longStrings.size(); i++) {
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
ret += mc->capacity;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
Reference in New Issue
Block a user