You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-874 StringStore Mk.3
StringStore as a vector of std::string had a performance regressions and a rare crash. This new version of StringStore restores the original StringStore with the 64KB limitation and adds another vector to store strings that won't fit into the small string storage.
This commit is contained in:
@ -69,10 +69,10 @@ StringStore::~StringStore()
|
|||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint64_t inUse = 0, allocated = 0;
|
uint64_t inUse = 0, allocated = 0;
|
||||||
|
|
||||||
for (i = 0; i < mem.size(); i++) {
|
for (i = 0; i < mem.size(); i++) {
|
||||||
std::string *tmp = mem.back().get();
|
MemChunk *tmp = (MemChunk *) mem.back().get();
|
||||||
inUse += tmp->length();
|
inUse += tmp->currentSize;
|
||||||
allocated += tmp->length();
|
allocated += tmp->capacity;
|
||||||
}
|
}
|
||||||
if (allocated > 0)
|
if (allocated > 0)
|
||||||
cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl;
|
cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl;
|
||||||
@ -81,6 +81,7 @@ StringStore::~StringStore()
|
|||||||
|
|
||||||
uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||||
{
|
{
|
||||||
|
MemChunk *lastMC = NULL;
|
||||||
uint32_t ret = 0;
|
uint32_t ret = 0;
|
||||||
|
|
||||||
empty = false; // At least a NULL is being stored.
|
empty = false; // At least a NULL is being stored.
|
||||||
@ -98,10 +99,47 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
|||||||
if (fUseStoreStringMutex)
|
if (fUseStoreStringMutex)
|
||||||
lk.lock();
|
lk.lock();
|
||||||
|
|
||||||
shared_ptr<std::string> newString(new std::string((char*)data, len));
|
if (mem.size() > 0)
|
||||||
mem.push_back(newString);
|
lastMC = (MemChunk *) mem.back().get();
|
||||||
|
|
||||||
ret = mem.size();
|
if (len >= CHUNK_SIZE)
|
||||||
|
{
|
||||||
|
shared_array<uint8_t> newOne(new uint8_t[len + sizeof(MemChunk)]);
|
||||||
|
longStrings.push_back(newOne);
|
||||||
|
lastMC = (MemChunk*) longStrings.back().get();
|
||||||
|
lastMC->capacity = lastMC->currentSize = len;
|
||||||
|
memcpy(lastMC->data, data, len);
|
||||||
|
// High bit to mark a long string
|
||||||
|
ret = 0x80000000;
|
||||||
|
ret += longStrings.size() - 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len))
|
||||||
|
{
|
||||||
|
// mem usage debugging
|
||||||
|
//if (lastMC)
|
||||||
|
//cout << "Memchunk efficiency = " << lastMC->currentSize << "/" << lastMC->capacity << endl;
|
||||||
|
shared_array<uint8_t> newOne(new uint8_t[CHUNK_SIZE + sizeof(MemChunk)]);
|
||||||
|
mem.push_back(newOne);
|
||||||
|
lastMC = (MemChunk *) mem.back().get();
|
||||||
|
lastMC->currentSize = 0;
|
||||||
|
lastMC->capacity = CHUNK_SIZE;
|
||||||
|
memset(lastMC->data, 0, CHUNK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize;
|
||||||
|
memcpy(&(lastMC->data[lastMC->currentSize]), data, len);
|
||||||
|
/*
|
||||||
|
cout << "stored: '" << hex;
|
||||||
|
for (uint32_t i = 0; i < len ; i++) {
|
||||||
|
cout << (char) lastMC->data[lastMC->currentSize + i];
|
||||||
|
}
|
||||||
|
cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl;
|
||||||
|
*/
|
||||||
|
lastMC->currentSize += len;
|
||||||
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -109,15 +147,22 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
|||||||
void StringStore::serialize(ByteStream &bs) const
|
void StringStore::serialize(ByteStream &bs) const
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
|
MemChunk *mc;
|
||||||
|
|
||||||
bs << (uint32_t) mem.size();
|
bs << (uint32_t) mem.size();
|
||||||
bs << (uint8_t) empty;
|
bs << (uint8_t) empty;
|
||||||
for (i = 0; i < mem.size(); i++) {
|
for (i = 0; i < mem.size(); i++) {
|
||||||
if (mem[i].get() == NULL)
|
mc = (MemChunk *) mem[i].get();
|
||||||
bs << empty_str;
|
bs << (uint32_t) mc->currentSize;
|
||||||
else
|
|
||||||
bs << *mem[i].get();
|
|
||||||
//cout << "serialized " << mc->currentSize << " bytes\n";
|
//cout << "serialized " << mc->currentSize << " bytes\n";
|
||||||
|
bs.append(mc->data, mc->currentSize);
|
||||||
|
}
|
||||||
|
bs << (uint32_t) longStrings.size();
|
||||||
|
for (i = 0; i < longStrings.size(); i++)
|
||||||
|
{
|
||||||
|
mc = (MemChunk *) longStrings[i].get();
|
||||||
|
bs << (uint32_t) mc->currentSize;
|
||||||
|
bs.append(mc->data, mc->currentSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,30 +170,48 @@ void StringStore::deserialize(ByteStream &bs)
|
|||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t count;
|
uint32_t count;
|
||||||
std::string buf;
|
uint32_t size;
|
||||||
|
uint8_t *buf;
|
||||||
|
MemChunk *mc;
|
||||||
uint8_t tmp8;
|
uint8_t tmp8;
|
||||||
|
|
||||||
//mem.clear();
|
//mem.clear();
|
||||||
bs >> count;
|
bs >> count;
|
||||||
mem.reserve(count);
|
mem.resize(count);
|
||||||
bs >> tmp8;
|
bs >> tmp8;
|
||||||
empty = (bool) tmp8;
|
empty = (bool) tmp8;
|
||||||
for (i = 0; i < count; i++) {
|
for (i = 0; i < count; i++) {
|
||||||
|
bs >> size;
|
||||||
//cout << "deserializing " << size << " bytes\n";
|
//cout << "deserializing " << size << " bytes\n";
|
||||||
bs >> buf;
|
buf = bs.buf();
|
||||||
// We do this to avoid pre-C++11 zero copy hell but need to
|
mem[i].reset(new uint8_t[size + sizeof(MemChunk)]);
|
||||||
// preserve all data including NULs so using c_str() is out.
|
mc = (MemChunk *) mem[i].get();
|
||||||
shared_ptr<std::string> newString(new std::string());
|
mc->currentSize = size;
|
||||||
newString->append(buf);
|
mc->capacity = size;
|
||||||
mem.push_back(newString);
|
memcpy(mc->data, buf, size);
|
||||||
|
bs.advance(size);
|
||||||
}
|
}
|
||||||
|
bs >> count;
|
||||||
|
longStrings.resize(count);
|
||||||
|
for (i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
bs >> size;
|
||||||
|
buf = bs.buf();
|
||||||
|
longStrings[i].reset(new uint8_t[size + sizeof(MemChunk)]);
|
||||||
|
mc = (MemChunk *) longStrings[i].get();
|
||||||
|
mc->capacity = mc->currentSize = size;
|
||||||
|
memcpy(mc->data, buf, size);
|
||||||
|
bs.advance(size);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void StringStore::clear()
|
void StringStore::clear()
|
||||||
{
|
{
|
||||||
vector<shared_ptr<std::string> > emptyv;
|
vector<shared_array<uint8_t> > emptyv;
|
||||||
|
vector<shared_array<uint8_t> > emptyv2;
|
||||||
mem.swap(emptyv);
|
mem.swap(emptyv);
|
||||||
|
longStrings.swap(emptyv2);
|
||||||
empty = true;
|
empty = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,9 +118,18 @@ private:
|
|||||||
|
|
||||||
// This is an overlay b/c the underlying data needs to be any size,
|
// This is an overlay b/c the underlying data needs to be any size,
|
||||||
// and alloc'd in one chunk. data can't be a separate dynamic chunk.
|
// and alloc'd in one chunk. data can't be a separate dynamic chunk.
|
||||||
|
struct MemChunk
|
||||||
std::vector<boost::shared_ptr<std::string> > mem;
|
{
|
||||||
bool empty;
|
uint32_t currentSize;
|
||||||
|
uint32_t capacity;
|
||||||
|
uint8_t data[];
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<boost::shared_array<uint8_t> > mem;
|
||||||
|
|
||||||
|
// To store strings > 64KB (BLOB/TEXT)
|
||||||
|
std::vector<boost::shared_array<uint8_t> > longStrings;
|
||||||
|
bool empty;
|
||||||
bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe
|
bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe
|
||||||
boost::mutex fMutex;
|
boost::mutex fMutex;
|
||||||
};
|
};
|
||||||
@ -1531,13 +1540,27 @@ inline std::string StringStore::getString(uint32_t off, uint32_t len) const
|
|||||||
if (off == std::numeric_limits<uint32_t>::max())
|
if (off == std::numeric_limits<uint32_t>::max())
|
||||||
return joblist::CPNULLSTRMARK;
|
return joblist::CPNULLSTRMARK;
|
||||||
|
|
||||||
if ((mem.size() < off) || off == 0)
|
MemChunk *mc;
|
||||||
|
if (off & 0x80000000)
|
||||||
|
{
|
||||||
|
off = off - 0x80000000;
|
||||||
|
if (longStrings.size() <= off)
|
||||||
|
return joblist::CPNULLSTRMARK;
|
||||||
|
mc = (MemChunk*) longStrings[off].get();
|
||||||
|
return std::string((char *) mc->data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t chunk = off / CHUNK_SIZE;
|
||||||
|
uint32_t offset = off % CHUNK_SIZE;
|
||||||
|
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||||
|
// what gets returned, it just can't go out of bounds.
|
||||||
|
if (mem.size() <= chunk)
|
||||||
|
return joblist::CPNULLSTRMARK;
|
||||||
|
mc = (MemChunk *) mem[chunk].get();
|
||||||
|
if ((offset + len) > mc->currentSize)
|
||||||
return joblist::CPNULLSTRMARK;
|
return joblist::CPNULLSTRMARK;
|
||||||
|
|
||||||
if (mem[off-1].get() == NULL)
|
return std::string((char *) &(mc->data[offset]), len);
|
||||||
return joblist::CPNULLSTRMARK;
|
|
||||||
|
|
||||||
return *mem[off-1].get();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||||
@ -1545,15 +1568,26 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
|||||||
if (off == std::numeric_limits<uint32_t>::max())
|
if (off == std::numeric_limits<uint32_t>::max())
|
||||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||||
|
|
||||||
|
uint32_t chunk = off / CHUNK_SIZE;
|
||||||
|
uint32_t offset = off % CHUNK_SIZE;
|
||||||
|
MemChunk *mc;
|
||||||
|
if (off & 0x80000000)
|
||||||
|
{
|
||||||
|
off = off - 0x80000000;
|
||||||
|
if (longStrings.size() <= off)
|
||||||
|
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||||
|
mc = (MemChunk*) longStrings[off].get();
|
||||||
|
return mc->data;
|
||||||
|
}
|
||||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||||
// what gets returned, it just can't go out of bounds.
|
// what gets returned, it just can't go out of bounds.
|
||||||
if (UNLIKELY(mem.size() < off))
|
if (UNLIKELY(mem.size() <= chunk))
|
||||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
|
||||||
|
|
||||||
if (off == 0 || (mem[off-1].get() == NULL))
|
|
||||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||||
|
mc = (MemChunk *) mem[chunk].get();
|
||||||
|
if (offset > mc->currentSize)
|
||||||
|
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||||
|
|
||||||
return (uint8_t*)mem[off-1].get()->c_str();
|
return &(mc->data[offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||||
@ -1564,15 +1598,22 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
|||||||
if (len < 8)
|
if (len < 8)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if ((mem.size() < off) || off == 0)
|
// Long strings won't be NULL
|
||||||
|
if (off & 0x80000000)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
uint32_t chunk = off / CHUNK_SIZE;
|
||||||
|
uint32_t offset = off % CHUNK_SIZE;
|
||||||
|
MemChunk *mc;
|
||||||
|
if (mem.size() <= chunk)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (mem[off-1].get() == NULL)
|
mc = (MemChunk *) mem[chunk].get();
|
||||||
|
if ((offset + len) > mc->currentSize)
|
||||||
return true;
|
return true;
|
||||||
|
if (mc->data[offset] == 0) // "" = NULL string for some reason...
|
||||||
if (mem[off-1].get()->empty()) // Empty string is NULL
|
|
||||||
return true;
|
return true;
|
||||||
return (mem[off-1].get()->compare(joblist::CPNULLSTRMARK) == 0);
|
return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const
|
inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const
|
||||||
@ -1580,13 +1621,30 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l
|
|||||||
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
|
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
|
||||||
return str == joblist::CPNULLSTRMARK;
|
return str == joblist::CPNULLSTRMARK;
|
||||||
|
|
||||||
if ((mem.size() < off) || off == 0)
|
MemChunk *mc;
|
||||||
|
if (off & 0x80000000)
|
||||||
|
{
|
||||||
|
if (longStrings.size() <= (off - 0x80000000))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
mc = (MemChunk *) longStrings[off - 0x80000000].get();
|
||||||
|
|
||||||
|
// Not sure if this check it needed, but adds safety
|
||||||
|
if (len > mc->currentSize)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return (strncmp(str.c_str(), (const char*) mc->data, len) == 0);
|
||||||
|
}
|
||||||
|
uint32_t chunk = off / CHUNK_SIZE;
|
||||||
|
uint32_t offset = off % CHUNK_SIZE;
|
||||||
|
if (mem.size() <= chunk)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (mem[off-1].get() == NULL)
|
mc = (MemChunk *) mem[chunk].get();
|
||||||
|
if ((offset + len) > mc->currentSize)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return (mem[off-1].get()->compare(str) == 0);
|
return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool StringStore::isEmpty() const
|
inline bool StringStore::isEmpty() const
|
||||||
@ -1598,10 +1656,16 @@ inline uint64_t StringStore::getSize() const
|
|||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint64_t ret = 0;
|
uint64_t ret = 0;
|
||||||
|
MemChunk *mc;
|
||||||
|
|
||||||
for (i = 0; i < mem.size(); i++) {
|
for (i = 0; i < mem.size(); i++) {
|
||||||
ret+= mem[i].get()->length();
|
mc = (MemChunk *) mem[i].get();
|
||||||
|
ret += mc->capacity;
|
||||||
}
|
}
|
||||||
|
for (i = 0; i < longStrings.size(); i++) {
|
||||||
|
mc = (MemChunk *) longStrings[i].get();
|
||||||
|
ret += mc->capacity;
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user