You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
Merge pull request #241 from mariadb-corporation/MCOL-874
MCOL-874 StringStore Mk.3
This commit is contained in:
@ -69,10 +69,10 @@ StringStore::~StringStore()
|
||||
uint32_t i;
|
||||
uint64_t inUse = 0, allocated = 0;
|
||||
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
std::string *tmp = mem.back().get();
|
||||
inUse += tmp->length();
|
||||
allocated += tmp->length();
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
MemChunk *tmp = (MemChunk *) mem.back().get();
|
||||
inUse += tmp->currentSize;
|
||||
allocated += tmp->capacity;
|
||||
}
|
||||
if (allocated > 0)
|
||||
cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl;
|
||||
@ -81,6 +81,7 @@ StringStore::~StringStore()
|
||||
|
||||
uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
{
|
||||
MemChunk *lastMC = NULL;
|
||||
uint32_t ret = 0;
|
||||
|
||||
empty = false; // At least a NULL is being stored.
|
||||
@ -98,10 +99,47 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
if (fUseStoreStringMutex)
|
||||
lk.lock();
|
||||
|
||||
shared_ptr<std::string> newString(new std::string((char*)data, len));
|
||||
mem.push_back(newString);
|
||||
if (mem.size() > 0)
|
||||
lastMC = (MemChunk *) mem.back().get();
|
||||
|
||||
ret = mem.size();
|
||||
if (len >= CHUNK_SIZE)
|
||||
{
|
||||
shared_array<uint8_t> newOne(new uint8_t[len + sizeof(MemChunk)]);
|
||||
longStrings.push_back(newOne);
|
||||
lastMC = (MemChunk*) longStrings.back().get();
|
||||
lastMC->capacity = lastMC->currentSize = len;
|
||||
memcpy(lastMC->data, data, len);
|
||||
// High bit to mark a long string
|
||||
ret = 0x80000000;
|
||||
ret += longStrings.size() - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len))
|
||||
{
|
||||
// mem usage debugging
|
||||
//if (lastMC)
|
||||
//cout << "Memchunk efficiency = " << lastMC->currentSize << "/" << lastMC->capacity << endl;
|
||||
shared_array<uint8_t> newOne(new uint8_t[CHUNK_SIZE + sizeof(MemChunk)]);
|
||||
mem.push_back(newOne);
|
||||
lastMC = (MemChunk *) mem.back().get();
|
||||
lastMC->currentSize = 0;
|
||||
lastMC->capacity = CHUNK_SIZE;
|
||||
memset(lastMC->data, 0, CHUNK_SIZE);
|
||||
}
|
||||
|
||||
|
||||
ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize;
|
||||
memcpy(&(lastMC->data[lastMC->currentSize]), data, len);
|
||||
/*
|
||||
cout << "stored: '" << hex;
|
||||
for (uint32_t i = 0; i < len ; i++) {
|
||||
cout << (char) lastMC->data[lastMC->currentSize + i];
|
||||
}
|
||||
cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl;
|
||||
*/
|
||||
lastMC->currentSize += len;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -109,15 +147,22 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len)
|
||||
void StringStore::serialize(ByteStream &bs) const
|
||||
{
|
||||
uint32_t i;
|
||||
MemChunk *mc;
|
||||
|
||||
bs << (uint32_t) mem.size();
|
||||
bs << (uint8_t) empty;
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
if (mem[i].get() == NULL)
|
||||
bs << empty_str;
|
||||
else
|
||||
bs << *mem[i].get();
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
bs << (uint32_t) mc->currentSize;
|
||||
//cout << "serialized " << mc->currentSize << " bytes\n";
|
||||
bs.append(mc->data, mc->currentSize);
|
||||
}
|
||||
bs << (uint32_t) longStrings.size();
|
||||
for (i = 0; i < longStrings.size(); i++)
|
||||
{
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
bs << (uint32_t) mc->currentSize;
|
||||
bs.append(mc->data, mc->currentSize);
|
||||
}
|
||||
}
|
||||
|
||||
@ -125,30 +170,48 @@ void StringStore::deserialize(ByteStream &bs)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t count;
|
||||
std::string buf;
|
||||
uint32_t size;
|
||||
uint8_t *buf;
|
||||
MemChunk *mc;
|
||||
uint8_t tmp8;
|
||||
|
||||
//mem.clear();
|
||||
bs >> count;
|
||||
mem.reserve(count);
|
||||
mem.resize(count);
|
||||
bs >> tmp8;
|
||||
empty = (bool) tmp8;
|
||||
for (i = 0; i < count; i++) {
|
||||
bs >> size;
|
||||
//cout << "deserializing " << size << " bytes\n";
|
||||
bs >> buf;
|
||||
// We do this to avoid pre-C++11 zero copy hell but need to
|
||||
// preserve all data including NULs so using c_str() is out.
|
||||
shared_ptr<std::string> newString(new std::string());
|
||||
newString->append(buf);
|
||||
mem.push_back(newString);
|
||||
buf = bs.buf();
|
||||
mem[i].reset(new uint8_t[size + sizeof(MemChunk)]);
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
mc->currentSize = size;
|
||||
mc->capacity = size;
|
||||
memcpy(mc->data, buf, size);
|
||||
bs.advance(size);
|
||||
}
|
||||
bs >> count;
|
||||
longStrings.resize(count);
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
bs >> size;
|
||||
buf = bs.buf();
|
||||
longStrings[i].reset(new uint8_t[size + sizeof(MemChunk)]);
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
mc->capacity = mc->currentSize = size;
|
||||
memcpy(mc->data, buf, size);
|
||||
bs.advance(size);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void StringStore::clear()
|
||||
{
|
||||
vector<shared_ptr<std::string> > emptyv;
|
||||
vector<shared_array<uint8_t> > emptyv;
|
||||
vector<shared_array<uint8_t> > emptyv2;
|
||||
mem.swap(emptyv);
|
||||
longStrings.swap(emptyv2);
|
||||
empty = true;
|
||||
}
|
||||
|
||||
|
@ -118,9 +118,18 @@ private:
|
||||
|
||||
// This is an overlay b/c the underlying data needs to be any size,
|
||||
// and alloc'd in one chunk. data can't be a separate dynamic chunk.
|
||||
|
||||
std::vector<boost::shared_ptr<std::string> > mem;
|
||||
bool empty;
|
||||
struct MemChunk
|
||||
{
|
||||
uint32_t currentSize;
|
||||
uint32_t capacity;
|
||||
uint8_t data[];
|
||||
};
|
||||
|
||||
std::vector<boost::shared_array<uint8_t> > mem;
|
||||
|
||||
// To store strings > 64KB (BLOB/TEXT)
|
||||
std::vector<boost::shared_array<uint8_t> > longStrings;
|
||||
bool empty;
|
||||
bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe
|
||||
boost::mutex fMutex;
|
||||
};
|
||||
@ -1531,13 +1540,27 @@ inline std::string StringStore::getString(uint32_t off, uint32_t len) const
|
||||
if (off == std::numeric_limits<uint32_t>::max())
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
if ((mem.size() < off) || off == 0)
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
{
|
||||
off = off - 0x80000000;
|
||||
if (longStrings.size() <= off)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
return std::string((char *) mc->data, len);
|
||||
}
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||
// what gets returned, it just can't go out of bounds.
|
||||
if (mem.size() <= chunk)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
if (mem[off-1].get() == NULL)
|
||||
return joblist::CPNULLSTRMARK;
|
||||
|
||||
return *mem[off-1].get();
|
||||
return std::string((char *) &(mc->data[offset]), len);
|
||||
}
|
||||
|
||||
inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||
@ -1545,15 +1568,26 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const
|
||||
if (off == std::numeric_limits<uint32_t>::max())
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
{
|
||||
off = off - 0x80000000;
|
||||
if (longStrings.size() <= off)
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
mc = (MemChunk*) longStrings[off].get();
|
||||
return mc->data;
|
||||
}
|
||||
// this has to handle uninitialized data as well. If it's uninitialized it doesn't matter
|
||||
// what gets returned, it just can't go out of bounds.
|
||||
if (UNLIKELY(mem.size() < off))
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
if (off == 0 || (mem[off-1].get() == NULL))
|
||||
if (UNLIKELY(mem.size() <= chunk))
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if (offset > mc->currentSize)
|
||||
return (const uint8_t *) joblist::CPNULLSTRMARK.c_str();
|
||||
|
||||
return (uint8_t*)mem[off-1].get()->c_str();
|
||||
return &(mc->data[offset]);
|
||||
}
|
||||
|
||||
inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||
@ -1564,15 +1598,22 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const
|
||||
if (len < 8)
|
||||
return false;
|
||||
|
||||
if ((mem.size() < off) || off == 0)
|
||||
// Long strings won't be NULL
|
||||
if (off & 0x80000000)
|
||||
return false;
|
||||
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
MemChunk *mc;
|
||||
if (mem.size() <= chunk)
|
||||
return true;
|
||||
|
||||
if (mem[off-1].get() == NULL)
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
return true;
|
||||
|
||||
if (mem[off-1].get()->empty()) // Empty string is NULL
|
||||
if (mc->data[offset] == 0) // "" = NULL string for some reason...
|
||||
return true;
|
||||
return (mem[off-1].get()->compare(joblist::CPNULLSTRMARK) == 0);
|
||||
return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str()));
|
||||
}
|
||||
|
||||
inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const
|
||||
@ -1580,13 +1621,30 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l
|
||||
if (off == std::numeric_limits<uint32_t>::max() || len == 0)
|
||||
return str == joblist::CPNULLSTRMARK;
|
||||
|
||||
if ((mem.size() < off) || off == 0)
|
||||
MemChunk *mc;
|
||||
if (off & 0x80000000)
|
||||
{
|
||||
if (longStrings.size() <= (off - 0x80000000))
|
||||
return false;
|
||||
|
||||
mc = (MemChunk *) longStrings[off - 0x80000000].get();
|
||||
|
||||
// Not sure if this check it needed, but adds safety
|
||||
if (len > mc->currentSize)
|
||||
return false;
|
||||
|
||||
return (strncmp(str.c_str(), (const char*) mc->data, len) == 0);
|
||||
}
|
||||
uint32_t chunk = off / CHUNK_SIZE;
|
||||
uint32_t offset = off % CHUNK_SIZE;
|
||||
if (mem.size() <= chunk)
|
||||
return false;
|
||||
|
||||
if (mem[off-1].get() == NULL)
|
||||
mc = (MemChunk *) mem[chunk].get();
|
||||
if ((offset + len) > mc->currentSize)
|
||||
return false;
|
||||
|
||||
return (mem[off-1].get()->compare(str) == 0);
|
||||
return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0);
|
||||
}
|
||||
|
||||
inline bool StringStore::isEmpty() const
|
||||
@ -1598,10 +1656,16 @@ inline uint64_t StringStore::getSize() const
|
||||
{
|
||||
uint32_t i;
|
||||
uint64_t ret = 0;
|
||||
MemChunk *mc;
|
||||
|
||||
for (i = 0; i < mem.size(); i++) {
|
||||
ret+= mem[i].get()->length();
|
||||
mc = (MemChunk *) mem[i].get();
|
||||
ret += mc->capacity;
|
||||
}
|
||||
for (i = 0; i < longStrings.size(); i++) {
|
||||
mc = (MemChunk *) longStrings[i].get();
|
||||
ret += mc->capacity;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user