From b1d04c04fb176a6731d70a8562cdd3b11d6be5a2 Mon Sep 17 00:00:00 2001 From: Andrew Hutchings Date: Tue, 21 Mar 2017 17:22:31 +0000 Subject: [PATCH] MCOL-267 Fix LONGBLOB issues * Set max column length to a little under 2.1GB in DDL * Fix token edge case * Re-write RowGroup string handling to take more than 64KB in one string --- dbcon/ddlpackage/ddl.y | 4 +- primitives/primproc/dictstep.cpp | 12 ++--- utils/rowgroup/rowgroup.cpp | 67 +++++++++------------------- utils/rowgroup/rowgroup.h | 76 ++++++++++++-------------------- 4 files changed, 59 insertions(+), 100 deletions(-) diff --git a/dbcon/ddlpackage/ddl.y b/dbcon/ddlpackage/ddl.y index cc2481b21..ba3bf9a7c 100644 --- a/dbcon/ddlpackage/ddl.y +++ b/dbcon/ddlpackage/ddl.y @@ -895,7 +895,7 @@ blob_type: | LONGBLOB { $$ = new ColumnType(DDL_BLOB); - $$->fLength = 4294967295; + $$->fLength = 2100000000; } ; @@ -923,7 +923,7 @@ text_type: | LONGTEXT { $$ = new ColumnType(DDL_BLOB); - $$->fLength = 4294967295; + $$->fLength = 2100000000; } ; diff --git a/primitives/primproc/dictstep.cpp b/primitives/primproc/dictstep.cpp index 295cfa1af..b0fcbc007 100644 --- a/primitives/primproc/dictstep.cpp +++ b/primitives/primproc/dictstep.cpp @@ -435,7 +435,7 @@ void DictStep::_projectToRG(RowGroup &rg, uint32_t col) int64_t l_lbid=0; int64_t o_lbid=0; OldGetSigParams *pt; - StringPtr tmpStrings[LOGICAL_BLOCK_RIDS]; + StringPtr *tmpStrings = new StringPtr[LOGICAL_BLOCK_RIDS]; rowgroup::Row r; boost::scoped_array newRidList; @@ -524,12 +524,12 @@ void DictStep::_projectToRG(RowGroup &rg, uint32_t col) // If this is a multi-block blob, get all the blocks // We do string copy here, should maybe have a RowGroup // function to append strings or something? - if ((newRidList[i].token != 0xffffffffffffffffLL) && + if (((newRidList[i].token >> 46) < 0x3FFFF) && ((newRidList[i].token >> 46) > 0)) { StringPtr multi_part[1]; uint16_t old_offset = primMsg->tokens[0].offset; - string result((char*)tmpStrings[i].ptr, tmpStrings[i].len); + string *result = new string((char*)tmpStrings[i].ptr, tmpStrings[i].len); uint64_t origin_lbid = primMsg->LBID; uint32_t lbid_count = newRidList[i].token >> 46; primMsg->tokens[0].offset = 1; // first offset of a sig @@ -541,11 +541,12 @@ void DictStep::_projectToRG(RowGroup &rg, uint32_t col) primMsg->tokens[0].LBID = origin_lbid + j; issuePrimitive(false); projectResult(multi_part); - result.append((char*)multi_part[0].ptr, multi_part[0].len); + result->append((char*)multi_part[0].ptr, multi_part[0].len); } primMsg->tokens[0].offset = old_offset; tmpResultCounter = firstTmpResultCounter; - r.setVarBinaryField((unsigned char*)result.c_str(), result.length(), col); + r.setVarBinaryField((unsigned char*)result->c_str(), result->length(), col); + delete result; } else { @@ -559,6 +560,7 @@ void DictStep::_projectToRG(RowGroup &rg, uint32_t col) //cout << "_projectToRG() total length = " << totalResultLength << endl; idbassert(tmpResultCounter == bpp->ridCount); + delete [] tmpStrings; //cout << "DS: /projectingToRG l: " << (int64_t)primMsg->LBID // << " len: " << tmpResultCounter // << endl; diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index a6bccca66..484e22208 100644 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -38,6 +38,7 @@ using namespace std; #include +#include using namespace boost; #include "bytestream.h" @@ -73,9 +74,9 @@ StringStore::~StringStore() uint64_t inUse = 0, allocated = 0; for (i = 0; i < mem.size(); i++) { - MemChunk *tmp = (MemChunk *) mem.back().get(); - inUse += tmp->currentSize; - allocated += tmp->capacity; + std::string *tmp = mem.back().get(); + inUse += tmp->length(); + allocated += tmp->length(); } if (allocated > 0) cout << "~SS: " << inUse << "/" << allocated << " = " << (float) inUse/(float) allocated << endl; @@ -84,7 +85,6 @@ StringStore::~StringStore() uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) { - MemChunk *lastMC = NULL; uint32_t ret = 0; empty = false; // At least a NULL is being stored. @@ -102,31 +102,10 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) if (fUseStoreStringMutex) lk.lock(); - if (mem.size() > 0) - lastMC = (MemChunk *) mem.back().get(); + shared_ptr newString(new std::string((char*)data, len)); + mem.push_back(newString); - if ((lastMC == NULL) || (lastMC->capacity - lastMC->currentSize < len)) { - // mem usage debugging - //if (lastMC) - //cout << "Memchunk efficiency = " << lastMC->currentSize << "/" << lastMC->capacity << endl; - shared_array newOne(new uint8_t[CHUNK_SIZE + sizeof(MemChunk)]); - mem.push_back(newOne); - lastMC = (MemChunk *) mem.back().get(); - lastMC->currentSize = 0; - lastMC->capacity = CHUNK_SIZE; - memset(lastMC->data, 0, CHUNK_SIZE); - } - - ret = ((mem.size()-1) * CHUNK_SIZE) + lastMC->currentSize; - memcpy(&(lastMC->data[lastMC->currentSize]), data, len); - /* - cout << "stored: '" << hex; - for (uint32_t i = 0; i < len ; i++) { - cout << (char) lastMC->data[lastMC->currentSize + i]; - } - cout << "' at position " << lastMC->currentSize << " len " << len << dec << endl; - */ - lastMC->currentSize += len; + ret = mem.size(); return ret; } @@ -134,16 +113,17 @@ uint32_t StringStore::storeString(const uint8_t *data, uint32_t len) void StringStore::serialize(ByteStream &bs) const { uint32_t i; - MemChunk *mc; - + std::string empty_str; + bs << (uint32_t) mem.size(); bs << (uint8_t) empty; for (i = 0; i < mem.size(); i++) { - mc = (MemChunk *) mem[i].get(); - bs << (uint32_t) mc->currentSize; + if (mem[i].get() == NULL) + bs << empty_str; + else + bs << *mem[i].get(); //cout << "serialized " << mc->currentSize << " bytes\n"; - bs.append(mc->data, mc->currentSize); - } + } } uint32_t StringStore::deserialize(ByteStream &bs) @@ -151,27 +131,22 @@ uint32_t StringStore::deserialize(ByteStream &bs) uint32_t i; uint32_t count; uint32_t size; - uint8_t *buf; - MemChunk *mc; + std::string buf; uint8_t tmp8; uint32_t ret = 0; //mem.clear(); bs >> count; - mem.resize(count); + mem.reserve(count); bs >> tmp8; empty = (bool) tmp8; ret += 5; for (i = 0; i < count; i++) { - bs >> size; //cout << "deserializing " << size << " bytes\n"; - buf = bs.buf(); - mem[i].reset(new uint8_t[size + sizeof(MemChunk)]); - mc = (MemChunk *) mem[i].get(); - mc->currentSize = size; - mc->capacity = size; - memcpy(mc->data, buf, size); - bs.advance(size); + bs >> buf; + shared_ptr newString(new std::string(buf)); + mem.push_back(newString); + //bs.advance(size); ret += (size + 4); } return ret; @@ -179,7 +154,7 @@ uint32_t StringStore::deserialize(ByteStream &bs) void StringStore::clear() { - vector > emptyv; + vector > emptyv; mem.swap(emptyv); empty = true; } diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index b37a6f5f4..66d3ed8e2 100644 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -119,14 +119,8 @@ private: // This is an overlay b/c the underlying data needs to be any size, // and alloc'd in one chunk. data can't be a sepatate dynamic chunk. - struct MemChunk - { - uint32_t currentSize; - uint32_t capacity; - uint8_t data[]; - }; - - std::vector > mem; + + std::vector > mem; bool empty; bool fUseStoreStringMutex; //@bug6065, make StringStore::storeString() thread safe boost::mutex fMutex; @@ -1447,17 +1441,13 @@ inline std::string StringStore::getString(uint32_t off, uint32_t len) const if (off == std::numeric_limits::max()) return joblist::CPNULLSTRMARK; - MemChunk *mc; - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; - // this has to handle uninitialized data as well. If it's uninitialized it doesn't matter - // what gets returned, it just can't go out of bounds. - if (mem.size() <= chunk) + if ((mem.size() < off) || off == 0) return joblist::CPNULLSTRMARK; - mc = (MemChunk *) mem[chunk].get(); - if ((offset + len) > mc->currentSize) - return joblist::CPNULLSTRMARK; - return std::string((char *) &(mc->data[offset]), len); + + if (mem[off-1].get() == NULL) + return joblist::CPNULLSTRMARK; + + return *mem[off-1].get(); } inline const uint8_t * StringStore::getPointer(uint32_t off) const @@ -1465,17 +1455,15 @@ inline const uint8_t * StringStore::getPointer(uint32_t off) const if (off == std::numeric_limits::max()) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; - MemChunk *mc; // this has to handle uninitialized data as well. If it's uninitialized it doesn't matter // what gets returned, it just can't go out of bounds. - if (UNLIKELY(mem.size() <= chunk)) + if (UNLIKELY(mem.size() < off)) return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - mc = (MemChunk *) mem[chunk].get(); - if (offset > mc->currentSize) - return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); - return &(mc->data[offset]); + + if (off == 0 || (mem[off-1].get() == NULL)) + return (const uint8_t *) joblist::CPNULLSTRMARK.c_str(); + + return (uint8_t*)mem[off-1].get()->c_str(); } inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const @@ -1486,17 +1474,15 @@ inline bool StringStore::isNullValue(uint32_t off, uint32_t len) const if (len < 8) return false; - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; - MemChunk *mc; - if (mem.size() <= chunk) + if ((mem.size() < off) || off == 0) return true; - mc = (MemChunk *) mem[chunk].get(); - if ((offset + len) > mc->currentSize) - return true; - if (mc->data[offset] == 0) // "" = NULL string for some reason... - return true; - return (*((uint64_t *) &mc->data[offset]) == *((uint64_t *) joblist::CPNULLSTRMARK.c_str())); + + if (mem[off-1].get() == NULL) + return true; + + if (mem[off-1].get()->empty()) // Empty string is NULL + return true; + return (mem[off-1].get()->compare(joblist::CPNULLSTRMARK) == 0); } inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t len) const @@ -1504,15 +1490,13 @@ inline bool StringStore::equals(const std::string &str, uint32_t off, uint32_t l if (off == std::numeric_limits::max() || len == 0) return str == joblist::CPNULLSTRMARK; - uint32_t chunk = off / CHUNK_SIZE; - uint32_t offset = off % CHUNK_SIZE; - if (mem.size() <= chunk) - return false; - MemChunk *mc = (MemChunk *) mem[chunk].get(); - if ((offset + len) > mc->currentSize) + if ((mem.size() < off) || off == 0) return false; - return (strncmp(str.c_str(), (const char *) &mc->data[offset], len) == 0); + if (mem[off-1].get() == NULL) + return false; + + return (mem[off-1].get()->compare(str) == 0); } inline bool StringStore::isEmpty() const @@ -1524,11 +1508,9 @@ inline uint64_t StringStore::getSize() const { uint32_t i; uint64_t ret = 0; - MemChunk *mc; - + for (i = 0; i < mem.size(); i++) { - mc = (MemChunk *) mem[i].get(); - ret += mc->capacity; + ret+= mem[i].get()->length(); } return ret; }