1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

[MCOL-4709] Disk-based aggregation

* Introduce multigeneration aggregation

* Do not save unused part of RGDatas to disk
* Add IO error explanation (strerror)

* Reduce memory usage while aggregating
* introduce in-memory generations to better memory utilization

* Try to limit the qty of buckets at a low limit

* Refactor disk aggregation a bit
* pass calculated hash into RowAggregation
* try to keep some RGData with free space in memory

* do not dump more than half of rowgroups to disk if generations are
  allowed, instead start a new generation
* for each thread shift the first processed bucket at each iteration,
  so the generations start more evenly

* Unify temp data location

* Explicitly create temp subdirectories
  whether disk aggregation/join are enabled or not
This commit is contained in:
Alexey Antipovsky
2021-01-15 18:52:13 +03:00
parent 3537c0d635
commit 475104e4d3
24 changed files with 5932 additions and 906 deletions

View File

@ -32,6 +32,7 @@
using namespace std;
#include <boost/shared_array.hpp>
#include <numeric>
using namespace boost;
#include "bytestream.h"
@ -405,6 +406,7 @@ RGData::RGData(const RowGroup& rg, uint32_t rowCount)
*/
memset(rowData.get(), 0, rg.getDataSize(rowCount)); // XXXPAT: make valgrind happy temporarily
#endif
memset(rowData.get(), 0, rg.getDataSize(rowCount)); // XXXPAT: make valgrind happy temporarily
}
RGData::RGData(const RowGroup& rg)
@ -481,7 +483,7 @@ void RGData::serialize(ByteStream& bs, uint32_t amount) const
bs << (uint8_t) 0;
}
void RGData::deserialize(ByteStream& bs, bool hasLenField)
void RGData::deserialize(ByteStream& bs, uint32_t defAmount)
{
uint32_t amount, sig;
uint8_t* buf;
@ -493,7 +495,7 @@ void RGData::deserialize(ByteStream& bs, bool hasLenField)
{
bs >> sig;
bs >> amount;
rowData.reset(new uint8_t[amount]);
rowData.reset(new uint8_t[std::max(amount, defAmount)]);
buf = bs.buf();
memcpy(rowData.get(), buf, amount);
bs.advance(amount);
@ -577,12 +579,13 @@ Row& Row::operator=(const Row& r)
return *this;
}
string Row::toString() const
string Row::toString(uint32_t rownum) const
{
ostringstream os;
uint32_t i;
//os << getRid() << ": ";
os << "[" << std::setw(5) << rownum << std::setw(0) << "]: ";
os << (int) useStringTable << ": ";
for (i = 0; i < columnCount; i++)
@ -1447,7 +1450,7 @@ uint32_t RowGroup::getColumnCount() const
return columnCount;
}
string RowGroup::toString() const
string RowGroup::toString(const std::vector<uint64_t>& used) const
{
ostringstream os;
ostream_iterator<int> oIter1(os, "\t");
@ -1479,6 +1482,8 @@ string RowGroup::toString() const
os << "uses a string table\n";
else
os << "doesn't use a string table\n";
if (!used.empty())
os << "sparse\n";
//os << "strings = " << hex << (int64_t) strings << "\n";
//os << "data = " << (int64_t) data << "\n" << dec;
@ -1488,14 +1493,25 @@ string RowGroup::toString() const
initRow(&r);
getRow(0, &r);
os << "rowcount = " << getRowCount() << endl;
if (!used.empty())
{
uint64_t cnt = std::accumulate(used.begin(), used.end(), 0ULL,
[](uint64_t a, uint64_t bits) {
return a + __builtin_popcountll(bits);
});
os << "sparse row count = " << cnt << endl;
}
os << "base rid = " << getBaseRid() << endl;
os << "status = " << getStatus() << endl;
os << "dbroot = " << getDBRoot() << endl;
os << "row data...\n";
for (uint32_t i = 0; i < getRowCount(); i++)
uint32_t max_cnt = used.empty() ? getRowCount() : (used.size() * 64);
for (uint32_t i = 0; i < max_cnt; i++)
{
os << r.toString() << endl;
if (!used.empty() && !(used[i/64] & (1ULL << (i%64))))
continue;
os << r.toString(i) << endl;
r.nextRow();
}
}