You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-5191 Refacator statistics.
Move uniform distribution to Statitistics constructor, remove rowcount.
This commit is contained in:
@ -17,15 +17,14 @@
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <random>
|
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
#include "statistics.h"
|
|
||||||
#include "IDBPolicy.h"
|
#include "IDBPolicy.h"
|
||||||
#include "brmtypes.h"
|
#include "brmtypes.h"
|
||||||
#include "hasher.h"
|
#include "hasher.h"
|
||||||
#include "messagequeue.h"
|
#include "messagequeue.h"
|
||||||
#include "configcpp.h"
|
#include "configcpp.h"
|
||||||
|
#include "statistics.h"
|
||||||
|
|
||||||
using namespace idbdatafile;
|
using namespace idbdatafile;
|
||||||
using namespace logging;
|
using namespace logging;
|
||||||
@ -60,17 +59,12 @@ void StatisticsManager::collectSample(const rowgroup::RowGroup& rowGroup)
|
|||||||
rowGroup.getRow(0, &r);
|
rowGroup.getRow(0, &r);
|
||||||
|
|
||||||
// Generate a uniform distribution.
|
// Generate a uniform distribution.
|
||||||
std::random_device randomDevice;
|
|
||||||
std::mt19937 gen32(randomDevice());
|
|
||||||
std::uniform_int_distribution<> uniformDistribution(0, currentRowIndex + rowCount - 1);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < rowCount; ++i)
|
for (uint32_t i = 0; i < rowCount; ++i)
|
||||||
{
|
{
|
||||||
if (currentSampleSize < maxSampleSize)
|
if (currentSampleSize < maxSampleSize)
|
||||||
{
|
{
|
||||||
for (uint32_t j = 0; j < columnCount; ++j)
|
for (uint32_t j = 0; j < columnCount; ++j)
|
||||||
{
|
{
|
||||||
// FIXME: Handle null values as well.
|
|
||||||
if (!r.isNullValue(j))
|
if (!r.isNullValue(j))
|
||||||
columnGroups[oids[j]][currentSampleSize] = r.getIntField(j);
|
columnGroups[oids[j]][currentSampleSize] = r.getIntField(j);
|
||||||
}
|
}
|
||||||
@ -82,11 +76,11 @@ void StatisticsManager::collectSample(const rowgroup::RowGroup& rowGroup)
|
|||||||
if (index < maxSampleSize)
|
if (index < maxSampleSize)
|
||||||
{
|
{
|
||||||
for (uint32_t j = 0; j < columnCount; ++j)
|
for (uint32_t j = 0; j < columnCount; ++j)
|
||||||
columnGroups[oids[j]][index] = r.getIntField(j);
|
if (!r.isNullValue(j))
|
||||||
|
columnGroups[oids[j]][index] = r.getIntField(j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
r.nextRow();
|
r.nextRow();
|
||||||
++currentRowIndex;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,7 +91,7 @@ void StatisticsManager::analyzeSample(bool traceOn)
|
|||||||
|
|
||||||
// PK_FK statistics.
|
// PK_FK statistics.
|
||||||
for (const auto& [oid, sample] : columnGroups)
|
for (const auto& [oid, sample] : columnGroups)
|
||||||
keyTypes[oid] = std::make_pair(KeyType::PK, currentRowIndex);
|
keyTypes[oid] = KeyType::PK;
|
||||||
|
|
||||||
for (const auto& [oid, sample] : columnGroups)
|
for (const auto& [oid, sample] : columnGroups)
|
||||||
{
|
{
|
||||||
@ -107,8 +101,8 @@ void StatisticsManager::analyzeSample(bool traceOn)
|
|||||||
{
|
{
|
||||||
const auto value = sample[i];
|
const auto value = sample[i];
|
||||||
// PK_FK statistics.
|
// PK_FK statistics.
|
||||||
if (columnsCache.count(value) && keyTypes[oid].first == KeyType::PK)
|
if (columnsCache.count(value) && keyTypes[oid] == KeyType::PK)
|
||||||
keyTypes[oid].first = KeyType::FK;
|
keyTypes[oid] = KeyType::FK;
|
||||||
else
|
else
|
||||||
columnsCache.insert(value);
|
columnsCache.insert(value);
|
||||||
|
|
||||||
@ -137,7 +131,6 @@ void StatisticsManager::analyzeSample(bool traceOn)
|
|||||||
// Clear sample.
|
// Clear sample.
|
||||||
columnGroups.clear();
|
columnGroups.clear();
|
||||||
currentSampleSize = 0;
|
currentSampleSize = 0;
|
||||||
currentRowIndex = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void StatisticsManager::output()
|
void StatisticsManager::output()
|
||||||
@ -147,20 +140,20 @@ void StatisticsManager::output()
|
|||||||
std::cout << "Statistics type [PK_FK]: " << std::endl;
|
std::cout << "Statistics type [PK_FK]: " << std::endl;
|
||||||
for (const auto& p : keyTypes)
|
for (const auto& p : keyTypes)
|
||||||
{
|
{
|
||||||
std::cout << "OID: " << p.first << " ";
|
std::cout << "[OID: " << p.first << ": ";
|
||||||
if (static_cast<uint32_t>(p.second.first) == 0)
|
if (static_cast<uint32_t>(p.second) == 0)
|
||||||
std::cout << "PK ";
|
std::cout << "PK] ";
|
||||||
else
|
else
|
||||||
std::cout << "FK ";
|
std::cout << "FK] ";
|
||||||
std::cout << "row count: " << p.second.second << std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "Statistics type [MCV]: " << std::endl;
|
std::cout << "\nStatistics type [MCV]: " << std::endl;
|
||||||
for (const auto& [oid, columnMCV] : mcv)
|
for (const auto& [oid, columnMCV] : mcv)
|
||||||
{
|
{
|
||||||
std::cout << "OID: " << oid << std::endl;
|
std::cout << "[OID: " << oid << std::endl;
|
||||||
for (const auto& [value, count] : columnMCV)
|
for (const auto& [value, count] : columnMCV)
|
||||||
std::cout << value << ": " << count << std::endl;
|
std::cout << value << ": " << count << ", ";
|
||||||
|
cout << "]" << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -169,8 +162,8 @@ std::unique_ptr<char[]> StatisticsManager::convertStatsToDataStream(uint64_t& da
|
|||||||
{
|
{
|
||||||
// Number of pairs.
|
// Number of pairs.
|
||||||
uint64_t count = keyTypes.size();
|
uint64_t count = keyTypes.size();
|
||||||
// count, [[uid, keyType, rows count], ... ]
|
// count, [[uid, keyType], ... ]
|
||||||
dataStreamSize = sizeof(uint64_t) + count * (sizeof(uint32_t) + sizeof(KeyType) + sizeof(uint32_t));
|
dataStreamSize = sizeof(uint64_t) + count * (sizeof(uint32_t) + sizeof(KeyType));
|
||||||
|
|
||||||
// Count the size of the MCV.
|
// Count the size of the MCV.
|
||||||
for (const auto& [oid, mcvColumn] : mcv)
|
for (const auto& [oid, mcvColumn] : mcv)
|
||||||
@ -188,18 +181,15 @@ std::unique_ptr<char[]> StatisticsManager::convertStatsToDataStream(uint64_t& da
|
|||||||
std::memcpy(dataStream, reinterpret_cast<char*>(&count), sizeof(uint64_t));
|
std::memcpy(dataStream, reinterpret_cast<char*>(&count), sizeof(uint64_t));
|
||||||
offset += sizeof(uint64_t);
|
offset += sizeof(uint64_t);
|
||||||
|
|
||||||
// For each pair [oid, key type, rows count].
|
// For each pair [oid, key type].
|
||||||
for (const auto& p : keyTypes)
|
for (const auto& p : keyTypes)
|
||||||
{
|
{
|
||||||
uint32_t oid = p.first;
|
uint32_t oid = p.first;
|
||||||
std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&oid), sizeof(uint32_t));
|
std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&oid), sizeof(uint32_t));
|
||||||
offset += sizeof(uint32_t);
|
offset += sizeof(uint32_t);
|
||||||
KeyType keyType = p.second.first;
|
KeyType keyType = p.second;
|
||||||
std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&keyType), sizeof(KeyType));
|
std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&keyType), sizeof(KeyType));
|
||||||
offset += sizeof(KeyType);
|
offset += sizeof(KeyType);
|
||||||
uint32_t rowCount = p.second.second;
|
|
||||||
std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&rowCount), sizeof(uint32_t));
|
|
||||||
offset += sizeof(uint32_t);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// For each [oid, list size, list [value, count]].
|
// For each [oid, list size, list [value, count]].
|
||||||
@ -240,16 +230,13 @@ void StatisticsManager::convertStatsFromDataStream(std::unique_ptr<char[]> dataS
|
|||||||
// For each pair.
|
// For each pair.
|
||||||
for (uint64_t i = 0; i < count; ++i)
|
for (uint64_t i = 0; i < count; ++i)
|
||||||
{
|
{
|
||||||
uint32_t oid, rowCount;
|
uint32_t oid;
|
||||||
KeyType keyType;
|
KeyType keyType;
|
||||||
std::memcpy(reinterpret_cast<char*>(&oid), &dataStream[offset], sizeof(uint32_t));
|
std::memcpy(reinterpret_cast<char*>(&oid), &dataStream[offset], sizeof(uint32_t));
|
||||||
offset += sizeof(uint32_t);
|
offset += sizeof(uint32_t);
|
||||||
std::memcpy(reinterpret_cast<char*>(&keyType), &dataStream[offset], sizeof(KeyType));
|
std::memcpy(reinterpret_cast<char*>(&keyType), &dataStream[offset], sizeof(KeyType));
|
||||||
offset += sizeof(KeyType);
|
offset += sizeof(KeyType);
|
||||||
std::memcpy(reinterpret_cast<char*>(&rowCount), &dataStream[offset], sizeof(uint32_t));
|
keyTypes[oid] = keyType;
|
||||||
offset += sizeof(uint32_t);
|
|
||||||
// Insert pair.
|
|
||||||
keyTypes[oid] = std::make_pair(keyType, rowCount);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint64_t i = 0; i < count; ++i)
|
for (uint64_t i = 0; i < count; ++i)
|
||||||
@ -407,8 +394,7 @@ void StatisticsManager::serialize(messageqcpp::ByteStream& bs)
|
|||||||
for (const auto& keyType : keyTypes)
|
for (const auto& keyType : keyTypes)
|
||||||
{
|
{
|
||||||
bs << keyType.first;
|
bs << keyType.first;
|
||||||
bs << (uint32_t)keyType.second.first;
|
bs << (uint32_t)keyType.second;
|
||||||
bs << keyType.second.second;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// MCV
|
// MCV
|
||||||
@ -435,11 +421,10 @@ void StatisticsManager::unserialize(messageqcpp::ByteStream& bs)
|
|||||||
// PK_FK
|
// PK_FK
|
||||||
for (uint32_t i = 0; i < count; ++i)
|
for (uint32_t i = 0; i < count; ++i)
|
||||||
{
|
{
|
||||||
uint32_t oid, keyType, rowCount;
|
uint32_t oid, keyType;
|
||||||
bs >> oid;
|
bs >> oid;
|
||||||
bs >> keyType;
|
bs >> keyType;
|
||||||
bs >> rowCount;
|
keyTypes[oid] = static_cast<KeyType>(keyType);
|
||||||
keyTypes[oid] = std::make_pair(static_cast<KeyType>(keyType), rowCount);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// MCV
|
// MCV
|
||||||
@ -470,7 +455,7 @@ bool StatisticsManager::hasKey(uint32_t oid)
|
|||||||
|
|
||||||
KeyType StatisticsManager::getKeyType(uint32_t oid)
|
KeyType StatisticsManager::getKeyType(uint32_t oid)
|
||||||
{
|
{
|
||||||
return keyTypes[oid].first;
|
return keyTypes[oid];
|
||||||
}
|
}
|
||||||
|
|
||||||
StatisticsDistributor* StatisticsDistributor::instance()
|
StatisticsDistributor* StatisticsDistributor::instance()
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
// Represents a commands for `ExeMgr`.
|
// Represents a commands for `ExeMgr`.
|
||||||
#define ANALYZE_TABLE_EXECUTE 6
|
#define ANALYZE_TABLE_EXECUTE 6
|
||||||
@ -67,7 +68,7 @@ struct StatisticsFileHeader
|
|||||||
|
|
||||||
using ColumnsCache = std::unordered_map<uint32_t, std::unordered_set<uint64_t>>;
|
using ColumnsCache = std::unordered_map<uint32_t, std::unordered_set<uint64_t>>;
|
||||||
using ColumnGroup = std::unordered_map<uint32_t, std::vector<uint64_t>>;
|
using ColumnGroup = std::unordered_map<uint32_t, std::vector<uint64_t>>;
|
||||||
using KeyTypes = std::unordered_map<uint32_t, std::pair<KeyType, uint32_t>>;
|
using KeyTypes = std::unordered_map<uint32_t, KeyType>;
|
||||||
using MCVList = std::unordered_map<uint32_t, std::unordered_map<uint64_t, uint32_t>>;
|
using MCVList = std::unordered_map<uint32_t, std::unordered_map<uint64_t, uint32_t>>;
|
||||||
|
|
||||||
// This class is responsible for processing and storing statistics.
|
// This class is responsible for processing and storing statistics.
|
||||||
@ -104,14 +105,22 @@ class StatisticsManager
|
|||||||
KeyType getKeyType(uint32_t oid);
|
KeyType getKeyType(uint32_t oid);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
StatisticsManager() : currentSampleSize(0), currentRowIndex(0), epoch(0), version(1)
|
StatisticsManager() : currentSampleSize(0), epoch(0), version(1)
|
||||||
{
|
{
|
||||||
// Initialize plugins.
|
// Initialize plugins.
|
||||||
IDBPolicy::configIDBPolicy();
|
IDBPolicy::configIDBPolicy();
|
||||||
|
// Generate distibution once in range [0, UINT_MAX].
|
||||||
|
gen32 = std::mt19937(randomDevice());
|
||||||
|
uniformDistribution = std::uniform_int_distribution<uint32_t>(0, UINT_MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<char[]> convertStatsToDataStream(uint64_t& dataStreamSize);
|
std::unique_ptr<char[]> convertStatsToDataStream(uint64_t& dataStreamSize);
|
||||||
void convertStatsFromDataStream(std::unique_ptr<char[]> dataStreamSmartPtr);
|
void convertStatsFromDataStream(std::unique_ptr<char[]> dataStreamSmartPtr);
|
||||||
|
|
||||||
|
std::random_device randomDevice;
|
||||||
|
std::mt19937 gen32;
|
||||||
|
std::uniform_int_distribution<uint32_t> uniformDistribution;
|
||||||
|
|
||||||
// Internal data represents a sample [OID, vector of values].
|
// Internal data represents a sample [OID, vector of values].
|
||||||
ColumnGroup columnGroups;
|
ColumnGroup columnGroups;
|
||||||
// Internal data for the PK/FK statistics [OID, bool value].
|
// Internal data for the PK/FK statistics [OID, bool value].
|
||||||
@ -122,7 +131,6 @@ class StatisticsManager
|
|||||||
// TODO: Think about sample size.
|
// TODO: Think about sample size.
|
||||||
const uint32_t maxSampleSize = 64000;
|
const uint32_t maxSampleSize = 64000;
|
||||||
uint32_t currentSampleSize;
|
uint32_t currentSampleSize;
|
||||||
uint32_t currentRowIndex;
|
|
||||||
uint32_t epoch;
|
uint32_t epoch;
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user