1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-04-18 21:44:02 +03:00
2025-02-21 20:02:38 +04:00

847 lines
20 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/******************************************************************************
* $Id: bucketdl.h 9655 2013-06-25 23:08:13Z xlou $
*
*****************************************************************************/
/** @file
* class XXX interface
*/
#include "wsdl.h"
#include "hasher.h"
#include "bucketreuse.h"
#include "tuplewsdl.h"
#include <boost/function.hpp>
#include <boost/function_equal.hpp>
#include <boost/shared_ptr.hpp>
#include <sstream>
#pragma once
namespace joblist
{
/** @brief class BucketDL
*
*/
template <typename element_t>
class BucketDL : public DataList<element_t>
{
typedef DataList<element_t> base;
typedef boost::shared_ptr<TupleWSDL> TWSSP;
typedef std::vector<TWSSP> TWSVec;
enum ElementMode
{
RID_MODE,
RID_VALUE
};
public:
/** Main constuctor.
* @param numBuckets The number of buckets to create
* @param numConsumers The number of consumers that will eventually read this DL.
* @param maxElementsPerBucket The maximum # of elements each bucket should keep in memory.
* @param hash The function object that calculates which bucket an elements goes into on insertion.
*/
BucketDL(uint32_t numBuckets, uint32_t numConsumers, uint32_t maxElementsPerBucket, ResourceManager* rm,
boost::function<uint32_t(const char* data, uint32_t len)> hash = utils::Hasher());
virtual ~BucketDL();
// datalist interface. insert() and endOfInput() are the only
// datalist function that makes sense. The consumer side functions
// are stubs. Consumers know they're consuming a BucketDL.
void insert(const element_t& e);
void insert(const std::vector<element_t>& e);
void insert(const element_t* array, uint64_t arrayCount);
void insert(TupleType& e);
void insert(std::vector<TupleType>& e);
void endOfInput();
uint64_t getIterator();
bool next(uint64_t it, element_t* e);
void setMultipleProducers(bool);
// BucketDL consumer fcns
uint64_t getIterator(uint64_t bucket);
bool next(uint64_t bucket, uint64_t it, element_t* e);
/** Returns the size of the specified bucket */
uint64_t size(uint64_t bucket);
/** Returns the total number of elements stored */
uint64_t totalSize();
/** Returns the number of buckets */
uint64_t bucketCount();
/** Sets the value to pass down to element_t::getHashString() */
void setHashMode(uint64_t mode);
/** Sets the value to RID_only or rid_value mode */
void setElementMode(uint64_t mode);
/** Total number of files and filespace used for temp files */
void totalFileCounts(uint64_t& numFiles, uint64_t& numBytes) const;
const uint32_t hashLen() const
{
return fHashLen;
}
void hashLen(const uint32_t hashLen)
{
fHashLen = hashLen;
}
const uint32_t elementLen() const
{
return fElementLen;
}
void elementLen(const uint64_t ridSize, const uint64_t dataSize);
/** This class does employ temp disk */
virtual bool useDisk() const
{
return true;
}
/** Sets the size of the element components that are saved to disk */
virtual void setDiskElemSize(uint32_t size1st, uint32_t size2nd);
/** Accessor and mutator to the BucketReuseControlEntry */
void reuseControl(BucketReuseControlEntry* control, bool readonly);
BucketReuseControlEntry* reuseControl()
{
return fReuseControl;
}
/** Restores the buckets' set number and start positions */
void restoreBucketInformation();
/** @brief return tuple rid size */
const uint64_t ridSize() const
{
return fRidSize;
}
/** @brief return tuple data size */
const uint64_t dataSize() const
{
return fDataSize;
}
/** Enables disk I/O time logging */
void enableDiskIoTrace();
/** Returns the disk I/O time in seconds */
bool totalDiskIoTime(uint64_t& w, uint64_t& r);
/** Returns the reference of the disk I/O info list */
std::list<DiskIoInfo>& diskIoInfoList(uint64_t bucket);
protected:
private:
// Declare default constructors but don't define to disable their use
explicit BucketDL();
BucketDL(const BucketDL<element_t>&);
BucketDL<element_t>& operator=(const BucketDL<element_t>&);
ResourceManager* fRm;
WSDL<element_t>** buckets;
WSDL<RIDElementType>** rbuckets;
TWSVec fTBuckets;
uint64_t numBuckets;
uint64_t numConsumers;
uint64_t maxElements;
boost::function<uint32_t(const char* data, uint32_t len)> hashFcn;
uint64_t hashMode;
uint64_t bucketMask;
bool multiProducer;
bool fTraceOn;
uint64_t elementMode;
uint32_t fHashLen; // @bug 844. hash length for tuple type
uint64_t fRidSize; // @bug 844.
uint64_t fDataSize;
uint64_t fElementLen;
uint64_t bucketDoneCount;
BucketReuseControlEntry* fReuseControl;
};
template <typename element_t>
BucketDL<element_t>::BucketDL(uint32_t nb, uint32_t nc, uint32_t me, ResourceManager* rm,
boost::function<uint32_t(const char* data, uint32_t len)> hash)
: base()
, fRm(rm)
, buckets(0)
, rbuckets(0)
, fTraceOn(false)
, fHashLen(0)
, fElementLen(0)
, bucketDoneCount(0)
, fReuseControl(NULL)
{
uint32_t i;
uint64_t mask;
numBuckets = nb;
numConsumers = nc;
maxElements = me;
hashFcn = hash;
hashMode = 0;
elementMode = RID_MODE;
multiProducer = false;
// initialize buckets
if (typeid(element_t) == typeid(TupleType))
{
for (i = 0; i < numBuckets; i++)
fTBuckets.push_back(TWSSP(new TupleWSDL(numConsumers, maxElements, fRm)));
}
else
{
rbuckets = new WSDL<RIDElementType>*[numBuckets];
for (i = 0; i < numBuckets; i++)
rbuckets[i] = new WSDL<RIDElementType>(numConsumers, maxElements, rm);
}
for (i = 1, mask = 1, bucketMask = 0; i <= 64; i++)
{
mask <<= 1;
bucketMask = (bucketMask << 1) | 1;
if (numBuckets & mask)
break;
}
for (i++, mask <<= 1; i <= 64; i++, mask <<= 1)
if (numBuckets & mask)
throw std::runtime_error("BucketDL: The number of buckets should be a power of 2.");
}
template <typename element_t>
BucketDL<element_t>::~BucketDL()
{
if (typeid(element_t) == typeid(TupleType))
return;
uint64_t i;
if (elementMode == RID_MODE)
{
for (i = 0; i < numBuckets; i++)
delete rbuckets[i];
delete[] rbuckets;
}
else
{
for (i = 0; i < numBuckets; i++)
delete buckets[i];
delete[] buckets;
}
}
template <typename element_t>
void BucketDL<element_t>::setMultipleProducers(bool b)
{
multiProducer = b;
uint64_t i;
if (typeid(element_t) == typeid(TupleType))
{
for (i = 0; i < numBuckets; i++)
fTBuckets[i]->setMultipleProducers(b);
}
else
{
if (elementMode == RID_MODE)
{
for (i = 0; i < numBuckets; i++)
rbuckets[i]->setMultipleProducers(b);
}
else
{
for (i = 0; i < numBuckets; i++)
buckets[i]->setMultipleProducers(b);
}
}
}
template <typename element_t>
void BucketDL<element_t>::insert(const element_t& e)
{
/* Need the element type to provide what to hash, which conflicts
with the standard meaning of "<<". For our currently-defined element types,
this would be the rid field only, not the entire contents of the structure */
uint64_t bucket, len = fHashLen;
const char* hashStr;
hashStr = e.getHashString(hashMode, &len);
bucket = hashFcn(hashStr, len) & bucketMask;
if (elementMode == RID_MODE)
{
RIDElementType rid(e.first);
rbuckets[bucket]->insert(rid);
}
else
buckets[bucket]->insert(e);
}
template <typename element_t>
void BucketDL<element_t>::insert(TupleType& e)
{
uint64_t bucket, len = fHashLen;
const char* hashStr;
hashStr = e.getHashString(hashMode, &len);
bucket = hashFcn(hashStr, len) & bucketMask;
fTBuckets[bucket]->insert(e);
}
template <typename element_t>
void BucketDL<element_t>::insert(std::vector<TupleType>& v)
{
std::vector<TupleType>::iterator it, end;
if (multiProducer)
base::lock();
try
{
end = v.end();
for (it = v.begin(); it != end; ++it)
fTBuckets[hashFcn(it->second, fHashLen) & bucketMask]->insert_nolock(*it);
}
catch (...)
{
if (multiProducer)
base::unlock();
throw;
}
if (multiProducer)
base::unlock();
}
template <typename element_t>
void BucketDL<element_t>::insert(const std::vector<element_t>& v)
{
typename std::vector<element_t>::const_iterator it, end;
const char* hashStr;
uint64_t len = fHashLen;
if (multiProducer)
base::lock();
try
{
end = v.end();
if (elementMode == RID_MODE)
{
RIDElementType rid;
for (it = v.begin(); it != end; ++it)
{
hashStr = it->getHashString(hashMode, &len);
rid.first = (*it).first;
rbuckets[hashFcn(hashStr, len) & bucketMask]->insert_nolock(rid);
}
}
else
for (it = v.begin(); it != end; ++it)
{
hashStr = it->getHashString(hashMode, &len);
buckets[hashFcn(hashStr, len) & bucketMask]->insert_nolock(*it);
}
}
catch (...)
{
if (multiProducer)
base::unlock();
throw;
}
if (multiProducer)
base::unlock();
}
template <typename element_t>
void BucketDL<element_t>::insert(const element_t* array, uint64_t arrayCount)
{
const char* hashStr;
uint64_t len = fHashLen;
if (multiProducer)
base::lock();
try
{
if (elementMode == RID_MODE)
{
RIDElementType rid;
for (uint64_t i = 0; i < arrayCount; ++i)
{
hashStr = array[i].getHashString(hashMode, &len);
rid.first = array[i].first;
rbuckets[hashFcn(hashStr, len) & bucketMask]->insert_nolock(rid);
}
}
else
for (uint64_t i = 0; i < arrayCount; ++i)
{
hashStr = array[i].getHashString(hashMode, &len);
buckets[hashFcn(hashStr, len) & bucketMask]->insert_nolock(array[i]);
}
}
catch (...)
{
if (multiProducer)
base::unlock();
throw;
}
if (multiProducer)
base::unlock();
}
template <typename element_t>
uint64_t BucketDL<element_t>::getIterator()
{
throw std::logic_error("don't call BucketDL::getIterator(), call getIterator(uint32_t)");
}
template <typename element_t>
void BucketDL<element_t>::endOfInput()
{
uint64_t i;
uint64_t saveSize = 0; // debug
if (typeid(element_t) == typeid(TupleType))
{
for (i = 0; i < numBuckets; i++)
{
fTBuckets[i]->endOfInput();
saveSize += fTBuckets[i]->saveSize();
}
// std::cout << "bucketdl-" << this << " saveSize=" << saveSize << std::endl;
}
else if (elementMode == RID_MODE)
{
for (i = 0; i < numBuckets; i++)
rbuckets[i]->endOfInput();
}
else
{
for (i = 0; i < numBuckets; i++)
buckets[i]->endOfInput();
}
if (fReuseControl != NULL && fReuseControl->userNotified() == false)
fReuseControl->notifyUsers();
}
template <typename element_t>
bool BucketDL<element_t>::next(uint64_t it, element_t* e)
{
throw std::logic_error(
"don't call BucketDL::next(uint32_t, element_t), call next(uint32_t, uint32_t, element_t");
}
template <typename element_t>
uint64_t BucketDL<element_t>::getIterator(uint64_t bucket)
{
if (typeid(element_t) == typeid(TupleType))
return fTBuckets[bucket]->getIterator();
if (elementMode == RID_MODE)
return rbuckets[bucket]->getIterator();
else
return buckets[bucket]->getIterator();
}
template <typename element_t>
bool BucketDL<element_t>::next(uint64_t bucket, uint64_t it, element_t* e)
{
if (typeid(element_t) == typeid(TupleType))
return fTBuckets[bucket]->next(it, reinterpret_cast<TupleType*>(e));
bool ret;
if (elementMode == RID_MODE)
{
RIDElementType rid;
ret = rbuckets[bucket]->next(it, &rid);
e->first = rid.first;
}
else
{
ret = buckets[bucket]->next(it, e);
}
if (ret != true && fReuseControl != NULL)
{
// because not all the buckets are consumed at the same time,
// getIterator(i) maybe called sequentially by one or more threads,
// need to make sure all consumers are done with all the buckets
base::lock();
if (++bucketDoneCount == numConsumers * numBuckets)
BucketReuseManager::instance()->userDeregister(fReuseControl);
base::unlock();
}
return ret;
}
template <typename element_t>
uint64_t BucketDL<element_t>::size(uint64_t bucket)
{
if (typeid(element_t) == typeid(TupleType))
return fTBuckets[bucket]->totalSize();
if (elementMode == RID_MODE)
return rbuckets[bucket]->totalSize();
else
return buckets[bucket]->totalSize();
}
template <typename element_t>
uint64_t BucketDL<element_t>::bucketCount()
{
return numBuckets;
}
template <typename element_t>
uint64_t BucketDL<element_t>::totalSize()
{
uint64_t ret = 0;
uint64_t i;
if (typeid(element_t) == typeid(TupleType))
for (i = 0; i < numBuckets; i++)
ret += fTBuckets[i]->totalSize();
else if (elementMode == RID_MODE)
{
for (i = 0; i < numBuckets; i++)
ret += rbuckets[i]->totalSize();
}
else
{
for (i = 0; i < numBuckets; i++)
ret += buckets[i]->totalSize();
}
return ret;
}
template <typename element_t>
void BucketDL<element_t>::setHashMode(uint64_t mode)
{
hashMode = mode;
// Make elementMode the same as hashMode unless setElementMode
// is explicitly called by the caller, like filterstep.
setElementMode(mode);
}
template <typename element_t>
void BucketDL<element_t>::setElementMode(uint64_t mode)
{
uint64_t i;
if (typeid(element_t) == typeid(TupleType))
return;
if (elementMode != mode)
{
if (elementMode == RID_MODE)
{
for (i = 0; i < numBuckets; i++)
delete rbuckets[i];
delete[] rbuckets;
}
else
{
for (i = 0; i < numBuckets; i++)
delete buckets[i];
delete[] buckets;
}
elementMode = mode;
if (elementMode == RID_MODE)
{
rbuckets = new WSDL<RIDElementType>*[numBuckets];
for (i = 0; i < numBuckets; i++)
{
rbuckets[i] = new WSDL<RIDElementType>(numConsumers, maxElements, base::getDiskElemSize1st(),
base::getDiskElemSize2nd(), fRm);
rbuckets[i]->setMultipleProducers(multiProducer);
rbuckets[i]->traceOn(fTraceOn);
}
}
else
{
buckets = new WSDL<element_t>*[numBuckets];
for (i = 0; i < numBuckets; i++)
{
buckets[i] = new WSDL<element_t>(numConsumers, maxElements, base::getDiskElemSize1st(),
base::getDiskElemSize2nd(), fRm);
buckets[i]->setMultipleProducers(multiProducer);
buckets[i]->traceOn(fTraceOn);
}
}
}
// std::cout << "bucketdl-" << this << " setElementMode " << hashMode << std::endl;
}
//
// Returns the number of temp files and the space taken up by those files
// (in bytes) by this Bucket collection.
//
template <typename element_t>
void BucketDL<element_t>::totalFileCounts(uint64_t& numFiles, uint64_t& numBytes) const
{
numFiles = 0;
numBytes = 0;
if (typeid(element_t) == typeid(TupleType))
{
for (uint64_t i = 0; i < numBuckets; i++)
{
uint64_t setCnt = fTBuckets[i]->initialSetCount();
if (setCnt > 1)
{
numFiles += fTBuckets[i]->numberOfTempFiles();
numBytes += fTBuckets[i]->saveSize();
}
}
}
else if (elementMode == RID_MODE)
{
for (uint64_t i = 0; i < numBuckets; i++)
{
uint64_t setCnt = rbuckets[i]->initialSetCount();
if (setCnt > 1)
{
// std::cout << "BDL: bucket " << i << " has " << setCnt <<
// " sets" << std::endl;
numFiles += rbuckets[i]->numberOfTempFiles();
numBytes += rbuckets[i]->saveSize;
}
}
}
else
{
for (uint64_t i = 0; i < numBuckets; i++)
{
uint64_t setCnt = buckets[i]->initialSetCount();
if (setCnt > 1)
{
// std::cout << "BDL: bucket " << i << " has " << setCnt <<
// " sets" << std::endl;
numFiles += buckets[i]->numberOfTempFiles();
numBytes += buckets[i]->saveSize;
}
}
}
}
template <typename element_t>
void BucketDL<element_t>::elementLen(const uint64_t ridSize, const uint64_t dataSize)
{
fElementLen = ridSize + dataSize;
uint64_t i;
if (typeid(element_t) == typeid(TupleType))
{
for (i = 0; i < numBuckets; i++)
fTBuckets[i]->tupleSize(ridSize, dataSize);
}
}
//
// Sets the sizes to be employed in saving the elements to disk.
// size1st is the size in bytes of element_t.first.
// size2nd is the size in bytes of element_t.second.
//
template <typename element_t>
void BucketDL<element_t>::setDiskElemSize(uint32_t size1st, uint32_t size2nd)
{
base::fElemDiskFirstSize = size1st;
base::fElemDiskSecondSize = size2nd;
//...Forward this size information to our internal WSDL containers.
// @todo compress for tuplewsdl
if (typeid(element_t) == typeid(TupleType))
return;
if (elementMode == RID_MODE)
{
for (uint64_t i = 0; i < numBuckets; i++)
{
rbuckets[i]->setDiskElemSize(size1st, size2nd);
}
}
else
{
for (uint64_t i = 0; i < numBuckets; i++)
{
buckets[i]->setDiskElemSize(size1st, size2nd);
}
}
if (fReuseControl != NULL)
{
fReuseControl->dataSize().first = size1st;
fReuseControl->dataSize().second = size2nd;
}
}
template <typename element_t>
void BucketDL<element_t>::reuseControl(BucketReuseControlEntry* control, bool readonly)
{
// @todo reuse for tuplewsdl
if (typeid(element_t) == typeid(TupleType))
return;
if (control == NULL)
return;
fReuseControl = control;
std::vector<SetRestoreInfo>& infoVec = fReuseControl->restoreInfoVec();
infoVec.resize(numBuckets);
for (uint64_t i = 0; i < numBuckets; i++)
{
std::stringstream ss;
ss << control->baseName() << "." << i;
if (elementMode == RID_MODE)
rbuckets[i]->setReuseInfo(&(infoVec[i]), ss.str().c_str(), readonly);
else
buckets[i]->setReuseInfo(&(infoVec[i]), ss.str().c_str(), readonly);
}
}
template <typename element_t>
void BucketDL<element_t>::restoreBucketInformation()
{
if (typeid(element_t) == typeid(TupleType))
return;
std::vector<SetRestoreInfo>& infoVec = fReuseControl->restoreInfoVec();
if (elementMode == RID_MODE)
for (uint64_t i = 0; i < numBuckets; i++)
rbuckets[i]->restoreSetForReuse(infoVec[i]);
else
for (uint64_t i = 0; i < numBuckets; i++)
buckets[i]->restoreSetForReuse(infoVec[i]);
}
template <typename element_t>
void BucketDL<element_t>::enableDiskIoTrace()
{
fTraceOn = true;
if (typeid(element_t) == typeid(TupleType))
{
for (uint64_t bucket = 0; bucket < numBuckets; bucket++)
fTBuckets[bucket]->traceOn(fTraceOn);
return;
}
if (elementMode == RID_MODE)
for (uint64_t bucket = 0; bucket < numBuckets; bucket++)
rbuckets[bucket]->traceOn(fTraceOn);
else
for (uint64_t bucket = 0; bucket < numBuckets; bucket++)
buckets[bucket]->traceOn(fTraceOn);
}
template <typename element_t>
bool BucketDL<element_t>::totalDiskIoTime(uint64_t& w, uint64_t& r)
{
boost::posix_time::time_duration wTime(0, 0, 0, 0);
boost::posix_time::time_duration rTime(0, 0, 0, 0);
bool diskIo = false;
for (uint64_t bucket = 0; bucket < numBuckets; bucket++)
{
std::list<DiskIoInfo>& infoList = diskIoInfoList(bucket);
std::list<DiskIoInfo>::iterator k = infoList.begin();
while (k != infoList.end())
{
if (k->fWrite == true)
wTime += k->fEnd - k->fStart;
else
rTime += k->fEnd - k->fStart;
k++;
}
if (infoList.size() > 0)
diskIo = true;
}
w = wTime.total_seconds();
r = rTime.total_seconds();
return diskIo;
}
template <typename element_t>
std::list<DiskIoInfo>& BucketDL<element_t>::diskIoInfoList(uint64_t bucket)
{
if (typeid(element_t) == typeid(TupleType))
return (fTBuckets[bucket]->diskIoList());
if (elementMode == RID_MODE)
return (rbuckets[bucket]->diskIoList());
else
return (buckets[bucket]->diskIoList());
}
} // namespace joblist