1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-5166 This patch adds support for in-memory communication b/w EM to PP via a shared queue in DEC class

JobList low-level code relateod to primitive jobs now uses shared pointers instead of ByteStream refs talking to DEC
b/c same-node EM-PP communication now goes over a queue in DEC instead of a network hop.
PP now has a separate thread that processes the primitive job messages from that DEC queue.
This commit is contained in:
Roman Nozdrin
2022-07-21 19:37:18 +00:00
committed by Leonid Fedorov
parent 9ef16c6ded
commit a9d8924683
11 changed files with 418 additions and 232 deletions

View File

@ -21,13 +21,16 @@
*
*
***********************************************************************/
#define _FILE_OFFSET_BITS 64
#define _LARGEFILE64_SOURCE
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <mutex>
#include <stdexcept>
//#define NDEBUG
#include <cassert>
#include <boost/thread.hpp>
@ -49,7 +52,8 @@ using namespace std;
#include <boost/scoped_array.hpp>
#include <boost/thread.hpp>
using namespace boost;
#include "distributedenginecomm.h"
#include "serviceexemgr.h"
#include "primproc.h"
#include "primitiveserver.h"
#include "primitivemsg.h"
@ -1055,7 +1059,7 @@ class DictScanJob : public threadpool::FairThreadPool::Functor
DictScanJob(SP_UM_IOSOCK ios, SBS bs, SP_UM_MUTEX writeLock);
virtual ~DictScanJob();
void write(const ByteStream&);
void write(const SBS&);
int operator()();
void catchHandler(const std::string& ex, uint32_t id, uint16_t code = logging::primitiveServerErr);
void sendErrorMsg(uint32_t id, uint16_t code);
@ -1077,17 +1081,27 @@ DictScanJob::~DictScanJob()
{
}
void DictScanJob::write(const ByteStream& bs)
void DictScanJob::write(const SBS& sbs)
{
// Here is the fast path for local EM to PM interaction. PM puts into the
// input EM DEC queue directly.
// !sock has a 'same host connection' semantics here.
if (!fIos)
{
auto* exeMgrDecPtr = exemgr::globServiceExeMgr->getDec();
exeMgrDecPtr->addDataToOutput(sbs);
return;
}
boost::mutex::scoped_lock lk(*fWriteLock);
fIos->write(bs);
fIos->write(*sbs);
}
int DictScanJob::operator()()
{
utils::setThreadName("PPDictScanJob");
uint8_t data[DATA_BLOCK_SIZE];
uint32_t output_buf_size = MAX_BUFFER_SIZE;
// Reducing this buffer one might face unrelated issues in DictScanStep.
const uint32_t output_buf_size = MAX_BUFFER_SIZE;
uint32_t session;
uint32_t uniqueId = 0;
bool wasBlockInCache = false;
@ -1095,7 +1109,6 @@ int DictScanJob::operator()()
uint16_t runCount;
boost::shared_ptr<DictEqualityFilter> eqFilter;
ByteStream results(output_buf_size);
TokenByScanRequestHeader* cmd;
PrimitiveProcessor pproc(gDebugLevel);
TokenByScanResultHeader* output;
@ -1114,7 +1127,6 @@ int DictScanJob::operator()()
session = cmd->Hdr.SessionID;
uniqueId = cmd->Hdr.UniqueID;
runCount = cmd->Count;
output = (TokenByScanResultHeader*)results.getInputPtr();
#ifdef VALGRIND
memset(output, 0, sizeof(TokenByScanResultHeader));
#endif
@ -1145,6 +1157,9 @@ int DictScanJob::operator()()
for (uint16_t i = 0; i < runCount; ++i)
{
SBS results(new ByteStream(output_buf_size));
output = (TokenByScanResultHeader*)results->getInputPtr();
loadBlock(cmd->LBID, verInfo, cmd->Hdr.TransactionID, cmd->CompType, data, &wasBlockInCache,
&blocksRead, fLBIDTraceOn, session);
pproc.setBlockPtr((int*)data);
@ -1155,9 +1170,8 @@ int DictScanJob::operator()()
else
output->PhysicalIO += blocksRead;
results.advanceInputPtr(output->NBYTES);
results->advanceInputPtr(output->NBYTES);
write(results);
results.restart();
cmd->LBID++;
}
@ -1199,9 +1213,9 @@ void DictScanJob::sendErrorMsg(uint32_t id, uint16_t code)
ism.Status = code;
ph.UniqueID = id;
ByteStream msg(sizeof(ISMPacketHeader) + sizeof(PrimitiveHeader));
msg.append((uint8_t*)&ism, sizeof(ism));
msg.append((uint8_t*)&ph, sizeof(ph));
SBS msg(new ByteStream(sizeof(ISMPacketHeader) + sizeof(PrimitiveHeader)));
msg->append((uint8_t*)&ism, sizeof(ism));
msg->append((uint8_t*)&ph, sizeof(ph));
write(msg);
}
@ -1940,6 +1954,128 @@ struct ReadThread
ios->write(buildCacheOpResp(0));
}
static void dispatchPrimitive(SBS sbs, boost::shared_ptr<BPPHandler>& fBPPHandler,
boost::shared_ptr<threadpool::FairThreadPool>& procPoolPtr,
SP_UM_IOSOCK& outIos, SP_UM_MUTEX& writeLock, const uint32_t processorThreads,
const bool ptTrace)
{
const ISMPacketHeader* ismHdr = reinterpret_cast<const ISMPacketHeader*>(sbs->buf());
switch (ismHdr->Command)
{
case DICT_CREATE_EQUALITY_FILTER:
case DICT_DESTROY_EQUALITY_FILTER:
case BATCH_PRIMITIVE_CREATE:
case BATCH_PRIMITIVE_ADD_JOINER:
case BATCH_PRIMITIVE_END_JOINER:
case BATCH_PRIMITIVE_DESTROY:
case BATCH_PRIMITIVE_ABORT:
{
const uint8_t* buf = sbs->buf();
uint32_t pos = sizeof(ISMPacketHeader) - 2;
const uint32_t txnId = *((uint32_t*)&buf[pos + 2]);
const uint32_t stepID = *((uint32_t*)&buf[pos + 6]);
const uint32_t uniqueID = *((uint32_t*)&buf[pos + 10]);
const uint32_t weight = 1;
const uint32_t priority = 0;
uint32_t id = 0;
boost::shared_ptr<FairThreadPool::Functor> functor;
if (ismHdr->Command == DICT_CREATE_EQUALITY_FILTER)
{
functor.reset(new CreateEqualityFilter(sbs));
}
else if (ismHdr->Command == DICT_DESTROY_EQUALITY_FILTER)
{
functor.reset(new DestroyEqualityFilter(sbs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_CREATE)
{
functor.reset(new BPPHandler::Create(fBPPHandler, sbs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_ADD_JOINER)
{
functor.reset(new BPPHandler::AddJoiner(fBPPHandler, sbs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_END_JOINER)
{
id = fBPPHandler->getUniqueID(sbs, ismHdr->Command);
functor.reset(new BPPHandler::LastJoiner(fBPPHandler, sbs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_DESTROY)
{
id = fBPPHandler->getUniqueID(sbs, ismHdr->Command);
functor.reset(new BPPHandler::Destroy(fBPPHandler, sbs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_ABORT)
{
id = fBPPHandler->getUniqueID(sbs, ismHdr->Command);
functor.reset(new BPPHandler::Abort(fBPPHandler, sbs));
}
FairThreadPool::Job job(uniqueID, stepID, txnId, functor, weight, priority, id);
procPoolPtr->addJob(job);
break;
}
case DICT_TOKEN_BY_SCAN_COMPARE:
case BATCH_PRIMITIVE_RUN:
{
TokenByScanRequestHeader* hdr = nullptr;
boost::shared_ptr<FairThreadPool::Functor> functor;
uint32_t id = 0;
uint32_t weight = 0;
uint32_t priority = 0;
uint32_t txnId = 0;
uint32_t stepID = 0;
uint32_t uniqueID = 0;
if (ismHdr->Command == DICT_TOKEN_BY_SCAN_COMPARE)
{
idbassert(sbs->length() >= sizeof(TokenByScanRequestHeader));
hdr = (TokenByScanRequestHeader*)ismHdr;
functor.reset(new DictScanJob(outIos, sbs, writeLock));
id = hdr->Hdr.UniqueID;
weight = LOGICAL_BLOCK_RIDS;
priority = hdr->Hdr.Priority;
const uint8_t* buf = sbs->buf();
const uint32_t pos = sizeof(ISMPacketHeader) - 2;
txnId = *((uint32_t*)&buf[pos + 2]);
stepID = *((uint32_t*)&buf[pos + 6]);
uniqueID = *((uint32_t*)&buf[pos + 10]);
}
else if (ismHdr->Command == BATCH_PRIMITIVE_RUN)
{
functor.reset(new BPPSeeder(sbs, writeLock, outIos, processorThreads, ptTrace));
BPPSeeder* bpps = dynamic_cast<BPPSeeder*>(functor.get());
id = bpps->getID();
priority = bpps->priority();
const uint8_t* buf = sbs->buf();
const uint32_t pos = sizeof(ISMPacketHeader) - 2;
txnId = *((uint32_t*)&buf[pos + 2]);
stepID = *((uint32_t*)&buf[pos + 6]);
uniqueID = *((uint32_t*)&buf[pos + 10]);
weight = ismHdr->Size + *((uint32_t*)&buf[pos + 18]);
}
FairThreadPool::Job job(uniqueID, stepID, txnId, functor, outIos, weight, priority, id);
procPoolPtr->addJob(job);
break;
}
case BATCH_PRIMITIVE_ACK:
{
fBPPHandler->doAck(*sbs);
break;
}
default:
{
std::ostringstream os;
Logger log;
os << "unknown primitive cmd: " << ismHdr->Command;
log.logMessage(os.str());
break;
}
} // the switch stmt
}
void operator()()
{
utils::setThreadName("PPReadThread");
@ -1994,9 +2130,6 @@ struct ReadThread
idbassert(bs->length() >= sizeof(ISMPacketHeader));
const ISMPacketHeader* ismHdr = reinterpret_cast<const ISMPacketHeader*>(bs->buf());
// uint64_t someVal = ismHdr->Command;
// std::cout << " PP read thread Command " << someVal << std::endl;
/* This switch is for the OOB commands */
switch (ismHdr->Command)
{
@ -2037,139 +2170,8 @@ struct ReadThread
default: break;
}
switch (ismHdr->Command)
{
case DICT_CREATE_EQUALITY_FILTER:
case DICT_DESTROY_EQUALITY_FILTER:
case BATCH_PRIMITIVE_CREATE:
case BATCH_PRIMITIVE_ADD_JOINER:
case BATCH_PRIMITIVE_END_JOINER:
case BATCH_PRIMITIVE_DESTROY:
case BATCH_PRIMITIVE_ABORT:
{
const uint8_t* buf = bs->buf();
uint32_t pos = sizeof(ISMPacketHeader) - 2;
const uint32_t txnId = *((uint32_t*)&buf[pos + 2]);
const uint32_t stepID = *((uint32_t*)&buf[pos + 6]);
const uint32_t uniqueID = *((uint32_t*)&buf[pos + 10]);
const uint32_t weight = 1;
const uint32_t priority = 0;
uint32_t id = 0;
boost::shared_ptr<FairThreadPool::Functor> functor;
if (ismHdr->Command == DICT_CREATE_EQUALITY_FILTER)
{
functor.reset(new CreateEqualityFilter(bs));
}
else if (ismHdr->Command == DICT_DESTROY_EQUALITY_FILTER)
{
functor.reset(new DestroyEqualityFilter(bs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_CREATE)
{
functor.reset(new BPPHandler::Create(fBPPHandler, bs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_ADD_JOINER)
{
functor.reset(new BPPHandler::AddJoiner(fBPPHandler, bs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_END_JOINER)
{
id = fBPPHandler->getUniqueID(bs, ismHdr->Command);
functor.reset(new BPPHandler::LastJoiner(fBPPHandler, bs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_DESTROY)
{
id = fBPPHandler->getUniqueID(bs, ismHdr->Command);
functor.reset(new BPPHandler::Destroy(fBPPHandler, bs));
}
else if (ismHdr->Command == BATCH_PRIMITIVE_ABORT)
{
id = fBPPHandler->getUniqueID(bs, ismHdr->Command);
functor.reset(new BPPHandler::Abort(fBPPHandler, bs));
}
FairThreadPool::Job job(uniqueID, stepID, txnId, functor, outIos, weight, priority, id);
procPoolPtr->addJob(job);
break;
}
case DICT_TOKEN_BY_SCAN_COMPARE:
case BATCH_PRIMITIVE_RUN:
{
TokenByScanRequestHeader* hdr = nullptr;
boost::shared_ptr<FairThreadPool::Functor> functor;
uint32_t id = 0;
uint32_t weight = 0;
uint32_t priority = 0;
uint32_t txnId = 0;
uint32_t stepID = 0;
uint32_t uniqueID = 0;
if (bRotateDest)
{
if (!pUmSocketSelector->nextIOSocket(fIos, outIos, writeLock))
{
// If we ever fall into this part of the
// code we have a "bug" of some sort.
// See handleUmSockSelErr() for more info.
// We reset ios and mutex to defaults.
handleUmSockSelErr(string("default cmd"));
outIos = outIosDefault;
writeLock = writeLockDefault;
pUmSocketSelector->delConnection(fIos);
bRotateDest = false;
}
}
if (ismHdr->Command == DICT_TOKEN_BY_SCAN_COMPARE)
{
idbassert(bs->length() >= sizeof(TokenByScanRequestHeader));
hdr = (TokenByScanRequestHeader*)ismHdr;
functor.reset(new DictScanJob(outIos, bs, writeLock));
id = hdr->Hdr.UniqueID;
weight = LOGICAL_BLOCK_RIDS;
priority = hdr->Hdr.Priority;
const uint8_t* buf = bs->buf();
const uint32_t pos = sizeof(ISMPacketHeader) - 2;
txnId = *((uint32_t*)&buf[pos + 2]);
stepID = *((uint32_t*)&buf[pos + 6]);
uniqueID = *((uint32_t*)&buf[pos + 10]);
}
else if (ismHdr->Command == BATCH_PRIMITIVE_RUN)
{
functor.reset(new BPPSeeder(bs, writeLock, outIos,
fPrimitiveServerPtr->ProcessorThreads(),
fPrimitiveServerPtr->PTTrace()));
BPPSeeder* bpps = dynamic_cast<BPPSeeder*>(functor.get());
id = bpps->getID();
priority = bpps->priority();
const uint8_t* buf = bs->buf();
const uint32_t pos = sizeof(ISMPacketHeader) - 2;
txnId = *((uint32_t*)&buf[pos + 2]);
stepID = *((uint32_t*)&buf[pos + 6]);
uniqueID = *((uint32_t*)&buf[pos + 10]);
weight = ismHdr->Size + *((uint32_t*)&buf[pos + 18]);
}
FairThreadPool::Job job(uniqueID, stepID, txnId, functor, outIos, weight, priority, id);
procPoolPtr->addJob(job);
break;
}
case BATCH_PRIMITIVE_ACK:
{
fBPPHandler->doAck(*bs);
break;
}
default:
{
std::ostringstream os;
Logger log;
os << "unknown primitive cmd: " << ismHdr->Command;
log.logMessage(os.str());
break;
}
} // the switch stmt
dispatchPrimitive(bs, fBPPHandler, procPoolPtr, outIos, writeLock,
fPrimitiveServerPtr->ProcessorThreads(), fPrimitiveServerPtr->PTTrace());
}
else // bs.length() == 0
{
@ -2213,8 +2215,6 @@ struct ReadThread
boost::shared_ptr<BPPHandler> fBPPHandler;
};
/** @brief accept a primitive command from the user module
*/
struct ServerThread
{
ServerThread(string serverName, PrimitiveServer* ps) : fServerName(serverName), fPrimitiveServerPtr(ps)
@ -2351,9 +2351,43 @@ void PrimitiveServer::start(Service* service, utils::USpaceSpinLock& startupRace
fServerpool.invoke(ServerThread(oss.str(), this));
}
startupRaceLock.release();
service->NotifyServiceStarted();
std::thread sameHostServerThread(
[this]()
{
utils::setThreadName("PPSHServerThr");
auto* exeMgrDecPtr = exemgr::globServiceExeMgr->getDec();
while (!exeMgrDecPtr)
{
sleep(1);
exeMgrDecPtr = exemgr::globServiceExeMgr->getDec();
}
// These empty SPs have "same-host" messaging semantics.
SP_UM_IOSOCK outIos(nullptr);
SP_UM_MUTEX writeLock(nullptr);
auto procPoolPtr = this->getProcessorThreadPool();
boost::shared_ptr<BPPHandler> fBPPHandler(new BPPHandler(this));
for (;;)
{
joblist::DistributedEngineComm::SBSVector primitiveMsgs;
for (auto& sbs : exeMgrDecPtr->readLocalQueueMessagesOrWait(primitiveMsgs))
{
if (sbs->length() == 0)
{
std::cout << "PPSHServerThr got an empty ByteStream." << std::endl;
continue;
}
idbassert(sbs->length() >= sizeof(ISMPacketHeader));
ReadThread::dispatchPrimitive(sbs, fBPPHandler, procPoolPtr, outIos, writeLock,
this->ProcessorThreads(), this->PTTrace());
}
}
});
fServerpool.wait();
cerr << "PrimitiveServer::start() exiting!" << endl;