1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-5263 Add support to ROLLBACK when PP were restarted.

DMLProc starts ROLLBACK when SELECT part of UPDATE fails b/c EM facility in PP were restarted.
Unfortunately this ROLLBACK stuck if EM/PP are not yet available.
DMLProc must have a t/o with re-try doing ROLLBACK.
This commit is contained in:
Denis Khalikov
2022-12-07 17:24:45 +03:00
parent e243a5332b
commit d61780cab1
8 changed files with 280 additions and 221 deletions

View File

@ -37,6 +37,8 @@
#include "we_ddlcommandclient.h"
#include "oamcache.h"
#include "liboamcpp.h"
#include "resourcemanager.h"
using namespace std;
using namespace WriteEngine;
using namespace dmlpackage;
@ -370,7 +372,7 @@ DMLPackageProcessor::DMLResult CommandPackageProcessor::processPackage(
int weRc = 0;
// version rollback, Bulkrollback
weRc = rollBackTransaction(uniqueId, txnid, fSessionID, errorMsg);
weRc = tryToRollBackTransaction(uniqueId, txnid, fSessionID, errorMsg);
if (weRc == 0)
{
@ -413,7 +415,7 @@ DMLPackageProcessor::DMLResult CommandPackageProcessor::processPackage(
{
std::string errorMsg("");
logging::logCommand(cpackage.get_SessionID(), txnid.id, "ROLLBACK;");
int weRc = rollBackTransaction(uniqueId, txnid, fSessionID, errorMsg);
int weRc = tryToRollBackTransaction(uniqueId, txnid, fSessionID, errorMsg);
if (weRc != 0)
{

View File

@ -266,6 +266,27 @@ int DMLPackageProcessor::commitTransaction(uint64_t uniqueId, BRM::TxnID txnID)
return rc;
}
// Tries to rollback transaction, if network error tries one more time
// MCOL-5263.
int32_t DMLPackageProcessor::tryToRollBackTransaction(uint64_t uniqueId, BRM::TxnID txnID, uint32_t sessionID,
string& errorMsg)
{
auto weRc = rollBackTransaction(uniqueId, txnID, sessionID, errorMsg);
if (weRc)
{
weRc = rollBackTransaction(uniqueId, txnID, sessionID, errorMsg);
if (weRc == 0)
{
// Setup connection in WE with PS.
joblist::ResourceManager* rm = joblist::ResourceManager::instance(true);
joblist::DistributedEngineComm* fEc = joblist::DistributedEngineComm::instance(rm);
weRc = fEc->Setup();
}
}
return weRc;
}
int DMLPackageProcessor::rollBackTransaction(uint64_t uniqueId, BRM::TxnID txnID, uint32_t sessionID,
std::string& errorMsg)
{

View File

@ -249,6 +249,9 @@ class DMLPackageProcessor
EXPORT int rollBackTransaction(uint64_t uniqueId, BRM::TxnID txnID, uint32_t sessionID,
std::string& errorMsg);
EXPORT int32_t tryToRollBackTransaction(uint64_t uniqueId, BRM::TxnID txnID, uint32_t sessionID,
std::string& errorMsg);
EXPORT int rollBackBatchAutoOnTransaction(uint64_t uniqueId, BRM::TxnID txnID, uint32_t sessionID,
const uint32_t tableOid, std::string& errorMsg);
/**

View File

@ -221,7 +221,7 @@ DistributedEngineComm::~DistributedEngineComm()
fInstance = 0;
}
void DistributedEngineComm::Setup()
int32_t DistributedEngineComm::Setup()
{
// This is here to ensure that this function does not get invoked multiple times simultaneously.
boost::mutex::scoped_lock setupLock(fSetupMutex);
@ -309,10 +309,9 @@ void DistributedEngineComm::Setup()
"Could not connect to PMS" + std::to_string(connectionId) + ": " + ex.what(),
LOG_TYPE_ERROR);
if (newPmCount == 0)
{
writeToLog(__FILE__, __LINE__, "No more PMs to try to connect to", LOG_TYPE_ERROR);
break;
}
return 1;
}
catch (...)
{
@ -322,10 +321,8 @@ void DistributedEngineComm::Setup()
writeToLog(__FILE__, __LINE__, "Could not connect to PMS" + std::to_string(connectionId),
LOG_TYPE_ERROR);
if (newPmCount == 0)
{
writeToLog(__FILE__, __LINE__, "No more PMs to try to connect to", LOG_TYPE_ERROR);
break;
}
return 1;
}
}
@ -362,6 +359,7 @@ void DistributedEngineComm::Setup()
newLocks.clear();
newClients.clear();
return 0;
}
int DistributedEngineComm::Close()
@ -428,8 +426,8 @@ Error:
decltype(pmCount) originalPMCount = pmCount;
// Re-establish if a remote PM restarted.
std::this_thread::sleep_for(std::chrono::seconds(3));
Setup();
if (originalPMCount != pmCount)
auto rc = Setup();
if (rc || originalPMCount != pmCount)
{
ostringstream os;
os << "DEC: lost connection to " << client->addr2String();
@ -889,7 +887,7 @@ void DistributedEngineComm::setFlowControl(bool enabled, uint32_t uniqueID, boos
writeToClient(localConnectionId, msg);
}
void DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
int32_t DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
{
ISMPacketHeader* ism = (ISMPacketHeader*)msg->buf();
uint32_t dest;
@ -914,6 +912,7 @@ void DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
entries in the config file point to unique PMs */
{
uint32_t localConnectionId = std::numeric_limits<uint32_t>::max();
int32_t rc = 0;
for (uint32_t i = 0; i < pmCount; ++i)
{
@ -922,21 +921,25 @@ void DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
localConnectionId = i;
continue;
}
writeToClient(i, msg, senderID);
rc =writeToClient(i, msg, senderID);
if (rc)
return rc;
}
if (localConnectionId < fPmConnections.size())
writeToClient(localConnectionId, msg);
rc = writeToClient(localConnectionId, msg);
return rc;
}
return;
case BATCH_PRIMITIVE_RUN:
case DICT_TOKEN_BY_SCAN_COMPARE:
{
// for efficiency, writeToClient() grabs the interleaving factor for the caller,
// and decides the final connection index because it already grabs the
// caller's queue information
dest = ism->Interleave;
writeToClient(dest, msg, senderID, true);
break;
return writeToClient(dest, msg, senderID, true);
}
default: idbassert_s(0, "Unknown message type");
}
@ -946,6 +949,7 @@ void DistributedEngineComm::write(uint32_t senderID, const SBS& msg)
writeToLog(__FILE__, __LINE__, "No PrimProcs are running", LOG_TYPE_DEBUG);
throw IDBExcept(ERR_NO_PRIMPROC);
}
return 0;
}
void DistributedEngineComm::write(messageqcpp::ByteStream& msg, uint32_t connection)
@ -1135,16 +1139,14 @@ int DistributedEngineComm::writeToClient(size_t aPMIndex, const SBS& bs, uint32_
}
}
ClientList::value_type client = fPmConnections[connectionId];
try
{
ClientList::value_type client = fPmConnections[connectionId];
if (!client->isAvailable())
return 0;
std::lock_guard lk(*(fWlock[connectionId]));
client->write(bs, NULL, senderStats);
return 0;
}
catch (...)
{
@ -1163,7 +1165,30 @@ int DistributedEngineComm::writeToClient(size_t aPMIndex, const SBS& bs, uint32_
map_tok->second->queue.push(sbs);
}
int tries = 0;
// Try to setup connection with PS, it could be a situation that PS is starting.
// MCOL-5263.
while (tries < 10 && Setup())
{
++tries;
std::this_thread::sleep_for(std::chrono::seconds(3));
}
lk.unlock();
if (tries == 10)
{
ostringstream os;
os << "DEC: lost connection to " << client->addr2String();
writeToLog(__FILE__, __LINE__, os.str(), LOG_TYPE_ERROR);
if (!fIsExeMgr)
abort();
else
throw runtime_error("DistributedEngineComm::write: Broken Pipe error");
}
// Connection was established.
return 1;
/*
// reconfig the connection array
ClientList tempConns;
@ -1195,8 +1220,8 @@ int DistributedEngineComm::writeToClient(size_t aPMIndex, const SBS& bs, uint32_
alarmItem.append(" PrimProc");
alarmMgr.sendAlarmReport(alarmItem.c_str(), oam::CONN_FAILURE, SET);
*/
throw runtime_error("DistributedEngineComm::write: Broken Pipe error");
}
return 0;
}
uint32_t DistributedEngineComm::size(uint32_t key)

View File

@ -146,7 +146,7 @@ class DistributedEngineComm
* Writes a primitive message to a primitive server. Msg needs to conatin an ISMPacketHeader. The
* LBID is extracted from the ISMPacketHeader and used to determine the actual P/M to send to.
*/
EXPORT void write(uint32_t key, const messageqcpp::SBS& msg);
EXPORT int32_t write(uint32_t key, const messageqcpp::SBS& msg);
// EXPORT void throttledWrite(const messageqcpp::ByteStream& msg);
@ -188,7 +188,7 @@ class DistributedEngineComm
*/
EXPORT uint32_t size(uint32_t key);
EXPORT void Setup();
EXPORT int32_t Setup();
EXPORT void addDECEventListener(DECEventListener*);
EXPORT void removeDECEventListener(DECEventListener*);