1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-09-08 10:32:09 +03:00

Fixes for cpimport primary node failover and locking issues.

This commit is contained in:
benthompson15
2020-04-23 05:08:59 -05:00
parent 1dff484c41
commit ee5ed53e77
3 changed files with 168 additions and 50 deletions

View File

@@ -1649,17 +1649,17 @@ void pingDeviceThread()
if (busy)
break;
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport");
// halt the dbrm
oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
//send notification
@@ -1724,6 +1724,36 @@ void pingDeviceThread()
//set query system state ready
processManager.setQuerySystemState(true);
// waiting until dml are ACTIVE
// disableModule is going to trigger DMLProc to restart wait for it
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
goto break_case;
}
}
@@ -2016,6 +2046,7 @@ void pingDeviceThread()
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
if ( amazon )
processManager.setSystemState(oam::FAILED);
else
@@ -2024,6 +2055,35 @@ void pingDeviceThread()
//set query system state ready
processManager.setQuerySystemState(true);
// waiting until dml are ACTIVE
// disableModule is going to trigger DMLProc to restart wait for it
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
//clear count
moduleInfoList[moduleName] = 0;
}
@@ -2085,17 +2145,22 @@ void pingDeviceThread()
Configuration config;
log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL);
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport");
// halt the dbrm
oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
//send notification
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
@@ -2149,6 +2214,36 @@ void pingDeviceThread()
//set query system state ready
processManager.setQuerySystemState(true);
// waiting until dml are ACTIVE
// disableModule is going to trigger DMLProc to restart wait for it
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
break;
}
}
@@ -2381,10 +2476,6 @@ void pingDeviceThread()
}
else
{
processManager.distributeConfigFile("system");
processManager.reinitProcesses();
// non-amazon
//call dbrm control
oam.dbrmctl("reload");

View File

@@ -3832,7 +3832,7 @@ int ProcessManager::disableModule(string target, bool manualFlag)
}
void ProcessManager::reinitProcesses()
void ProcessManager::reinitProcesses(std::string skipModule)
{
Oam oam;
@@ -3840,35 +3840,12 @@ void ProcessManager::reinitProcesses()
reinitProcessType("DBRMWorkerNode");
reinitProcessType("WriteEngineServer");
restartProcessType("ExeMgr");
restartProcessType("ExeMgr",skipModule);
sleep(1);
restartProcessType("DDLProc");
restartProcessType("DDLProc",skipModule);
sleep(1);
restartProcessType("DMLProc");
sleep(1);
// waiting until dml are ACTIVE
while (true)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
else
break;
// wait some more
sleep(2);
}
restartProcessType("DMLProc",skipModule);
sleep(3);
log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG);
}
@@ -4921,6 +4898,7 @@ int ProcessManager::reinitProcessType( std::string processName )
if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" )
{
// found one, request reinit of it
log.writeLog(__LINE__, "reinitProcessType: cpimport" + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
"cpimport");
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
@@ -9967,6 +9945,10 @@ int ProcessManager::OAMParentModuleChange()
++retryCount;
}
//run save.brm script
//Nope turns out this has to be done first...
processManager.saveBRM(false);
try
{
oam.autoMovePmDbroot(downOAMParentName);
@@ -9976,10 +9958,6 @@ int ProcessManager::OAMParentModuleChange()
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
}
//run save.brm script
//MCOL-3945 move saveBRM after autoMovePmDbroot as this will potentially mount the dbrm directory from dbroot1
processManager.saveBRM(true, false);
//distribute config file
distributeConfigFile("system");
@@ -10098,8 +10076,11 @@ int ProcessManager::OAMParentModuleChange()
status = startsystemthreadStatus;
}
reinitProcessType("cpimport");
// waiting until dml are ACTIVE
while (true)
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
@@ -10123,6 +10104,7 @@ int ProcessManager::OAMParentModuleChange()
// wait some more
sleep(2);
++retry;
}
@@ -10226,6 +10208,35 @@ int ProcessManager::OAMParentModuleChange()
processManager.reinitProcesses();
// waiting until dml are ACTIVE
retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
// clear alarm
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
@@ -11151,7 +11162,15 @@ int ProcessManager::glusterAssign(std::string moduleName, std::string dbroot)
msg << dbroot;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
int retry = 0;
// Try this for a minute because in failover the node returning to service may not be listening yet
while(returnStatus != API_SUCCESS && retry < 60)
{
log.writeLog(__LINE__, "glusterAssign retrying...", LOG_TYPE_DEBUG);
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
sleep(1);
++retry;
}
if ( returnStatus == API_SUCCESS)
{
//log the success event
@@ -11181,7 +11200,15 @@ int ProcessManager::glusterUnassign(std::string moduleName, std::string dbroot)
msg << dbroot;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
int retry = 0;
// Try this for a minute because in failover the node returning to service may not be listening yet
while(returnStatus != API_SUCCESS && retry < 60)
{
log.writeLog(__LINE__, "glusterUnassign retrying...", LOG_TYPE_DEBUG);
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
sleep(1);
++retry;
}
if ( returnStatus == API_SUCCESS)
{
//log the success event

View File

@@ -302,7 +302,7 @@ public:
/**
*@brief reinit Processes trying to replace recycleProcess
*/
void reinitProcesses();
void reinitProcesses(std::string skipModule = "none");
/**
*@brief recycle Processes
*/