1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-09-08 10:32:09 +03:00

Fixes for cpimport primary node failover and locking issues.

This commit is contained in:
benthompson15
2020-04-23 05:08:59 -05:00
parent 1dff484c41
commit ee5ed53e77
3 changed files with 168 additions and 50 deletions

View File

@@ -1649,17 +1649,17 @@ void pingDeviceThread()
if (busy) if (busy)
break; break;
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport"); processManager.reinitProcessType("cpimport");
// halt the dbrm // halt the dbrm
oam.dbrmctl("halt"); oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG); log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR); aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
//send notification //send notification
@@ -1724,6 +1724,36 @@ void pingDeviceThread()
//set query system state ready //set query system state ready
processManager.setQuerySystemState(true); processManager.setQuerySystemState(true);
// waiting until dml are ACTIVE
// disableModule is going to trigger DMLProc to restart wait for it
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
goto break_case; goto break_case;
} }
} }
@@ -2016,6 +2046,7 @@ void pingDeviceThread()
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL); log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
if ( amazon ) if ( amazon )
processManager.setSystemState(oam::FAILED); processManager.setSystemState(oam::FAILED);
else else
@@ -2024,6 +2055,35 @@ void pingDeviceThread()
//set query system state ready //set query system state ready
processManager.setQuerySystemState(true); processManager.setQuerySystemState(true);
// waiting until dml are ACTIVE
// disableModule is going to trigger DMLProc to restart wait for it
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
//clear count //clear count
moduleInfoList[moduleName] = 0; moduleInfoList[moduleName] = 0;
} }
@@ -2085,17 +2145,22 @@ void pingDeviceThread()
Configuration config; Configuration config;
log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL); log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL);
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport"); processManager.reinitProcessType("cpimport");
// halt the dbrm // halt the dbrm
oam.dbrmctl("halt"); oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG); log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
//set query system state not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
//send notification //send notification
oam.sendDeviceNotification(moduleName, MODULE_DOWN); oam.sendDeviceNotification(moduleName, MODULE_DOWN);
@@ -2149,6 +2214,36 @@ void pingDeviceThread()
//set query system state ready //set query system state ready
processManager.setQuerySystemState(true); processManager.setQuerySystemState(true);
// waiting until dml are ACTIVE
// disableModule is going to trigger DMLProc to restart wait for it
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
break; break;
} }
} }
@@ -2381,10 +2476,6 @@ void pingDeviceThread()
} }
else else
{ {
processManager.distributeConfigFile("system");
processManager.reinitProcesses();
// non-amazon // non-amazon
//call dbrm control //call dbrm control
oam.dbrmctl("reload"); oam.dbrmctl("reload");

View File

@@ -3832,7 +3832,7 @@ int ProcessManager::disableModule(string target, bool manualFlag)
} }
void ProcessManager::reinitProcesses() void ProcessManager::reinitProcesses(std::string skipModule)
{ {
Oam oam; Oam oam;
@@ -3840,35 +3840,12 @@ void ProcessManager::reinitProcesses()
reinitProcessType("DBRMWorkerNode"); reinitProcessType("DBRMWorkerNode");
reinitProcessType("WriteEngineServer"); reinitProcessType("WriteEngineServer");
restartProcessType("ExeMgr"); restartProcessType("ExeMgr",skipModule);
sleep(1); sleep(1);
restartProcessType("DDLProc"); restartProcessType("DDLProc",skipModule);
sleep(1); sleep(1);
restartProcessType("DMLProc"); restartProcessType("DMLProc",skipModule);
sleep(1); sleep(3);
// waiting until dml are ACTIVE
while (true)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
else
break;
// wait some more
sleep(2);
}
log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG); log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG);
} }
@@ -4921,6 +4898,7 @@ int ProcessManager::reinitProcessType( std::string processName )
if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" ) if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" )
{ {
// found one, request reinit of it // found one, request reinit of it
log.writeLog(__LINE__, "reinitProcessType: cpimport" + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module, retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
"cpimport"); "cpimport");
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG); log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
@@ -9967,6 +9945,10 @@ int ProcessManager::OAMParentModuleChange()
++retryCount; ++retryCount;
} }
//run save.brm script
//Nope turns out this has to be done first...
processManager.saveBRM(false);
try try
{ {
oam.autoMovePmDbroot(downOAMParentName); oam.autoMovePmDbroot(downOAMParentName);
@@ -9976,10 +9958,6 @@ int ProcessManager::OAMParentModuleChange()
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR); log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
} }
//run save.brm script
//MCOL-3945 move saveBRM after autoMovePmDbroot as this will potentially mount the dbrm directory from dbroot1
processManager.saveBRM(true, false);
//distribute config file //distribute config file
distributeConfigFile("system"); distributeConfigFile("system");
@@ -10098,8 +10076,11 @@ int ProcessManager::OAMParentModuleChange()
status = startsystemthreadStatus; status = startsystemthreadStatus;
} }
reinitProcessType("cpimport");
// waiting until dml are ACTIVE // waiting until dml are ACTIVE
while (true) int retry = 0;
while (retry < 30)
{ {
ProcessStatus DMLprocessstatus; ProcessStatus DMLprocessstatus;
@@ -10123,6 +10104,7 @@ int ProcessManager::OAMParentModuleChange()
// wait some more // wait some more
sleep(2); sleep(2);
++retry;
} }
@@ -10226,6 +10208,35 @@ int ProcessManager::OAMParentModuleChange()
processManager.reinitProcesses(); processManager.reinitProcesses();
// waiting until dml are ACTIVE
retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
// clear alarm // clear alarm
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR); aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
@@ -11151,7 +11162,15 @@ int ProcessManager::glusterAssign(std::string moduleName, std::string dbroot)
msg << dbroot; msg << dbroot;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 ); int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
int retry = 0;
// Try this for a minute because in failover the node returning to service may not be listening yet
while(returnStatus != API_SUCCESS && retry < 60)
{
log.writeLog(__LINE__, "glusterAssign retrying...", LOG_TYPE_DEBUG);
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
sleep(1);
++retry;
}
if ( returnStatus == API_SUCCESS) if ( returnStatus == API_SUCCESS)
{ {
//log the success event //log the success event
@@ -11181,7 +11200,15 @@ int ProcessManager::glusterUnassign(std::string moduleName, std::string dbroot)
msg << dbroot; msg << dbroot;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 ); int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
int retry = 0;
// Try this for a minute because in failover the node returning to service may not be listening yet
while(returnStatus != API_SUCCESS && retry < 60)
{
log.writeLog(__LINE__, "glusterUnassign retrying...", LOG_TYPE_DEBUG);
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
sleep(1);
++retry;
}
if ( returnStatus == API_SUCCESS) if ( returnStatus == API_SUCCESS)
{ {
//log the success event //log the success event

View File

@@ -302,7 +302,7 @@ public:
/** /**
*@brief reinit Processes trying to replace recycleProcess *@brief reinit Processes trying to replace recycleProcess
*/ */
void reinitProcesses(); void reinitProcesses(std::string skipModule = "none");
/** /**
*@brief recycle Processes *@brief recycle Processes
*/ */