You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-09-08 10:32:09 +03:00
Fixes for cpimport primary node failover and locking issues.
This commit is contained in:
119
procmgr/main.cpp
119
procmgr/main.cpp
@@ -1649,17 +1649,17 @@ void pingDeviceThread()
|
||||
if (busy)
|
||||
break;
|
||||
|
||||
//set query system state not ready
|
||||
processManager.setQuerySystemState(false);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
processManager.reinitProcessType("cpimport");
|
||||
|
||||
// halt the dbrm
|
||||
oam.dbrmctl("halt");
|
||||
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
||||
|
||||
//set query system state not ready
|
||||
processManager.setQuerySystemState(false);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
|
||||
|
||||
//send notification
|
||||
@@ -1724,6 +1724,36 @@ void pingDeviceThread()
|
||||
//set query system state ready
|
||||
processManager.setQuerySystemState(true);
|
||||
|
||||
// waiting until dml are ACTIVE
|
||||
// disableModule is going to trigger DMLProc to restart wait for it
|
||||
int retry = 0;
|
||||
while (retry < 30)
|
||||
{
|
||||
ProcessStatus DMLprocessstatus;
|
||||
|
||||
try
|
||||
{
|
||||
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||
}
|
||||
catch (exception& ex)
|
||||
{}
|
||||
catch (...)
|
||||
{}
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||
break;
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||
break;
|
||||
|
||||
// wait some more
|
||||
sleep(2);
|
||||
++retry;
|
||||
}
|
||||
|
||||
goto break_case;
|
||||
}
|
||||
}
|
||||
@@ -2016,6 +2046,7 @@ void pingDeviceThread()
|
||||
|
||||
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
|
||||
|
||||
|
||||
if ( amazon )
|
||||
processManager.setSystemState(oam::FAILED);
|
||||
else
|
||||
@@ -2024,6 +2055,35 @@ void pingDeviceThread()
|
||||
//set query system state ready
|
||||
processManager.setQuerySystemState(true);
|
||||
|
||||
// waiting until dml are ACTIVE
|
||||
// disableModule is going to trigger DMLProc to restart wait for it
|
||||
int retry = 0;
|
||||
while (retry < 30)
|
||||
{
|
||||
ProcessStatus DMLprocessstatus;
|
||||
|
||||
try
|
||||
{
|
||||
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||
}
|
||||
catch (exception& ex)
|
||||
{}
|
||||
catch (...)
|
||||
{}
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||
break;
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||
break;
|
||||
|
||||
// wait some more
|
||||
sleep(2);
|
||||
++retry;
|
||||
}
|
||||
//clear count
|
||||
moduleInfoList[moduleName] = 0;
|
||||
}
|
||||
@@ -2085,17 +2145,22 @@ void pingDeviceThread()
|
||||
Configuration config;
|
||||
log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL);
|
||||
|
||||
//set query system state not ready
|
||||
processManager.setQuerySystemState(false);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
processManager.reinitProcessType("cpimport");
|
||||
|
||||
// halt the dbrm
|
||||
oam.dbrmctl("halt");
|
||||
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
||||
|
||||
//set query system state not ready
|
||||
processManager.setQuerySystemState(false);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
//call dbrm control
|
||||
oam.dbrmctl("reload");
|
||||
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
||||
|
||||
|
||||
//send notification
|
||||
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
|
||||
|
||||
@@ -2149,6 +2214,36 @@ void pingDeviceThread()
|
||||
//set query system state ready
|
||||
processManager.setQuerySystemState(true);
|
||||
|
||||
// waiting until dml are ACTIVE
|
||||
// disableModule is going to trigger DMLProc to restart wait for it
|
||||
int retry = 0;
|
||||
while (retry < 30)
|
||||
{
|
||||
ProcessStatus DMLprocessstatus;
|
||||
|
||||
try
|
||||
{
|
||||
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||
}
|
||||
catch (exception& ex)
|
||||
{}
|
||||
catch (...)
|
||||
{}
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||
break;
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||
break;
|
||||
|
||||
// wait some more
|
||||
sleep(2);
|
||||
++retry;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -2381,10 +2476,6 @@ void pingDeviceThread()
|
||||
}
|
||||
else
|
||||
{
|
||||
processManager.distributeConfigFile("system");
|
||||
|
||||
processManager.reinitProcesses();
|
||||
|
||||
// non-amazon
|
||||
//call dbrm control
|
||||
oam.dbrmctl("reload");
|
||||
|
@@ -3832,7 +3832,7 @@ int ProcessManager::disableModule(string target, bool manualFlag)
|
||||
}
|
||||
|
||||
|
||||
void ProcessManager::reinitProcesses()
|
||||
void ProcessManager::reinitProcesses(std::string skipModule)
|
||||
{
|
||||
Oam oam;
|
||||
|
||||
@@ -3840,35 +3840,12 @@ void ProcessManager::reinitProcesses()
|
||||
|
||||
reinitProcessType("DBRMWorkerNode");
|
||||
reinitProcessType("WriteEngineServer");
|
||||
restartProcessType("ExeMgr");
|
||||
restartProcessType("ExeMgr",skipModule);
|
||||
sleep(1);
|
||||
restartProcessType("DDLProc");
|
||||
restartProcessType("DDLProc",skipModule);
|
||||
sleep(1);
|
||||
restartProcessType("DMLProc");
|
||||
sleep(1);
|
||||
|
||||
// waiting until dml are ACTIVE
|
||||
while (true)
|
||||
{
|
||||
ProcessStatus DMLprocessstatus;
|
||||
|
||||
try
|
||||
{
|
||||
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||
}
|
||||
catch (exception& ex)
|
||||
{}
|
||||
catch (...)
|
||||
{}
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||
else
|
||||
break;
|
||||
|
||||
// wait some more
|
||||
sleep(2);
|
||||
}
|
||||
restartProcessType("DMLProc",skipModule);
|
||||
sleep(3);
|
||||
|
||||
log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG);
|
||||
}
|
||||
@@ -4921,6 +4898,7 @@ int ProcessManager::reinitProcessType( std::string processName )
|
||||
if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" )
|
||||
{
|
||||
// found one, request reinit of it
|
||||
log.writeLog(__LINE__, "reinitProcessType: cpimport" + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
|
||||
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
|
||||
"cpimport");
|
||||
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
|
||||
@@ -9967,6 +9945,10 @@ int ProcessManager::OAMParentModuleChange()
|
||||
++retryCount;
|
||||
}
|
||||
|
||||
//run save.brm script
|
||||
//Nope turns out this has to be done first...
|
||||
|
||||
processManager.saveBRM(false);
|
||||
try
|
||||
{
|
||||
oam.autoMovePmDbroot(downOAMParentName);
|
||||
@@ -9976,10 +9958,6 @@ int ProcessManager::OAMParentModuleChange()
|
||||
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
||||
}
|
||||
|
||||
//run save.brm script
|
||||
//MCOL-3945 move saveBRM after autoMovePmDbroot as this will potentially mount the dbrm directory from dbroot1
|
||||
processManager.saveBRM(true, false);
|
||||
|
||||
//distribute config file
|
||||
distributeConfigFile("system");
|
||||
|
||||
@@ -10098,8 +10076,11 @@ int ProcessManager::OAMParentModuleChange()
|
||||
status = startsystemthreadStatus;
|
||||
}
|
||||
|
||||
reinitProcessType("cpimport");
|
||||
|
||||
// waiting until dml are ACTIVE
|
||||
while (true)
|
||||
int retry = 0;
|
||||
while (retry < 30)
|
||||
{
|
||||
ProcessStatus DMLprocessstatus;
|
||||
|
||||
@@ -10123,6 +10104,7 @@ int ProcessManager::OAMParentModuleChange()
|
||||
|
||||
// wait some more
|
||||
sleep(2);
|
||||
++retry;
|
||||
}
|
||||
|
||||
|
||||
@@ -10226,6 +10208,35 @@ int ProcessManager::OAMParentModuleChange()
|
||||
|
||||
processManager.reinitProcesses();
|
||||
|
||||
// waiting until dml are ACTIVE
|
||||
retry = 0;
|
||||
while (retry < 30)
|
||||
{
|
||||
ProcessStatus DMLprocessstatus;
|
||||
|
||||
try
|
||||
{
|
||||
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||
}
|
||||
catch (exception& ex)
|
||||
{}
|
||||
catch (...)
|
||||
{}
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||
break;
|
||||
|
||||
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||
break;
|
||||
|
||||
// wait some more
|
||||
sleep(2);
|
||||
++retry;
|
||||
}
|
||||
|
||||
// clear alarm
|
||||
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
|
||||
|
||||
@@ -11151,7 +11162,15 @@ int ProcessManager::glusterAssign(std::string moduleName, std::string dbroot)
|
||||
msg << dbroot;
|
||||
|
||||
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||
|
||||
int retry = 0;
|
||||
// Try this for a minute because in failover the node returning to service may not be listening yet
|
||||
while(returnStatus != API_SUCCESS && retry < 60)
|
||||
{
|
||||
log.writeLog(__LINE__, "glusterAssign retrying...", LOG_TYPE_DEBUG);
|
||||
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||
sleep(1);
|
||||
++retry;
|
||||
}
|
||||
if ( returnStatus == API_SUCCESS)
|
||||
{
|
||||
//log the success event
|
||||
@@ -11181,7 +11200,15 @@ int ProcessManager::glusterUnassign(std::string moduleName, std::string dbroot)
|
||||
msg << dbroot;
|
||||
|
||||
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||
|
||||
int retry = 0;
|
||||
// Try this for a minute because in failover the node returning to service may not be listening yet
|
||||
while(returnStatus != API_SUCCESS && retry < 60)
|
||||
{
|
||||
log.writeLog(__LINE__, "glusterUnassign retrying...", LOG_TYPE_DEBUG);
|
||||
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||
sleep(1);
|
||||
++retry;
|
||||
}
|
||||
if ( returnStatus == API_SUCCESS)
|
||||
{
|
||||
//log the success event
|
||||
|
@@ -302,7 +302,7 @@ public:
|
||||
/**
|
||||
*@brief reinit Processes trying to replace recycleProcess
|
||||
*/
|
||||
void reinitProcesses();
|
||||
void reinitProcesses(std::string skipModule = "none");
|
||||
/**
|
||||
*@brief recycle Processes
|
||||
*/
|
||||
|
Reference in New Issue
Block a user