You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-09-08 10:32:09 +03:00
Fixes for cpimport primary node failover and locking issues.
This commit is contained in:
119
procmgr/main.cpp
119
procmgr/main.cpp
@@ -1649,17 +1649,17 @@ void pingDeviceThread()
|
|||||||
if (busy)
|
if (busy)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
//set query system state not ready
|
|
||||||
processManager.setQuerySystemState(false);
|
|
||||||
|
|
||||||
processManager.setSystemState(oam::BUSY_INIT);
|
|
||||||
|
|
||||||
processManager.reinitProcessType("cpimport");
|
processManager.reinitProcessType("cpimport");
|
||||||
|
|
||||||
// halt the dbrm
|
// halt the dbrm
|
||||||
oam.dbrmctl("halt");
|
oam.dbrmctl("halt");
|
||||||
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
//set query system state not ready
|
||||||
|
processManager.setQuerySystemState(false);
|
||||||
|
|
||||||
|
processManager.setSystemState(oam::BUSY_INIT);
|
||||||
|
|
||||||
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
|
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
|
||||||
|
|
||||||
//send notification
|
//send notification
|
||||||
@@ -1724,6 +1724,36 @@ void pingDeviceThread()
|
|||||||
//set query system state ready
|
//set query system state ready
|
||||||
processManager.setQuerySystemState(true);
|
processManager.setQuerySystemState(true);
|
||||||
|
|
||||||
|
// waiting until dml are ACTIVE
|
||||||
|
// disableModule is going to trigger DMLProc to restart wait for it
|
||||||
|
int retry = 0;
|
||||||
|
while (retry < 30)
|
||||||
|
{
|
||||||
|
ProcessStatus DMLprocessstatus;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||||
|
}
|
||||||
|
catch (exception& ex)
|
||||||
|
{}
|
||||||
|
catch (...)
|
||||||
|
{}
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||||
|
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// wait some more
|
||||||
|
sleep(2);
|
||||||
|
++retry;
|
||||||
|
}
|
||||||
|
|
||||||
goto break_case;
|
goto break_case;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2016,6 +2046,7 @@ void pingDeviceThread()
|
|||||||
|
|
||||||
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
|
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
|
||||||
|
|
||||||
|
|
||||||
if ( amazon )
|
if ( amazon )
|
||||||
processManager.setSystemState(oam::FAILED);
|
processManager.setSystemState(oam::FAILED);
|
||||||
else
|
else
|
||||||
@@ -2024,6 +2055,35 @@ void pingDeviceThread()
|
|||||||
//set query system state ready
|
//set query system state ready
|
||||||
processManager.setQuerySystemState(true);
|
processManager.setQuerySystemState(true);
|
||||||
|
|
||||||
|
// waiting until dml are ACTIVE
|
||||||
|
// disableModule is going to trigger DMLProc to restart wait for it
|
||||||
|
int retry = 0;
|
||||||
|
while (retry < 30)
|
||||||
|
{
|
||||||
|
ProcessStatus DMLprocessstatus;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||||
|
}
|
||||||
|
catch (exception& ex)
|
||||||
|
{}
|
||||||
|
catch (...)
|
||||||
|
{}
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||||
|
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// wait some more
|
||||||
|
sleep(2);
|
||||||
|
++retry;
|
||||||
|
}
|
||||||
//clear count
|
//clear count
|
||||||
moduleInfoList[moduleName] = 0;
|
moduleInfoList[moduleName] = 0;
|
||||||
}
|
}
|
||||||
@@ -2085,17 +2145,22 @@ void pingDeviceThread()
|
|||||||
Configuration config;
|
Configuration config;
|
||||||
log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL);
|
log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL);
|
||||||
|
|
||||||
//set query system state not ready
|
|
||||||
processManager.setQuerySystemState(false);
|
|
||||||
|
|
||||||
processManager.setSystemState(oam::BUSY_INIT);
|
|
||||||
|
|
||||||
processManager.reinitProcessType("cpimport");
|
processManager.reinitProcessType("cpimport");
|
||||||
|
|
||||||
// halt the dbrm
|
// halt the dbrm
|
||||||
oam.dbrmctl("halt");
|
oam.dbrmctl("halt");
|
||||||
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
//set query system state not ready
|
||||||
|
processManager.setQuerySystemState(false);
|
||||||
|
|
||||||
|
processManager.setSystemState(oam::BUSY_INIT);
|
||||||
|
|
||||||
|
//call dbrm control
|
||||||
|
oam.dbrmctl("reload");
|
||||||
|
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
|
||||||
//send notification
|
//send notification
|
||||||
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
|
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
|
||||||
|
|
||||||
@@ -2149,6 +2214,36 @@ void pingDeviceThread()
|
|||||||
//set query system state ready
|
//set query system state ready
|
||||||
processManager.setQuerySystemState(true);
|
processManager.setQuerySystemState(true);
|
||||||
|
|
||||||
|
// waiting until dml are ACTIVE
|
||||||
|
// disableModule is going to trigger DMLProc to restart wait for it
|
||||||
|
int retry = 0;
|
||||||
|
while (retry < 30)
|
||||||
|
{
|
||||||
|
ProcessStatus DMLprocessstatus;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||||
|
}
|
||||||
|
catch (exception& ex)
|
||||||
|
{}
|
||||||
|
catch (...)
|
||||||
|
{}
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||||
|
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// wait some more
|
||||||
|
sleep(2);
|
||||||
|
++retry;
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2381,10 +2476,6 @@ void pingDeviceThread()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
processManager.distributeConfigFile("system");
|
|
||||||
|
|
||||||
processManager.reinitProcesses();
|
|
||||||
|
|
||||||
// non-amazon
|
// non-amazon
|
||||||
//call dbrm control
|
//call dbrm control
|
||||||
oam.dbrmctl("reload");
|
oam.dbrmctl("reload");
|
||||||
|
@@ -3832,7 +3832,7 @@ int ProcessManager::disableModule(string target, bool manualFlag)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ProcessManager::reinitProcesses()
|
void ProcessManager::reinitProcesses(std::string skipModule)
|
||||||
{
|
{
|
||||||
Oam oam;
|
Oam oam;
|
||||||
|
|
||||||
@@ -3840,35 +3840,12 @@ void ProcessManager::reinitProcesses()
|
|||||||
|
|
||||||
reinitProcessType("DBRMWorkerNode");
|
reinitProcessType("DBRMWorkerNode");
|
||||||
reinitProcessType("WriteEngineServer");
|
reinitProcessType("WriteEngineServer");
|
||||||
restartProcessType("ExeMgr");
|
restartProcessType("ExeMgr",skipModule);
|
||||||
sleep(1);
|
sleep(1);
|
||||||
restartProcessType("DDLProc");
|
restartProcessType("DDLProc",skipModule);
|
||||||
sleep(1);
|
sleep(1);
|
||||||
restartProcessType("DMLProc");
|
restartProcessType("DMLProc",skipModule);
|
||||||
sleep(1);
|
sleep(3);
|
||||||
|
|
||||||
// waiting until dml are ACTIVE
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
ProcessStatus DMLprocessstatus;
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
|
||||||
}
|
|
||||||
catch (exception& ex)
|
|
||||||
{}
|
|
||||||
catch (...)
|
|
||||||
{}
|
|
||||||
|
|
||||||
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
|
||||||
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
|
|
||||||
// wait some more
|
|
||||||
sleep(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG);
|
log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG);
|
||||||
}
|
}
|
||||||
@@ -4921,6 +4898,7 @@ int ProcessManager::reinitProcessType( std::string processName )
|
|||||||
if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" )
|
if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" )
|
||||||
{
|
{
|
||||||
// found one, request reinit of it
|
// found one, request reinit of it
|
||||||
|
log.writeLog(__LINE__, "reinitProcessType: cpimport" + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
|
||||||
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
|
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
|
||||||
"cpimport");
|
"cpimport");
|
||||||
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
|
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
|
||||||
@@ -9967,6 +9945,10 @@ int ProcessManager::OAMParentModuleChange()
|
|||||||
++retryCount;
|
++retryCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//run save.brm script
|
||||||
|
//Nope turns out this has to be done first...
|
||||||
|
|
||||||
|
processManager.saveBRM(false);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
oam.autoMovePmDbroot(downOAMParentName);
|
oam.autoMovePmDbroot(downOAMParentName);
|
||||||
@@ -9976,10 +9958,6 @@ int ProcessManager::OAMParentModuleChange()
|
|||||||
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
//run save.brm script
|
|
||||||
//MCOL-3945 move saveBRM after autoMovePmDbroot as this will potentially mount the dbrm directory from dbroot1
|
|
||||||
processManager.saveBRM(true, false);
|
|
||||||
|
|
||||||
//distribute config file
|
//distribute config file
|
||||||
distributeConfigFile("system");
|
distributeConfigFile("system");
|
||||||
|
|
||||||
@@ -10098,8 +10076,11 @@ int ProcessManager::OAMParentModuleChange()
|
|||||||
status = startsystemthreadStatus;
|
status = startsystemthreadStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reinitProcessType("cpimport");
|
||||||
|
|
||||||
// waiting until dml are ACTIVE
|
// waiting until dml are ACTIVE
|
||||||
while (true)
|
int retry = 0;
|
||||||
|
while (retry < 30)
|
||||||
{
|
{
|
||||||
ProcessStatus DMLprocessstatus;
|
ProcessStatus DMLprocessstatus;
|
||||||
|
|
||||||
@@ -10123,6 +10104,7 @@ int ProcessManager::OAMParentModuleChange()
|
|||||||
|
|
||||||
// wait some more
|
// wait some more
|
||||||
sleep(2);
|
sleep(2);
|
||||||
|
++retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -10226,6 +10208,35 @@ int ProcessManager::OAMParentModuleChange()
|
|||||||
|
|
||||||
processManager.reinitProcesses();
|
processManager.reinitProcesses();
|
||||||
|
|
||||||
|
// waiting until dml are ACTIVE
|
||||||
|
retry = 0;
|
||||||
|
while (retry < 30)
|
||||||
|
{
|
||||||
|
ProcessStatus DMLprocessstatus;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
|
||||||
|
}
|
||||||
|
catch (exception& ex)
|
||||||
|
{}
|
||||||
|
catch (...)
|
||||||
|
{}
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
|
||||||
|
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// wait some more
|
||||||
|
sleep(2);
|
||||||
|
++retry;
|
||||||
|
}
|
||||||
|
|
||||||
// clear alarm
|
// clear alarm
|
||||||
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
|
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
|
||||||
|
|
||||||
@@ -11151,7 +11162,15 @@ int ProcessManager::glusterAssign(std::string moduleName, std::string dbroot)
|
|||||||
msg << dbroot;
|
msg << dbroot;
|
||||||
|
|
||||||
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||||
|
int retry = 0;
|
||||||
|
// Try this for a minute because in failover the node returning to service may not be listening yet
|
||||||
|
while(returnStatus != API_SUCCESS && retry < 60)
|
||||||
|
{
|
||||||
|
log.writeLog(__LINE__, "glusterAssign retrying...", LOG_TYPE_DEBUG);
|
||||||
|
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||||
|
sleep(1);
|
||||||
|
++retry;
|
||||||
|
}
|
||||||
if ( returnStatus == API_SUCCESS)
|
if ( returnStatus == API_SUCCESS)
|
||||||
{
|
{
|
||||||
//log the success event
|
//log the success event
|
||||||
@@ -11181,7 +11200,15 @@ int ProcessManager::glusterUnassign(std::string moduleName, std::string dbroot)
|
|||||||
msg << dbroot;
|
msg << dbroot;
|
||||||
|
|
||||||
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||||
|
int retry = 0;
|
||||||
|
// Try this for a minute because in failover the node returning to service may not be listening yet
|
||||||
|
while(returnStatus != API_SUCCESS && retry < 60)
|
||||||
|
{
|
||||||
|
log.writeLog(__LINE__, "glusterUnassign retrying...", LOG_TYPE_DEBUG);
|
||||||
|
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
|
||||||
|
sleep(1);
|
||||||
|
++retry;
|
||||||
|
}
|
||||||
if ( returnStatus == API_SUCCESS)
|
if ( returnStatus == API_SUCCESS)
|
||||||
{
|
{
|
||||||
//log the success event
|
//log the success event
|
||||||
|
@@ -302,7 +302,7 @@ public:
|
|||||||
/**
|
/**
|
||||||
*@brief reinit Processes trying to replace recycleProcess
|
*@brief reinit Processes trying to replace recycleProcess
|
||||||
*/
|
*/
|
||||||
void reinitProcesses();
|
void reinitProcesses(std::string skipModule = "none");
|
||||||
/**
|
/**
|
||||||
*@brief recycle Processes
|
*@brief recycle Processes
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user