diff --git a/procmgr/main.cpp b/procmgr/main.cpp index 6aae4cfb6..7de3a3d9f 100644 --- a/procmgr/main.cpp +++ b/procmgr/main.cpp @@ -1649,17 +1649,17 @@ void pingDeviceThread() if (busy) break; - //set query system state not ready - processManager.setQuerySystemState(false); - - processManager.setSystemState(oam::BUSY_INIT); - processManager.reinitProcessType("cpimport"); // halt the dbrm oam.dbrmctl("halt"); log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG); + //set query system state not ready + processManager.setQuerySystemState(false); + + processManager.setSystemState(oam::BUSY_INIT); + aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR); //send notification @@ -1724,6 +1724,36 @@ void pingDeviceThread() //set query system state ready processManager.setQuerySystemState(true); + // waiting until dml are ACTIVE + // disableModule is going to trigger DMLProc to restart wait for it + int retry = 0; + while (retry < 30) + { + ProcessStatus DMLprocessstatus; + + try + { + oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus); + } + catch (exception& ex) + {} + catch (...) + {} + + if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) + log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG); + + if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) + break; + + if (DMLprocessstatus.ProcessOpState == oam::FAILED) + break; + + // wait some more + sleep(2); + ++retry; + } + goto break_case; } } @@ -2016,6 +2046,7 @@ void pingDeviceThread() log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL); + if ( amazon ) processManager.setSystemState(oam::FAILED); else @@ -2024,6 +2055,35 @@ void pingDeviceThread() //set query system state ready processManager.setQuerySystemState(true); + // waiting until dml are ACTIVE + // disableModule is going to trigger DMLProc to restart wait for it + int retry = 0; + while (retry < 30) + { + ProcessStatus DMLprocessstatus; + + try + { + oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus); + } + catch (exception& ex) + {} + catch (...) + {} + + if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) + log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG); + + if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) + break; + + if (DMLprocessstatus.ProcessOpState == oam::FAILED) + break; + + // wait some more + sleep(2); + ++retry; + } //clear count moduleInfoList[moduleName] = 0; } @@ -2085,17 +2145,22 @@ void pingDeviceThread() Configuration config; log.writeLog(__LINE__, "*** module is down: " + moduleName, LOG_TYPE_CRITICAL); - //set query system state not ready - processManager.setQuerySystemState(false); - - processManager.setSystemState(oam::BUSY_INIT); - processManager.reinitProcessType("cpimport"); // halt the dbrm oam.dbrmctl("halt"); log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG); + //set query system state not ready + processManager.setQuerySystemState(false); + + processManager.setSystemState(oam::BUSY_INIT); + + //call dbrm control + oam.dbrmctl("reload"); + log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG); + + //send notification oam.sendDeviceNotification(moduleName, MODULE_DOWN); @@ -2149,6 +2214,36 @@ void pingDeviceThread() //set query system state ready processManager.setQuerySystemState(true); + // waiting until dml are ACTIVE + // disableModule is going to trigger DMLProc to restart wait for it + int retry = 0; + while (retry < 30) + { + ProcessStatus DMLprocessstatus; + + try + { + oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus); + } + catch (exception& ex) + {} + catch (...) + {} + + if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) + log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG); + + if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) + break; + + if (DMLprocessstatus.ProcessOpState == oam::FAILED) + break; + + // wait some more + sleep(2); + ++retry; + } + break; } } @@ -2381,10 +2476,6 @@ void pingDeviceThread() } else { - processManager.distributeConfigFile("system"); - - processManager.reinitProcesses(); - // non-amazon //call dbrm control oam.dbrmctl("reload"); diff --git a/procmgr/processmanager.cpp b/procmgr/processmanager.cpp index 35d3a6a0a..4dd6ac9f9 100644 --- a/procmgr/processmanager.cpp +++ b/procmgr/processmanager.cpp @@ -3832,7 +3832,7 @@ int ProcessManager::disableModule(string target, bool manualFlag) } -void ProcessManager::reinitProcesses() +void ProcessManager::reinitProcesses(std::string skipModule) { Oam oam; @@ -3840,35 +3840,12 @@ void ProcessManager::reinitProcesses() reinitProcessType("DBRMWorkerNode"); reinitProcessType("WriteEngineServer"); - restartProcessType("ExeMgr"); + restartProcessType("ExeMgr",skipModule); sleep(1); - restartProcessType("DDLProc"); + restartProcessType("DDLProc",skipModule); sleep(1); - restartProcessType("DMLProc"); - sleep(1); - - // waiting until dml are ACTIVE - while (true) - { - ProcessStatus DMLprocessstatus; - - try - { - oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus); - } - catch (exception& ex) - {} - catch (...) - {} - - if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) - log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG); - else - break; - - // wait some more - sleep(2); - } + restartProcessType("DMLProc",skipModule); + sleep(3); log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG); } @@ -4921,6 +4898,7 @@ int ProcessManager::reinitProcessType( std::string processName ) if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" ) { // found one, request reinit of it + log.writeLog(__LINE__, "reinitProcessType: cpimport" + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG); retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module, "cpimport"); log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG); @@ -9967,6 +9945,10 @@ int ProcessManager::OAMParentModuleChange() ++retryCount; } + //run save.brm script + //Nope turns out this has to be done first... + + processManager.saveBRM(false); try { oam.autoMovePmDbroot(downOAMParentName); @@ -9976,10 +9958,6 @@ int ProcessManager::OAMParentModuleChange() log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR); } - //run save.brm script - //MCOL-3945 move saveBRM after autoMovePmDbroot as this will potentially mount the dbrm directory from dbroot1 - processManager.saveBRM(true, false); - //distribute config file distributeConfigFile("system"); @@ -10098,8 +10076,11 @@ int ProcessManager::OAMParentModuleChange() status = startsystemthreadStatus; } + reinitProcessType("cpimport"); + // waiting until dml are ACTIVE - while (true) + int retry = 0; + while (retry < 30) { ProcessStatus DMLprocessstatus; @@ -10123,6 +10104,7 @@ int ProcessManager::OAMParentModuleChange() // wait some more sleep(2); + ++retry; } @@ -10226,6 +10208,35 @@ int ProcessManager::OAMParentModuleChange() processManager.reinitProcesses(); + // waiting until dml are ACTIVE + retry = 0; + while (retry < 30) + { + ProcessStatus DMLprocessstatus; + + try + { + oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus); + } + catch (exception& ex) + {} + catch (...) + {} + + if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) + log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG); + + if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) + break; + + if (DMLprocessstatus.ProcessOpState == oam::FAILED) + break; + + // wait some more + sleep(2); + ++retry; + } + // clear alarm aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR); @@ -11151,7 +11162,15 @@ int ProcessManager::glusterAssign(std::string moduleName, std::string dbroot) msg << dbroot; int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 ); - + int retry = 0; + // Try this for a minute because in failover the node returning to service may not be listening yet + while(returnStatus != API_SUCCESS && retry < 60) + { + log.writeLog(__LINE__, "glusterAssign retrying...", LOG_TYPE_DEBUG); + returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 ); + sleep(1); + ++retry; + } if ( returnStatus == API_SUCCESS) { //log the success event @@ -11181,7 +11200,15 @@ int ProcessManager::glusterUnassign(std::string moduleName, std::string dbroot) msg << dbroot; int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 ); - + int retry = 0; + // Try this for a minute because in failover the node returning to service may not be listening yet + while(returnStatus != API_SUCCESS && retry < 60) + { + log.writeLog(__LINE__, "glusterUnassign retrying...", LOG_TYPE_DEBUG); + returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 ); + sleep(1); + ++retry; + } if ( returnStatus == API_SUCCESS) { //log the success event diff --git a/procmgr/processmanager.h b/procmgr/processmanager.h index 038b8ab40..76e473488 100644 --- a/procmgr/processmanager.h +++ b/procmgr/processmanager.h @@ -302,7 +302,7 @@ public: /** *@brief reinit Processes trying to replace recycleProcess */ - void reinitProcesses(); + void reinitProcesses(std::string skipModule = "none"); /** *@brief recycle Processes */