From 22dced568cd7be7c640a23602d401eb95425fe04 Mon Sep 17 00:00:00 2001 From: Ben Thompson Date: Wed, 8 Nov 2017 15:12:10 -0600 Subject: [PATCH 1/2] MCOL-976: Change how processes are restarted after losing or regaining contact with a down module. --- procmgr/main.cpp | 89 ++++++++++++++++++++++++++++++++++++-- procmgr/processmanager.cpp | 57 +++++------------------- 2 files changed, 95 insertions(+), 51 deletions(-) diff --git a/procmgr/main.cpp b/procmgr/main.cpp index f54ff09be..4b3fa1376 100644 --- a/procmgr/main.cpp +++ b/procmgr/main.cpp @@ -1635,9 +1635,16 @@ void pingDeviceThread() processManager.distributeConfigFile("system"); sleep(1); - // if a PM module was started successfully, restart ACTIVE ExeMgr(s) / mysqld + // if a PM module was started successfully, restart ACTIVE DBRM(s), ExeMgr(s) / mysqld if( moduleName.find("pm") == 0 ) { - processManager.restartProcessType("ExeMgr", moduleName); + processManager.restartProcessType("DBRMControllerNode", moduleName); + processManager.restartProcessType("DBRMWorkerNode"); + processManager.stopProcessType("DDLProc"); + processManager.stopProcessType("DMLProc"); + processManager.stopProcessType("ExeMgr"); + processManager.restartProcessType("PrimProc"); + sleep(1); + processManager.restartProcessType("ExeMgr"); } string moduleType = moduleName.substr(0,MAX_MODULE_TYPE_SIZE); @@ -1667,9 +1674,11 @@ void pingDeviceThread() // if a PM module was started successfully, DMLProc/DDLProc if( moduleName.find("pm") == 0 ) { - processManager.restartProcessType("DDLProc", moduleName); + processManager.restartProcessType("WriteEngineServer"); sleep(1); - processManager.restartProcessType("DMLProc", moduleName); + processManager.restartProcessType("DDLProc"); + sleep(1); + processManager.restartProcessType("DMLProc"); } //enable query stats @@ -1680,6 +1689,78 @@ void pingDeviceThread() processManager.setSystemState(oam::ACTIVE); + //reset standby module + string newStandbyModule = processManager.getStandbyModule(); + + //send message to start new Standby Process-Manager, if needed + if ( !newStandbyModule.empty() && newStandbyModule != "NONE") { + processManager.setStandbyModule(newStandbyModule); + } + else + { + Config* sysConfig = Config::makeConfig(); + + // clear Standby OAM Module + sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName); + sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr); + + //update Calpont Config table + try { + sysConfig->write(); + } + catch(...) + { + log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR); + } + } + + if ( moduletypeconfig.RunType == SIMPLEX ) { + //start SIMPLEX runtype processes on a SIMPLEX runtype module + string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE); + DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin(); + for( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++) + { + string launchModuleName = (*pt).DeviceName; + string launchModuletype = launchModuleName.substr(0,MAX_MODULE_TYPE_SIZE); + if ( moduletype != launchModuletype ) + continue; + + //skip if active pm module (local module) + if ( launchModuleName == config.moduleName() ) + continue; + + //check if module is active before starting any SIMPLEX STANDBY apps + try{ + int launchopState = oam::ACTIVE; + bool degraded; + oam.getModuleStatus(launchModuleName, launchopState, degraded); + + if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY ) { + continue; + } + } + catch (exception& ex) + { +// string error = ex.what(); +// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR); + } + catch(...) + { +// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR); + } + + int status; + log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG); + for ( int j = 0 ; j < 20 ; j ++ ) + { + status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE); + if ( status == API_SUCCESS) + break; + } + log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG); + } + } + //clear count moduleInfoList[moduleName] = 0; } diff --git a/procmgr/processmanager.cpp b/procmgr/processmanager.cpp index 492dc9b05..c3cd330a9 100755 --- a/procmgr/processmanager.cpp +++ b/procmgr/processmanager.cpp @@ -3439,9 +3439,13 @@ void ProcessManager::recycleProcess(string module) //restart ExeMgrs/mysql if module is a pm if ( moduleType == "pm" ) { -// restartProcessType("DBRMWorkerNode"); -// restartProcessType("PrimProc"); -// restartProcessType("WriteEngineServer"); + restartProcessType("DBRMControllerNode", module); + restartProcessType("DBRMWorkerNode"); + stopProcessType("DDLProc"); + stopProcessType("DMLProc"); + stopProcessType("ExeMgr"); + restartProcessType("PrimProc"); + sleep(1); restartProcessType("ExeMgr"); restartProcessType("mysql"); } @@ -3457,54 +3461,13 @@ void ProcessManager::recycleProcess(string module) if( moduleType == "pm" && PrimaryUMModuleName != module) { -// restartProcessType("DBRMControllerNode", module); -// sleep(1); - reinitProcessType("DDLProc"); + restartProcessType("WriteEngineServer"); + sleep(1); + restartProcessType("DDLProc"); sleep(1); restartProcessType("DMLProc", module); } - //wait for DMLProc to go ACTIVE -/* uint16_t rtn = 0; - bool bfirst = true; - while (rtn == 0) - { - ProcessStatus DMLprocessstatus; - try { - oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus); - } - catch (exception& ex) - { -// string error = ex.what(); -// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR); - } - catch(...) - { -// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR); - } - - if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) { - if (bfirst) - { - log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback" , LOG_TYPE_INFO); - bfirst = false; - } - } - - if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) { - rtn = oam::ACTIVE; - break; - } - - if (DMLprocessstatus.ProcessOpState == oam::FAILED) { - rtn = oam::FAILED; - break; - } - - // wait some more - sleep(2); - } -*/ return; } From e345d20f97314fa70e97c6708349bef3d2eae654 Mon Sep 17 00:00:00 2001 From: Ben Thompson Date: Mon, 13 Nov 2017 16:25:11 -0600 Subject: [PATCH 2/2] MCOL-976: Change to how restartProcessType restarts DDLProc and DMLProc. --- procmgr/processmanager.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/procmgr/processmanager.cpp b/procmgr/processmanager.cpp index c3cd330a9..7c147f9e4 100755 --- a/procmgr/processmanager.cpp +++ b/procmgr/processmanager.cpp @@ -3447,11 +3447,15 @@ void ProcessManager::recycleProcess(string module) restartProcessType("PrimProc"); sleep(1); restartProcessType("ExeMgr"); + sleep(1); restartProcessType("mysql"); } else + { + restartProcessType("DBRMControllerNode", module); + restartProcessType("DBRMWorkerNode"); restartProcessType("ExeMgr"); - + } if ( PrimaryUMModuleName == module ) { restartProcessType("DDLProc", module); @@ -4254,6 +4258,7 @@ int ProcessManager::restartProcessType( std::string processName, std::string ski SystemProcessStatus systemprocessstatus; ProcessStatus processstatus; int retStatus = API_SUCCESS; + bool setPMProcIPs = true; log.writeLog(__LINE__, "restartProcessType: Restart all " + processName, LOG_TYPE_DEBUG); @@ -4301,8 +4306,8 @@ int ProcessManager::restartProcessType( std::string processName, std::string ski ( systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY && !manualFlag ) ) continue; - if( processName.find("DDLProc") == 0 || - processName.find("DMLProc") == 0 ) { + if ( (processName.find("DDLProc") == 0 || processName.find("DMLProc") == 0) && setPMProcIPs ) + { string procModuleType = systemprocessstatus.processstatus[i].Module.substr(0,MAX_MODULE_TYPE_SIZE); if ( procModuleType == "pm" && PMwithUM == "y" ) continue; @@ -4329,11 +4334,11 @@ int ProcessManager::restartProcessType( std::string processName, std::string ski // if DDL or DMLProc, change IP Address if ( retStatus == oam::API_SUCCESS ) { - if( processName.find("DDLProc") == 0 || - processName.find("DMLProc") == 0 ) { - + if ( (processName.find("DDLProc") == 0 || processName.find("DMLProc") == 0) && setPMProcIPs ) + { processManager.setPMProcIPs(systemprocessstatus.processstatus[i].Module, processName); - return retStatus; + setPMProcIPs = false; + continue; } } } @@ -8188,7 +8193,7 @@ int ProcessManager::setPMProcIPs( std::string moduleName, std::string processNam pthread_mutex_unlock(&THREAD_LOCK); - log.writeLog(__LINE__, "setPMProcIPs failed", LOG_TYPE_DEBUG); + //log.writeLog(__LINE__, "setPMProcIPs failed", LOG_TYPE_DEBUG); return API_SUCCESS;