diff --git a/dbcon/joblist/distributedenginecomm.cpp b/dbcon/joblist/distributedenginecomm.cpp index 1243cdcbe..17fc0d9c5 100644 --- a/dbcon/joblist/distributedenginecomm.cpp +++ b/dbcon/joblist/distributedenginecomm.cpp @@ -336,7 +336,7 @@ void DistributedEngineComm::Listen(boost::shared_ptr client, Error: // @bug 488 - error condition! push 0 length bs to messagequeuemap and // eventually let jobstep error out. - mutex::scoped_lock lk(fMlock); +/* mutex::scoped_lock lk(fMlock); //cout << "WARNING: DEC READ 0 LENGTH BS FROM " << client->otherEnd()<< endl; MessageQueueMap::iterator map_tok; @@ -370,7 +370,7 @@ Error: fPmConnections.swap(tempConns); pmCount = (pmCount == 0 ? 0 : pmCount - 1); //cout << "PMCOUNT=" << pmCount << endl; - +*/ // send alarm & log it ALARMManager alarmMgr; string alarmItem = client->addr2String(); @@ -861,7 +861,7 @@ int DistributedEngineComm::writeToClient(size_t index, const ByteStream& bs, uin { // @bug 488. error out under such condition instead of re-trying other connection, // by pushing 0 size bytestream to messagequeue and throw excpetion - SBS sbs; +/* SBS sbs; lk.lock(); //cout << "WARNING: DEC WRITE BROKEN PIPE. PMS index = " << index << endl; MessageQueueMap::iterator map_tok; @@ -894,7 +894,7 @@ int DistributedEngineComm::writeToClient(size_t index, const ByteStream& bs, uin fPmConnections.swap(tempConns); pmCount = (pmCount == 0 ? 0 : pmCount - 1); } - +*/ // send alarm ALARMManager alarmMgr; string alarmItem("UNKNOWN"); diff --git a/procmgr/main.cpp b/procmgr/main.cpp index b01641e70..228980f43 100644 --- a/procmgr/main.cpp +++ b/procmgr/main.cpp @@ -58,6 +58,7 @@ bool HDFS = false; string localHostName; string PMwithUM = "n"; string MySQLRep = "n"; +string DBRootStorageType = "internal"; // pushing the ACTIVE_ALARMS_FILE to all nodes every 10 seconds. const int ACTIVE_ALARMS_PUSHING_INTERVAL = 10; @@ -1365,6 +1366,9 @@ void pingDeviceThread() break; //set query system state not ready + BRM::DBRM dbrm; + dbrm.setSystemQueryReady(false); + processManager.setQuerySystemState(false); processManager.setSystemState(oam::BUSY_INIT); @@ -1380,19 +1384,19 @@ void pingDeviceThread() //send notification oam.sendDeviceNotification(config.moduleName(), MODULE_UP); - //set module to enable state - processManager.enableModule(moduleName, oam::AUTO_OFFLINE); - int status; - - // if pm, move dbroots back to pm - if ( ( moduleName.find("pm") == 0 && !amazon ) || + + // if shared pm, move dbroots back to pm + if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) || ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) || ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) { //restart to get the versionbuffer files closed so it can be unmounted processManager.restartProcessType("WriteEngineServer", moduleName); + //set module to enable state + processManager.enableModule(moduleName, oam::AUTO_OFFLINE); + downActiveOAMModule = false; int retry; for ( retry = 0 ; retry < 5 ; retry++ ) @@ -1484,6 +1488,9 @@ void pingDeviceThread() break; } } + else + //set module to enable state + processManager.enableModule(moduleName, oam::AUTO_OFFLINE); //restart module processes int retry = 0; @@ -1584,14 +1591,6 @@ void pingDeviceThread() continue; } - //call dbrm control, need to resume before start so the getdbrmfiles halt doesn't hang - oam.dbrmctl("reload"); - log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG); - - // resume the dbrm - oam.dbrmctl("resume"); - log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG); - // next, startmodule status = processManager.startModule(moduleName, oam::FORCEFUL, oam::AUTO_OFFLINE); if ( status == oam::API_SUCCESS ) @@ -1606,6 +1605,14 @@ void pingDeviceThread() if ( retry < ModuleProcMonWaitCount ) { // module successfully started + //call dbrm control, need to resume before start so the getdbrmfiles halt doesn't hang + oam.dbrmctl("reload"); + log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG); + + // resume the dbrm + oam.dbrmctl("resume"); + log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG); + //distribute config file processManager.distributeConfigFile("system"); sleep(1); @@ -1647,6 +1654,9 @@ void pingDeviceThread() processManager.restartProcessType("DMLProc", moduleName); } + //enable query stats + dbrm.setSystemQueryReady(true); + //set query system state ready processManager.setQuerySystemState(true); @@ -1664,8 +1674,9 @@ void pingDeviceThread() aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET); // if pm, move dbroots back to pm - if ( ( moduleName.find("pm") == 0 && !amazon ) || - ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ) { + if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) || + ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) || + ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) { //move dbroots to other modules try { log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG); @@ -1703,6 +1714,9 @@ void pingDeviceThread() else processManager.setSystemState(oam::ACTIVE); + //enable query stats + dbrm.setSystemQueryReady(true); + //set query system state ready processManager.setQuerySystemState(true); @@ -1741,8 +1755,13 @@ void pingDeviceThread() log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL); //set query system state not ready + BRM::DBRM dbrm; + dbrm.setSystemQueryReady(false); + processManager.setQuerySystemState(false); + processManager.setSystemState(oam::BUSY_INIT); + processManager.reinitProcessType("cpimport"); // halt the dbrm @@ -1771,25 +1790,24 @@ void pingDeviceThread() log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG); // if pm, move dbroots to other pms - if ( !amazon || - ( amazon && AmazonPMFailover == "y") ) { - if( moduleName.find("pm") == 0 ) { - try { - log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG); - oam.autoMovePmDbroot(moduleName); - log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG); - //distribute config file - processManager.distributeConfigFile("system"); - } - catch (exception& ex) - { - string error = ex.what(); - log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG); - } - catch(...) - { - log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR); - } + if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) || + ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) || + ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) { + try { + log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG); + oam.autoMovePmDbroot(moduleName); + log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG); + //distribute config file + processManager.distributeConfigFile("system"); + } + catch (exception& ex) + { + string error = ex.what(); + log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG); + } + catch(...) + { + log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR); } } @@ -1968,7 +1986,9 @@ void pingDeviceThread() processManager.removeModule(devicenetworklist, false); // if pm, move dbroots to other pms - if( moduleName.find("pm") == 0 ) { + if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) || + ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) || + ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) { try { log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG); oam.autoMovePmDbroot(moduleName); @@ -1990,6 +2010,9 @@ void pingDeviceThread() //set recycle process processManager.recycleProcess(moduleName); + //enable query stats + dbrm.setSystemQueryReady(true); + //set query system state ready processManager.setQuerySystemState(true); @@ -2004,6 +2027,9 @@ void pingDeviceThread() oam.dbrmctl("resume"); log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG); + //enable query stats + dbrm.setSystemQueryReady(true); + //set query system state ready processManager.setQuerySystemState(true); } @@ -2017,6 +2043,9 @@ void pingDeviceThread() //set recycle process processManager.recycleProcess(moduleName); + //enable query stats + dbrm.setSystemQueryReady(true); + //set query system state ready processManager.setQuerySystemState(true); diff --git a/procmgr/processmanager.cpp b/procmgr/processmanager.cpp index 8be2aeee5..44da73140 100644 --- a/procmgr/processmanager.cpp +++ b/procmgr/processmanager.cpp @@ -72,6 +72,7 @@ bool startsystemthreadRunning = false; string gdownActiveOAMModule; vector downModuleList; bool startFailOver = false; +extern string DBRootStorageType; string masterLogFile = oam::UnassignedName; string masterLogPos = oam::UnassignedName; @@ -2791,6 +2792,16 @@ void processMSG(messageqcpp::IOSocket* cfIos) log.writeLog(__LINE__, "MSG RECEIVED: Process Restarted on " + moduleName + "/" + processName); + //set query system states not ready + BRM::DBRM dbrm; + dbrm.setSystemQueryReady(false); + + processManager.setQuerySystemState(false); + + processManager.setSystemState(oam::BUSY_INIT); + + processManager.reinitProcessType("cpimport"); + //request reinit after Process is active for ( int i = 0; i < 600 ; i++ ) { try { @@ -2916,6 +2927,13 @@ void processMSG(messageqcpp::IOSocket* cfIos) break; } } + + //enable query stats + dbrm.setSystemQueryReady(true); + + processManager.setQuerySystemState(true); + + processManager.setSystemState(oam::ACTIVE); } break; @@ -8525,14 +8543,6 @@ int ProcessManager::switchParentOAMModule(std::string newActiveModuleName) log.writeLog(__LINE__, "switchParentOAMModule Function Started", LOG_TYPE_DEBUG); - string DBRootStorageType = "internal"; - { - try{ - oam.getSystemConfig("DBRootStorageType", DBRootStorageType); - } - catch(...) {} - } - if ( DBRootStorageType == "internal" && GlusterConfig == "n") { log.writeLog(__LINE__, "ERROR: DBRootStorageType = internal", LOG_TYPE_ERROR); pthread_mutex_unlock(&THREAD_LOCK); @@ -8818,15 +8828,6 @@ int ProcessManager::OAMParentModuleChange() log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR); } - // dbroot storage type, do different failover if internal - string DBRootStorageType = "internal"; - { - try{ - oam.getSystemConfig("DBRootStorageType", DBRootStorageType); - } - catch(...) {} - } - string cmdLine = "ping "; string cmdOption = " -c 1 -w 5 >> /dev/null"; string cmd; diff --git a/procmon/processmonitor.cpp b/procmon/processmonitor.cpp index 08a74d2ed..74f846929 100644 --- a/procmon/processmonitor.cpp +++ b/procmon/processmonitor.cpp @@ -595,7 +595,7 @@ void ProcessMonitor::processMessage(messageqcpp::ByteStream msg, messageqcpp::IO log.writeLog(__LINE__, "START: process already active " + processName); //Inform Process Manager that Process restart - processRestarted(processName); + //processRestarted(processName); ackMsg << (ByteStream::byte) ACK; ackMsg << (ByteStream::byte) START; @@ -694,7 +694,7 @@ void ProcessMonitor::processMessage(messageqcpp::ByteStream msg, messageqcpp::IO } //Inform Process Manager that Process restart - processRestarted(processName); + //processRestarted(processName); ackMsg << (ByteStream::byte) ACK; ackMsg << (ByteStream::byte) RESTART;