From 482047679aff4f37356fc7aefff4185935e6be4b Mon Sep 17 00:00:00 2001 From: David Hall Date: Tue, 23 Aug 2016 16:50:56 -0500 Subject: [PATCH] MCOL-259 add some retry logic to the OAMCache system. Add that degraded is still valid for a PM. --- dbcon/joblist/pdictionaryscan.cpp | 5 +-- dbcon/joblist/tuple-bps.cpp | 5 +-- oam/oamcpp/liboamcpp.cpp | 1 + oam/oamcpp/liboamcpp.h | 53 ++++++++++++++++--------------- oam/oamcpp/oamcache.cpp | 36 +++++++++++++++------ procmon/processmonitor.cpp | 2 +- 6 files changed, 61 insertions(+), 41 deletions(-) diff --git a/dbcon/joblist/pdictionaryscan.cpp b/dbcon/joblist/pdictionaryscan.cpp index 25fabdcde..34c958ecf 100644 --- a/dbcon/joblist/pdictionaryscan.cpp +++ b/dbcon/joblist/pdictionaryscan.cpp @@ -367,12 +367,13 @@ void pDictionaryScan::sendPrimitiveMessages() if (dbRootConnectionMap->find(dbroot) == dbRootConnectionMap->end()) { // MCOL-259 force a reload of the xml. This usualy fixes it. - std::cout << "dictionary forcing reload of columnstore.xml for dbRootConnectionMap" << std::endl; + Logger log; + log.logMessage(logging::LOG_TYPE_DEBUG, "dictionary forcing reload of columnstore.xml for dbRootConnectionMap"); oamCache->forceReload(); dbRootConnectionMap = oamCache->getDBRootToConnectionMap(); if (dbRootConnectionMap->find(dbroot) == dbRootConnectionMap->end()) { - std::cout << "dictionary still not in dbRootConnectionMap" << std::endl; + log.logMessage(logging::LOG_TYPE_DEBUG, "dictionary still not in dbRootConnectionMap"); throw IDBExcept(ERR_DATA_OFFLINE); } } diff --git a/dbcon/joblist/tuple-bps.cpp b/dbcon/joblist/tuple-bps.cpp index a8d1cbfb9..30cca8e2e 100644 --- a/dbcon/joblist/tuple-bps.cpp +++ b/dbcon/joblist/tuple-bps.cpp @@ -1623,12 +1623,13 @@ void TupleBPS::makeJobs(vector *jobs) if (dbRootConnectionMap->find(scannedExtents[i].dbRoot) == dbRootConnectionMap->end()) { // MCOL-259 force a reload of the xml. This usualy fixes it. - std::cout << "forcing reload of columnstore.xml for dbRootConnectionMap" << std::endl; + Logger log; + log.logMessage(logging::LOG_TYPE_WARNING, "forcing reload of columnstore.xml for dbRootConnectionMap"); oamCache->forceReload(); dbRootConnectionMap = oamCache->getDBRootToConnectionMap(); if (dbRootConnectionMap->find(scannedExtents[i].dbRoot) == dbRootConnectionMap->end()) { - std::cout << "still not in dbRootConnectionMap" << std::endl; + log.logMessage(logging::LOG_TYPE_WARNING, "dbroot still not in dbRootConnectionMap"); throw IDBExcept(ERR_DATA_OFFLINE); } } diff --git a/oam/oamcpp/liboamcpp.cpp b/oam/oamcpp/liboamcpp.cpp index 472a80f37..fb2ae3c2c 100644 --- a/oam/oamcpp/liboamcpp.cpp +++ b/oam/oamcpp/liboamcpp.cpp @@ -1618,6 +1618,7 @@ namespace oam {} // no match found + state = oam::UNEQUIP; exceptionControl("getModuleStatus", API_INVALID_PARAMETER); } diff --git a/oam/oamcpp/liboamcpp.h b/oam/oamcpp/liboamcpp.h index 3d77c0086..7e568afb3 100644 --- a/oam/oamcpp/liboamcpp.h +++ b/oam/oamcpp/liboamcpp.h @@ -273,7 +273,8 @@ namespace oam STANDBY_INIT, // 18 = Standby init BUSY_INIT, // 19 = Busy init ROLLBACK_INIT, // 20 = Rollback during DML init - STATE_MAX // 21 = Max value + PID_UPDATE, // 21 = Assigning the pid + STATE_MAX // 22 = Max value }; /** @brief String State @@ -2390,17 +2391,17 @@ namespace oam */ EXPORT void dbrmctl(std::string command); - /** @brief Wait for system to close transactions - * - * When a Shutdown, stop, restart or suspend operation is - * requested but there are active transactions of some sort, - * We wait for all transactions to close before performing - * the action. - */ - EXPORT bool waitForSystem(PROC_MGT_MSG_REQUEST request, messageqcpp::IOSocket& ios, messageqcpp::ByteStream& stillWorkingMsg); + /** @brief Wait for system to close transactions + * + * When a Shutdown, stop, restart or suspend operation is + * requested but there are active transactions of some sort, + * We wait for all transactions to close before performing + * the action. + */ + EXPORT bool waitForSystem(PROC_MGT_MSG_REQUEST request, messageqcpp::IOSocket& ios, messageqcpp::ByteStream& stillWorkingMsg); - void amazonReattach(std::string toPM, dbrootList dbrootConfigList, bool attach = false); - void mountDBRoot(dbrootList dbrootConfigList, bool mount = true); + void amazonReattach(std::string toPM, dbrootList dbrootConfigList, bool attach = false); + void mountDBRoot(dbrootList dbrootConfigList, bool mount = true); /** *@brief gluster control @@ -2431,19 +2432,24 @@ namespace oam **/ EXPORT bool disableMySQLRep(); - /** @brief check Gluster Log after a Gluster control call - */ - EXPORT int checkGlusterLog(std::string logFile, std::string& errmsg); + /** @brief check Gluster Log after a Gluster control call + */ + EXPORT int checkGlusterLog(std::string logFile, std::string& errmsg); - /** @brief check and get mysql user password - */ - EXPORT std::string getMySQLPassword(bool bypassConfig = false); + /** @brief check and get mysql user password + */ + EXPORT std::string getMySQLPassword(bool bypassConfig = false); - /** @brief update fstab with dbroot mounts - */ - EXPORT std::string updateFstab(std::string device, std::string dbrootID); + /** @brief update fstab with dbroot mounts + */ + EXPORT std::string updateFstab(std::string device, std::string dbrootID); - private: + /** + * @brief Write the message to the log + */ + void writeLog(const std::string logContent, const logging::LOG_TYPE logType = logging::LOG_TYPE_INFO); + + private: int sendMsgToProcMgr3(messageqcpp::ByteStream::byte requestType, snmpmanager::AlarmList& alarmlist, const std::string date); @@ -2476,11 +2482,6 @@ namespace oam */ void sendStatusUpdate(messageqcpp::ByteStream obs, messageqcpp::ByteStream::byte returnRequestType); - /** - * @brief Write the message to the log - */ - void writeLog(const std::string logContent, const logging::LOG_TYPE logType = logging::LOG_TYPE_INFO); - std::string CalpontConfigFile; std::string AlarmConfigFile; std::string ProcessConfigFile; diff --git a/oam/oamcpp/oamcache.cpp b/oam/oamcpp/oamcache.cpp index 43ddf73ac..7f60338b0 100644 --- a/oam/oamcpp/oamcache.cpp +++ b/oam/oamcpp/oamcache.cpp @@ -106,20 +106,36 @@ void OamCache::checkReload() #if !defined(SKIP_OAM_INIT) { try { - int state = oam::ACTIVE; bool degraded; + int state = oam::MAN_INIT; + bool degraded; char num[80]; + int retry = 0; + // MCOL-259 retry for 5 seconds if the PM is in some INIT mode. + while (( state == oam::BUSY_INIT + || state == oam::MAN_INIT + || state == oam::PID_UPDATE) + && retry < 5) + { + snprintf(num, 80, "%d", *it); + try { + oam.getModuleStatus(string("pm") + num, state, degraded); + } + catch (...) {break;} - snprintf(num, 80, "%d", *it); - try { - oam.getModuleStatus(string("pm") + num, state, degraded); - } - catch (...) {} - - if (state == oam::ACTIVE) { - pmToConnectionMap[*it] = i++; - moduleIds.push_back(*it); + if (state == oam::ACTIVE || state == oam::DEGRADED) { + pmToConnectionMap[*it] = i++; + moduleIds.push_back(*it); + break; + } + sleep(1); //cout << "pm " << *it << " -> connection " << (i-1) << endl; } + if (state != oam::ACTIVE) + { + ostringstream os; + os << "OamCache::checkReload shows state for pm" << num << " as " << state; + oam.writeLog(os.str(), logging::LOG_TYPE_WARNING); + } } catch (...) { /* doesn't get added to the connection map */ } } diff --git a/procmon/processmonitor.cpp b/procmon/processmonitor.cpp index 7cc354929..f75fec17f 100644 --- a/procmon/processmonitor.cpp +++ b/procmon/processmonitor.cpp @@ -2573,7 +2573,7 @@ pid_t ProcessMonitor::startProcess(string processModuleType, string processName, DepProcessName, DepModuleName, LogFile); //Update Process Status: Update PID - updateProcessInfo(processName, STATE_MAX, newProcessID); + updateProcessInfo(processName, PID_UPDATE, newProcessID); } log.writeLog(__LINE__, processName + " PID is " + oam.itoa(newProcessID), LOG_TYPE_DEBUG);