diff --git a/oam/oamcpp/liboamcpp.cpp b/oam/oamcpp/liboamcpp.cpp index fb2ae3c2c..d1244d3ca 100644 --- a/oam/oamcpp/liboamcpp.cpp +++ b/oam/oamcpp/liboamcpp.cpp @@ -1368,10 +1368,7 @@ namespace oam void Oam::getSystemStatus(SystemStatus& systemstatus, bool systemStatusOnly) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - exceptionControl("getSystemStatus", API_FAILURE); + checkSystemRunning("getSystemStatus"); #ifdef _MSC_VER // TODO: Remove when we create OAM for Windows @@ -1386,81 +1383,67 @@ namespace oam DbrootStatus dbrootstatus; systemstatus.systemdbrootstatus.dbrootstatus.clear(); - for ( int i = 0 ; i < 2 ; i ++) - { - try + for ( int i = 0 ; i < 2 ; i ++) { - MessageQueueClient processor("ProcStatusControl"); - // processor.syncProto(false); - ByteStream::byte ModuleNumber; - ByteStream::byte ExtDeviceNumber; - ByteStream::byte dbrootNumber; - ByteStream::byte NICNumber; - ByteStream::byte state; - std::string name; - std::string date; - ByteStream obs, ibs; - - obs << (ByteStream::byte) GET_SYSTEM_STATUS; - if ( systemStatusOnly ) - obs << (ByteStream::byte) 1; - else - obs << (ByteStream::byte) 2; - - try { - struct timespec ts = { 3, 0 }; - processor.write(obs, &ts); - } - catch (exception& e) - { - processor.shutdown(); -// string error = e.what(); -// writeLog("getSystemStatus: write exception: " + error, LOG_TYPE_ERROR); - exceptionControl("getSystemStatus", API_FAILURE); - } - catch(...) - { - processor.shutdown(); -// writeLog("getSystemStatus: write exception: unknown", LOG_TYPE_ERROR); - exceptionControl("getSystemStatus", API_FAILURE); - } - - // wait 30 seconds for ACK from Process Monitor - try { - struct timespec ts = { 30, 0 }; - ibs = processor.read(&ts); - } - catch (exception& e) - { - processor.shutdown(); - string error = e.what(); -// writeLog("getSystemStatus: read exception: " + error, LOG_TYPE_ERROR); - exceptionControl("getSystemStatus", API_FAILURE); - } - catch(...) - { - processor.shutdown(); -// writeLog("getSystemStatus: read exception: unknown", LOG_TYPE_ERROR); - exceptionControl("getSystemStatus", API_FAILURE); - } - - if (ibs.length() > 0) + try { + MessageQueueClient processor("ProcStatusControl"); + // processor.syncProto(false); + ByteStream::byte ModuleNumber; + ByteStream::byte ExtDeviceNumber; + ByteStream::byte dbrootNumber; + ByteStream::byte NICNumber; + ByteStream::byte state; + std::string name; + std::string date; + ByteStream obs, ibs; + + obs << (ByteStream::byte) GET_SYSTEM_STATUS; if ( systemStatusOnly ) - { - ibs >> name; - ibs >> state; - ibs >> date; - if ( name.find("system") != string::npos ) { - systemstatus.SystemOpState = state; - systemstatus.StateChangeDate = date; - } - } + obs << (ByteStream::byte) 1; else + obs << (ByteStream::byte) 2; + + try { + struct timespec ts = { 3, 0 }; + processor.write(obs, &ts); + } + catch (exception& e) { - ibs >> ModuleNumber; - - for( int i=0 ; i < ModuleNumber ; ++i) + processor.shutdown(); + string error = e.what(); + writeLog("getSystemStatus: write exception: " + error, LOG_TYPE_ERROR); + exceptionControl("getSystemStatus write", API_FAILURE); + } + catch(...) + { + processor.shutdown(); + writeLog("getSystemStatus: write exception: unknown", LOG_TYPE_ERROR); + exceptionControl("getSystemStatus write", API_FAILURE); + } + + // wait 30 seconds for ACK from Process Monitor + try { + struct timespec ts = { 30, 0 }; + ibs = processor.read(&ts); + } + catch (exception& e) + { + processor.shutdown(); + string error = e.what(); + writeLog("getSystemStatus: read exception: " + error, LOG_TYPE_ERROR); + exceptionControl("getSystemStatus read", API_FAILURE); + } + catch(...) + { + processor.shutdown(); + writeLog("getSystemStatus: read exception: unknown", LOG_TYPE_ERROR); + exceptionControl("getSystemStatus read", API_FAILURE); + } + + if (ibs.length() > 0) + { + if ( systemStatusOnly ) { ibs >> name; ibs >> state; @@ -1469,68 +1452,93 @@ namespace oam systemstatus.SystemOpState = state; systemstatus.StateChangeDate = date; } - else + } + else + { + ibs >> ModuleNumber; + + for( int i=0 ; i < ModuleNumber ; ++i) { - modulestatus.Module = name; - modulestatus.ModuleOpState = state; - modulestatus.StateChangeDate = date; - systemstatus.systemmodulestatus.modulestatus.push_back(modulestatus); + ibs >> name; + ibs >> state; + ibs >> date; + if ( name.find("system") != string::npos ) { + systemstatus.SystemOpState = state; + systemstatus.StateChangeDate = date; + } + else + { + modulestatus.Module = name; + modulestatus.ModuleOpState = state; + modulestatus.StateChangeDate = date; + systemstatus.systemmodulestatus.modulestatus.push_back(modulestatus); + } + } + + ibs >> ExtDeviceNumber; + + for( int i=0 ; i < ExtDeviceNumber ; ++i) + { + ibs >> name; + ibs >> state; + ibs >> date; + extdevicestatus.Name = name; + extdevicestatus.OpState = state; + extdevicestatus.StateChangeDate = date; + systemstatus.systemextdevicestatus.extdevicestatus.push_back(extdevicestatus); + } + + ibs >> NICNumber; + + for( int i=0 ; i < NICNumber ; ++i) + { + ibs >> name; + ibs >> state; + ibs >> date; + nicstatus.HostName = name; + nicstatus.NICOpState = state; + nicstatus.StateChangeDate = date; + systemstatus.systemnicstatus.nicstatus.push_back(nicstatus); + } + + ibs >> dbrootNumber; + + for( int i=0 ; i < dbrootNumber ; ++i) + { + ibs >> name; + ibs >> state; + ibs >> date; + dbrootstatus.Name = name; + dbrootstatus.OpState = state; + dbrootstatus.StateChangeDate = date; + systemstatus.systemdbrootstatus.dbrootstatus.push_back(dbrootstatus); } } - - ibs >> ExtDeviceNumber; - - for( int i=0 ; i < ExtDeviceNumber ; ++i) - { - ibs >> name; - ibs >> state; - ibs >> date; - extdevicestatus.Name = name; - extdevicestatus.OpState = state; - extdevicestatus.StateChangeDate = date; - systemstatus.systemextdevicestatus.extdevicestatus.push_back(extdevicestatus); - } - - ibs >> NICNumber; - - for( int i=0 ; i < NICNumber ; ++i) - { - ibs >> name; - ibs >> state; - ibs >> date; - nicstatus.HostName = name; - nicstatus.NICOpState = state; - nicstatus.StateChangeDate = date; - systemstatus.systemnicstatus.nicstatus.push_back(nicstatus); - } - - ibs >> dbrootNumber; - - for( int i=0 ; i < dbrootNumber ; ++i) - { - ibs >> name; - ibs >> state; - ibs >> date; - dbrootstatus.Name = name; - dbrootstatus.OpState = state; - dbrootstatus.StateChangeDate = date; - systemstatus.systemdbrootstatus.dbrootstatus.push_back(dbrootstatus); - } + + processor.shutdown(); + return; } - + else + { + writeLog("getSystemStatus: ProcStatusControl returns 0 length", LOG_TYPE_ERROR); + } + // timeout ocurred, shutdown connection processor.shutdown(); - return; + writeLog("getSystemStatus: read 0 length", LOG_TYPE_ERROR); + exceptionControl("getSystemStatus read 0", API_FAILURE); + } + catch (exception& e) + { + string error = e.what(); + writeLog("getSystemStatus: final exception: " + error, LOG_TYPE_ERROR); + } + catch(...) + { + writeLog("getSystemStatus: final exception: unknown", LOG_TYPE_ERROR); } - // timeout ocurred, shutdown connection - processor.shutdown(); -// writeLog("getSystemStatus: read 0 length", LOG_TYPE_ERROR); - exceptionControl("getSystemStatus", API_FAILURE); } - catch(...) - {} - } - exceptionControl("getSystemStatus:MessageQueueClient-Error", API_FAILURE); + exceptionControl("getSystemStatus:MessageQueueClient-Error", API_FAILURE); } /******************************************************************** @@ -1572,6 +1580,7 @@ namespace oam ModuleConfig moduleconfig; std::vector NICstates; degraded = false; + state = oam::UNEQUIP; try { @@ -1595,8 +1604,19 @@ namespace oam getNICStatus((*pt1).HostName, state); NICstates.push_back(state); } - catch (...) - {} + catch (exception& e) + { + Oam oam; + ostringstream os; + os << "Oam::getModuleStatus exception while getNICStatus " << (*pt1).HostName << " " << e.what(); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + } + catch (...) { + Oam oam; + ostringstream os; + os << "Oam::getModuleStatus exception while getNICStatus " << (*pt1).HostName; + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + } } vector::iterator pt = NICstates.begin(); @@ -1609,16 +1629,37 @@ namespace oam } return; } - catch (...) - {} + catch (exception& e) + { + Oam oam; + ostringstream os; + os << "Oam::getModuleStatus exception while getSystemConfig " << name << " " << e.what(); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + } + catch (...) { + Oam oam; + ostringstream os; + os << "Oam::getModuleStatus exception while getSystemConfig " << name; + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + } } } } - catch(...) - {} + catch (exception& e) + { + Oam oam; + ostringstream os; + os << "Oam::getModuleStatus exception while getSystemStatus " << e.what(); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + } + catch (...) { + Oam oam; + ostringstream os; + os << "Oam::getModuleStatus exception while getSystemStatus"; + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + } // no match found - state = oam::UNEQUIP; exceptionControl("getModuleStatus", API_INVALID_PARAMETER); } @@ -1794,8 +1835,12 @@ namespace oam } } } - catch (exception&) + catch (exception& e) { + Oam oam; + ostringstream os; + os << "Oam::getNICStatus exception while getSystemStatus for " << name << " " << e.what(); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); exceptionControl("getNICStatus", API_FAILURE); } @@ -2091,10 +2136,7 @@ namespace oam void Oam::getProcessStatus(SystemProcessStatus& systemprocessstatus, string port) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - exceptionControl("getProcessStatus", API_FAILURE); + checkSystemRunning("getProcessStatus"); ProcessStatus processstatus; systemprocessstatus.processstatus.clear(); @@ -2194,10 +2236,7 @@ namespace oam return; #endif - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - exceptionControl("getProcessStatus", API_FAILURE); + checkSystemRunning("getProcessStatus"); for ( int i = 0 ; i < 5 ; i ++) { @@ -2292,10 +2331,7 @@ namespace oam void Oam::setProcessStatus(const std::string process, const std::string module, const int state, pid_t PID) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - exceptionControl("setProcessStatus", API_FAILURE); + checkSystemRunning("setProcessStatus"); //send and wait for ack and resend if not received //retry 5 time max @@ -2812,10 +2848,7 @@ namespace oam exceptionControl("getMyProcessStatus", API_FAILURE); } - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - exceptionControl("getMyProcessStatus", API_FAILURE); + checkSystemRunning("getMyProcessStatus"); for ( int i = 0 ; i < 5 ; i ++) { @@ -4652,20 +4685,31 @@ namespace oam { ifstream file (fileName.c_str()); - char line[400]; + if (!file.is_open()) + { + ostringstream os; + os << "checkLogStatus error while opening file " << fileName << " " << strerror(errno); + writeLog(os.str(), LOG_TYPE_ERROR ); + } string buf; - while (file.getline(line, 400)) + while (getline(file, buf)) { - buf = line; - string::size_type pos = buf.find(phrase,0); if (pos != string::npos) //found phrase return true; } + if (file.bad()) + { + ostringstream os; + os << "checkLogStatus error while reading file " << fileName << " " << strerror(errno); + writeLog(os.str(), LOG_TYPE_ERROR ); + } file.close(); - + ostringstream os; + os << "checkLogStatus failed " << fileName << " expected \"" << phrase.c_str() << "\" found \"" << buf.c_str() << "\""; + writeLog(os.str(), LOG_TYPE_ERROR ); return false; } @@ -4821,10 +4865,7 @@ namespace oam ********************************************************************/ bool Oam::switchParentOAMModule(std::string moduleName, GRACEFUL_FLAG gracefulflag) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - exceptionControl("switchParentOAMModule", API_FAILURE); + checkSystemRunning("switchParentOAMModule"); int returnStatus; // We assume that moduleName is a valid pm @@ -4837,7 +4878,7 @@ namespace oam string cmdLine = "ping "; string cmdOption = " -w 1 >> /dev/null"; - cmd = cmdLine + IPAddr + cmdOption; + string cmd = cmdLine + IPAddr + cmdOption; if ( system(cmd.c_str()) != 0 ) { //ping failure try{ @@ -6267,10 +6308,7 @@ namespace oam exceptionControl("sysConfig->write", API_FAILURE); } - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - return; + checkSystemRunning("addDbroot"); //get updated Columnstore.xml distributed distributeConfigFile("system"); @@ -8734,10 +8772,8 @@ namespace oam GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string argument1, const std::string argument2, int timeout) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - return API_CONN_REFUSED; + if (!checkSystemRunning("")) + return API_CONN_REFUSED; int returnStatus = API_SUCCESS; //default ByteStream msg; @@ -8837,10 +8873,8 @@ namespace oam int Oam::sendMsgToProcMgr2(messageqcpp::ByteStream::byte requestType, DeviceNetworkList devicenetworklist, GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string password, const std::string mysqlpw) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - return API_CONN_REFUSED; + if (!checkSystemRunning("")) + return API_CONN_REFUSED; int returnStatus = API_TIMEOUT; //default ByteStream msg; @@ -8953,10 +8987,8 @@ namespace oam int Oam::sendMsgToProcMgr3(messageqcpp::ByteStream::byte requestType, AlarmList& alarmlist, const std::string date) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) - return API_CONN_REFUSED; + if (!checkSystemRunning("")) + return API_CONN_REFUSED; int returnStatus = API_SUCCESS; //default ByteStream msg; @@ -9056,9 +9088,7 @@ namespace oam GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string argument1, const std::string argument2, int timeout) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) + if (!checkSystemRunning("")) return API_CONN_REFUSED; int returnStatus = API_STILL_WORKING; @@ -9244,9 +9274,7 @@ namespace oam void Oam::sendStatusUpdate(ByteStream obs, ByteStream::byte returnRequestType) { - string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log"; - system(cmd.c_str()); - if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") ) + if (!checkSystemRunning("")) return; for ( int i = 0 ; i < 5 ; i ++) @@ -9622,6 +9650,33 @@ namespace oam return returnStatus; } + bool Oam::checkSystemRunning(const char* function) + { + struct stat st; + if (stat("/var/lock/subsys/columnstore", &st) == 0) + { + return true; + } + if (geteuid() != 0) + { + // not root user + // The stat above may fail for non-root because of permissions + // This is a non-optimal solution + string cmd = "pgrep ProcMon"; + if (system(cmd.c_str()) != 0) + { + return true; + } + } + ostringstream os; + os << function << " system is not running: " << strerror(errno); + writeLog(os.str(), LOG_TYPE_ERROR ); + if (strlen(function)) + { + throw runtime_error(os.str()); + } + return false; + } } //namespace oam diff --git a/oam/oamcpp/liboamcpp.h b/oam/oamcpp/liboamcpp.h index 7e568afb3..b1ed7ebae 100644 --- a/oam/oamcpp/liboamcpp.h +++ b/oam/oamcpp/liboamcpp.h @@ -2482,6 +2482,8 @@ namespace oam */ void sendStatusUpdate(messageqcpp::ByteStream obs, messageqcpp::ByteStream::byte returnRequestType); + bool checkSystemRunning(const char* function); + std::string CalpontConfigFile; std::string AlarmConfigFile; std::string ProcessConfigFile; diff --git a/oam/oamcpp/oamcache.cpp b/oam/oamcpp/oamcache.cpp index 4a2174ded..6987b7819 100644 --- a/oam/oamcpp/oamcache.cpp +++ b/oam/oamcpp/oamcache.cpp @@ -120,7 +120,19 @@ void OamCache::checkReload() try { oam.getModuleStatus(string("pm") + num, state, degraded); } - catch (...) {break;} + catch (std::exception& e) + { + ostringstream os; + os << "OamCache::checkReload exception while getModuleStatus pm" << num << " " << e.what(); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + break; + } + catch (...) { + ostringstream os; + os << "OamCache::checkReload exception while getModuleStatus pm" << num; + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + break; + } if (state == oam::ACTIVE || state == oam::DEGRADED) { pmToConnectionMap[*it] = i++; @@ -134,10 +146,22 @@ void OamCache::checkReload() { ostringstream os; os << "OamCache::checkReload shows state for pm" << num << " as " << oamState[state]; - oam.writeLog(os.str(), logging::LOG_TYPE_WARNING); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); } } - catch (...) { /* doesn't get added to the connection map */ } + catch (std::exception& e) + { + ostringstream os; + os << "OamCache::checkReload final exception while getModuleStatus " << e.what(); + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + break; + } + catch (...) { + ostringstream os; + os << "OamCache::checkReload final exception while getModuleStatus"; + oam.writeLog(os.str(), logging::LOG_TYPE_ERROR); + break; + } } #else moduleIds.push_back(*it); diff --git a/utils/messageqcpp/inetstreamsocket.cpp b/utils/messageqcpp/inetstreamsocket.cpp index 78c311466..b298843a2 100644 --- a/utils/messageqcpp/inetstreamsocket.cpp +++ b/utils/messageqcpp/inetstreamsocket.cpp @@ -437,7 +437,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO size_t mlread = 0; if (readToMagic(msecs, isTimeOut, stats) == false) //indicates a timeout or EOF + { + logIoError("InetStreamSocket::read: timeout during readToMagic", 0); return SBS(new ByteStream(0)); + } //FIXME: This seems like a lot of work to read 4 bytes... while (mlread < sizeof(msglen)) @@ -458,6 +461,7 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO { if (isTimeOut) *isTimeOut = true; + logIoError("InetStreamSocket::read: timeout during first poll", 0); return SBS(new ByteStream(0)); } } @@ -470,7 +474,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO if (t == 0) { if (timeout == NULL) + { + logIoError("InetStreamSocket::read: timeout during first read", 0); return SBS(new ByteStream(0)); // don't return an incomplete message + } else throw SocketClosed("InetStreamSocket::read: Remote is closed"); } @@ -518,7 +525,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO if (err == 0) // timeout { if (isTimeOut) + { + logIoError("InetStreamSocket::read: timeout during second poll", 0); *isTimeOut = true; + } if (stats) stats->dataRecvd(nread); return SBS(new ByteStream(0)); @@ -538,7 +548,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO if (timeout == NULL) return SBS(new ByteStream(0)); // don't return an incomplete message else + { + logIoError("InetStreamSocket::read: timeout during second read", 0); throw SocketClosed("InetStreamSocket::read: Remote is closed"); + } } if (t < 0) { ostringstream oss;