1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-05-30 00:27:08 +03:00
2017-11-30 15:15:01 +00:00

2741 lines
84 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
Copyright (C) 2016 MariaDB Corporaton
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/*****************************************************************************************
* $Id: main.cpp 2203 2013-07-08 16:50:51Z bpaul $
*
*****************************************************************************************/
#include <clocale>
#include <boost/filesystem.hpp>
#include "processmanager.h"
#include "installdir.h"
#include "utils_utf8.h"
using namespace std;
using namespace logging;
using namespace messageqcpp;
using namespace processmanager;
using namespace oam;
using namespace alarmmanager;
using namespace threadpool;
//using namespace procheartbeat;
using namespace config;
bool runStandby = false;
bool runCold = false;
string systemName = "system";
string iface_name;
string cloud;
bool amazon = false;
string PMInstanceType;
string UMInstanceType;
string AmazonPMFailover = "y";
string DataRedundancyConfig = "n";
bool rootUser = true;
string USER = "root";
bool HDFS = false;
string localHostName;
string PMwithUM = "n";
string MySQLRep = "n";
// pushing the ACTIVE_ALARMS_FILE to all nodes every 10 seconds.
const int ACTIVE_ALARMS_PUSHING_INTERVAL = 10;
typedef map<string, int> moduleList;
moduleList moduleInfoList;
extern HeartBeatProcList hbproclist;
extern pthread_mutex_t THREAD_LOCK;
extern bool startsystemthreadStop;
extern string gdownActiveOAMModule;
extern int startsystemthreadStatus;
extern vector<string> downModuleList;
extern bool startFailOver;
extern bool gOAMParentModuleFlag;
static void messageThread(Configuration config);
static void sigUser1Handler(int sig);
static void startMgrProcessThread();
static void hdfsActiveAlarmsPushingThread();
//static void pingDeviceThread();
//static void heartbeatProcessThread();
//static void heartbeatMsgThread();
/*****************************************************************************************
* @brief main
*
* purpose: request launching of Mgr controlled processes and wait for incoming messages
*
*****************************************************************************************/
int main(int argc, char **argv)
{
#ifndef _MSC_VER
setuid(0); // set effective ID to root; ignore return status
#endif
// get and set locale language
string systemLang = "C";
setlocale(LC_ALL, systemLang.c_str());
Oam oam;
//check if root-user
int user;
user = getuid();
if (user != 0)
rootUser = false;
char* p= getenv("USER");
if (p && *p)
USER = p;
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
ALARMManager aManager;
log.writeLog(__LINE__, " ");
log.writeLog(__LINE__, "**********Process Manager Started**********");
//Ignore SIGPIPE signals
signal(SIGPIPE, SIG_IGN);
//Ignore SIGHUP signals
signal(SIGHUP, SIG_IGN);
//create SIGUSR1 handler to get configuration updates
signal(SIGUSR1, sigUser1Handler);
// Get System Name
try{
oam.getSystemConfig("SystemName", systemName);
}
catch(...)
{}
//get cloud setting
try {
oam.getSystemConfig( "Cloud", cloud);
}
catch(...) {}
//get amazon parameters
if ( cloud == "amazon-ec2" || cloud == "amazon-vpc" )
{
oam.getSystemConfig("PMInstanceType", PMInstanceType);
oam.getSystemConfig("UMInstanceType", UMInstanceType);
oam.getSystemConfig("AmazonPMFailover", AmazonPMFailover);
amazon = true;
}
//get gluster config
try {
oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig);
}
catch(...)
{
DataRedundancyConfig = "n";
}
//hdfs / hadoop config
string DBRootStorageType;
try {
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
}
catch(...) {}
if ( DBRootStorageType == "hdfs" )
HDFS = true;
log.writeLog(__LINE__, "Main: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
//PMwithUM config
try {
oam.getSystemConfig( "PMwithUM", PMwithUM);
}
catch(...) {
PMwithUM = "n";
}
try {
oam.getSystemConfig("MySQLRep", MySQLRep);
}
catch(...) {
MySQLRep = "n";
}
// get system uptime and alarm if this is a restart after module outage
if ( gOAMParentModuleFlag ) {
log.writeLog(__LINE__, "Running Active");
log.writeLog(__LINE__, "Running Active", LOG_TYPE_DEBUG);
}
else
{
log.writeLog(__LINE__, "Running Standby");
log.writeLog(__LINE__, "Running Standby", LOG_TYPE_DEBUG);
runStandby = true;
}
//get local module main IP address
ModuleConfig moduleconfig;
oam.getSystemConfig(config.moduleName(), moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string localIPaddr = (*pt1).IPAddr;
localHostName = (*pt1).HostName;
struct ifaddrs *addrs, *iap;
struct sockaddr_in *sa;
char buf[32];
getifaddrs(&addrs);
for (iap = addrs; iap != NULL; iap = iap->ifa_next)
{
if (iap->ifa_addr && (iap->ifa_flags & IFF_UP) && iap->ifa_addr->sa_family == AF_INET)
{
sa = (struct sockaddr_in *)(iap->ifa_addr);
inet_ntop(iap->ifa_addr->sa_family, (void *)&(sa->sin_addr), buf, sizeof(buf));
if (!strcmp(localIPaddr.c_str(), buf))
{
iface_name = iap->ifa_name;
break;
}
}
}
freeifaddrs(addrs);
log.writeLog(__LINE__, "Main Ethernet Port = " + iface_name, LOG_TYPE_DEBUG);
//
//start a thread to ping all system modules
//
if (runStandby) {
//running standby after startup
try {
oam.processInitComplete("ProcessManager", oam::STANDBY);
log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
}
// create message thread
pthread_t MessageThread;
int ret = pthread_create (&MessageThread, NULL, (void*(*)(void*)) &messageThread, &config);
if ( ret != 0 )
log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
//monitor OAM Parent Module for failover
while(true)
{
if ( processManager.OAMParentModuleChange() == oam::API_SUCCESS )
break;
log.writeLog(__LINE__, "OAMParentModuleChange failure", LOG_TYPE_WARNING);
// GO TRY AGAIN
}
pthread_t srvThread;
int status = pthread_create (&srvThread, NULL, (void*(*)(void*)) &pingDeviceThread, NULL);
if ( status != 0 )
log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
}
else
{ //running active after startup
//Update DBRM section of Columnstore.xml
processManager.updateWorkerNodeconfig();
// processManager.distributeConfigFile("system");
pthread_t srvThread;
int status = pthread_create (&srvThread, NULL, (void*(*)(void*)) &pingDeviceThread, NULL);
if ( status != 0 )
log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
// if HDFS, create a thread to push an image of activeAlarms to HDFS filesystem
if (HDFS) {
pthread_t hdfsAlarmThread;
int status = pthread_create(&hdfsAlarmThread, NULL, (void*(*)(void*)) &hdfsActiveAlarmsPushingThread, NULL);
if ( status != 0 )
log.writeLog(__LINE__, "hdfsActiveAlarmsPushingThread pthread_create failed, return code = " + oam.itoa(status), LOG_TYPE_ERROR);
}
sleep(5);
SystemStatus systemstatus;
try {
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if (systemstatus.SystemOpState != oam::MAN_OFFLINE &&
systemstatus.SystemOpState != oam::ACTIVE) {
pthread_t mgrProcThread;
int status = pthread_create (&mgrProcThread, NULL, (void*(*)(void*)) &startMgrProcessThread, NULL);
if ( status != 0 )
log.writeLog(__LINE__, "startMgrProcessThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
}
try {
oam.processInitComplete("ProcessManager");
log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
}
//make sure ProcMgr IP Address is configured correctly
try
{
Config* sysConfig = Config::makeConfig();
// get Standby IP address
ModuleConfig moduleconfig;
oam.getSystemConfig(config.moduleName(), moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string IPaddr = (*pt1).IPAddr;
sysConfig->setConfig("ProcMgr", "IPAddr", IPaddr);
log.writeLog(__LINE__, "set ProcMgr IPaddr to " + IPaddr, LOG_TYPE_DEBUG);
//update Calpont Config table
try {
sysConfig->write();
}
catch(...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
catch(...)
{
log.writeLog(__LINE__, "ERROR: makeConfig failed", LOG_TYPE_ERROR);
}
try {
oam.distributeConfigFile();
}
catch(...)
{}
// create message thread
pthread_t MessageThread;
int ret = pthread_create (&MessageThread, NULL, (void*(*)(void*)) &messageThread, &config);
if ( ret != 0 )
log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
}
//
//start a thread to process heartbeat checks
//
// pthread_t heartThread;
// pthread_create (&heartThread, NULL, (void*(*)(void*)) &heartbeatProcessThread, NULL);
//
//start a thread to read heartbeat messages
//
// pthread_t heartMsgThread;
// pthread_create (&heartMsgThread, NULL, (void*(*)(void*)) &heartbeatMsgThread, NULL);
// suspend forever
while(true)
{
sleep(1000);
}
}
/******************************************************************************************
* @brief messageThread
*
* purpose: Read incoming messages
*
******************************************************************************************/
static void messageThread(Configuration config)
{
ProcessLog log;
ProcessManager processManager(config, log);
Oam oam;
//check for running active, then launch
while(true)
{
if ( !runStandby)
break;
sleep (1);
}
log.writeLog(__LINE__, "Message Thread started ..", LOG_TYPE_DEBUG);
//read and cleanup port before trying to use
try {
Config* sysConfig = Config::makeConfig();
string port = sysConfig->getConfig("ProcMgr", "Port");
string cmd = "fuser -k " + port + "/tcp >/dev/null 2>&1";
if ( !rootUser)
cmd = "sudo fuser -k " + port + "/tcp >/dev/null 2>&1";
system(cmd.c_str());
}
catch(...)
{
}
//
//waiting for request
//
IOSocket fIos;
for (;;)
{
try
{
MessageQueueServer procmgr("ProcMgr");
for (;;)
{
try
{
fIos = procmgr.accept();
pthread_t messagethread;
int status = pthread_create (&messagethread, NULL, (void*(*)(void*)) &processMSG, &fIos);
if ( status != 0 )
log.writeLog(__LINE__, "messagethread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
}
catch(...)
{}
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
// takes 2 - 4 minites to free sockets, sleep and retry
sleep(60);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr: Caught unknown exception!", LOG_TYPE_ERROR);
// takes 2 - 4 minites to free sockets, sleep and retry
sleep(60);
}
}
return;
}
/******************************************************************************************
* @brief sigUser1Handler
*
* purpose: Handler SIGUSER1 signal and initial failover
*
******************************************************************************************/
static void sigUser1Handler(int sig)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
log.writeLog(__LINE__, "SIGUSER1 received, set startFailOver = true", LOG_TYPE_DEBUG);
startFailOver = true;
}
/*****************************************************************************************
* @brief Start Mgr Process by module Thread
*
* purpose: Send Messages to Module Process Monitors to start Processes
*
*****************************************************************************************/
static void startMgrProcessThread()
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemModuleTypeConfig systemmoduletypeconfig;
ModuleTypeConfig PMSmoduletypeconfig;
ALARMManager aManager;
int waitTime = 180;
log.writeLog(__LINE__, "startMgrProcessThread launched", LOG_TYPE_DEBUG);
//get calpont software version and release
SystemSoftware systemsoftware;
string softwareVersion;
string softwareRelease;
try
{
oam.getSystemSoftware(systemsoftware);
softwareVersion = systemsoftware.Version;
softwareRelease = systemsoftware.Release;
}
catch (exception& e) {
cout << endl << "ProcMon Construct Error reading getSystemSoftware = " << e.what() << endl;
exit(-1);
}
string localSoftwareInfo = softwareVersion + softwareRelease;
//get systemStartupOffline
string systemStartupOffline = "n";
try {
Config* sysConfig = Config::makeConfig();
systemStartupOffline = sysConfig->getConfig("Installation", "SystemStartupOffline");
}
catch(...)
{
log.writeLog(__LINE__, "ERROR: Problem getting systemStartupOffline from the Calpont System Configuration file", LOG_TYPE_ERROR);
systemStartupOffline = "n";
}
if ( systemStartupOffline == "y" )
log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_INFO);
try{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
//get Distributed Install
string DistributedInstall = "y";
try
{
oam.getSystemConfig("DistributedInstall", DistributedInstall);
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: get DistributedInstall", LOG_TYPE_ERROR);
}
//Send out a start service just to make sure Columnstore is runing on remote nodes
//note this only works for systems with ssh-keys
/* for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
//skip OAM Parent module
if ( (*pt).DeviceName == config.moduleName() )
continue;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
{
//run remote command script
string cmd = startup::StartUp::installDir() + "/bin/remote_command.sh " + (*pt1).IPAddr + " ssh '" + startup::StartUp::installDir() + "/bin/columnstore restart' 0";
system(cmd.c_str());
}
}
}
*/
//distribute system and process config files
processManager.distributeConfigFile("system");
processManager.distributeConfigFile("system", "ProcessConfig.xml");
//send out moduleName to remote nodes, this will be used to startup new installed nodes
{
int status = API_SUCCESS;
int k = 0;
for( ; k < waitTime ; k++ )
{
if ( startsystemthreadStop ) {
processManager.setSystemState(oam::MAN_OFFLINE);
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
pthread_exit(0);
}
status = API_SUCCESS;
for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
string moduleName = (*pt).DeviceName;
//skip OAM Parent module
if ( (*pt).DeviceName == config.moduleName() )
continue;
if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
(*pt).DisableState == oam::AUTODISABLEDSTATE )
continue;
int ret = processManager.configureModule(moduleName);
if ( ret != API_SUCCESS )
status = ret;
}
}
//get out of loop if all modules updated
if( status == API_SUCCESS )
break;
//retry after sleeping for a bit
sleep(1);
}
if ( k == waitTime || status == API_FAILURE) {
// system didn't successfull restart
processManager.setSystemState(oam::FAILED);
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons running", LOG_TYPE_CRITICAL);
log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
pthread_exit(0);
}
}
//wait until all modules are up after a system reboot
int i = 0;
for( ; i < waitTime ; i++ )
{
if ( startsystemthreadStop ) {
processManager.setSystemState(oam::MAN_OFFLINE);
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
pthread_exit(0);
}
int status = API_SUCCESS;
for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType == "pm" )
PMSmoduletypeconfig = systemmoduletypeconfig.moduletypeconfig[i];
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
string moduleName = (*pt).DeviceName;
// Is Module UP
try{
bool degraded;
int opState = oam::ACTIVE;
oam.getModuleStatus(moduleName, opState, degraded);
if ( opState == oam::MAN_DISABLED )
//mark all processes running on module man-offline except ProcMon
processManager.setProcessStates(moduleName, oam::MAN_OFFLINE);
if ( opState == oam::AUTO_DISABLED)
//mark all processes running on module auto-offline
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
if (opState == oam::INITIAL ||
opState == oam::DOWN) {
//a module is not up
status = API_MINOR_FAILURE;
break;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
}
if ( status == API_MINOR_FAILURE) {
sleep(1);
break;
}
}
if ( status == API_SUCCESS)
//all modules are up
break;
}
if ( i == waitTime ) {
// system didn't successfull restart
processManager.setSystemState(oam::FAILED);
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all modules are UP", LOG_TYPE_CRITICAL);
pthread_exit(0);
}
//configure the PMS settings
processManager.updatePMSconfig();
if (HDFS)
//distribute config file
processManager.distributeConfigFile("system");
//now wait until all procmons are ACTIVE and validate rpms on each module
int status = API_SUCCESS;
int k = 0;
for( ; k < waitTime ; k++ )
{
if ( startsystemthreadStop ) {
processManager.setSystemState(oam::MAN_OFFLINE);
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
pthread_exit(0);
}
status = API_SUCCESS;
for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
string moduleName = (*pt).DeviceName;
if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
(*pt).DisableState == oam::AUTODISABLEDSTATE )
continue;
int moduleOpState = oam::ACTIVE;
// check module state
try{
bool degraded;
oam.getModuleStatus(moduleName, moduleOpState, degraded);
// if up, set to MAN_INIT
if ( HDFS &&
(moduleOpState == oam::UP) )
{
processManager.setModuleState(moduleName, oam::MAN_INIT);
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
// Is Module's ProcMon ACTIVE and module status has been updated
int opState = oam::ACTIVE;
try {
ProcessStatus procstat;
oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
opState = procstat.ProcessOpState;
if (opState != oam::ACTIVE) {
//skip if Not ACTIVE
log.writeLog(__LINE__, "Module ProcMon not active yet: " + moduleName, LOG_TYPE_DEBUG);
status = API_MINOR_FAILURE;
continue;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
status = API_MINOR_FAILURE;
continue;
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
status = API_MINOR_FAILURE;
continue;
}
//skip OAM Parent module
if ( moduleName == config.moduleName() )
continue;
//ProcMon ACTIVE, validate the software release and version of that module
ByteStream msg;
ByteStream::byte requestID = GETSOFTWAREINFO;
msg << requestID;
string moduleSoftwareInfo = processManager.sendMsgProcMon1( moduleName, msg, requestID );
if ( moduleSoftwareInfo == "FAILED" )
continue;
if ( localSoftwareInfo != moduleSoftwareInfo ) {
// module not running on same Calpont Software build as this local Director
// alarm and fail the module
log.writeLog(__LINE__, "Software Version mismatch : " + moduleName + "/" + localSoftwareInfo + "/" + moduleSoftwareInfo, LOG_TYPE_CRITICAL);
aManager.sendAlarmReport(moduleName.c_str(), INVALID_SW_VERSION, SET);
processManager.setModuleState(moduleName, oam::FAILED);
status = API_FAILURE;
break;
}
}
}
//get out of loop if all modules ACTTVE or MAN_OFFLINE
if( status == API_SUCCESS ) {
if ( systemStartupOffline == "y" ) {
processManager.setSystemState(oam::MAN_OFFLINE);
log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_DEBUG);
}
break;
}
else
{
//get out of loop if start module failed
if( status == API_FAILURE )
break;
//retry after sleeping for a bit
sleep(1);
}
}
if ( k == waitTime || status == API_FAILURE) {
// system didn't successfull restart
processManager.setSystemState(oam::FAILED);
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons ACTIVE", LOG_TYPE_CRITICAL);
log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
pthread_exit(0);
}
else
{
//distribute config file
// processManager.distributeConfigFile("system");
if ( systemStartupOffline == "n" && status == API_SUCCESS ) {
oam::DeviceNetworkList devicenetworklist;
pthread_t startsystemthread;
int status = pthread_create (&startsystemthread, NULL, (void*(*)(void*)) &startSystemThread, &devicenetworklist);
if ( status != 0 ) {
log.writeLog(__LINE__, "STARTSYSTEMS: pthread_create failed, return status = " + oam.itoa(status));
status = API_FAILURE;
}
if (status == 0)
{
pthread_join(startsystemthread, NULL);
status = startsystemthreadStatus;
}
if ( status != API_SUCCESS ) {
// system didn't successfull restart
processManager.setSystemState(oam::FAILED);
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, error returned from startSystemThread", LOG_TYPE_CRITICAL);
}
else
//distribute config file
processManager.distributeConfigFile("system");
}
}
// exit thread
log.writeLog(__LINE__, "startMgrProcessThread Exit", LOG_TYPE_DEBUG);
pthread_exit(0);
}
/*****************************************************************************************
* @brief pingDeviceThread
*
* purpose: perform ping testing on the devices within the system
*
*****************************************************************************************/
void pingDeviceThread()
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
ModuleTypeConfig moduletypeconfig;
ALARMManager aManager;
BRM::DBRM dbrm;
log.writeLog(__LINE__, "pingDeviceThread launched", LOG_TYPE_DEBUG);
string cmdLine = "ping ";
string cmdOption = " -c 1 -w 5 >> /dev/null";
string cmd;
string deviceIP;
//
// Get Module Info
//
SystemModuleTypeConfig systemModuleTypeConfig;
try{
oam.getSystemConfig(systemModuleTypeConfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
//Build the initial list, clear module state
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0 )
// skip of no modules configured
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
moduleInfoList.insert(moduleList::value_type((*pt).DeviceName, 0));
}
}
typedef map<string, int> nicList;
nicList nicInfoList;
//Build the initial list, clear NIC state
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0 )
// skip of no modules configured
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
{
nicInfoList.insert(moduleList::value_type((*pt1).HostName, 0));
}
}
}
//
// Get ext device info
//
SystemExtDeviceConfig systemextdeviceconfig;
try{
oam.getSystemConfig(systemextdeviceconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
typedef map<string, int> extDeviceList;
extDeviceList extDeviceInfoList;
//Build the initial list, clear ext device state
for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count; i++)
{
string name = systemextdeviceconfig.extdeviceconfig[i].Name;
extDeviceInfoList.insert(extDeviceList::value_type(name, 0));
}
//storage config
string DBRootStorageType;
try {
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
}
catch(...) {}
log.writeLog(__LINE__, "pingDeviceThread: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
int rtnCode = 0;
Configuration configData;
SystemStatus systemstatus;
bool enableModuleMonitor = true;
bool LANOUTAGEACTIVE = false;
bool HOTSTANDBYACTIVE = false;
bool downActiveOAMModule = false;
// monitor module and external device loop
while (true)
{
//don't peform module test if system is MAN_OFFLINE or not getting status's
while(true)
{
SystemStatus systemstatus;
try {
oam.getSystemStatus(systemstatus);
if (systemstatus.SystemOpState == oam::MAN_OFFLINE )
sleep(5);
else
break;
}
catch(...)
{
sleep(5);
}
}
// Module Heartbeat period and failure count
int ModuleHeartbeatPeriod;
int ModuleHeartbeatCount;
try {
oam.getSystemConfig("ModuleHeartbeatPeriod", ModuleHeartbeatPeriod);
oam.getSystemConfig("ModuleHeartbeatCount", ModuleHeartbeatCount);
ModuleHeartbeatPeriod = ModuleHeartbeatPeriod * 10;
}
catch (exception& ex) {
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
sleep(5);
continue;
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
sleep(5);
continue;
}
// skip testing if Heartbeat is disable
if( ModuleHeartbeatPeriod <= 0 ) {
if ( enableModuleMonitor )
log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to disabled", LOG_TYPE_DEBUG);
enableModuleMonitor = false;
}
else
{
if ( !enableModuleMonitor && moduleInfoList.size() > 1 )
log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to enabled", LOG_TYPE_DEBUG);
enableModuleMonitor = true;
}
//single server system
if ( moduleInfoList.size() <= 1)
enableModuleMonitor = false;
//
// ping NIC
//
// read each time to catch updates
pthread_mutex_lock(&THREAD_LOCK);
systemModuleTypeConfig.moduletypeconfig.clear();
try{
oam.getSystemConfig(systemModuleTypeConfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
sleep(5);
continue;
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
sleep(5);
continue;
}
pthread_mutex_unlock(&THREAD_LOCK);
bool LANOUTAGESUPPORT = true;
bool LOCALNICDOWN = false;
if (enableModuleMonitor)
{
//test main local Ethernet interface status
for ( int count = 0 ; ; count ++)
{
int sockfd;
struct ifreq ifr;
sockfd = socket(AF_INET, SOCK_DGRAM, 0);
if(sockfd == -1){
log.writeLog(__LINE__, "Could not get socket to check", LOG_TYPE_ERROR);
close(sockfd);
break;
}
/* get interface name */
strncpy(ifr.ifr_name, iface_name.c_str(), IFNAMSIZ);
/* Read interface flags */
if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0) {
// not supported
close(sockfd);
break;
}
if (ifr.ifr_flags & IFF_UP) {
// ethernet port is up, continue on
close(sockfd);
break;
}
else
{
// ethernet port is down
log.writeLog(__LINE__, "NIC #1 is DOWN", LOG_TYPE_WARNING);
if ( count >= ModuleHeartbeatCount ) {
LOCALNICDOWN = true;
close(sockfd);
break;
}
else
sleep(5);
}
close(sockfd);
}
}
// if the NIC is down, go directly to LAN outage processing
if ( !LOCALNICDOWN )
{
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
string moduleName = (*pt).DeviceName;
string ipAddr;
string hostName;
int moduleState = oam::INITIAL;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
{
ipAddr = (*pt1).IPAddr;
hostName = (*pt1).HostName;
if (enableModuleMonitor)
{
// perform ping test
cmd = cmdLine + ipAddr + cmdOption;
rtnCode = system(cmd.c_str());
rtnCode = WEXITSTATUS(rtnCode);
}
else
rtnCode = 0;
int currentNICState = oam::UP;
try {
oam.getNICStatus(hostName, currentNICState);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
switch (rtnCode) {
case 0:
//NIC Ack ping
if ( currentNICState != oam::UP ) {
processManager.setNICState(hostName, oam::UP);
if( ModuleHeartbeatPeriod > 0 )
//Clear an alarm
aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, CLEAR);
}
//set LAN Outage indicator to false since a module is responding
if ( moduleState == oam::INITIAL)
if ( moduleName != config.moduleName())
LANOUTAGESUPPORT = false;
//set Module State
if ( moduleState == oam::INITIAL || moduleState == oam::UP)
moduleState = oam::UP;
break;
default:
//NIC failed to respond to ping
if ( currentNICState != oam::DOWN ) {
log.writeLog(__LINE__, "NIC failed to respond to ping: " + hostName, LOG_TYPE_WARNING);
processManager.setNICState(hostName, oam::DOWN);
if( ModuleHeartbeatPeriod > 0 )
//Issue an alarm
aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, SET);
}
//set Module State
if ( moduleState == oam::INITIAL || moduleState == oam::DOWN)
moduleState = oam::DOWN;
else
// NIC 1 is up and NIC 2 is down
moduleState = oam::DEGRADED;
break;
}
}
// if disable, default module state to up
if (!enableModuleMonitor)
moduleState = oam::UP;
// moduleState coming out of the NIC monitoring loop
// UP - ALL NICs passed ping test
// DEGRADED - NIC 1 passed, NIC 2 failed ping test
// DOWN - NIC 1 or ALL NICs failed ping test
int opState = oam::ACTIVE;
try{
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
// skip module check if not inuse or in FAILED state
if (opState == oam::MAN_OFFLINE ||
opState == oam::MAN_DISABLED ||
opState == oam::FAILED)
continue;
//fast track a restart of a downed failover modules
if ( gdownActiveOAMModule == moduleName ) {
moduleInfoList[moduleName] = ModuleHeartbeatCount-1;
gdownActiveOAMModule.clear();
moduleState = oam::DOWN;
downActiveOAMModule = true;
}
vector<string>::iterator pt2 = downModuleList.begin();
for( ; pt2 != downModuleList.end() ; pt2++)
{
if ( *pt2 == moduleName ) {
moduleInfoList[moduleName] = ModuleHeartbeatCount-1;
moduleState = oam::DOWN;
downModuleList.erase(pt2);
break;
}
}
switch (moduleState){
case oam::DEGRADED:
// do nothing for now
break;
case oam::UP:
// comment out, only come up when both nic are up, if not the pms list will not have the second nic in there
// case oam::DEGRADED:
if (opState == oam::DOWN || opState == oam::INITIAL
|| opState == oam::AUTO_DISABLED)
{
//Set the module state to up
processManager.setModuleState(moduleName, moduleState);
}
if ( moduleName == config.OAMStandbyName() )
HOTSTANDBYACTIVE = true;
// if LAN OUTAGE ACTIVE, skip module checks
if (LANOUTAGEACTIVE)
break;
try {
oam.getSystemConfig("MySQLRep", MySQLRep);
}
catch(...) {
MySQLRep = "n";
}
if (moduleInfoList[moduleName] >= ModuleHeartbeatCount ||
opState == oam::DOWN || opState == oam::AUTO_DISABLED)
{
log.writeLog(__LINE__, "Module alive, bring it back online: " + moduleName, LOG_TYPE_DEBUG);
string PrimaryUMModuleName = config.moduleName();
try {
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch(...) {}
bool busy = false;
for ( int retry = 0 ; retry < 20 ; retry++ )
{
busy = false;
ProcessStatus DMLprocessstatus;
try {
oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
if ( DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) {
log.writeLog(__LINE__, "DMLProc in BUSY_INIT, skip bringing module online " + moduleName, LOG_TYPE_DEBUG);
busy = true;
sleep(5);
}
else
break;
}
catch(...)
{
sleep(5);
}
}
if (busy)
break;
//set query system state not ready
BRM::DBRM dbrm;
dbrm.setSystemQueryReady(false);
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport");
// halt the dbrm
oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
//send notification
oam.sendDeviceNotification(config.moduleName(), MODULE_UP);
int status;
DBRootConfigList dbrootConfigList;
// if shared pm, move dbroots back to pm
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
//restart to get the versionbuffer files closed so it can be unmounted
processManager.restartProcessType("WriteEngineServer", moduleName);
//set module to enable state
processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
downActiveOAMModule = false;
int retry;
for ( retry = 0 ; retry < 5 ; retry++ )
{
try {
log.writeLog(__LINE__, "Call autoUnMovePmDbroot", LOG_TYPE_DEBUG);
oam.autoUnMovePmDbroot(moduleName);
//check if any dbroots got assigned back to this module
// they could not be moved if there were busy on other pms
try
{
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE,MAX_MODULE_ID_SIZE).c_str());
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
if ( dbrootConfigList.size() == 0 )
{
// no dbroots, fail module
log.writeLog(__LINE__, "autoUnMovePmDbroot left no dbroots mounted, failing module restart: " + moduleName, LOG_TYPE_WARNING);
//Issue an alarm
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
//set module to disable state
processManager.disableModule(moduleName, true);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
//clear count
moduleInfoList[moduleName] = 0;
processManager.setSystemState(oam::ACTIVE);
//set query system state ready
processManager.setQuerySystemState(true);
break;
}
}
catch(...)
{}
log.writeLog(__LINE__, "autoUnMovePmDbroot success", LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
break;
}
catch(...)
{
sleep(5);
}
}
if ( retry == 5 )
{
log.writeLog(__LINE__, "autoUnMovePmDbroot: Failed. Fail Module", LOG_TYPE_WARNING);
//Issue an alarm
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
//set module to disable state
processManager.disableModule(moduleName, true);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
//clear count
moduleInfoList[moduleName] = 0;
processManager.setSystemState(oam::ACTIVE);
//set query system state ready
processManager.setQuerySystemState(true);
break;
}
}
else
//set module to enable state
processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
//restart module processes
int retry = 0;
int ModuleProcMonWaitCount = 12;
try{
oam.getSystemConfig("ModuleProcMonWaitCount", ModuleProcMonWaitCount);
}
catch(...) {
ModuleProcMonWaitCount = 12;
}
for ( ; retry < ModuleProcMonWaitCount ; retry ++ )
{
// first, wait until module's ProcMon is ACTIVE
int opState = oam::ACTIVE;
try {
ProcessStatus procstat;
oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
opState = procstat.ProcessOpState;
if (opState != oam::ACTIVE) {
log.writeLog(__LINE__, "Waiting for Module ProcMon to go ACTIVE: " + moduleName, LOG_TYPE_DEBUG);
sleep(5);
continue;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
sleep(5);
continue;
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
sleep(5);
continue;
}
//check and assign Elastic IP Address
int AmazonElasticIPCount = 0;
try{
oam.getSystemConfig("AmazonElasticIPCount", AmazonElasticIPCount);
}
catch(...) {
AmazonElasticIPCount = 0;
}
for ( int id = 1 ; id < AmazonElasticIPCount+1 ; id++ )
{
string AmazonElasticModule = "AmazonElasticModule" + oam.itoa(id);
string ELmoduleName;
try{
oam.getSystemConfig(AmazonElasticModule, ELmoduleName);
}
catch(...) {}
if ( ELmoduleName == moduleName )
{ //match found assign Elastic IP Address
string AmazonElasticIPAddr = "AmazonElasticIPAddr" + oam.itoa(id);
string ELIPaddress;
try{
oam.getSystemConfig(AmazonElasticIPAddr, ELIPaddress);
}
catch(...) {}
try{
oam.assignElasticIP(hostName, ELIPaddress);
log.writeLog(__LINE__, "Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_DEBUG);
}
catch(...) {
log.writeLog(__LINE__, "Failed to Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_ERROR);
}
break;
}
}
// next, stopmodule to start up clean
status = processManager.stopModule(moduleName, oam::FORCEFUL, false);
if ( status == oam::API_SUCCESS ) {
string newStandbyModule = processManager.getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
processManager.setStandbyModule(newStandbyModule);
}
else
{
if ( newStandbyModule == "NONE")
if ( moduleName.substr(0,MAX_MODULE_TYPE_SIZE) == "pm" )
processManager.setStandbyModule(moduleName);
}
DBRootConfigList::iterator pt = dbrootConfigList.begin();
if (( DBRootStorageType == "DataRedundancy") && (*pt == 1))
{
log.writeLog(__LINE__, "stopModule, " + config.moduleName(), LOG_TYPE_DEBUG);
processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
processManager.switchParentOAMModule(moduleName);
processManager.stopProcess(config.moduleName(), "ProcessManager", oam::FORCEFUL, true);
break;
}
}
else {
//stop failed, retry
log.writeLog(__LINE__, "stopModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
sleep(5);
continue;
}
// next, startmodule
status = processManager.startModule(moduleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
if ( status == oam::API_SUCCESS )
break;
log.writeLog(__LINE__, "startModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
//sleep and retry all over again
sleep (5);
} // end of the retry loop
if ( retry < ModuleProcMonWaitCount )
{ // module successfully started
//call dbrm control, need to resume before start so the getdbrmfiles halt doesn't hang
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
sleep(1);
// if a PM module was started successfully, restart ACTIVE DBRM(s), ExeMgr(s) / mysqld
if( moduleName.find("pm") == 0 ) {
processManager.restartProcessType("DBRMControllerNode", moduleName);
processManager.restartProcessType("DBRMWorkerNode");
processManager.stopProcessType("DDLProc");
processManager.stopProcessType("DMLProc");
processManager.stopProcessType("ExeMgr");
processManager.restartProcessType("PrimProc");
sleep(1);
processManager.restartProcessType("ExeMgr");
}
string moduleType = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
if ( MySQLRep == "y" ) {
if ( moduleType == "um" ||
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
( moduleType == "pm" && PMwithUM == "y") ) {
//setup MySQL Replication for started modules
log.writeLog(__LINE__, "Setup MySQL Replication for module recovering from outage on " + moduleName, LOG_TYPE_DEBUG);
DeviceNetworkList devicenetworklist;
DeviceNetworkConfig devicenetworkconfig;
devicenetworkconfig.DeviceName = moduleName;
devicenetworklist.push_back(devicenetworkconfig);
processManager.setMySQLReplication(devicenetworklist);
}
}
else
{
if( moduleName.find("pm") == 0 ) {
processManager.restartProcessType("mysql", moduleName);
sleep(1);
}
}
// if a PM module was started successfully, DMLProc/DDLProc
if( moduleName.find("pm") == 0 ) {
processManager.restartProcessType("WriteEngineServer");
sleep(1);
processManager.restartProcessType("DDLProc");
sleep(1);
processManager.restartProcessType("DMLProc");
}
//enable query stats
dbrm.setSystemQueryReady(true);
//set query system state ready
processManager.setQuerySystemState(true);
processManager.setSystemState(oam::ACTIVE);
//reset standby module
string newStandbyModule = processManager.getStandbyModule();
//send message to start new Standby Process-Manager, if needed
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
processManager.setStandbyModule(newStandbyModule);
}
else
{
Config* sysConfig = Config::makeConfig();
// clear Standby OAM Module
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try {
sysConfig->write();
}
catch(...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
if ( moduletypeconfig.RunType == SIMPLEX ) {
//start SIMPLEX runtype processes on a SIMPLEX runtype module
string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
for( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
{
string launchModuleName = (*pt).DeviceName;
string launchModuletype = launchModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
if ( moduletype != launchModuletype )
continue;
//skip if active pm module (local module)
if ( launchModuleName == config.moduleName() )
continue;
//check if module is active before starting any SIMPLEX STANDBY apps
try{
int launchopState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(launchModuleName, launchopState, degraded);
if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY ) {
continue;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
int status;
log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG);
for ( int j = 0 ; j < 20 ; j ++ )
{
status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
if ( status == API_SUCCESS)
break;
}
log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG);
}
}
//clear count
moduleInfoList[moduleName] = 0;
}
else
{ // module failed to restart, place back in disabled state
//Log failure, issue alarm, set moduleOpState
Configuration config;
//Issue an alarm
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
// if pm, move dbroots back to pm
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
//move dbroots to other modules
try {
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
oam.autoMovePmDbroot(moduleName);
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
//set module to disable state
processManager.disableModule(moduleName, true);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
if ( amazon )
processManager.setSystemState(oam::FAILED);
else
processManager.setSystemState(oam::ACTIVE);
//enable query stats
dbrm.setSystemQueryReady(true);
//set query system state ready
processManager.setQuerySystemState(true);
//clear count
moduleInfoList[moduleName] = 0;
}
}
break;
case oam::DOWN:
// if initial state, skip
if (opState == oam::INITIAL)
break;
// if disabled and not amazon, skip
if (opState == oam::AUTO_DISABLED && !amazon)
break;
log.writeLog(__LINE__, "module failed to respond to pings: " + moduleName, LOG_TYPE_WARNING);
//bump module ping failure counter
moduleInfoList[moduleName]++;
if ( moduleName == config.OAMStandbyName() )
HOTSTANDBYACTIVE = false;
if (moduleInfoList[moduleName] == ModuleHeartbeatCount)
{
// if LAN OUTAGE ACTIVE,skip module checks
if (LANOUTAGEACTIVE)
break;
//Log failure, issue alarm, set moduleOpState
Configuration config;
log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
//set query system state not ready
BRM::DBRM dbrm;
dbrm.setSystemQueryReady(false);
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport");
// halt the dbrm
oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
processManager.setSystemState(oam::BUSY_INIT);
//string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
//system(cmd.c_str());
//send notification
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
//Issue an alarm
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
//mark all processes running on module auto-offline
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
//set module to disable state
processManager.disableModule(moduleName, false);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
// if pm, move dbroots to other pms
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
try {
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
oam.autoMovePmDbroot(moduleName);
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
// if Cloud Instance
// state = running, then instance is rebooting, monitor for recovery
// state = stopped, then try starting, if fail, remove/addmodule to launch new instance
// state = terminate or nothing, remove/addmodule to launch new instance
if ( amazon ) {
if ( moduleName.find("um") == 0 )
{
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
//set recycle process
processManager.recycleProcess(moduleName);
}
// return values = 'ip address' for running or rebooting, stopped or terminated
string currentIPAddr = oam.getEC2InstanceIpAddress(hostName);
if ( currentIPAddr == "terminated")
{
//check if down module was Standby OAM, if so find another one
if ( moduleName == config.OAMStandbyName() ) {
//set down module ProcessManager to AOS
processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
//get another standby OAM module
string newStandbyModule = processManager.getStandbyModule();
//send message to start new Standby Process-Manager, if needed
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
processManager.setStandbyModule(newStandbyModule);
}
else
{
Config* sysConfig = Config::makeConfig();
// clear Standby OAM Module
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try {
sysConfig->write();
}
catch(...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
}
// remove/addmodule
log.writeLog(__LINE__, "Instance terminated, re-launching: " + hostName, LOG_TYPE_DEBUG);
// if pm, get assigned dbroots and deattach EBS
DBRootConfigList dbrootConfigList;
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE,MAX_MODULE_ID_SIZE).c_str());
if( moduleName.find("pm") == 0 ) {
//get dbroots ids for to PM
try
{
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
}
catch (exception& e)
{
log.writeLog(__LINE__, "ERROR: getPmDbrootConfig error: " + moduleName, LOG_TYPE_DEBUG);
}
}
DeviceNetworkList devicenetworklist;
DeviceNetworkConfig devicenetworkconfig;
HostConfig hostconfig;
devicenetworkconfig.DeviceName = moduleName;
if (cloud == "amazon-vpc")
hostconfig.IPAddr = ipAddr;
else
hostconfig.IPAddr = oam::UnassignedName;
hostconfig.HostName = oam::UnassignedName;
hostconfig.NicID = 1;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
devicenetworklist.push_back(devicenetworkconfig);
bool pass = true;
for ( int addRetry = 0 ; addRetry < 5 ; addRetry++ )
{
//remove module
int ret = processManager.removeModule(devicenetworklist, false);
if ( ret != oam::API_SUCCESS )
{
log.writeLog(__LINE__, "Instance failed to remove, retry: " + moduleName, LOG_TYPE_DEBUG);
}
else
{
pass = true;
log.writeLog(__LINE__, "Instance removed, module: " + moduleName, LOG_TYPE_DEBUG);
}
// add module
string password = oam::UnassignedName;
try
{
oam.getSystemConfig("rpw", password);
}
catch(...)
{
password = oam::UnassignedName;
}
ret = processManager.addModule(devicenetworklist, password, false);
if ( ret != oam::API_SUCCESS )
{
log.writeLog(__LINE__, "Instance failed to add, retry: " + moduleName, LOG_TYPE_CRITICAL);
pass = false;
}
else
{
pass = true;
log.writeLog(__LINE__, "New Instance Launched for " + moduleName, LOG_TYPE_DEBUG);
// if pm, config and attach EBS
if( moduleName.find("pm") == 0 && !dbrootConfigList.empty() ) {
try
{
oam.setPmDbrootConfig(moduleID, dbrootConfigList);
std::vector<std::string> dbrootList;
DBRootConfigList::iterator pt1 = dbrootConfigList.begin();
for( ; pt1 != dbrootConfigList.end() ; pt1++)
{
dbrootList.push_back(oam.itoa(*pt1));
}
//attach EBS
try
{
oam.amazonReattach(moduleName, dbrootList, true);
pass = true;
break;
}
catch (exception& e)
{
log.writeLog(__LINE__, "ERROR: amazonReattach error on " + moduleName, LOG_TYPE_ERROR);
pass = false;
}
}
catch (exception& e)
{
log.writeLog(__LINE__, "ERROR: setPmDbrootConfig error on " + moduleName, LOG_TYPE_ERROR);
pass = false;
}
}
else
{
pass = true;
break;
}
}
if (pass)
break;
}
if (pass)
//Set the module state so it will be brought back up
processManager.setModuleState(moduleName, oam::AUTO_DISABLED);
else
{
//new instance failed to get added
//remove and try auto moving dbroots to other pms
processManager.removeModule(devicenetworklist, false);
// if pm, move dbroots to other pms
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
try {
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
oam.autoMovePmDbroot(moduleName);
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
//set recycle process
processManager.recycleProcess(moduleName);
//enable query stats
dbrm.setSystemQueryReady(true);
//set query system state ready
processManager.setQuerySystemState(true);
sleep(2);
processManager.setSystemState(oam::ACTIVE);
}
}
if ( moduleName.find("pm") == 0 )
{
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
//enable query stats
dbrm.setSystemQueryReady(true);
//set query system state ready
processManager.setQuerySystemState(true);
}
}
else
{ // non-amazon
// resume the dbrm
oam.dbrmctl("resume");
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
//set recycle process
processManager.recycleProcess(moduleName);
//enable query stats
dbrm.setSystemQueryReady(true);
//set query system state ready
processManager.setQuerySystemState(true);
sleep(2);
//check if down module was Standby OAM, if so find another one
if ( moduleName == config.OAMStandbyName() ) {
//set down module ProcessManager to AOS
processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
//get another standby OAM module
string newStandbyModule = processManager.getStandbyModule();
//send message to start new Standby Process-Manager, if needed
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
processManager.setStandbyModule(newStandbyModule);
}
else
{
Config* sysConfig = Config::makeConfig();
// clear Standby OAM Module
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try {
sysConfig->write();
}
catch(...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
}
}
//start SIMPLEX runtype processes on a SIMPLEX runtype module
string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
try{
oam.getSystemConfig(moduletype, moduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
if ( moduletypeconfig.RunType == SIMPLEX ) {
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
for( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
{
string launchModuleName = (*pt).DeviceName;
string launchModuletype = launchModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
if ( moduletype != launchModuletype )
continue;
//skip if active pm module (local module)
if ( launchModuleName == config.moduleName() )
continue;
if( moduleName != launchModuleName ) {
//check if module is active before starting any SIMPLEX STANDBY apps
try{
int launchopState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(launchModuleName, launchopState, degraded);
if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY ) {
continue;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
int status;
log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG);
for ( int j = 0 ; j < 20 ; j ++ )
{
status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
if ( status == API_SUCCESS)
break;
}
log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG);
}
}
}
}
break;
}
}
} //end of for loop
}
// check and take action if LAN outage is flagged
if (LANOUTAGESUPPORT && !LANOUTAGEACTIVE && LOCALNICDOWN)
{
log.writeLog(__LINE__, "LAN Failure detected", LOG_TYPE_CRITICAL);
oam.sendDeviceNotification(config.moduleName(), START_PM_MASTER_DOWN);
LANOUTAGEACTIVE = true;
log.writeLog(__LINE__, "Kill any cpimport running", LOG_TYPE_INFO);
system("pkill -9 cpimport");
//request stop of local module
int status = processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
if ( status != oam::API_SUCCESS )
log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
//stop snmptrap daemon process
processManager.stopProcess(config.moduleName(), "SNMPTrapDaemon", oam::FORCEFUL, false);
}
else
{
if ( LANOUTAGEACTIVE && HOTSTANDBYACTIVE && !LOCALNICDOWN)
{
// pthread_mutex_unlock(&THREAD_LOCK);
LANOUTAGEACTIVE = false;
log.writeLog(__LINE__, "LAN Failure recovery");
//check if this module still is active according to last know hot standby module
ByteStream msg;
ByteStream::byte requestID = GETPARENTOAMMODULE;
msg << requestID;
string parentOAMModule = processManager.sendMsgProcMon1( config.OAMStandbyName(), msg, requestID );
if ( parentOAMModule == config.moduleName() ||
parentOAMModule == "FAILED" ) {
//srestart to these guys incase they marked any PrimProcs offline
processManager.restartProcessType("ExeMgr");
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
else
{
//send message to local Process Monitor to run coldStandby
ByteStream msg;
ByteStream::byte requestID = OAMPARENTCOLD;
msg << requestID;
int returnStatus = processManager.sendMsgProcMon( config.moduleName(), msg, requestID );
log.writeLog(__LINE__, "sent OAM Parent Cold message to local Process-Monitor, status: " + oam.itoa(returnStatus) , LOG_TYPE_DEBUG);
//request stop of local module
int status = processManager.stopModule(config.moduleName(), oam::INSTALL, false);
if ( status != oam::API_SUCCESS )
log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
}
}
}
//
// ping ext devices
//
// read each time to catch updates
systemextdeviceconfig.extdeviceconfig.clear();
try{
oam.getSystemConfig(systemextdeviceconfig);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count ; i++ )
{
string extDeviceName = systemextdeviceconfig.extdeviceconfig[i].Name;
string ipAddr = systemextdeviceconfig.extdeviceconfig[i].IPAddr;
int opState = oam::ACTIVE;
try{
oam.getExtDeviceStatus(extDeviceName, opState);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
cmd = cmdLine + ipAddr + cmdOption;
rtnCode = system(cmd.c_str());
switch (WEXITSTATUS(rtnCode)){
case 0:
//Switch Ack ping, Check whether alarm have been issued
if (extDeviceInfoList[extDeviceName] >= ModuleHeartbeatCount)
{
aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, CLEAR);
}
extDeviceInfoList[extDeviceName] = 0;
if (opState != oam::ACTIVE)
{
//Set the switch state to active
processManager.setExtdeviceState(extDeviceName, oam::ACTIVE);
}
break;
default:
//extDevice failed to respond to ping
log.writeLog(__LINE__, "extDevice failed to respond to ping: " + extDeviceName, LOG_TYPE_WARNING);
extDeviceInfoList[extDeviceName]++;
if (extDeviceInfoList[extDeviceName] == ModuleHeartbeatCount)
{
//Log failure, issue alarm, set extDeviceOpState
log.writeLog(__LINE__, "extDevice is down: " + extDeviceName, LOG_TYPE_CRITICAL);
processManager.setExtdeviceState(extDeviceName, oam::AUTO_OFFLINE);
//Issue an alarm
aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, SET);
}
break;
}
} //end of for loop
// double check to make sure the system status is ACTIVE if all module status's are ACTIVE
try {
if (dbrm.isDBRMReady())
{
int systemReady = dbrm.getSystemReady(); // -1 == fail, 0 == not ready, 1 == ready
if (systemReady > 0)
{
bool updateActive = true;
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
string moduleName = (*pt).DeviceName;
int opState = oam::ACTIVE;
try
{
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::ACTIVE ||
opState == oam::DEGRADED ||
opState == oam::MAN_DISABLED ||
opState == oam::AUTO_DISABLED )
continue;
updateActive = false;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
if (updateActive)
{
// log.writeLog(__LINE__, "Modules are ACTIVE, check system state ", LOG_TYPE_DEBUG);
string PrimaryUMModuleName;
try {
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch(...) {}
// log.writeLog(__LINE__, "PrimaryUMModuleName = " + PrimaryUMModuleName, LOG_TYPE_DEBUG);
ProcessStatus DMLprocessstatus;
try {
oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
// log.writeLog(__LINE__, "DMLPROC STATUS = " + oamState[DMLprocessstatus.ProcessOpState], LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) {
//set the system status if a change has occurred
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if ( systemstatus.SystemOpState != oam::ACTIVE )
{
processManager.setSystemState(oam::ACTIVE);
}
}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) {
//set the system status if a change has occurred
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if ( systemstatus.SystemOpState != oam::BUSY_INIT )
{
processManager.setSystemState(oam::BUSY_INIT);
}
}
}
}
}
}
catch (...)
{
}
//go sleep for a bit
int sleepTime = ModuleHeartbeatPeriod / 10;
if (!enableModuleMonitor && systemextdeviceconfig.Count == 0)
sleep(60);
else
sleep(sleepTime);
}
return;
}
/******************************************************************************************
* @brief hdfsActiveAlarmsPushingThread
*
* purpose: Push an image of ActiveAlarms to HDFS for non-OAMParentModule to view.
*
******************************************************************************************/
static void hdfsActiveAlarmsPushingThread()
{
boost::filesystem::path filePath(ACTIVE_ALARM_FILE);
boost::filesystem::path dirPath = filePath.parent_path();
string dirName = boost::filesystem::canonical(dirPath).string();
if (boost::filesystem::exists("/etc/pdsh/machines"))
{
string cpCmd = "pdcp -a -x " + localHostName + " " + ACTIVE_ALARM_FILE + " " + dirName +
" > /dev/null 2>&1";
string rmCmd = "pdsh -a -x " + localHostName + " rm -f " + ACTIVE_ALARM_FILE +
" > /dev/null 2>&1";
while(1)
{
if (boost::filesystem::exists(filePath))
system(cpCmd.c_str());
else
system(rmCmd.c_str());
sleep(ACTIVE_ALARMS_PUSHING_INTERVAL);
}
}
return;
}
/*****************************************************************************************
* @brief Processor Heartbeat Msg Thread
*
* purpose: Read Heartbeat Messages from other Processes
*
*****************************************************************************************/
/*
static void heartbeatMsgThread()
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
//
//waiting for request
//
ByteStream receivedMSG;
IOSocket fIos;
for (;;)
{
try
{
MessageQueueServer procmgr("ProcHeartbeatControl");
for (;;)
{
try
{
fIos = procmgr.accept();
receivedMSG = fIos.read();
if (receivedMSG.length() > 0) {
processManager.processMSG(fIos, receivedMSG);
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: Caught unknown exception!", LOG_TYPE_ERROR);
}
fIos.close();
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
// takes 2 - 4 minites to free sockets, sleep and retry
sleep(60);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcHeartbeatControl: Caught unknown exception!", LOG_TYPE_ERROR);
// takes 2 - 4 minites to free sockets, sleep and retry
sleep(60);
}
}
}
*/
/*****************************************************************************************
* @brief Processor Heartbeat Thread
*
* purpose: Check Heartbeat Messages from other Processes
*
*****************************************************************************************/
/*
static void heartbeatProcessThread()
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
ALARMManager aManager;
int processHeartbeatPeriod=60; //default value to 60 seconds
log.writeLog(__LINE__, "Thread Launched: Process Heartbeat!!!");
while (true)
{
//
// check and report on register process not sending heartbeats
//
// get process heartbeat period
try {
oam.getSystemConfig("ProcessHeartbeatPeriod", processHeartbeatPeriod);
processHeartbeatPeriod = processHeartbeatPeriod * 60;
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
Oam oam;
log.writeLog(__LINE__, "Process Heartbeat check started, Heartbeat period is " + oam.itoa(processHeartbeatPeriod), LOG_TYPE_DEBUG);
sleep(processHeartbeatPeriod);
HeartBeatProcList::iterator list = hbproclist.begin();
for( ; list != hbproclist.end() ; list++)
{
string moduleName = (*list).ModuleName;
string processName = (*list).ProcessName;
int id = (*list).ID;
// get Process state and only check if ACTIVE
ProcessStatus procstat;
try{
oam.getProcessStatus(processName, moduleName, procstat);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
procstat.ProcessOpState = oam::MAN_OFFLINE;
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
procstat.ProcessOpState = oam::MAN_OFFLINE;
}
if ( procstat.ProcessOpState == oam::ACTIVE ) {
// skip testing if Heartbeat is disable
if( processHeartbeatPeriod != -1 ) {
//log.writeLog(__LINE__, "Heartbeat: Process being monitored: " + moduleName + " / " + processName + " / " + oam.itoa(id), LOG_TYPE_DEBUG);
if ( !(*list).receiveFlag ) {
// got a missing heartbeat, request a restart on the process
log.writeLog(__LINE__, "heartbeatProcessThread: Failure from process " + moduleName + " / " + processName+ " / " + oam.itoa(id), LOG_TYPE_WARNING);
oam.restartProcess(moduleName, processName, FORCEFUL, ACK_NO);
(*list).receiveFlag = true;
// reset all other entries for this process
HeartBeatProcList::iterator list1 = hbproclist.begin();
for( ; list1 != hbproclist.end() ; list1++)
{
string moduleName1 = (*list1).ModuleName;
string processName1 = (*list1).ProcessName;
if ( moduleName == moduleName1 && processName == processName1 )
(*list1).receiveFlag = true;
}
}
else
// reset receive heartbeat indication flag
(*list).receiveFlag = false;
}
else
// heartbeat is disabled
(*list).receiveFlag=true;
}
else
{ // registered process not active, remove from list
hbproclist.erase(list);
log.writeLog(__LINE__, "Removing OOS Process from Heartbeat Monitor list: " + moduleName + " / " + processName+ " / " + oam.itoa(id));
break;
}
}
} // end of while forever loop
}
*/
// vim:ts=4 sw=4: