mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-05-30 00:27:08 +03:00
2741 lines
84 KiB
C++
2741 lines
84 KiB
C++
/* Copyright (C) 2014 InfiniDB, Inc.
|
|
Copyright (C) 2016 MariaDB Corporaton
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; version 2 of
|
|
the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
MA 02110-1301, USA. */
|
|
|
|
/*****************************************************************************************
|
|
* $Id: main.cpp 2203 2013-07-08 16:50:51Z bpaul $
|
|
*
|
|
*****************************************************************************************/
|
|
|
|
|
|
#include <clocale>
|
|
|
|
#include <boost/filesystem.hpp>
|
|
|
|
#include "processmanager.h"
|
|
#include "installdir.h"
|
|
|
|
#include "utils_utf8.h"
|
|
|
|
using namespace std;
|
|
using namespace logging;
|
|
using namespace messageqcpp;
|
|
using namespace processmanager;
|
|
using namespace oam;
|
|
using namespace alarmmanager;
|
|
using namespace threadpool;
|
|
//using namespace procheartbeat;
|
|
using namespace config;
|
|
|
|
bool runStandby = false;
|
|
bool runCold = false;
|
|
string systemName = "system";
|
|
string iface_name;
|
|
string cloud;
|
|
bool amazon = false;
|
|
string PMInstanceType;
|
|
string UMInstanceType;
|
|
string AmazonPMFailover = "y";
|
|
string DataRedundancyConfig = "n";
|
|
bool rootUser = true;
|
|
string USER = "root";
|
|
bool HDFS = false;
|
|
string localHostName;
|
|
string PMwithUM = "n";
|
|
string MySQLRep = "n";
|
|
|
|
// pushing the ACTIVE_ALARMS_FILE to all nodes every 10 seconds.
|
|
const int ACTIVE_ALARMS_PUSHING_INTERVAL = 10;
|
|
|
|
typedef map<string, int> moduleList;
|
|
moduleList moduleInfoList;
|
|
|
|
extern HeartBeatProcList hbproclist;
|
|
extern pthread_mutex_t THREAD_LOCK;
|
|
extern bool startsystemthreadStop;
|
|
extern string gdownActiveOAMModule;
|
|
extern int startsystemthreadStatus;
|
|
extern vector<string> downModuleList;
|
|
extern bool startFailOver;
|
|
extern bool gOAMParentModuleFlag;
|
|
|
|
static void messageThread(Configuration config);
|
|
static void sigUser1Handler(int sig);
|
|
static void startMgrProcessThread();
|
|
static void hdfsActiveAlarmsPushingThread();
|
|
//static void pingDeviceThread();
|
|
//static void heartbeatProcessThread();
|
|
//static void heartbeatMsgThread();
|
|
|
|
/*****************************************************************************************
|
|
* @brief main
|
|
*
|
|
* purpose: request launching of Mgr controlled processes and wait for incoming messages
|
|
*
|
|
*****************************************************************************************/
|
|
int main(int argc, char **argv)
|
|
{
|
|
#ifndef _MSC_VER
|
|
setuid(0); // set effective ID to root; ignore return status
|
|
#endif
|
|
// get and set locale language
|
|
string systemLang = "C";
|
|
|
|
setlocale(LC_ALL, systemLang.c_str());
|
|
|
|
Oam oam;
|
|
|
|
//check if root-user
|
|
int user;
|
|
user = getuid();
|
|
if (user != 0)
|
|
rootUser = false;
|
|
|
|
char* p= getenv("USER");
|
|
if (p && *p)
|
|
USER = p;
|
|
|
|
ProcessLog log;
|
|
Configuration config;
|
|
ProcessManager processManager(config, log);
|
|
ALARMManager aManager;
|
|
|
|
log.writeLog(__LINE__, " ");
|
|
log.writeLog(__LINE__, "**********Process Manager Started**********");
|
|
|
|
//Ignore SIGPIPE signals
|
|
signal(SIGPIPE, SIG_IGN);
|
|
|
|
//Ignore SIGHUP signals
|
|
signal(SIGHUP, SIG_IGN);
|
|
|
|
//create SIGUSR1 handler to get configuration updates
|
|
signal(SIGUSR1, sigUser1Handler);
|
|
|
|
// Get System Name
|
|
try{
|
|
oam.getSystemConfig("SystemName", systemName);
|
|
}
|
|
catch(...)
|
|
{}
|
|
|
|
//get cloud setting
|
|
try {
|
|
oam.getSystemConfig( "Cloud", cloud);
|
|
}
|
|
catch(...) {}
|
|
|
|
//get amazon parameters
|
|
if ( cloud == "amazon-ec2" || cloud == "amazon-vpc" )
|
|
{
|
|
oam.getSystemConfig("PMInstanceType", PMInstanceType);
|
|
oam.getSystemConfig("UMInstanceType", UMInstanceType);
|
|
oam.getSystemConfig("AmazonPMFailover", AmazonPMFailover);
|
|
|
|
amazon = true;
|
|
}
|
|
|
|
//get gluster config
|
|
try {
|
|
oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig);
|
|
}
|
|
catch(...)
|
|
{
|
|
DataRedundancyConfig = "n";
|
|
}
|
|
|
|
//hdfs / hadoop config
|
|
string DBRootStorageType;
|
|
try {
|
|
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
|
|
}
|
|
catch(...) {}
|
|
|
|
if ( DBRootStorageType == "hdfs" )
|
|
HDFS = true;
|
|
|
|
log.writeLog(__LINE__, "Main: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
|
|
|
|
//PMwithUM config
|
|
try {
|
|
oam.getSystemConfig( "PMwithUM", PMwithUM);
|
|
}
|
|
catch(...) {
|
|
PMwithUM = "n";
|
|
}
|
|
|
|
try {
|
|
oam.getSystemConfig("MySQLRep", MySQLRep);
|
|
}
|
|
catch(...) {
|
|
MySQLRep = "n";
|
|
}
|
|
|
|
// get system uptime and alarm if this is a restart after module outage
|
|
if ( gOAMParentModuleFlag ) {
|
|
log.writeLog(__LINE__, "Running Active");
|
|
log.writeLog(__LINE__, "Running Active", LOG_TYPE_DEBUG);
|
|
}
|
|
else
|
|
{
|
|
log.writeLog(__LINE__, "Running Standby");
|
|
log.writeLog(__LINE__, "Running Standby", LOG_TYPE_DEBUG);
|
|
runStandby = true;
|
|
}
|
|
|
|
//get local module main IP address
|
|
ModuleConfig moduleconfig;
|
|
oam.getSystemConfig(config.moduleName(), moduleconfig);
|
|
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
|
|
string localIPaddr = (*pt1).IPAddr;
|
|
localHostName = (*pt1).HostName;
|
|
|
|
struct ifaddrs *addrs, *iap;
|
|
struct sockaddr_in *sa;
|
|
char buf[32];
|
|
|
|
getifaddrs(&addrs);
|
|
for (iap = addrs; iap != NULL; iap = iap->ifa_next)
|
|
{
|
|
|
|
if (iap->ifa_addr && (iap->ifa_flags & IFF_UP) && iap->ifa_addr->sa_family == AF_INET)
|
|
{
|
|
sa = (struct sockaddr_in *)(iap->ifa_addr);
|
|
inet_ntop(iap->ifa_addr->sa_family, (void *)&(sa->sin_addr), buf, sizeof(buf));
|
|
if (!strcmp(localIPaddr.c_str(), buf))
|
|
{
|
|
iface_name = iap->ifa_name;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
freeifaddrs(addrs);
|
|
log.writeLog(__LINE__, "Main Ethernet Port = " + iface_name, LOG_TYPE_DEBUG);
|
|
|
|
//
|
|
//start a thread to ping all system modules
|
|
//
|
|
if (runStandby) {
|
|
//running standby after startup
|
|
try {
|
|
oam.processInitComplete("ProcessManager", oam::STANDBY);
|
|
log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
// create message thread
|
|
pthread_t MessageThread;
|
|
int ret = pthread_create (&MessageThread, NULL, (void*(*)(void*)) &messageThread, &config);
|
|
if ( ret != 0 )
|
|
log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
|
|
|
|
//monitor OAM Parent Module for failover
|
|
while(true)
|
|
{
|
|
if ( processManager.OAMParentModuleChange() == oam::API_SUCCESS )
|
|
break;
|
|
log.writeLog(__LINE__, "OAMParentModuleChange failure", LOG_TYPE_WARNING);
|
|
// GO TRY AGAIN
|
|
}
|
|
|
|
pthread_t srvThread;
|
|
int status = pthread_create (&srvThread, NULL, (void*(*)(void*)) &pingDeviceThread, NULL);
|
|
|
|
if ( status != 0 )
|
|
log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
|
|
}
|
|
else
|
|
{ //running active after startup
|
|
//Update DBRM section of Columnstore.xml
|
|
processManager.updateWorkerNodeconfig();
|
|
// processManager.distributeConfigFile("system");
|
|
|
|
pthread_t srvThread;
|
|
int status = pthread_create (&srvThread, NULL, (void*(*)(void*)) &pingDeviceThread, NULL);
|
|
|
|
if ( status != 0 )
|
|
log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
|
|
|
|
// if HDFS, create a thread to push an image of activeAlarms to HDFS filesystem
|
|
if (HDFS) {
|
|
pthread_t hdfsAlarmThread;
|
|
int status = pthread_create(&hdfsAlarmThread, NULL, (void*(*)(void*)) &hdfsActiveAlarmsPushingThread, NULL);
|
|
if ( status != 0 )
|
|
log.writeLog(__LINE__, "hdfsActiveAlarmsPushingThread pthread_create failed, return code = " + oam.itoa(status), LOG_TYPE_ERROR);
|
|
}
|
|
|
|
sleep(5);
|
|
|
|
SystemStatus systemstatus;
|
|
try {
|
|
oam.getSystemStatus(systemstatus);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
if (systemstatus.SystemOpState != oam::MAN_OFFLINE &&
|
|
systemstatus.SystemOpState != oam::ACTIVE) {
|
|
pthread_t mgrProcThread;
|
|
int status = pthread_create (&mgrProcThread, NULL, (void*(*)(void*)) &startMgrProcessThread, NULL);
|
|
|
|
if ( status != 0 )
|
|
log.writeLog(__LINE__, "startMgrProcessThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
|
|
}
|
|
|
|
try {
|
|
oam.processInitComplete("ProcessManager");
|
|
log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
//make sure ProcMgr IP Address is configured correctly
|
|
try
|
|
{
|
|
Config* sysConfig = Config::makeConfig();
|
|
|
|
// get Standby IP address
|
|
ModuleConfig moduleconfig;
|
|
oam.getSystemConfig(config.moduleName(), moduleconfig);
|
|
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
|
|
string IPaddr = (*pt1).IPAddr;
|
|
|
|
sysConfig->setConfig("ProcMgr", "IPAddr", IPaddr);
|
|
|
|
log.writeLog(__LINE__, "set ProcMgr IPaddr to " + IPaddr, LOG_TYPE_DEBUG);
|
|
//update Calpont Config table
|
|
try {
|
|
sysConfig->write();
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: makeConfig failed", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
try {
|
|
oam.distributeConfigFile();
|
|
}
|
|
catch(...)
|
|
{}
|
|
|
|
// create message thread
|
|
pthread_t MessageThread;
|
|
int ret = pthread_create (&MessageThread, NULL, (void*(*)(void*)) &messageThread, &config);
|
|
if ( ret != 0 )
|
|
log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
|
|
}
|
|
|
|
//
|
|
//start a thread to process heartbeat checks
|
|
//
|
|
// pthread_t heartThread;
|
|
// pthread_create (&heartThread, NULL, (void*(*)(void*)) &heartbeatProcessThread, NULL);
|
|
|
|
//
|
|
//start a thread to read heartbeat messages
|
|
//
|
|
// pthread_t heartMsgThread;
|
|
// pthread_create (&heartMsgThread, NULL, (void*(*)(void*)) &heartbeatMsgThread, NULL);
|
|
|
|
// suspend forever
|
|
while(true)
|
|
{
|
|
sleep(1000);
|
|
}
|
|
}
|
|
|
|
/******************************************************************************************
|
|
* @brief messageThread
|
|
*
|
|
* purpose: Read incoming messages
|
|
*
|
|
******************************************************************************************/
|
|
static void messageThread(Configuration config)
|
|
{
|
|
ProcessLog log;
|
|
ProcessManager processManager(config, log);
|
|
Oam oam;
|
|
|
|
//check for running active, then launch
|
|
while(true)
|
|
{
|
|
if ( !runStandby)
|
|
break;
|
|
sleep (1);
|
|
}
|
|
|
|
log.writeLog(__LINE__, "Message Thread started ..", LOG_TYPE_DEBUG);
|
|
|
|
//read and cleanup port before trying to use
|
|
try {
|
|
Config* sysConfig = Config::makeConfig();
|
|
string port = sysConfig->getConfig("ProcMgr", "Port");
|
|
string cmd = "fuser -k " + port + "/tcp >/dev/null 2>&1";
|
|
if ( !rootUser)
|
|
cmd = "sudo fuser -k " + port + "/tcp >/dev/null 2>&1";
|
|
|
|
|
|
system(cmd.c_str());
|
|
}
|
|
catch(...)
|
|
{
|
|
}
|
|
|
|
//
|
|
//waiting for request
|
|
//
|
|
IOSocket fIos;
|
|
|
|
for (;;)
|
|
{
|
|
try
|
|
{
|
|
MessageQueueServer procmgr("ProcMgr");
|
|
for (;;)
|
|
{
|
|
try
|
|
{
|
|
fIos = procmgr.accept();
|
|
|
|
pthread_t messagethread;
|
|
int status = pthread_create (&messagethread, NULL, (void*(*)(void*)) &processMSG, &fIos);
|
|
|
|
if ( status != 0 )
|
|
log.writeLog(__LINE__, "messagethread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{}
|
|
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
|
|
|
|
// takes 2 - 4 minites to free sockets, sleep and retry
|
|
sleep(60);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
|
|
// takes 2 - 4 minites to free sockets, sleep and retry
|
|
sleep(60);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
/******************************************************************************************
|
|
* @brief sigUser1Handler
|
|
*
|
|
* purpose: Handler SIGUSER1 signal and initial failover
|
|
*
|
|
******************************************************************************************/
|
|
static void sigUser1Handler(int sig)
|
|
{
|
|
ProcessLog log;
|
|
Configuration config;
|
|
ProcessManager processManager(config, log);
|
|
Oam oam;
|
|
log.writeLog(__LINE__, "SIGUSER1 received, set startFailOver = true", LOG_TYPE_DEBUG);
|
|
|
|
startFailOver = true;
|
|
}
|
|
|
|
/*****************************************************************************************
|
|
* @brief Start Mgr Process by module Thread
|
|
*
|
|
* purpose: Send Messages to Module Process Monitors to start Processes
|
|
*
|
|
*****************************************************************************************/
|
|
static void startMgrProcessThread()
|
|
{
|
|
ProcessLog log;
|
|
Configuration config;
|
|
ProcessManager processManager(config, log);
|
|
Oam oam;
|
|
SystemModuleTypeConfig systemmoduletypeconfig;
|
|
ModuleTypeConfig PMSmoduletypeconfig;
|
|
ALARMManager aManager;
|
|
|
|
int waitTime = 180;
|
|
|
|
log.writeLog(__LINE__, "startMgrProcessThread launched", LOG_TYPE_DEBUG);
|
|
|
|
//get calpont software version and release
|
|
SystemSoftware systemsoftware;
|
|
string softwareVersion;
|
|
string softwareRelease;
|
|
|
|
try
|
|
{
|
|
oam.getSystemSoftware(systemsoftware);
|
|
|
|
softwareVersion = systemsoftware.Version;
|
|
softwareRelease = systemsoftware.Release;
|
|
}
|
|
catch (exception& e) {
|
|
cout << endl << "ProcMon Construct Error reading getSystemSoftware = " << e.what() << endl;
|
|
exit(-1);
|
|
}
|
|
|
|
string localSoftwareInfo = softwareVersion + softwareRelease;
|
|
|
|
//get systemStartupOffline
|
|
string systemStartupOffline = "n";
|
|
try {
|
|
Config* sysConfig = Config::makeConfig();
|
|
|
|
systemStartupOffline = sysConfig->getConfig("Installation", "SystemStartupOffline");
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: Problem getting systemStartupOffline from the Calpont System Configuration file", LOG_TYPE_ERROR);
|
|
systemStartupOffline = "n";
|
|
}
|
|
|
|
if ( systemStartupOffline == "y" )
|
|
log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_INFO);
|
|
|
|
try{
|
|
oam.getSystemConfig(systemmoduletypeconfig);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
//get Distributed Install
|
|
string DistributedInstall = "y";
|
|
|
|
try
|
|
{
|
|
oam.getSystemConfig("DistributedInstall", DistributedInstall);
|
|
}
|
|
catch (...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: get DistributedInstall", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
//Send out a start service just to make sure Columnstore is runing on remote nodes
|
|
//note this only works for systems with ssh-keys
|
|
/* for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
|
|
if( moduleCount == 0)
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
|
|
{
|
|
//skip OAM Parent module
|
|
if ( (*pt).DeviceName == config.moduleName() )
|
|
continue;
|
|
|
|
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
|
|
for( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
|
|
{
|
|
//run remote command script
|
|
string cmd = startup::StartUp::installDir() + "/bin/remote_command.sh " + (*pt1).IPAddr + " ssh '" + startup::StartUp::installDir() + "/bin/columnstore restart' 0";
|
|
system(cmd.c_str());
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
//distribute system and process config files
|
|
processManager.distributeConfigFile("system");
|
|
processManager.distributeConfigFile("system", "ProcessConfig.xml");
|
|
|
|
//send out moduleName to remote nodes, this will be used to startup new installed nodes
|
|
{
|
|
int status = API_SUCCESS;
|
|
int k = 0;
|
|
for( ; k < waitTime ; k++ )
|
|
{
|
|
if ( startsystemthreadStop ) {
|
|
processManager.setSystemState(oam::MAN_OFFLINE);
|
|
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
|
|
pthread_exit(0);
|
|
}
|
|
|
|
status = API_SUCCESS;
|
|
for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
|
|
if( moduleCount == 0)
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
|
|
{
|
|
string moduleName = (*pt).DeviceName;
|
|
|
|
//skip OAM Parent module
|
|
if ( (*pt).DeviceName == config.moduleName() )
|
|
continue;
|
|
|
|
if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
|
|
(*pt).DisableState == oam::AUTODISABLEDSTATE )
|
|
continue;
|
|
|
|
int ret = processManager.configureModule(moduleName);
|
|
if ( ret != API_SUCCESS )
|
|
status = ret;
|
|
}
|
|
}
|
|
|
|
//get out of loop if all modules updated
|
|
if( status == API_SUCCESS )
|
|
break;
|
|
|
|
//retry after sleeping for a bit
|
|
sleep(1);
|
|
}
|
|
|
|
if ( k == waitTime || status == API_FAILURE) {
|
|
// system didn't successfull restart
|
|
processManager.setSystemState(oam::FAILED);
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons running", LOG_TYPE_CRITICAL);
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
|
|
pthread_exit(0);
|
|
}
|
|
}
|
|
|
|
//wait until all modules are up after a system reboot
|
|
int i = 0;
|
|
for( ; i < waitTime ; i++ )
|
|
{
|
|
if ( startsystemthreadStop ) {
|
|
processManager.setSystemState(oam::MAN_OFFLINE);
|
|
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
|
|
pthread_exit(0);
|
|
}
|
|
|
|
int status = API_SUCCESS;
|
|
for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
|
|
{
|
|
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType == "pm" )
|
|
PMSmoduletypeconfig = systemmoduletypeconfig.moduletypeconfig[i];
|
|
|
|
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
|
|
if( moduleCount == 0)
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
|
|
{
|
|
string moduleName = (*pt).DeviceName;
|
|
|
|
// Is Module UP
|
|
try{
|
|
bool degraded;
|
|
int opState = oam::ACTIVE;
|
|
oam.getModuleStatus(moduleName, opState, degraded);
|
|
|
|
if ( opState == oam::MAN_DISABLED )
|
|
//mark all processes running on module man-offline except ProcMon
|
|
processManager.setProcessStates(moduleName, oam::MAN_OFFLINE);
|
|
|
|
if ( opState == oam::AUTO_DISABLED)
|
|
//mark all processes running on module auto-offline
|
|
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
|
|
|
|
if (opState == oam::INITIAL ||
|
|
opState == oam::DOWN) {
|
|
//a module is not up
|
|
status = API_MINOR_FAILURE;
|
|
break;
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
if ( status == API_MINOR_FAILURE) {
|
|
sleep(1);
|
|
break;
|
|
}
|
|
}
|
|
if ( status == API_SUCCESS)
|
|
//all modules are up
|
|
break;
|
|
}
|
|
|
|
if ( i == waitTime ) {
|
|
// system didn't successfull restart
|
|
processManager.setSystemState(oam::FAILED);
|
|
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all modules are UP", LOG_TYPE_CRITICAL);
|
|
pthread_exit(0);
|
|
}
|
|
|
|
//configure the PMS settings
|
|
processManager.updatePMSconfig();
|
|
|
|
if (HDFS)
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
|
|
//now wait until all procmons are ACTIVE and validate rpms on each module
|
|
int status = API_SUCCESS;
|
|
int k = 0;
|
|
for( ; k < waitTime ; k++ )
|
|
{
|
|
if ( startsystemthreadStop ) {
|
|
processManager.setSystemState(oam::MAN_OFFLINE);
|
|
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
|
|
pthread_exit(0);
|
|
}
|
|
|
|
status = API_SUCCESS;
|
|
for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
|
|
if( moduleCount == 0)
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
|
|
{
|
|
string moduleName = (*pt).DeviceName;
|
|
if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
|
|
(*pt).DisableState == oam::AUTODISABLEDSTATE )
|
|
continue;
|
|
|
|
int moduleOpState = oam::ACTIVE;
|
|
// check module state
|
|
try{
|
|
bool degraded;
|
|
oam.getModuleStatus(moduleName, moduleOpState, degraded);
|
|
|
|
// if up, set to MAN_INIT
|
|
if ( HDFS &&
|
|
(moduleOpState == oam::UP) )
|
|
{
|
|
processManager.setModuleState(moduleName, oam::MAN_INIT);
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
// Is Module's ProcMon ACTIVE and module status has been updated
|
|
int opState = oam::ACTIVE;
|
|
try {
|
|
ProcessStatus procstat;
|
|
oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
|
|
opState = procstat.ProcessOpState;
|
|
|
|
if (opState != oam::ACTIVE) {
|
|
//skip if Not ACTIVE
|
|
log.writeLog(__LINE__, "Module ProcMon not active yet: " + moduleName, LOG_TYPE_DEBUG);
|
|
status = API_MINOR_FAILURE;
|
|
continue;
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
|
|
status = API_MINOR_FAILURE;
|
|
continue;
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
status = API_MINOR_FAILURE;
|
|
continue;
|
|
}
|
|
|
|
//skip OAM Parent module
|
|
if ( moduleName == config.moduleName() )
|
|
continue;
|
|
|
|
//ProcMon ACTIVE, validate the software release and version of that module
|
|
ByteStream msg;
|
|
ByteStream::byte requestID = GETSOFTWAREINFO;
|
|
msg << requestID;
|
|
|
|
string moduleSoftwareInfo = processManager.sendMsgProcMon1( moduleName, msg, requestID );
|
|
if ( moduleSoftwareInfo == "FAILED" )
|
|
continue;
|
|
|
|
if ( localSoftwareInfo != moduleSoftwareInfo ) {
|
|
// module not running on same Calpont Software build as this local Director
|
|
// alarm and fail the module
|
|
log.writeLog(__LINE__, "Software Version mismatch : " + moduleName + "/" + localSoftwareInfo + "/" + moduleSoftwareInfo, LOG_TYPE_CRITICAL);
|
|
|
|
aManager.sendAlarmReport(moduleName.c_str(), INVALID_SW_VERSION, SET);
|
|
processManager.setModuleState(moduleName, oam::FAILED);
|
|
status = API_FAILURE;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//get out of loop if all modules ACTTVE or MAN_OFFLINE
|
|
if( status == API_SUCCESS ) {
|
|
if ( systemStartupOffline == "y" ) {
|
|
processManager.setSystemState(oam::MAN_OFFLINE);
|
|
log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_DEBUG);
|
|
}
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
//get out of loop if start module failed
|
|
if( status == API_FAILURE )
|
|
break;
|
|
|
|
//retry after sleeping for a bit
|
|
sleep(1);
|
|
}
|
|
}
|
|
|
|
if ( k == waitTime || status == API_FAILURE) {
|
|
// system didn't successfull restart
|
|
processManager.setSystemState(oam::FAILED);
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons ACTIVE", LOG_TYPE_CRITICAL);
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
|
|
pthread_exit(0);
|
|
}
|
|
else
|
|
{
|
|
//distribute config file
|
|
// processManager.distributeConfigFile("system");
|
|
|
|
if ( systemStartupOffline == "n" && status == API_SUCCESS ) {
|
|
oam::DeviceNetworkList devicenetworklist;
|
|
pthread_t startsystemthread;
|
|
int status = pthread_create (&startsystemthread, NULL, (void*(*)(void*)) &startSystemThread, &devicenetworklist);
|
|
|
|
if ( status != 0 ) {
|
|
log.writeLog(__LINE__, "STARTSYSTEMS: pthread_create failed, return status = " + oam.itoa(status));
|
|
status = API_FAILURE;
|
|
}
|
|
|
|
if (status == 0)
|
|
{
|
|
pthread_join(startsystemthread, NULL);
|
|
status = startsystemthreadStatus;
|
|
}
|
|
|
|
if ( status != API_SUCCESS ) {
|
|
// system didn't successfull restart
|
|
processManager.setSystemState(oam::FAILED);
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, error returned from startSystemThread", LOG_TYPE_CRITICAL);
|
|
}
|
|
else
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
}
|
|
}
|
|
|
|
// exit thread
|
|
log.writeLog(__LINE__, "startMgrProcessThread Exit", LOG_TYPE_DEBUG);
|
|
pthread_exit(0);
|
|
}
|
|
|
|
|
|
/*****************************************************************************************
|
|
* @brief pingDeviceThread
|
|
*
|
|
* purpose: perform ping testing on the devices within the system
|
|
*
|
|
*****************************************************************************************/
|
|
void pingDeviceThread()
|
|
{
|
|
ProcessLog log;
|
|
Configuration config;
|
|
ProcessManager processManager(config, log);
|
|
Oam oam;
|
|
ModuleTypeConfig moduletypeconfig;
|
|
ALARMManager aManager;
|
|
BRM::DBRM dbrm;
|
|
|
|
log.writeLog(__LINE__, "pingDeviceThread launched", LOG_TYPE_DEBUG);
|
|
|
|
string cmdLine = "ping ";
|
|
string cmdOption = " -c 1 -w 5 >> /dev/null";
|
|
string cmd;
|
|
string deviceIP;
|
|
|
|
//
|
|
// Get Module Info
|
|
//
|
|
SystemModuleTypeConfig systemModuleTypeConfig;
|
|
|
|
try{
|
|
oam.getSystemConfig(systemModuleTypeConfig);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
//Build the initial list, clear module state
|
|
|
|
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
|
|
if ( moduleCount == 0 )
|
|
// skip of no modules configured
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
|
|
{
|
|
moduleInfoList.insert(moduleList::value_type((*pt).DeviceName, 0));
|
|
}
|
|
}
|
|
|
|
typedef map<string, int> nicList;
|
|
nicList nicInfoList;
|
|
|
|
//Build the initial list, clear NIC state
|
|
|
|
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
|
|
if ( moduleCount == 0 )
|
|
// skip of no modules configured
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
|
|
{
|
|
|
|
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
|
|
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
|
|
{
|
|
nicInfoList.insert(moduleList::value_type((*pt1).HostName, 0));
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Get ext device info
|
|
//
|
|
SystemExtDeviceConfig systemextdeviceconfig;
|
|
|
|
try{
|
|
oam.getSystemConfig(systemextdeviceconfig);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
typedef map<string, int> extDeviceList;
|
|
extDeviceList extDeviceInfoList;
|
|
|
|
//Build the initial list, clear ext device state
|
|
|
|
for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count; i++)
|
|
{
|
|
string name = systemextdeviceconfig.extdeviceconfig[i].Name;
|
|
extDeviceInfoList.insert(extDeviceList::value_type(name, 0));
|
|
}
|
|
|
|
//storage config
|
|
string DBRootStorageType;
|
|
try {
|
|
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
|
|
}
|
|
catch(...) {}
|
|
|
|
log.writeLog(__LINE__, "pingDeviceThread: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
|
|
|
|
int rtnCode = 0;
|
|
Configuration configData;
|
|
SystemStatus systemstatus;
|
|
|
|
bool enableModuleMonitor = true;
|
|
|
|
bool LANOUTAGEACTIVE = false;
|
|
bool HOTSTANDBYACTIVE = false;
|
|
bool downActiveOAMModule = false;
|
|
|
|
// monitor module and external device loop
|
|
|
|
while (true)
|
|
{
|
|
//don't peform module test if system is MAN_OFFLINE or not getting status's
|
|
while(true)
|
|
{
|
|
SystemStatus systemstatus;
|
|
try {
|
|
oam.getSystemStatus(systemstatus);
|
|
|
|
if (systemstatus.SystemOpState == oam::MAN_OFFLINE )
|
|
sleep(5);
|
|
else
|
|
break;
|
|
}
|
|
catch(...)
|
|
{
|
|
sleep(5);
|
|
}
|
|
}
|
|
|
|
// Module Heartbeat period and failure count
|
|
int ModuleHeartbeatPeriod;
|
|
int ModuleHeartbeatCount;
|
|
|
|
try {
|
|
oam.getSystemConfig("ModuleHeartbeatPeriod", ModuleHeartbeatPeriod);
|
|
oam.getSystemConfig("ModuleHeartbeatCount", ModuleHeartbeatCount);
|
|
ModuleHeartbeatPeriod = ModuleHeartbeatPeriod * 10;
|
|
}
|
|
catch (exception& ex) {
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
|
|
// skip testing if Heartbeat is disable
|
|
if( ModuleHeartbeatPeriod <= 0 ) {
|
|
if ( enableModuleMonitor )
|
|
log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to disabled", LOG_TYPE_DEBUG);
|
|
enableModuleMonitor = false;
|
|
}
|
|
else
|
|
{
|
|
if ( !enableModuleMonitor && moduleInfoList.size() > 1 )
|
|
log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to enabled", LOG_TYPE_DEBUG);
|
|
enableModuleMonitor = true;
|
|
}
|
|
|
|
//single server system
|
|
if ( moduleInfoList.size() <= 1)
|
|
enableModuleMonitor = false;
|
|
|
|
//
|
|
// ping NIC
|
|
//
|
|
|
|
// read each time to catch updates
|
|
pthread_mutex_lock(&THREAD_LOCK);
|
|
systemModuleTypeConfig.moduletypeconfig.clear();
|
|
try{
|
|
oam.getSystemConfig(systemModuleTypeConfig);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
|
|
pthread_mutex_unlock(&THREAD_LOCK);
|
|
|
|
bool LANOUTAGESUPPORT = true;
|
|
bool LOCALNICDOWN = false;
|
|
|
|
if (enableModuleMonitor)
|
|
{
|
|
//test main local Ethernet interface status
|
|
for ( int count = 0 ; ; count ++)
|
|
{
|
|
int sockfd;
|
|
struct ifreq ifr;
|
|
|
|
sockfd = socket(AF_INET, SOCK_DGRAM, 0);
|
|
if(sockfd == -1){
|
|
log.writeLog(__LINE__, "Could not get socket to check", LOG_TYPE_ERROR);
|
|
close(sockfd);
|
|
break;
|
|
}
|
|
|
|
/* get interface name */
|
|
strncpy(ifr.ifr_name, iface_name.c_str(), IFNAMSIZ);
|
|
|
|
/* Read interface flags */
|
|
if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0) {
|
|
// not supported
|
|
close(sockfd);
|
|
break;
|
|
}
|
|
|
|
if (ifr.ifr_flags & IFF_UP) {
|
|
// ethernet port is up, continue on
|
|
close(sockfd);
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
// ethernet port is down
|
|
log.writeLog(__LINE__, "NIC #1 is DOWN", LOG_TYPE_WARNING);
|
|
if ( count >= ModuleHeartbeatCount ) {
|
|
LOCALNICDOWN = true;
|
|
close(sockfd);
|
|
break;
|
|
}
|
|
else
|
|
sleep(5);
|
|
}
|
|
close(sockfd);
|
|
}
|
|
}
|
|
|
|
// if the NIC is down, go directly to LAN outage processing
|
|
if ( !LOCALNICDOWN )
|
|
{
|
|
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
|
|
if( moduleCount == 0)
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
|
|
{
|
|
string moduleName = (*pt).DeviceName;
|
|
string ipAddr;
|
|
string hostName;
|
|
int moduleState = oam::INITIAL;
|
|
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
|
|
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
|
|
{
|
|
ipAddr = (*pt1).IPAddr;
|
|
hostName = (*pt1).HostName;
|
|
|
|
if (enableModuleMonitor)
|
|
{
|
|
// perform ping test
|
|
cmd = cmdLine + ipAddr + cmdOption;
|
|
rtnCode = system(cmd.c_str());
|
|
rtnCode = WEXITSTATUS(rtnCode);
|
|
}
|
|
else
|
|
rtnCode = 0;
|
|
|
|
int currentNICState = oam::UP;
|
|
try {
|
|
oam.getNICStatus(hostName, currentNICState);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
switch (rtnCode) {
|
|
case 0:
|
|
//NIC Ack ping
|
|
if ( currentNICState != oam::UP ) {
|
|
processManager.setNICState(hostName, oam::UP);
|
|
|
|
if( ModuleHeartbeatPeriod > 0 )
|
|
//Clear an alarm
|
|
aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, CLEAR);
|
|
}
|
|
|
|
//set LAN Outage indicator to false since a module is responding
|
|
if ( moduleState == oam::INITIAL)
|
|
if ( moduleName != config.moduleName())
|
|
LANOUTAGESUPPORT = false;
|
|
|
|
//set Module State
|
|
if ( moduleState == oam::INITIAL || moduleState == oam::UP)
|
|
moduleState = oam::UP;
|
|
break;
|
|
|
|
default:
|
|
//NIC failed to respond to ping
|
|
if ( currentNICState != oam::DOWN ) {
|
|
log.writeLog(__LINE__, "NIC failed to respond to ping: " + hostName, LOG_TYPE_WARNING);
|
|
processManager.setNICState(hostName, oam::DOWN);
|
|
|
|
if( ModuleHeartbeatPeriod > 0 )
|
|
//Issue an alarm
|
|
aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, SET);
|
|
}
|
|
|
|
//set Module State
|
|
if ( moduleState == oam::INITIAL || moduleState == oam::DOWN)
|
|
moduleState = oam::DOWN;
|
|
else
|
|
// NIC 1 is up and NIC 2 is down
|
|
moduleState = oam::DEGRADED;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// if disable, default module state to up
|
|
if (!enableModuleMonitor)
|
|
moduleState = oam::UP;
|
|
|
|
// moduleState coming out of the NIC monitoring loop
|
|
// UP - ALL NICs passed ping test
|
|
// DEGRADED - NIC 1 passed, NIC 2 failed ping test
|
|
// DOWN - NIC 1 or ALL NICs failed ping test
|
|
|
|
int opState = oam::ACTIVE;
|
|
try{
|
|
bool degraded;
|
|
oam.getModuleStatus(moduleName, opState, degraded);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
// skip module check if not inuse or in FAILED state
|
|
if (opState == oam::MAN_OFFLINE ||
|
|
opState == oam::MAN_DISABLED ||
|
|
opState == oam::FAILED)
|
|
continue;
|
|
|
|
//fast track a restart of a downed failover modules
|
|
if ( gdownActiveOAMModule == moduleName ) {
|
|
moduleInfoList[moduleName] = ModuleHeartbeatCount-1;
|
|
gdownActiveOAMModule.clear();
|
|
moduleState = oam::DOWN;
|
|
downActiveOAMModule = true;
|
|
}
|
|
|
|
vector<string>::iterator pt2 = downModuleList.begin();
|
|
|
|
for( ; pt2 != downModuleList.end() ; pt2++)
|
|
{
|
|
if ( *pt2 == moduleName ) {
|
|
moduleInfoList[moduleName] = ModuleHeartbeatCount-1;
|
|
moduleState = oam::DOWN;
|
|
downModuleList.erase(pt2);
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (moduleState){
|
|
case oam::DEGRADED:
|
|
// do nothing for now
|
|
break;
|
|
case oam::UP:
|
|
// comment out, only come up when both nic are up, if not the pms list will not have the second nic in there
|
|
// case oam::DEGRADED:
|
|
if (opState == oam::DOWN || opState == oam::INITIAL
|
|
|| opState == oam::AUTO_DISABLED)
|
|
{
|
|
//Set the module state to up
|
|
processManager.setModuleState(moduleName, moduleState);
|
|
}
|
|
|
|
if ( moduleName == config.OAMStandbyName() )
|
|
HOTSTANDBYACTIVE = true;
|
|
|
|
// if LAN OUTAGE ACTIVE, skip module checks
|
|
if (LANOUTAGEACTIVE)
|
|
break;
|
|
|
|
try {
|
|
oam.getSystemConfig("MySQLRep", MySQLRep);
|
|
}
|
|
catch(...) {
|
|
MySQLRep = "n";
|
|
}
|
|
|
|
if (moduleInfoList[moduleName] >= ModuleHeartbeatCount ||
|
|
opState == oam::DOWN || opState == oam::AUTO_DISABLED)
|
|
{
|
|
log.writeLog(__LINE__, "Module alive, bring it back online: " + moduleName, LOG_TYPE_DEBUG);
|
|
|
|
string PrimaryUMModuleName = config.moduleName();
|
|
try {
|
|
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
|
|
}
|
|
catch(...) {}
|
|
|
|
bool busy = false;
|
|
for ( int retry = 0 ; retry < 20 ; retry++ )
|
|
{
|
|
busy = false;
|
|
ProcessStatus DMLprocessstatus;
|
|
try {
|
|
oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
|
|
|
|
if ( DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) {
|
|
log.writeLog(__LINE__, "DMLProc in BUSY_INIT, skip bringing module online " + moduleName, LOG_TYPE_DEBUG);
|
|
busy = true;
|
|
sleep(5);
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
catch(...)
|
|
{
|
|
sleep(5);
|
|
}
|
|
}
|
|
|
|
if (busy)
|
|
break;
|
|
|
|
//set query system state not ready
|
|
BRM::DBRM dbrm;
|
|
dbrm.setSystemQueryReady(false);
|
|
|
|
processManager.setQuerySystemState(false);
|
|
|
|
processManager.setSystemState(oam::BUSY_INIT);
|
|
|
|
processManager.reinitProcessType("cpimport");
|
|
|
|
// halt the dbrm
|
|
oam.dbrmctl("halt");
|
|
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
|
|
|
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
|
|
|
|
//send notification
|
|
oam.sendDeviceNotification(config.moduleName(), MODULE_UP);
|
|
|
|
int status;
|
|
DBRootConfigList dbrootConfigList;
|
|
|
|
// if shared pm, move dbroots back to pm
|
|
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
|
|
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
|
|
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
|
|
|
|
//restart to get the versionbuffer files closed so it can be unmounted
|
|
processManager.restartProcessType("WriteEngineServer", moduleName);
|
|
|
|
//set module to enable state
|
|
processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
|
|
|
|
downActiveOAMModule = false;
|
|
int retry;
|
|
for ( retry = 0 ; retry < 5 ; retry++ )
|
|
{
|
|
try {
|
|
log.writeLog(__LINE__, "Call autoUnMovePmDbroot", LOG_TYPE_DEBUG);
|
|
oam.autoUnMovePmDbroot(moduleName);
|
|
|
|
//check if any dbroots got assigned back to this module
|
|
// they could not be moved if there were busy on other pms
|
|
try
|
|
{
|
|
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE,MAX_MODULE_ID_SIZE).c_str());
|
|
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
|
|
|
|
if ( dbrootConfigList.size() == 0 )
|
|
{
|
|
// no dbroots, fail module
|
|
log.writeLog(__LINE__, "autoUnMovePmDbroot left no dbroots mounted, failing module restart: " + moduleName, LOG_TYPE_WARNING);
|
|
|
|
//Issue an alarm
|
|
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
|
|
|
|
//set module to disable state
|
|
processManager.disableModule(moduleName, true);
|
|
|
|
//call dbrm control
|
|
oam.dbrmctl("reload");
|
|
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
|
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
//clear count
|
|
moduleInfoList[moduleName] = 0;
|
|
|
|
processManager.setSystemState(oam::ACTIVE);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
|
|
break;
|
|
}
|
|
}
|
|
catch(...)
|
|
{}
|
|
|
|
log.writeLog(__LINE__, "autoUnMovePmDbroot success", LOG_TYPE_DEBUG);
|
|
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
|
|
break;
|
|
}
|
|
catch(...)
|
|
{
|
|
sleep(5);
|
|
}
|
|
}
|
|
|
|
if ( retry == 5 )
|
|
{
|
|
log.writeLog(__LINE__, "autoUnMovePmDbroot: Failed. Fail Module", LOG_TYPE_WARNING);
|
|
|
|
//Issue an alarm
|
|
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
|
|
|
|
//set module to disable state
|
|
processManager.disableModule(moduleName, true);
|
|
|
|
//call dbrm control
|
|
oam.dbrmctl("reload");
|
|
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
|
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
//clear count
|
|
moduleInfoList[moduleName] = 0;
|
|
|
|
processManager.setSystemState(oam::ACTIVE);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
//set module to enable state
|
|
processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
|
|
|
|
//restart module processes
|
|
int retry = 0;
|
|
|
|
int ModuleProcMonWaitCount = 12;
|
|
try{
|
|
oam.getSystemConfig("ModuleProcMonWaitCount", ModuleProcMonWaitCount);
|
|
}
|
|
catch(...) {
|
|
ModuleProcMonWaitCount = 12;
|
|
}
|
|
|
|
for ( ; retry < ModuleProcMonWaitCount ; retry ++ )
|
|
{
|
|
// first, wait until module's ProcMon is ACTIVE
|
|
int opState = oam::ACTIVE;
|
|
try {
|
|
ProcessStatus procstat;
|
|
oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
|
|
opState = procstat.ProcessOpState;
|
|
|
|
if (opState != oam::ACTIVE) {
|
|
log.writeLog(__LINE__, "Waiting for Module ProcMon to go ACTIVE: " + moduleName, LOG_TYPE_DEBUG);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
|
|
//check and assign Elastic IP Address
|
|
int AmazonElasticIPCount = 0;
|
|
try{
|
|
oam.getSystemConfig("AmazonElasticIPCount", AmazonElasticIPCount);
|
|
}
|
|
catch(...) {
|
|
AmazonElasticIPCount = 0;
|
|
}
|
|
|
|
for ( int id = 1 ; id < AmazonElasticIPCount+1 ; id++ )
|
|
{
|
|
string AmazonElasticModule = "AmazonElasticModule" + oam.itoa(id);
|
|
string ELmoduleName;
|
|
try{
|
|
oam.getSystemConfig(AmazonElasticModule, ELmoduleName);
|
|
}
|
|
catch(...) {}
|
|
|
|
if ( ELmoduleName == moduleName )
|
|
{ //match found assign Elastic IP Address
|
|
string AmazonElasticIPAddr = "AmazonElasticIPAddr" + oam.itoa(id);
|
|
string ELIPaddress;
|
|
try{
|
|
oam.getSystemConfig(AmazonElasticIPAddr, ELIPaddress);
|
|
}
|
|
catch(...) {}
|
|
|
|
try{
|
|
oam.assignElasticIP(hostName, ELIPaddress);
|
|
log.writeLog(__LINE__, "Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_DEBUG);
|
|
}
|
|
catch(...) {
|
|
log.writeLog(__LINE__, "Failed to Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_ERROR);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// next, stopmodule to start up clean
|
|
status = processManager.stopModule(moduleName, oam::FORCEFUL, false);
|
|
if ( status == oam::API_SUCCESS ) {
|
|
string newStandbyModule = processManager.getStandbyModule();
|
|
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
|
|
processManager.setStandbyModule(newStandbyModule);
|
|
}
|
|
else
|
|
{
|
|
if ( newStandbyModule == "NONE")
|
|
if ( moduleName.substr(0,MAX_MODULE_TYPE_SIZE) == "pm" )
|
|
processManager.setStandbyModule(moduleName);
|
|
}
|
|
DBRootConfigList::iterator pt = dbrootConfigList.begin();
|
|
if (( DBRootStorageType == "DataRedundancy") && (*pt == 1))
|
|
{
|
|
log.writeLog(__LINE__, "stopModule, " + config.moduleName(), LOG_TYPE_DEBUG);
|
|
processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
|
|
processManager.switchParentOAMModule(moduleName);
|
|
processManager.stopProcess(config.moduleName(), "ProcessManager", oam::FORCEFUL, true);
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
//stop failed, retry
|
|
log.writeLog(__LINE__, "stopModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
|
|
sleep(5);
|
|
continue;
|
|
}
|
|
|
|
// next, startmodule
|
|
status = processManager.startModule(moduleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
|
|
if ( status == oam::API_SUCCESS )
|
|
break;
|
|
|
|
log.writeLog(__LINE__, "startModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
|
|
|
|
//sleep and retry all over again
|
|
sleep (5);
|
|
} // end of the retry loop
|
|
|
|
if ( retry < ModuleProcMonWaitCount )
|
|
{ // module successfully started
|
|
|
|
//call dbrm control, need to resume before start so the getdbrmfiles halt doesn't hang
|
|
oam.dbrmctl("reload");
|
|
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
|
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
sleep(1);
|
|
|
|
// if a PM module was started successfully, restart ACTIVE DBRM(s), ExeMgr(s) / mysqld
|
|
if( moduleName.find("pm") == 0 ) {
|
|
processManager.restartProcessType("DBRMControllerNode", moduleName);
|
|
processManager.restartProcessType("DBRMWorkerNode");
|
|
processManager.stopProcessType("DDLProc");
|
|
processManager.stopProcessType("DMLProc");
|
|
processManager.stopProcessType("ExeMgr");
|
|
processManager.restartProcessType("PrimProc");
|
|
sleep(1);
|
|
processManager.restartProcessType("ExeMgr");
|
|
}
|
|
|
|
string moduleType = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
|
|
|
if ( MySQLRep == "y" ) {
|
|
if ( moduleType == "um" ||
|
|
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
|
|
( moduleType == "pm" && PMwithUM == "y") ) {
|
|
|
|
//setup MySQL Replication for started modules
|
|
|
|
log.writeLog(__LINE__, "Setup MySQL Replication for module recovering from outage on " + moduleName, LOG_TYPE_DEBUG);
|
|
DeviceNetworkList devicenetworklist;
|
|
DeviceNetworkConfig devicenetworkconfig;
|
|
devicenetworkconfig.DeviceName = moduleName;
|
|
devicenetworklist.push_back(devicenetworkconfig);
|
|
processManager.setMySQLReplication(devicenetworklist);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if( moduleName.find("pm") == 0 ) {
|
|
processManager.restartProcessType("mysql", moduleName);
|
|
sleep(1);
|
|
}
|
|
}
|
|
|
|
// if a PM module was started successfully, DMLProc/DDLProc
|
|
if( moduleName.find("pm") == 0 ) {
|
|
processManager.restartProcessType("WriteEngineServer");
|
|
sleep(1);
|
|
processManager.restartProcessType("DDLProc");
|
|
sleep(1);
|
|
processManager.restartProcessType("DMLProc");
|
|
}
|
|
|
|
//enable query stats
|
|
dbrm.setSystemQueryReady(true);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
|
|
processManager.setSystemState(oam::ACTIVE);
|
|
|
|
//reset standby module
|
|
string newStandbyModule = processManager.getStandbyModule();
|
|
|
|
//send message to start new Standby Process-Manager, if needed
|
|
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
|
|
processManager.setStandbyModule(newStandbyModule);
|
|
}
|
|
else
|
|
{
|
|
Config* sysConfig = Config::makeConfig();
|
|
|
|
// clear Standby OAM Module
|
|
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
|
|
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
|
|
|
|
//update Calpont Config table
|
|
try {
|
|
sysConfig->write();
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
|
|
if ( moduletypeconfig.RunType == SIMPLEX ) {
|
|
//start SIMPLEX runtype processes on a SIMPLEX runtype module
|
|
string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
|
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
|
|
for( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
|
|
{
|
|
string launchModuleName = (*pt).DeviceName;
|
|
string launchModuletype = launchModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
|
if ( moduletype != launchModuletype )
|
|
continue;
|
|
|
|
//skip if active pm module (local module)
|
|
if ( launchModuleName == config.moduleName() )
|
|
continue;
|
|
|
|
//check if module is active before starting any SIMPLEX STANDBY apps
|
|
try{
|
|
int launchopState = oam::ACTIVE;
|
|
bool degraded;
|
|
oam.getModuleStatus(launchModuleName, launchopState, degraded);
|
|
|
|
if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY ) {
|
|
continue;
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
int status;
|
|
log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG);
|
|
for ( int j = 0 ; j < 20 ; j ++ )
|
|
{
|
|
status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
|
|
if ( status == API_SUCCESS)
|
|
break;
|
|
}
|
|
log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG);
|
|
}
|
|
}
|
|
|
|
//clear count
|
|
moduleInfoList[moduleName] = 0;
|
|
}
|
|
else
|
|
{ // module failed to restart, place back in disabled state
|
|
//Log failure, issue alarm, set moduleOpState
|
|
Configuration config;
|
|
|
|
//Issue an alarm
|
|
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
|
|
|
|
// if pm, move dbroots back to pm
|
|
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
|
|
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
|
|
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
|
|
//move dbroots to other modules
|
|
try {
|
|
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
|
|
oam.autoMovePmDbroot(moduleName);
|
|
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
|
|
//set module to disable state
|
|
processManager.disableModule(moduleName, true);
|
|
|
|
//call dbrm control
|
|
oam.dbrmctl("reload");
|
|
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
|
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
|
|
|
|
if ( amazon )
|
|
processManager.setSystemState(oam::FAILED);
|
|
else
|
|
processManager.setSystemState(oam::ACTIVE);
|
|
|
|
//enable query stats
|
|
dbrm.setSystemQueryReady(true);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
|
|
//clear count
|
|
moduleInfoList[moduleName] = 0;
|
|
}
|
|
}
|
|
|
|
break;
|
|
|
|
case oam::DOWN:
|
|
// if initial state, skip
|
|
if (opState == oam::INITIAL)
|
|
break;
|
|
|
|
// if disabled and not amazon, skip
|
|
if (opState == oam::AUTO_DISABLED && !amazon)
|
|
break;
|
|
|
|
log.writeLog(__LINE__, "module failed to respond to pings: " + moduleName, LOG_TYPE_WARNING);
|
|
|
|
//bump module ping failure counter
|
|
moduleInfoList[moduleName]++;
|
|
|
|
if ( moduleName == config.OAMStandbyName() )
|
|
HOTSTANDBYACTIVE = false;
|
|
|
|
if (moduleInfoList[moduleName] == ModuleHeartbeatCount)
|
|
{
|
|
// if LAN OUTAGE ACTIVE,skip module checks
|
|
if (LANOUTAGEACTIVE)
|
|
break;
|
|
|
|
//Log failure, issue alarm, set moduleOpState
|
|
Configuration config;
|
|
log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
|
|
|
|
//set query system state not ready
|
|
BRM::DBRM dbrm;
|
|
dbrm.setSystemQueryReady(false);
|
|
|
|
processManager.setQuerySystemState(false);
|
|
|
|
processManager.setSystemState(oam::BUSY_INIT);
|
|
|
|
processManager.reinitProcessType("cpimport");
|
|
|
|
// halt the dbrm
|
|
oam.dbrmctl("halt");
|
|
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
|
|
|
processManager.setSystemState(oam::BUSY_INIT);
|
|
|
|
//string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
|
|
//system(cmd.c_str());
|
|
|
|
//send notification
|
|
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
|
|
|
|
//Issue an alarm
|
|
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
|
|
|
|
//mark all processes running on module auto-offline
|
|
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
|
|
|
|
//set module to disable state
|
|
processManager.disableModule(moduleName, false);
|
|
|
|
//call dbrm control
|
|
oam.dbrmctl("reload");
|
|
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
|
|
|
// if pm, move dbroots to other pms
|
|
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
|
|
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
|
|
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
|
|
try {
|
|
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
|
|
oam.autoMovePmDbroot(moduleName);
|
|
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
|
|
// if Cloud Instance
|
|
// state = running, then instance is rebooting, monitor for recovery
|
|
// state = stopped, then try starting, if fail, remove/addmodule to launch new instance
|
|
// state = terminate or nothing, remove/addmodule to launch new instance
|
|
if ( amazon ) {
|
|
if ( moduleName.find("um") == 0 )
|
|
{
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
//set recycle process
|
|
processManager.recycleProcess(moduleName);
|
|
}
|
|
|
|
// return values = 'ip address' for running or rebooting, stopped or terminated
|
|
string currentIPAddr = oam.getEC2InstanceIpAddress(hostName);
|
|
if ( currentIPAddr == "terminated")
|
|
{
|
|
//check if down module was Standby OAM, if so find another one
|
|
if ( moduleName == config.OAMStandbyName() ) {
|
|
|
|
//set down module ProcessManager to AOS
|
|
processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
|
|
|
|
//get another standby OAM module
|
|
string newStandbyModule = processManager.getStandbyModule();
|
|
|
|
//send message to start new Standby Process-Manager, if needed
|
|
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
|
|
processManager.setStandbyModule(newStandbyModule);
|
|
}
|
|
else
|
|
{
|
|
Config* sysConfig = Config::makeConfig();
|
|
|
|
// clear Standby OAM Module
|
|
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
|
|
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
|
|
|
|
//update Calpont Config table
|
|
try {
|
|
sysConfig->write();
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
// remove/addmodule
|
|
log.writeLog(__LINE__, "Instance terminated, re-launching: " + hostName, LOG_TYPE_DEBUG);
|
|
|
|
// if pm, get assigned dbroots and deattach EBS
|
|
DBRootConfigList dbrootConfigList;
|
|
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE,MAX_MODULE_ID_SIZE).c_str());
|
|
if( moduleName.find("pm") == 0 ) {
|
|
//get dbroots ids for to PM
|
|
try
|
|
{
|
|
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
|
|
}
|
|
catch (exception& e)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: getPmDbrootConfig error: " + moduleName, LOG_TYPE_DEBUG);
|
|
}
|
|
}
|
|
|
|
DeviceNetworkList devicenetworklist;
|
|
DeviceNetworkConfig devicenetworkconfig;
|
|
HostConfig hostconfig;
|
|
|
|
devicenetworkconfig.DeviceName = moduleName;
|
|
if (cloud == "amazon-vpc")
|
|
hostconfig.IPAddr = ipAddr;
|
|
else
|
|
hostconfig.IPAddr = oam::UnassignedName;
|
|
|
|
hostconfig.HostName = oam::UnassignedName;
|
|
hostconfig.NicID = 1;
|
|
devicenetworkconfig.hostConfigList.push_back(hostconfig);
|
|
|
|
devicenetworklist.push_back(devicenetworkconfig);
|
|
|
|
bool pass = true;
|
|
for ( int addRetry = 0 ; addRetry < 5 ; addRetry++ )
|
|
{
|
|
//remove module
|
|
int ret = processManager.removeModule(devicenetworklist, false);
|
|
if ( ret != oam::API_SUCCESS )
|
|
{
|
|
log.writeLog(__LINE__, "Instance failed to remove, retry: " + moduleName, LOG_TYPE_DEBUG);
|
|
}
|
|
else
|
|
{
|
|
pass = true;
|
|
log.writeLog(__LINE__, "Instance removed, module: " + moduleName, LOG_TYPE_DEBUG);
|
|
}
|
|
|
|
// add module
|
|
string password = oam::UnassignedName;
|
|
try
|
|
{
|
|
oam.getSystemConfig("rpw", password);
|
|
}
|
|
catch(...)
|
|
{
|
|
password = oam::UnassignedName;
|
|
}
|
|
|
|
ret = processManager.addModule(devicenetworklist, password, false);
|
|
if ( ret != oam::API_SUCCESS )
|
|
{
|
|
log.writeLog(__LINE__, "Instance failed to add, retry: " + moduleName, LOG_TYPE_CRITICAL);
|
|
pass = false;
|
|
}
|
|
else
|
|
{
|
|
pass = true;
|
|
log.writeLog(__LINE__, "New Instance Launched for " + moduleName, LOG_TYPE_DEBUG);
|
|
|
|
// if pm, config and attach EBS
|
|
if( moduleName.find("pm") == 0 && !dbrootConfigList.empty() ) {
|
|
try
|
|
{
|
|
oam.setPmDbrootConfig(moduleID, dbrootConfigList);
|
|
|
|
std::vector<std::string> dbrootList;
|
|
DBRootConfigList::iterator pt1 = dbrootConfigList.begin();
|
|
for( ; pt1 != dbrootConfigList.end() ; pt1++)
|
|
{
|
|
dbrootList.push_back(oam.itoa(*pt1));
|
|
}
|
|
|
|
//attach EBS
|
|
try
|
|
{
|
|
oam.amazonReattach(moduleName, dbrootList, true);
|
|
pass = true;
|
|
break;
|
|
}
|
|
catch (exception& e)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: amazonReattach error on " + moduleName, LOG_TYPE_ERROR);
|
|
pass = false;
|
|
}
|
|
}
|
|
catch (exception& e)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: setPmDbrootConfig error on " + moduleName, LOG_TYPE_ERROR);
|
|
pass = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pass = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (pass)
|
|
break;
|
|
}
|
|
|
|
if (pass)
|
|
//Set the module state so it will be brought back up
|
|
processManager.setModuleState(moduleName, oam::AUTO_DISABLED);
|
|
else
|
|
{
|
|
//new instance failed to get added
|
|
//remove and try auto moving dbroots to other pms
|
|
processManager.removeModule(devicenetworklist, false);
|
|
|
|
// if pm, move dbroots to other pms
|
|
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
|
|
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
|
|
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
|
|
try {
|
|
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
|
|
oam.autoMovePmDbroot(moduleName);
|
|
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
|
|
//distribute config file
|
|
processManager.distributeConfigFile("system");
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
|
|
//set recycle process
|
|
processManager.recycleProcess(moduleName);
|
|
|
|
//enable query stats
|
|
dbrm.setSystemQueryReady(true);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
|
|
sleep(2);
|
|
processManager.setSystemState(oam::ACTIVE);
|
|
}
|
|
}
|
|
|
|
if ( moduleName.find("pm") == 0 )
|
|
{
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
//enable query stats
|
|
dbrm.setSystemQueryReady(true);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
}
|
|
}
|
|
else
|
|
{ // non-amazon
|
|
// resume the dbrm
|
|
oam.dbrmctl("resume");
|
|
log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
|
|
|
|
//set recycle process
|
|
processManager.recycleProcess(moduleName);
|
|
|
|
//enable query stats
|
|
dbrm.setSystemQueryReady(true);
|
|
|
|
//set query system state ready
|
|
processManager.setQuerySystemState(true);
|
|
|
|
sleep(2);
|
|
//check if down module was Standby OAM, if so find another one
|
|
if ( moduleName == config.OAMStandbyName() ) {
|
|
|
|
//set down module ProcessManager to AOS
|
|
processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
|
|
|
|
//get another standby OAM module
|
|
string newStandbyModule = processManager.getStandbyModule();
|
|
|
|
//send message to start new Standby Process-Manager, if needed
|
|
if ( !newStandbyModule.empty() && newStandbyModule != "NONE") {
|
|
processManager.setStandbyModule(newStandbyModule);
|
|
}
|
|
else
|
|
{
|
|
Config* sysConfig = Config::makeConfig();
|
|
|
|
// clear Standby OAM Module
|
|
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
|
|
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
|
|
|
|
//update Calpont Config table
|
|
try {
|
|
sysConfig->write();
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//start SIMPLEX runtype processes on a SIMPLEX runtype module
|
|
string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
|
|
|
try{
|
|
oam.getSystemConfig(moduletype, moduletypeconfig);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
if ( moduletypeconfig.RunType == SIMPLEX ) {
|
|
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
|
|
for( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
|
|
{
|
|
string launchModuleName = (*pt).DeviceName;
|
|
string launchModuletype = launchModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
|
if ( moduletype != launchModuletype )
|
|
continue;
|
|
|
|
//skip if active pm module (local module)
|
|
if ( launchModuleName == config.moduleName() )
|
|
continue;
|
|
|
|
if( moduleName != launchModuleName ) {
|
|
//check if module is active before starting any SIMPLEX STANDBY apps
|
|
try{
|
|
int launchopState = oam::ACTIVE;
|
|
bool degraded;
|
|
oam.getModuleStatus(launchModuleName, launchopState, degraded);
|
|
|
|
if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY ) {
|
|
continue;
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
int status;
|
|
log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG);
|
|
for ( int j = 0 ; j < 20 ; j ++ )
|
|
{
|
|
status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
|
|
if ( status == API_SUCCESS)
|
|
break;
|
|
}
|
|
log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
} //end of for loop
|
|
}
|
|
|
|
// check and take action if LAN outage is flagged
|
|
if (LANOUTAGESUPPORT && !LANOUTAGEACTIVE && LOCALNICDOWN)
|
|
{
|
|
log.writeLog(__LINE__, "LAN Failure detected", LOG_TYPE_CRITICAL);
|
|
|
|
oam.sendDeviceNotification(config.moduleName(), START_PM_MASTER_DOWN);
|
|
|
|
LANOUTAGEACTIVE = true;
|
|
|
|
log.writeLog(__LINE__, "Kill any cpimport running", LOG_TYPE_INFO);
|
|
system("pkill -9 cpimport");
|
|
|
|
//request stop of local module
|
|
int status = processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
|
|
if ( status != oam::API_SUCCESS )
|
|
log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
|
|
|
|
//stop snmptrap daemon process
|
|
processManager.stopProcess(config.moduleName(), "SNMPTrapDaemon", oam::FORCEFUL, false);
|
|
}
|
|
else
|
|
{
|
|
if ( LANOUTAGEACTIVE && HOTSTANDBYACTIVE && !LOCALNICDOWN)
|
|
{
|
|
// pthread_mutex_unlock(&THREAD_LOCK);
|
|
LANOUTAGEACTIVE = false;
|
|
|
|
log.writeLog(__LINE__, "LAN Failure recovery");
|
|
|
|
//check if this module still is active according to last know hot standby module
|
|
ByteStream msg;
|
|
ByteStream::byte requestID = GETPARENTOAMMODULE;
|
|
msg << requestID;
|
|
|
|
string parentOAMModule = processManager.sendMsgProcMon1( config.OAMStandbyName(), msg, requestID );
|
|
if ( parentOAMModule == config.moduleName() ||
|
|
parentOAMModule == "FAILED" ) {
|
|
|
|
//srestart to these guys incase they marked any PrimProcs offline
|
|
processManager.restartProcessType("ExeMgr");
|
|
processManager.reinitProcessType("DDLProc");
|
|
processManager.reinitProcessType("DMLProc");
|
|
}
|
|
else
|
|
{
|
|
//send message to local Process Monitor to run coldStandby
|
|
ByteStream msg;
|
|
ByteStream::byte requestID = OAMPARENTCOLD;
|
|
|
|
msg << requestID;
|
|
|
|
int returnStatus = processManager.sendMsgProcMon( config.moduleName(), msg, requestID );
|
|
log.writeLog(__LINE__, "sent OAM Parent Cold message to local Process-Monitor, status: " + oam.itoa(returnStatus) , LOG_TYPE_DEBUG);
|
|
|
|
//request stop of local module
|
|
int status = processManager.stopModule(config.moduleName(), oam::INSTALL, false);
|
|
if ( status != oam::API_SUCCESS )
|
|
log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// ping ext devices
|
|
//
|
|
|
|
// read each time to catch updates
|
|
systemextdeviceconfig.extdeviceconfig.clear();
|
|
try{
|
|
oam.getSystemConfig(systemextdeviceconfig);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count ; i++ )
|
|
{
|
|
string extDeviceName = systemextdeviceconfig.extdeviceconfig[i].Name;
|
|
string ipAddr = systemextdeviceconfig.extdeviceconfig[i].IPAddr;
|
|
|
|
int opState = oam::ACTIVE;
|
|
try{
|
|
oam.getExtDeviceStatus(extDeviceName, opState);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
cmd = cmdLine + ipAddr + cmdOption;
|
|
rtnCode = system(cmd.c_str());
|
|
|
|
switch (WEXITSTATUS(rtnCode)){
|
|
case 0:
|
|
//Switch Ack ping, Check whether alarm have been issued
|
|
if (extDeviceInfoList[extDeviceName] >= ModuleHeartbeatCount)
|
|
{
|
|
aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, CLEAR);
|
|
|
|
}
|
|
|
|
extDeviceInfoList[extDeviceName] = 0;
|
|
|
|
if (opState != oam::ACTIVE)
|
|
{
|
|
//Set the switch state to active
|
|
processManager.setExtdeviceState(extDeviceName, oam::ACTIVE);
|
|
}
|
|
break;
|
|
default:
|
|
//extDevice failed to respond to ping
|
|
log.writeLog(__LINE__, "extDevice failed to respond to ping: " + extDeviceName, LOG_TYPE_WARNING);
|
|
extDeviceInfoList[extDeviceName]++;
|
|
|
|
if (extDeviceInfoList[extDeviceName] == ModuleHeartbeatCount)
|
|
{
|
|
//Log failure, issue alarm, set extDeviceOpState
|
|
log.writeLog(__LINE__, "extDevice is down: " + extDeviceName, LOG_TYPE_CRITICAL);
|
|
|
|
processManager.setExtdeviceState(extDeviceName, oam::AUTO_OFFLINE);
|
|
|
|
//Issue an alarm
|
|
aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, SET);
|
|
}
|
|
break;
|
|
}
|
|
} //end of for loop
|
|
|
|
// double check to make sure the system status is ACTIVE if all module status's are ACTIVE
|
|
try {
|
|
if (dbrm.isDBRMReady())
|
|
{
|
|
int systemReady = dbrm.getSystemReady(); // -1 == fail, 0 == not ready, 1 == ready
|
|
if (systemReady > 0)
|
|
{
|
|
bool updateActive = true;
|
|
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
|
|
{
|
|
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
|
|
if ( moduleCount == 0)
|
|
continue;
|
|
|
|
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
|
|
for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
|
|
{
|
|
string moduleName = (*pt).DeviceName;
|
|
|
|
int opState = oam::ACTIVE;
|
|
try
|
|
{
|
|
bool degraded;
|
|
oam.getModuleStatus(moduleName, opState, degraded);
|
|
if (opState == oam::ACTIVE ||
|
|
opState == oam::DEGRADED ||
|
|
opState == oam::MAN_DISABLED ||
|
|
opState == oam::AUTO_DISABLED )
|
|
continue;
|
|
|
|
updateActive = false;
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch (...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (updateActive)
|
|
{
|
|
// log.writeLog(__LINE__, "Modules are ACTIVE, check system state ", LOG_TYPE_DEBUG);
|
|
|
|
string PrimaryUMModuleName;
|
|
try {
|
|
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
|
|
}
|
|
catch(...) {}
|
|
|
|
// log.writeLog(__LINE__, "PrimaryUMModuleName = " + PrimaryUMModuleName, LOG_TYPE_DEBUG);
|
|
|
|
ProcessStatus DMLprocessstatus;
|
|
try {
|
|
oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
// log.writeLog(__LINE__, "DMLPROC STATUS = " + oamState[DMLprocessstatus.ProcessOpState], LOG_TYPE_DEBUG);
|
|
|
|
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE) {
|
|
|
|
//set the system status if a change has occurred
|
|
SystemStatus systemstatus;
|
|
|
|
try
|
|
{
|
|
oam.getSystemStatus(systemstatus);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch (...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
if ( systemstatus.SystemOpState != oam::ACTIVE )
|
|
{
|
|
processManager.setSystemState(oam::ACTIVE);
|
|
}
|
|
}
|
|
|
|
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT) {
|
|
|
|
//set the system status if a change has occurred
|
|
SystemStatus systemstatus;
|
|
|
|
try
|
|
{
|
|
oam.getSystemStatus(systemstatus);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
// string error = ex.what();
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch (...)
|
|
{
|
|
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
if ( systemstatus.SystemOpState != oam::BUSY_INIT )
|
|
{
|
|
processManager.setSystemState(oam::BUSY_INIT);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
catch (...)
|
|
{
|
|
}
|
|
|
|
//go sleep for a bit
|
|
int sleepTime = ModuleHeartbeatPeriod / 10;
|
|
|
|
if (!enableModuleMonitor && systemextdeviceconfig.Count == 0)
|
|
sleep(60);
|
|
else
|
|
sleep(sleepTime);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/******************************************************************************************
|
|
* @brief hdfsActiveAlarmsPushingThread
|
|
*
|
|
* purpose: Push an image of ActiveAlarms to HDFS for non-OAMParentModule to view.
|
|
*
|
|
******************************************************************************************/
|
|
static void hdfsActiveAlarmsPushingThread()
|
|
{
|
|
boost::filesystem::path filePath(ACTIVE_ALARM_FILE);
|
|
boost::filesystem::path dirPath = filePath.parent_path();
|
|
string dirName = boost::filesystem::canonical(dirPath).string();
|
|
|
|
if (boost::filesystem::exists("/etc/pdsh/machines"))
|
|
{
|
|
string cpCmd = "pdcp -a -x " + localHostName + " " + ACTIVE_ALARM_FILE + " " + dirName +
|
|
" > /dev/null 2>&1";
|
|
string rmCmd = "pdsh -a -x " + localHostName + " rm -f " + ACTIVE_ALARM_FILE +
|
|
" > /dev/null 2>&1";
|
|
while(1)
|
|
{
|
|
if (boost::filesystem::exists(filePath))
|
|
system(cpCmd.c_str());
|
|
else
|
|
system(rmCmd.c_str());
|
|
|
|
sleep(ACTIVE_ALARMS_PUSHING_INTERVAL);
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/*****************************************************************************************
|
|
* @brief Processor Heartbeat Msg Thread
|
|
*
|
|
* purpose: Read Heartbeat Messages from other Processes
|
|
*
|
|
*****************************************************************************************/
|
|
/*
|
|
static void heartbeatMsgThread()
|
|
{
|
|
ProcessLog log;
|
|
Configuration config;
|
|
ProcessManager processManager(config, log);
|
|
|
|
//
|
|
//waiting for request
|
|
//
|
|
ByteStream receivedMSG;
|
|
IOSocket fIos;
|
|
|
|
for (;;)
|
|
{
|
|
try
|
|
{
|
|
MessageQueueServer procmgr("ProcHeartbeatControl");
|
|
for (;;)
|
|
{
|
|
try
|
|
{
|
|
fIos = procmgr.accept();
|
|
receivedMSG = fIos.read();
|
|
|
|
if (receivedMSG.length() > 0) {
|
|
processManager.processMSG(fIos, receivedMSG);
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
fIos.close();
|
|
}
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
|
|
// takes 2 - 4 minites to free sockets, sleep and retry
|
|
sleep(60);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcHeartbeatControl: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
// takes 2 - 4 minites to free sockets, sleep and retry
|
|
sleep(60);
|
|
}
|
|
}
|
|
|
|
}
|
|
*/
|
|
|
|
/*****************************************************************************************
|
|
* @brief Processor Heartbeat Thread
|
|
*
|
|
* purpose: Check Heartbeat Messages from other Processes
|
|
*
|
|
*****************************************************************************************/
|
|
/*
|
|
static void heartbeatProcessThread()
|
|
{
|
|
ProcessLog log;
|
|
Configuration config;
|
|
ProcessManager processManager(config, log);
|
|
Oam oam;
|
|
ALARMManager aManager;
|
|
|
|
int processHeartbeatPeriod=60; //default value to 60 seconds
|
|
|
|
log.writeLog(__LINE__, "Thread Launched: Process Heartbeat!!!");
|
|
|
|
while (true)
|
|
{
|
|
//
|
|
// check and report on register process not sending heartbeats
|
|
//
|
|
|
|
// get process heartbeat period
|
|
try {
|
|
oam.getSystemConfig("ProcessHeartbeatPeriod", processHeartbeatPeriod);
|
|
processHeartbeatPeriod = processHeartbeatPeriod * 60;
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
}
|
|
|
|
Oam oam;
|
|
log.writeLog(__LINE__, "Process Heartbeat check started, Heartbeat period is " + oam.itoa(processHeartbeatPeriod), LOG_TYPE_DEBUG);
|
|
|
|
sleep(processHeartbeatPeriod);
|
|
|
|
HeartBeatProcList::iterator list = hbproclist.begin();
|
|
for( ; list != hbproclist.end() ; list++)
|
|
{
|
|
string moduleName = (*list).ModuleName;
|
|
string processName = (*list).ProcessName;
|
|
int id = (*list).ID;
|
|
|
|
// get Process state and only check if ACTIVE
|
|
ProcessStatus procstat;
|
|
try{
|
|
oam.getProcessStatus(processName, moduleName, procstat);
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
string error = ex.what();
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
|
|
procstat.ProcessOpState = oam::MAN_OFFLINE;
|
|
}
|
|
catch(...)
|
|
{
|
|
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
|
|
procstat.ProcessOpState = oam::MAN_OFFLINE;
|
|
}
|
|
|
|
if ( procstat.ProcessOpState == oam::ACTIVE ) {
|
|
// skip testing if Heartbeat is disable
|
|
if( processHeartbeatPeriod != -1 ) {
|
|
//log.writeLog(__LINE__, "Heartbeat: Process being monitored: " + moduleName + " / " + processName + " / " + oam.itoa(id), LOG_TYPE_DEBUG);
|
|
if ( !(*list).receiveFlag ) {
|
|
// got a missing heartbeat, request a restart on the process
|
|
log.writeLog(__LINE__, "heartbeatProcessThread: Failure from process " + moduleName + " / " + processName+ " / " + oam.itoa(id), LOG_TYPE_WARNING);
|
|
|
|
oam.restartProcess(moduleName, processName, FORCEFUL, ACK_NO);
|
|
(*list).receiveFlag = true;
|
|
// reset all other entries for this process
|
|
HeartBeatProcList::iterator list1 = hbproclist.begin();
|
|
for( ; list1 != hbproclist.end() ; list1++)
|
|
{
|
|
string moduleName1 = (*list1).ModuleName;
|
|
string processName1 = (*list1).ProcessName;
|
|
if ( moduleName == moduleName1 && processName == processName1 )
|
|
(*list1).receiveFlag = true;
|
|
}
|
|
}
|
|
else
|
|
// reset receive heartbeat indication flag
|
|
(*list).receiveFlag = false;
|
|
}
|
|
else
|
|
// heartbeat is disabled
|
|
(*list).receiveFlag=true;
|
|
}
|
|
else
|
|
{ // registered process not active, remove from list
|
|
hbproclist.erase(list);
|
|
log.writeLog(__LINE__, "Removing OOS Process from Heartbeat Monitor list: " + moduleName + " / " + processName+ " / " + oam.itoa(id));
|
|
break;
|
|
}
|
|
}
|
|
} // end of while forever loop
|
|
}
|
|
*/
|
|
// vim:ts=4 sw=4:
|