You've already forked mariadb-columnstore-engine
							
							
				mirror of
				https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
				synced 2025-11-03 17:13:17 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			3056 lines
		
	
	
		
			121 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			3056 lines
		
	
	
		
			121 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/* Copyright (C) 2014 InfiniDB, Inc.
 | 
						|
   Copyright (C) 2016 MariaDB Corporaton
 | 
						|
 | 
						|
   This program is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU General Public License
 | 
						|
   as published by the Free Software Foundation; version 2 of
 | 
						|
   the License.
 | 
						|
 | 
						|
   This program is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU General Public License for more details.
 | 
						|
 | 
						|
 | 
						|
   You should have received a copy of the GNU General Public License
 | 
						|
   along with this program; if not, write to the Free Software
 | 
						|
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 | 
						|
   MA 02110-1301, USA. */
 | 
						|
 | 
						|
/*****************************************************************************************
 | 
						|
* $Id: main.cpp 2203 2013-07-08 16:50:51Z bpaul $
 | 
						|
*
 | 
						|
*****************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
#include <clocale>
 | 
						|
 | 
						|
#include <boost/filesystem.hpp>
 | 
						|
 | 
						|
#include "columnstoreversion.h"
 | 
						|
#include "processmanager.h"
 | 
						|
#include "installdir.h"
 | 
						|
 | 
						|
#include "utils_utf8.h"
 | 
						|
 | 
						|
#include "crashtrace.h"
 | 
						|
 | 
						|
using namespace std;
 | 
						|
using namespace logging;
 | 
						|
using namespace messageqcpp;
 | 
						|
using namespace processmanager;
 | 
						|
using namespace oam;
 | 
						|
using namespace alarmmanager;
 | 
						|
using namespace threadpool;
 | 
						|
//using namespace procheartbeat;
 | 
						|
using namespace config;
 | 
						|
 | 
						|
bool runStandby = false;
 | 
						|
bool runCold = false;
 | 
						|
string systemName = "system";
 | 
						|
string iface_name;
 | 
						|
string cloud;
 | 
						|
bool amazon = false;
 | 
						|
string PMInstanceType;
 | 
						|
string UMInstanceType;
 | 
						|
string AmazonPMFailover = "y";
 | 
						|
string DataRedundancyConfig = "n";
 | 
						|
bool rootUser = true;
 | 
						|
string USER = "root";
 | 
						|
bool HDFS = false;
 | 
						|
string localHostName;
 | 
						|
string PMwithUM = "n";
 | 
						|
string MySQLRep = "n";
 | 
						|
 | 
						|
// pushing the ACTIVE_ALARMS_FILE to all nodes every 10 seconds.
 | 
						|
const int ACTIVE_ALARMS_PUSHING_INTERVAL = 10;
 | 
						|
 | 
						|
typedef   map<string, int>	moduleList;
 | 
						|
moduleList	moduleInfoList;
 | 
						|
 | 
						|
extern HeartBeatProcList hbproclist;
 | 
						|
extern pthread_mutex_t THREAD_LOCK;
 | 
						|
extern bool startsystemthreadStop;
 | 
						|
extern string gdownActiveOAMModule;
 | 
						|
extern int startsystemthreadStatus;
 | 
						|
extern vector<string> downModuleList;
 | 
						|
extern bool startFailOver;
 | 
						|
extern bool gOAMParentModuleFlag;
 | 
						|
 | 
						|
static void messageThread(Configuration config);
 | 
						|
static void alarmMessageThread(Configuration config);
 | 
						|
static void sigUser1Handler(int sig);
 | 
						|
static void startMgrProcessThread();
 | 
						|
static void hdfsActiveAlarmsPushingThread();
 | 
						|
//static void pingDeviceThread();
 | 
						|
//static void heartbeatProcessThread();
 | 
						|
//static void heartbeatMsgThread();
 | 
						|
 | 
						|
/*****************************************************************************************
 | 
						|
* @brief	main
 | 
						|
*
 | 
						|
* purpose:	request launching of Mgr controlled processes and wait for incoming messages
 | 
						|
*
 | 
						|
*****************************************************************************************/
 | 
						|
int main(int argc, char** argv)
 | 
						|
{
 | 
						|
#ifndef _MSC_VER
 | 
						|
    setuid(0); // set effective ID to root; ignore return status
 | 
						|
#endif
 | 
						|
    // get and set locale language
 | 
						|
    string systemLang = "C";
 | 
						|
 | 
						|
    setlocale(LC_ALL, systemLang.c_str());
 | 
						|
 | 
						|
    // This is unset due to the way we start it
 | 
						|
    program_invocation_short_name = const_cast<char*>("ProcMgr");
 | 
						|
 | 
						|
    struct sigaction ign;
 | 
						|
    memset(&ign, 0, sizeof(ign));
 | 
						|
    ign.sa_handler = fatalHandler;
 | 
						|
    sigaction(SIGSEGV, &ign, 0);
 | 
						|
    sigaction(SIGABRT, &ign, 0);
 | 
						|
    sigaction(SIGFPE, &ign, 0);
 | 
						|
 | 
						|
    Oam oam;
 | 
						|
 | 
						|
    //check if root-user
 | 
						|
    int user;
 | 
						|
    user = getuid();
 | 
						|
 | 
						|
    if (user != 0)
 | 
						|
        rootUser = false;
 | 
						|
 | 
						|
    char* p = getenv("USER");
 | 
						|
 | 
						|
    if (p && *p)
 | 
						|
        USER = p;
 | 
						|
 | 
						|
    ProcessLog log;
 | 
						|
    Configuration config;
 | 
						|
    ProcessManager processManager(config, log);
 | 
						|
    ALARMManager aManager;
 | 
						|
 | 
						|
    log.writeLog(__LINE__, " ");
 | 
						|
    log.writeLog(__LINE__, "**********Process Manager Started**********");
 | 
						|
 | 
						|
    //Ignore SIGPIPE signals
 | 
						|
    signal(SIGPIPE, SIG_IGN);
 | 
						|
 | 
						|
    //Ignore SIGHUP signals
 | 
						|
    signal(SIGHUP, SIG_IGN);
 | 
						|
 | 
						|
    //create SIGUSR1 handler to get configuration updates
 | 
						|
    signal(SIGUSR1, sigUser1Handler);
 | 
						|
 | 
						|
    // Get System Name
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig("SystemName", systemName);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {}
 | 
						|
 | 
						|
    //get cloud setting
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig( "Cloud", cloud);
 | 
						|
    }
 | 
						|
    catch (...) {}
 | 
						|
 | 
						|
    //get amazon parameters
 | 
						|
    if ( cloud == "amazon-ec2" || cloud == "amazon-vpc" )
 | 
						|
    {
 | 
						|
        oam.getSystemConfig("PMInstanceType", PMInstanceType);
 | 
						|
        oam.getSystemConfig("UMInstanceType", UMInstanceType);
 | 
						|
        oam.getSystemConfig("AmazonPMFailover", AmazonPMFailover);
 | 
						|
 | 
						|
        amazon = true;
 | 
						|
    }
 | 
						|
 | 
						|
    //get gluster config
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        DataRedundancyConfig = "n";
 | 
						|
    }
 | 
						|
 | 
						|
    //hdfs / hadoop config
 | 
						|
    string DBRootStorageType;
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
 | 
						|
    }
 | 
						|
    catch (...) {}
 | 
						|
 | 
						|
    if ( DBRootStorageType == "hdfs" )
 | 
						|
        HDFS = true;
 | 
						|
 | 
						|
    log.writeLog(__LINE__, "Main: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    //PMwithUM config
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig( "PMwithUM", PMwithUM);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        PMwithUM = "n";
 | 
						|
    }
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig("MySQLRep", MySQLRep);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        MySQLRep = "n";
 | 
						|
    }
 | 
						|
 | 
						|
    // get system uptime and alarm if this is a restart after module outage
 | 
						|
    if ( gOAMParentModuleFlag )
 | 
						|
    {
 | 
						|
        log.writeLog(__LINE__, "Running Active");
 | 
						|
        log.writeLog(__LINE__, "Running Active", LOG_TYPE_DEBUG);
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        log.writeLog(__LINE__, "Running Standby");
 | 
						|
        log.writeLog(__LINE__, "Running Standby", LOG_TYPE_DEBUG);
 | 
						|
        runStandby = true;
 | 
						|
    }
 | 
						|
 | 
						|
    //get local module main IP address
 | 
						|
    ModuleConfig moduleconfig;
 | 
						|
    oam.getSystemConfig(config.moduleName(), moduleconfig);
 | 
						|
    HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
 | 
						|
    string localIPaddr = (*pt1).IPAddr;
 | 
						|
    localHostName = (*pt1).HostName;
 | 
						|
 | 
						|
    struct ifaddrs* addrs, *iap;
 | 
						|
    struct sockaddr_in* sa;
 | 
						|
    char buf[32];
 | 
						|
 | 
						|
    getifaddrs(&addrs);
 | 
						|
 | 
						|
    for (iap = addrs; iap != NULL; iap = iap->ifa_next)
 | 
						|
    {
 | 
						|
 | 
						|
        if (iap->ifa_addr && (iap->ifa_flags & IFF_UP) && iap->ifa_addr->sa_family == AF_INET)
 | 
						|
        {
 | 
						|
            sa = (struct sockaddr_in*)(iap->ifa_addr);
 | 
						|
            inet_ntop(iap->ifa_addr->sa_family, (void*) & (sa->sin_addr), buf, sizeof(buf));
 | 
						|
 | 
						|
            if (!strcmp(localIPaddr.c_str(), buf))
 | 
						|
            {
 | 
						|
                iface_name = iap->ifa_name;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    freeifaddrs(addrs);
 | 
						|
    log.writeLog(__LINE__, "Main Ethernet Port = " + iface_name, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    //
 | 
						|
    //start a thread to ping all system modules
 | 
						|
    //
 | 
						|
    if (runStandby)
 | 
						|
    {
 | 
						|
        //running standby after startup
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.processInitComplete("ProcessManager", oam::STANDBY);
 | 
						|
            log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        // create message thread
 | 
						|
        pthread_t MessageThread;
 | 
						|
        int ret = pthread_create (&MessageThread, NULL, (void* (*)(void*)) &messageThread, &config);
 | 
						|
 | 
						|
        if ( ret != 0 )
 | 
						|
            log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
 | 
						|
 | 
						|
        // create alarm message thread
 | 
						|
        pthread_t AlarmMessageThread;
 | 
						|
        ret = pthread_create (&AlarmMessageThread, NULL, (void* (*)(void*)) &alarmMessageThread, &config);
 | 
						|
 | 
						|
        if ( ret != 0 )
 | 
						|
            log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
 | 
						|
 | 
						|
        //monitor OAM Parent Module for failover
 | 
						|
        while (true)
 | 
						|
        {
 | 
						|
            if ( processManager.OAMParentModuleChange() == oam::API_SUCCESS )
 | 
						|
                break;
 | 
						|
 | 
						|
            log.writeLog(__LINE__, "OAMParentModuleChange failure", LOG_TYPE_WARNING);
 | 
						|
            // GO TRY AGAIN
 | 
						|
        }
 | 
						|
 | 
						|
        pthread_t srvThread;
 | 
						|
        int status = pthread_create (&srvThread, NULL, (void* (*)(void*)) &pingDeviceThread, NULL);
 | 
						|
 | 
						|
        if ( status != 0 )
 | 
						|
            log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        //running active after startup
 | 
						|
        //Update DBRM section of Columnstore.xml
 | 
						|
        processManager.updateWorkerNodeconfig();
 | 
						|
//		processManager.distributeConfigFile("system");
 | 
						|
 | 
						|
        pthread_t srvThread;
 | 
						|
        int status = pthread_create (&srvThread, NULL, (void* (*)(void*)) &pingDeviceThread, NULL);
 | 
						|
 | 
						|
        if ( status != 0 )
 | 
						|
            log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | 
						|
 | 
						|
        // if HDFS, create a thread to push an image of activeAlarms to HDFS filesystem
 | 
						|
        if (HDFS)
 | 
						|
        {
 | 
						|
            pthread_t hdfsAlarmThread;
 | 
						|
            int status = pthread_create(&hdfsAlarmThread, NULL, (void* (*)(void*)) &hdfsActiveAlarmsPushingThread, NULL);
 | 
						|
 | 
						|
            if ( status != 0 )
 | 
						|
                log.writeLog(__LINE__, "hdfsActiveAlarmsPushingThread pthread_create failed, return code = " + oam.itoa(status), LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        sleep(5);
 | 
						|
 | 
						|
        SystemStatus systemstatus;
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.getSystemStatus(systemstatus);
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
//			string error = ex.what();
 | 
						|
//			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
//			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        if (systemstatus.SystemOpState != oam::MAN_OFFLINE &&
 | 
						|
                systemstatus.SystemOpState != oam::ACTIVE)
 | 
						|
        {
 | 
						|
            pthread_t mgrProcThread;
 | 
						|
            int status = pthread_create (&mgrProcThread, NULL, (void* (*)(void*)) &startMgrProcessThread, NULL);
 | 
						|
 | 
						|
            if ( status != 0 )
 | 
						|
                log.writeLog(__LINE__, "startMgrProcessThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.processInitComplete("ProcessManager");
 | 
						|
            log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        //make sure ProcMgr IP Address is configured correctly
 | 
						|
        try
 | 
						|
        {
 | 
						|
            Config* sysConfig = Config::makeConfig();
 | 
						|
 | 
						|
            // get Standby IP address
 | 
						|
            ModuleConfig moduleconfig;
 | 
						|
            oam.getSystemConfig(config.moduleName(), moduleconfig);
 | 
						|
            HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
 | 
						|
            string IPaddr = (*pt1).IPAddr;
 | 
						|
 | 
						|
            sysConfig->setConfig("ProcMgr", "IPAddr", IPaddr);
 | 
						|
            sysConfig->setConfig("ProcMgr_Alarm", "IPAddr", IPaddr);
 | 
						|
 | 
						|
            log.writeLog(__LINE__, "set ProcMgr IPaddr to " + IPaddr, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
            //update Calpont Config table
 | 
						|
            try
 | 
						|
            {
 | 
						|
                sysConfig->write();
 | 
						|
            }
 | 
						|
            catch (...)
 | 
						|
            {
 | 
						|
                log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
 | 
						|
            }
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "ERROR: makeConfig failed", LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.distributeConfigFile();
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {}
 | 
						|
 | 
						|
        // create message thread
 | 
						|
        pthread_t MessageThread;
 | 
						|
        int ret = pthread_create (&MessageThread, NULL, (void* (*)(void*)) &messageThread, &config);
 | 
						|
 | 
						|
        if ( ret != 0 )
 | 
						|
            log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
 | 
						|
 | 
						|
        // create alarm message thread
 | 
						|
        pthread_t AlarmMessageThread;
 | 
						|
        ret = pthread_create (&AlarmMessageThread, NULL, (void* (*)(void*)) &alarmMessageThread, &config);
 | 
						|
 | 
						|
        if ( ret != 0 )
 | 
						|
            log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    //start a thread to process heartbeat checks
 | 
						|
    //
 | 
						|
//	pthread_t heartThread;
 | 
						|
//	pthread_create (&heartThread, NULL, (void*(*)(void*)) &heartbeatProcessThread, NULL);
 | 
						|
 | 
						|
    //
 | 
						|
    //start a thread to read heartbeat messages
 | 
						|
    //
 | 
						|
//	pthread_t heartMsgThread;
 | 
						|
//	pthread_create (&heartMsgThread, NULL, (void*(*)(void*)) &heartbeatMsgThread, NULL);
 | 
						|
 | 
						|
    // suspend forever
 | 
						|
    while (true)
 | 
						|
    {
 | 
						|
        sleep(1000);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/******************************************************************************************
 | 
						|
* @brief	messageThread
 | 
						|
*
 | 
						|
* purpose:	Read incoming messages
 | 
						|
*
 | 
						|
******************************************************************************************/
 | 
						|
static void messageThread(Configuration config)
 | 
						|
{
 | 
						|
    ProcessLog log;
 | 
						|
    ProcessManager processManager(config, log);
 | 
						|
    Oam oam;
 | 
						|
 | 
						|
    //check for running active, then launch
 | 
						|
    while (true)
 | 
						|
    {
 | 
						|
        if ( !runStandby)
 | 
						|
            break;
 | 
						|
 | 
						|
        sleep (1);
 | 
						|
    }
 | 
						|
 | 
						|
    log.writeLog(__LINE__, "Message Thread started ..", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    //read and cleanup port before trying to use
 | 
						|
    try
 | 
						|
    {
 | 
						|
        Config* sysConfig = Config::makeConfig();
 | 
						|
        string port = sysConfig->getConfig("ProcMgr", "Port");
 | 
						|
        string cmd = "fuser -k " + port + "/tcp >/dev/null 2>&1";
 | 
						|
 | 
						|
        if ( !rootUser)
 | 
						|
            cmd = "sudo fuser -k " + port + "/tcp >/dev/null 2>&1";
 | 
						|
 | 
						|
 | 
						|
        system(cmd.c_str());
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    //waiting for request
 | 
						|
    //
 | 
						|
    IOSocket fIos;
 | 
						|
 | 
						|
    for (;;)
 | 
						|
    {
 | 
						|
        try
 | 
						|
        {
 | 
						|
            MessageQueueServer procmgr("ProcMgr");
 | 
						|
 | 
						|
            for (;;)
 | 
						|
            {
 | 
						|
                try
 | 
						|
                {
 | 
						|
                    fIos = procmgr.accept();
 | 
						|
 | 
						|
                    pthread_t messagethread;
 | 
						|
                    int status = pthread_create (&messagethread, NULL, (void* (*)(void*)) &processMSG, &fIos);
 | 
						|
 | 
						|
                    if ( status != 0 )
 | 
						|
                        log.writeLog(__LINE__, "messagethread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | 
						|
                }
 | 
						|
                catch (...)
 | 
						|
                {}
 | 
						|
 | 
						|
            }
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
 | 
						|
 | 
						|
            // takes 2 - 4 minites to free sockets, sleep and retry
 | 
						|
            sleep(60);
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
 | 
						|
            // takes 2 - 4 minites to free sockets, sleep and retry
 | 
						|
            sleep(60);
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/******************************************************************************************
 | 
						|
* @brief	alarmMesssageThread
 | 
						|
*
 | 
						|
* purpose:	Read incoming alarm messages
 | 
						|
*
 | 
						|
******************************************************************************************/
 | 
						|
static void alarmMessageThread(Configuration config)
 | 
						|
{
 | 
						|
    ProcessLog log;
 | 
						|
    ProcessManager processManager(config, log);
 | 
						|
    Oam oam;
 | 
						|
 | 
						|
    ByteStream msg;
 | 
						|
 | 
						|
    //check for running active, then launch
 | 
						|
    while (true)
 | 
						|
    {
 | 
						|
        if ( !runStandby)
 | 
						|
            break;
 | 
						|
 | 
						|
        sleep (1);
 | 
						|
    }
 | 
						|
 | 
						|
    log.writeLog(__LINE__, "Alarm Message Thread started ..", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    //read and cleanup port before trying to use
 | 
						|
    try
 | 
						|
    {
 | 
						|
        Config* sysConfig = Config::makeConfig();
 | 
						|
        string port = sysConfig->getConfig("ProcMgr_Alarm", "Port");
 | 
						|
        string cmd = "fuser -k " + port + "/tcp >/dev/null 2>&1";
 | 
						|
 | 
						|
        if ( !rootUser)
 | 
						|
            cmd = "sudo fuser -k " + port + "/tcp >/dev/null 2>&1";
 | 
						|
 | 
						|
        system(cmd.c_str());
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    //waiting for request
 | 
						|
    //
 | 
						|
    IOSocket fIos;
 | 
						|
 | 
						|
    for (;;)
 | 
						|
    {
 | 
						|
        try
 | 
						|
        {
 | 
						|
            MessageQueueServer procmgr("ProcMgr_Alarm");
 | 
						|
 | 
						|
            for (;;)
 | 
						|
            {
 | 
						|
                try
 | 
						|
                {
 | 
						|
                    fIos = procmgr.accept();
 | 
						|
 | 
						|
                    try
 | 
						|
                    {
 | 
						|
                        msg = fIos.read();
 | 
						|
 | 
						|
                        if (msg.length() <= 0)
 | 
						|
                            continue;
 | 
						|
 | 
						|
                        //log.writeLog(__LINE__,  "MSG RECEIVED: Process Alarm Message");
 | 
						|
 | 
						|
                        ByteStream::byte alarmID;
 | 
						|
                        std::string componentID;
 | 
						|
                        ByteStream::byte state;
 | 
						|
                        std::string ModuleName;
 | 
						|
                        std::string processName;
 | 
						|
                        ByteStream::byte pid;
 | 
						|
                        ByteStream::byte tid;
 | 
						|
 | 
						|
                        msg >> alarmID;
 | 
						|
                        msg >> componentID;
 | 
						|
                        msg >> state;
 | 
						|
                        msg >> ModuleName;
 | 
						|
                        msg >> processName;
 | 
						|
                        msg >> pid;
 | 
						|
                        msg >> tid;
 | 
						|
 | 
						|
                        Alarm calAlarm;
 | 
						|
 | 
						|
                        calAlarm.setAlarmID (alarmID);
 | 
						|
                        calAlarm.setComponentID (componentID);
 | 
						|
                        calAlarm.setState (state);
 | 
						|
                        calAlarm.setSname (ModuleName);
 | 
						|
                        calAlarm.setPname (processName);
 | 
						|
                        calAlarm.setPid (pid);
 | 
						|
                        calAlarm.setTid (tid);
 | 
						|
 | 
						|
                        ALARMManager aManager;
 | 
						|
                        aManager.processAlarmReport(calAlarm);
 | 
						|
                    }
 | 
						|
                    catch (exception& ex)
 | 
						|
                    {
 | 
						|
                        string error = ex.what();
 | 
						|
                        log.writeLog(__LINE__, "EXCEPTION ERROR on read for ProcMgr_Alarm:" + error, LOG_TYPE_ERROR);
 | 
						|
                        continue;
 | 
						|
                    }
 | 
						|
                    catch (...)
 | 
						|
                    {
 | 
						|
                        log.writeLog(__LINE__, "EXCEPTION ERROR on read for ProcMgr_Alarm: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                        continue;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                catch (exception& ex)
 | 
						|
                {
 | 
						|
                    string error = ex.what();
 | 
						|
                    log.writeLog(__LINE__, "EXCEPTION ERROR on accept for ProcMgr_Alarm:" + error, LOG_TYPE_ERROR);
 | 
						|
                    continue;
 | 
						|
                }
 | 
						|
                catch (...)
 | 
						|
                {
 | 
						|
                    log.writeLog(__LINE__, "EXCEPTION ERROR on accept for ProcMgr_Alarm: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                    continue;
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr_Alarm:" + error, LOG_TYPE_ERROR);
 | 
						|
 | 
						|
            sleep(1);
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr_Alarm: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
 | 
						|
            sleep(1);
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/******************************************************************************************
 | 
						|
* @brief	sigUser1Handler
 | 
						|
*
 | 
						|
* purpose:	Handler SIGUSER1 signal and initial failover
 | 
						|
*
 | 
						|
******************************************************************************************/
 | 
						|
static void sigUser1Handler(int sig)
 | 
						|
{
 | 
						|
    ProcessLog log;
 | 
						|
    Configuration config;
 | 
						|
    ProcessManager processManager(config, log);
 | 
						|
    Oam oam;
 | 
						|
    log.writeLog(__LINE__, "SIGUSER1 received, set startFailOver = true", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    startFailOver = true;
 | 
						|
}
 | 
						|
 | 
						|
/*****************************************************************************************
 | 
						|
* @brief	Start Mgr Process by module Thread
 | 
						|
*
 | 
						|
* purpose:	Send Messages to Module Process Monitors to start Processes
 | 
						|
*
 | 
						|
*****************************************************************************************/
 | 
						|
static void startMgrProcessThread()
 | 
						|
{
 | 
						|
    ProcessLog log;
 | 
						|
    Configuration config;
 | 
						|
    ProcessManager processManager(config, log);
 | 
						|
    Oam oam;
 | 
						|
    SystemModuleTypeConfig systemmoduletypeconfig;
 | 
						|
    ModuleTypeConfig PMSmoduletypeconfig;
 | 
						|
    ALARMManager aManager;
 | 
						|
 | 
						|
    int waitTime = 180;
 | 
						|
 | 
						|
    log.writeLog(__LINE__, "startMgrProcessThread launched", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    //get calpont software version and release
 | 
						|
    string localSoftwareInfo = columnstore_version + columnstore_release;
 | 
						|
    //get systemStartupOffline
 | 
						|
    string systemStartupOffline = "n";
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        Config* sysConfig = Config::makeConfig();
 | 
						|
 | 
						|
        systemStartupOffline = sysConfig->getConfig("Installation", "SystemStartupOffline");
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        log.writeLog(__LINE__, "ERROR: Problem getting systemStartupOffline from the Calpont System Configuration file", LOG_TYPE_ERROR);
 | 
						|
        systemStartupOffline = "n";
 | 
						|
    }
 | 
						|
 | 
						|
    if ( systemStartupOffline == "y" )
 | 
						|
        log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_INFO);
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig(systemmoduletypeconfig);
 | 
						|
    }
 | 
						|
    catch (exception& ex)
 | 
						|
    {
 | 
						|
        string error = ex.what();
 | 
						|
        log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
 | 
						|
    //get Distributed Install
 | 
						|
    string DistributedInstall = "y";
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig("DistributedInstall", DistributedInstall);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        log.writeLog(__LINE__, "ERROR: get DistributedInstall", LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
 | 
						|
    //Send out a start service just to make sure Columnstore is runing on remote nodes
 | 
						|
    //note this only works for systems with ssh-keys
 | 
						|
    /*	for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | 
						|
    	{
 | 
						|
    		int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | 
						|
    		if( moduleCount == 0)
 | 
						|
    			continue;
 | 
						|
 | 
						|
    		DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
    		for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | 
						|
    		{
 | 
						|
    		      //skip OAM Parent module
 | 
						|
    		      if ( (*pt).DeviceName == config.moduleName() )
 | 
						|
    			      continue;
 | 
						|
 | 
						|
    		      HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
 | 
						|
    		      for( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
 | 
						|
    		      {
 | 
						|
    			      //run remote command script
 | 
						|
    			      string cmd = startup::StartUp::installDir() + "/bin/remote_command.sh " + (*pt1).IPAddr + " ssh '" + startup::StartUp::installDir() + "/bin/columnstore restart' 0";
 | 
						|
    			      system(cmd.c_str());
 | 
						|
    		      }
 | 
						|
    		}
 | 
						|
    	}
 | 
						|
    */
 | 
						|
    //distribute system and process config files
 | 
						|
    processManager.distributeConfigFile("system");
 | 
						|
    processManager.distributeConfigFile("system", "ProcessConfig.xml");
 | 
						|
 | 
						|
    //send out moduleName to remote nodes, this will be used to startup new installed nodes
 | 
						|
    {
 | 
						|
        int status = API_SUCCESS;
 | 
						|
        int k = 0;
 | 
						|
 | 
						|
        for ( ; k < waitTime ; k++ )
 | 
						|
        {
 | 
						|
            if ( startsystemthreadStop )
 | 
						|
            {
 | 
						|
                processManager.setSystemState(oam::MAN_OFFLINE);
 | 
						|
 | 
						|
                // exit thread
 | 
						|
                log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
 | 
						|
                pthread_exit(0);
 | 
						|
            }
 | 
						|
 | 
						|
            status = API_SUCCESS;
 | 
						|
 | 
						|
            for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | 
						|
            {
 | 
						|
                int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
                if ( moduleCount == 0)
 | 
						|
                    continue;
 | 
						|
 | 
						|
                DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
                for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | 
						|
                {
 | 
						|
                    string moduleName = (*pt).DeviceName;
 | 
						|
 | 
						|
                    //skip OAM Parent module
 | 
						|
                    if ( (*pt).DeviceName == config.moduleName() )
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
 | 
						|
                            (*pt).DisableState == oam::AUTODISABLEDSTATE )
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    int ret = processManager.configureModule(moduleName);
 | 
						|
 | 
						|
                    if ( ret != API_SUCCESS )
 | 
						|
                        status = ret;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            //get out of loop if all modules updated
 | 
						|
            if ( status == API_SUCCESS )
 | 
						|
                break;
 | 
						|
 | 
						|
            //retry after sleeping for a bit
 | 
						|
            sleep(1);
 | 
						|
        }
 | 
						|
 | 
						|
        if ( k == waitTime || status == API_FAILURE)
 | 
						|
        {
 | 
						|
            // system didn't successfull restart
 | 
						|
            processManager.setSystemState(oam::FAILED);
 | 
						|
            // exit thread
 | 
						|
            log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons running", LOG_TYPE_CRITICAL);
 | 
						|
            log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
 | 
						|
            pthread_exit(0);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    //wait until all modules are up after a system reboot
 | 
						|
    int i = 0;
 | 
						|
 | 
						|
    for ( ; i < waitTime ; i++ )
 | 
						|
    {
 | 
						|
        if ( startsystemthreadStop )
 | 
						|
        {
 | 
						|
            processManager.setSystemState(oam::MAN_OFFLINE);
 | 
						|
 | 
						|
            // exit thread
 | 
						|
            log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
 | 
						|
            pthread_exit(0);
 | 
						|
        }
 | 
						|
 | 
						|
        int status = API_SUCCESS;
 | 
						|
 | 
						|
        for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | 
						|
        {
 | 
						|
            if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType == "pm" )
 | 
						|
                PMSmoduletypeconfig = systemmoduletypeconfig.moduletypeconfig[i];
 | 
						|
 | 
						|
            int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
            if ( moduleCount == 0)
 | 
						|
                continue;
 | 
						|
 | 
						|
            DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
            for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | 
						|
            {
 | 
						|
                string moduleName = (*pt).DeviceName;
 | 
						|
 | 
						|
                // Is Module UP
 | 
						|
                try
 | 
						|
                {
 | 
						|
                    bool degraded;
 | 
						|
                    int opState = oam::ACTIVE;
 | 
						|
                    oam.getModuleStatus(moduleName, opState, degraded);
 | 
						|
 | 
						|
                    if ( opState == oam::MAN_DISABLED )
 | 
						|
                        //mark all processes running on module man-offline except ProcMon
 | 
						|
                        processManager.setProcessStates(moduleName, oam::MAN_OFFLINE);
 | 
						|
 | 
						|
                    if ( opState == oam::AUTO_DISABLED)
 | 
						|
                        //mark all processes running on module auto-offline
 | 
						|
                        processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                    if (opState == oam::INITIAL ||
 | 
						|
                            opState == oam::DOWN)
 | 
						|
                    {
 | 
						|
                        //a module is not up
 | 
						|
                        status = API_MINOR_FAILURE;
 | 
						|
                        break;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                catch (exception& ex)
 | 
						|
                {
 | 
						|
//					string error = ex.what();
 | 
						|
//					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
 | 
						|
                }
 | 
						|
                catch (...)
 | 
						|
                {
 | 
						|
//					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            if ( status == API_MINOR_FAILURE)
 | 
						|
            {
 | 
						|
                sleep(1);
 | 
						|
                break;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        if ( status == API_SUCCESS)
 | 
						|
            //all modules are up
 | 
						|
            break;
 | 
						|
    }
 | 
						|
 | 
						|
    if ( i == waitTime )
 | 
						|
    {
 | 
						|
        // system didn't successfull restart
 | 
						|
        processManager.setSystemState(oam::FAILED);
 | 
						|
 | 
						|
        // exit thread
 | 
						|
        log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all modules are UP", LOG_TYPE_CRITICAL);
 | 
						|
        pthread_exit(0);
 | 
						|
    }
 | 
						|
 | 
						|
    //configure the PMS settings
 | 
						|
    processManager.updatePMSconfig();
 | 
						|
 | 
						|
    if (HDFS)
 | 
						|
        //distribute config file
 | 
						|
        processManager.distributeConfigFile("system");
 | 
						|
 | 
						|
    //now wait until all procmons are ACTIVE and validate rpms on each module
 | 
						|
    int status = API_SUCCESS;
 | 
						|
    int k = 0;
 | 
						|
 | 
						|
    for ( ; k < waitTime ; k++ )
 | 
						|
    {
 | 
						|
        if ( startsystemthreadStop )
 | 
						|
        {
 | 
						|
            processManager.setSystemState(oam::MAN_OFFLINE);
 | 
						|
 | 
						|
            // exit thread
 | 
						|
            log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
 | 
						|
            pthread_exit(0);
 | 
						|
        }
 | 
						|
 | 
						|
        status = API_SUCCESS;
 | 
						|
 | 
						|
        for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | 
						|
        {
 | 
						|
            int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
            if ( moduleCount == 0)
 | 
						|
                continue;
 | 
						|
 | 
						|
            DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
            for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | 
						|
            {
 | 
						|
                string moduleName = (*pt).DeviceName;
 | 
						|
 | 
						|
                if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
 | 
						|
                        (*pt).DisableState == oam::AUTODISABLEDSTATE )
 | 
						|
                    continue;
 | 
						|
 | 
						|
                int moduleOpState = oam::ACTIVE;
 | 
						|
 | 
						|
                // check module state
 | 
						|
                try
 | 
						|
                {
 | 
						|
                    bool degraded;
 | 
						|
                    oam.getModuleStatus(moduleName, moduleOpState, degraded);
 | 
						|
 | 
						|
                    // if up, set to MAN_INIT
 | 
						|
                    if ( HDFS &&
 | 
						|
                            (moduleOpState == oam::UP) )
 | 
						|
                    {
 | 
						|
                        processManager.setModuleState(moduleName, oam::MAN_INIT);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                catch (exception& ex)
 | 
						|
                {
 | 
						|
//					string error = ex.what();
 | 
						|
//					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
 | 
						|
                }
 | 
						|
                catch (...)
 | 
						|
                {
 | 
						|
//					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                }
 | 
						|
 | 
						|
                // Is Module's ProcMon ACTIVE and module status has been updated
 | 
						|
                int opState = oam::ACTIVE;
 | 
						|
 | 
						|
                try
 | 
						|
                {
 | 
						|
                    ProcessStatus procstat;
 | 
						|
                    oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
 | 
						|
                    opState = procstat.ProcessOpState;
 | 
						|
 | 
						|
                    if (opState != oam::ACTIVE)
 | 
						|
                    {
 | 
						|
                        //skip if Not ACTIVE
 | 
						|
                        log.writeLog(__LINE__, "Module ProcMon not active yet: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                        status = API_MINOR_FAILURE;
 | 
						|
                        continue;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                catch (exception& ex)
 | 
						|
                {
 | 
						|
//					string error = ex.what();
 | 
						|
//					log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
                    status = API_MINOR_FAILURE;
 | 
						|
                    continue;
 | 
						|
                }
 | 
						|
                catch (...)
 | 
						|
                {
 | 
						|
//					log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                    status = API_MINOR_FAILURE;
 | 
						|
                    continue;
 | 
						|
                }
 | 
						|
 | 
						|
                //skip OAM Parent module
 | 
						|
                if ( moduleName == config.moduleName() )
 | 
						|
                    continue;
 | 
						|
 | 
						|
                //ProcMon ACTIVE, validate the software release and version of that module
 | 
						|
                ByteStream msg;
 | 
						|
                ByteStream::byte requestID = GETSOFTWAREINFO;
 | 
						|
                msg << requestID;
 | 
						|
 | 
						|
                string moduleSoftwareInfo = processManager.sendMsgProcMon1( moduleName, msg, requestID );
 | 
						|
 | 
						|
                if ( moduleSoftwareInfo == "FAILED" )
 | 
						|
                    continue;
 | 
						|
 | 
						|
                if ( localSoftwareInfo != moduleSoftwareInfo )
 | 
						|
                {
 | 
						|
                    // module not running on same Calpont Software build as this local Director
 | 
						|
                    // alarm and fail the module
 | 
						|
                    log.writeLog(__LINE__, "Software Version mismatch : " + moduleName + "/" + localSoftwareInfo + "/" + moduleSoftwareInfo, LOG_TYPE_CRITICAL);
 | 
						|
 | 
						|
                    aManager.sendAlarmReport(moduleName.c_str(), INVALID_SW_VERSION, SET);
 | 
						|
                    processManager.setModuleState(moduleName, oam::FAILED);
 | 
						|
                    status = API_FAILURE;
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        //get out of loop if all modules ACTTVE or MAN_OFFLINE
 | 
						|
        if ( status == API_SUCCESS )
 | 
						|
        {
 | 
						|
            if ( systemStartupOffline == "y" )
 | 
						|
            {
 | 
						|
                processManager.setSystemState(oam::MAN_OFFLINE);
 | 
						|
                log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_DEBUG);
 | 
						|
            }
 | 
						|
 | 
						|
            break;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            //get out of loop if start module failed
 | 
						|
            if ( status == API_FAILURE )
 | 
						|
                break;
 | 
						|
 | 
						|
            //retry after sleeping for a bit
 | 
						|
            sleep(1);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    if ( k == waitTime || status == API_FAILURE)
 | 
						|
    {
 | 
						|
        // system didn't successfull restart
 | 
						|
        processManager.setSystemState(oam::FAILED);
 | 
						|
        // exit thread
 | 
						|
        log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons ACTIVE", LOG_TYPE_CRITICAL);
 | 
						|
        log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
 | 
						|
        pthread_exit(0);
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        //distribute config file
 | 
						|
//		processManager.distributeConfigFile("system");
 | 
						|
 | 
						|
        if ( systemStartupOffline == "n" && status == API_SUCCESS )
 | 
						|
        {
 | 
						|
            oam::DeviceNetworkList devicenetworklist;
 | 
						|
            pthread_t startsystemthread;
 | 
						|
            int status = pthread_create (&startsystemthread, NULL, (void* (*)(void*)) &startSystemThread, &devicenetworklist);
 | 
						|
 | 
						|
            if ( status != 0 )
 | 
						|
            {
 | 
						|
                log.writeLog(__LINE__, "STARTSYSTEMS: pthread_create failed, return status = " + oam.itoa(status));
 | 
						|
                status = API_FAILURE;
 | 
						|
            }
 | 
						|
 | 
						|
            if (status == 0)
 | 
						|
            {
 | 
						|
                pthread_join(startsystemthread, NULL);
 | 
						|
                status = startsystemthreadStatus;
 | 
						|
            }
 | 
						|
 | 
						|
            if ( status != API_SUCCESS )
 | 
						|
            {
 | 
						|
                // system didn't successfull restart
 | 
						|
                processManager.setSystemState(oam::FAILED);
 | 
						|
                log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, error returned from startSystemThread", LOG_TYPE_CRITICAL);
 | 
						|
            }
 | 
						|
            else
 | 
						|
                //distribute config file
 | 
						|
                processManager.distributeConfigFile("system");
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    // exit thread
 | 
						|
    log.writeLog(__LINE__, "startMgrProcessThread Exit", LOG_TYPE_DEBUG);
 | 
						|
    pthread_exit(0);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*****************************************************************************************
 | 
						|
* @brief	pingDeviceThread
 | 
						|
*
 | 
						|
* purpose:	perform ping testing on the devices within the system
 | 
						|
*
 | 
						|
*****************************************************************************************/
 | 
						|
void pingDeviceThread()
 | 
						|
{
 | 
						|
    ProcessLog log;
 | 
						|
    Configuration config;
 | 
						|
    ProcessManager processManager(config, log);
 | 
						|
    Oam oam;
 | 
						|
    ModuleTypeConfig moduletypeconfig;
 | 
						|
    ALARMManager aManager;
 | 
						|
    BRM::DBRM dbrm;
 | 
						|
 | 
						|
    log.writeLog(__LINE__, "pingDeviceThread launched", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    string cmdLine = "ping ";
 | 
						|
    string cmdOption = " -c 1 -w 5 >> /dev/null";
 | 
						|
    string cmd;
 | 
						|
    string deviceIP;
 | 
						|
 | 
						|
    //
 | 
						|
    // Get Module Info
 | 
						|
    //
 | 
						|
    SystemModuleTypeConfig systemModuleTypeConfig;
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig(systemModuleTypeConfig);
 | 
						|
    }
 | 
						|
    catch (exception& ex)
 | 
						|
    {
 | 
						|
        string error = ex.what();
 | 
						|
        log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
        log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
 | 
						|
    //Build the initial list, clear module state
 | 
						|
 | 
						|
    for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | 
						|
    {
 | 
						|
        int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
        if ( moduleCount == 0 )
 | 
						|
            // skip of no modules configured
 | 
						|
            continue;
 | 
						|
 | 
						|
        DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
        for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | 
						|
        {
 | 
						|
            moduleInfoList.insert(moduleList::value_type((*pt).DeviceName, 0));
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    typedef   map<string, int>	nicList;
 | 
						|
    nicList	nicInfoList;
 | 
						|
 | 
						|
    //Build the initial list, clear NIC state
 | 
						|
 | 
						|
    for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | 
						|
    {
 | 
						|
        int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
        if ( moduleCount == 0 )
 | 
						|
            // skip of no modules configured
 | 
						|
            continue;
 | 
						|
 | 
						|
        DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
        for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | 
						|
        {
 | 
						|
 | 
						|
            HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
 | 
						|
 | 
						|
            for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
 | 
						|
            {
 | 
						|
                nicInfoList.insert(moduleList::value_type((*pt1).HostName, 0));
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    //
 | 
						|
    // Get ext device info
 | 
						|
    //
 | 
						|
    SystemExtDeviceConfig systemextdeviceconfig;
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig(systemextdeviceconfig);
 | 
						|
    }
 | 
						|
    catch (exception& ex)
 | 
						|
    {
 | 
						|
        string error = ex.what();
 | 
						|
        log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
    catch (...)
 | 
						|
    {
 | 
						|
//		log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
    }
 | 
						|
 | 
						|
    typedef   map<string, int>	extDeviceList;
 | 
						|
    extDeviceList	extDeviceInfoList;
 | 
						|
 | 
						|
    //Build the initial list, clear ext device state
 | 
						|
 | 
						|
    for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count; i++)
 | 
						|
    {
 | 
						|
        string name = systemextdeviceconfig.extdeviceconfig[i].Name;
 | 
						|
        extDeviceInfoList.insert(extDeviceList::value_type(name, 0));
 | 
						|
    }
 | 
						|
 | 
						|
    //storage config
 | 
						|
    string DBRootStorageType;
 | 
						|
 | 
						|
    try
 | 
						|
    {
 | 
						|
        oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
 | 
						|
    }
 | 
						|
    catch (...) {}
 | 
						|
 | 
						|
    log.writeLog(__LINE__, "pingDeviceThread: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
    int rtnCode = 0;
 | 
						|
    Configuration configData;
 | 
						|
    SystemStatus systemstatus;
 | 
						|
 | 
						|
    bool enableModuleMonitor = true;
 | 
						|
 | 
						|
    bool LANOUTAGEACTIVE = false;
 | 
						|
    bool HOTSTANDBYACTIVE = false;
 | 
						|
    bool downActiveOAMModule = false;
 | 
						|
 | 
						|
    // monitor module and external device loop
 | 
						|
 | 
						|
    while (true)
 | 
						|
    {
 | 
						|
        //don't peform module test if system is MAN_OFFLINE or not getting status's
 | 
						|
        while (true)
 | 
						|
        {
 | 
						|
            SystemStatus systemstatus;
 | 
						|
 | 
						|
            try
 | 
						|
            {
 | 
						|
                oam.getSystemStatus(systemstatus);
 | 
						|
 | 
						|
                if (systemstatus.SystemOpState == oam::MAN_OFFLINE )
 | 
						|
                    sleep(5);
 | 
						|
                else
 | 
						|
                    break;
 | 
						|
            }
 | 
						|
            catch (...)
 | 
						|
            {
 | 
						|
                sleep(5);
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        // Module Heartbeat period and failure count
 | 
						|
        int ModuleHeartbeatPeriod;
 | 
						|
        int ModuleHeartbeatCount;
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.getSystemConfig("ModuleHeartbeatPeriod", ModuleHeartbeatPeriod);
 | 
						|
            oam.getSystemConfig("ModuleHeartbeatCount", ModuleHeartbeatCount);
 | 
						|
            ModuleHeartbeatPeriod = ModuleHeartbeatPeriod * 10;
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
            sleep(5);
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
            sleep(5);
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        // skip testing if Heartbeat is disable
 | 
						|
        if ( ModuleHeartbeatPeriod <= 0 )
 | 
						|
        {
 | 
						|
            if ( enableModuleMonitor )
 | 
						|
                log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to disabled", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
            enableModuleMonitor = false;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            if ( !enableModuleMonitor && moduleInfoList.size() > 1 )
 | 
						|
                log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to enabled", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
            enableModuleMonitor = true;
 | 
						|
        }
 | 
						|
 | 
						|
        //single server system
 | 
						|
        if ( moduleInfoList.size() <= 1)
 | 
						|
            enableModuleMonitor = false;
 | 
						|
 | 
						|
        //
 | 
						|
        // ping NIC
 | 
						|
        //
 | 
						|
 | 
						|
        // read each time to catch updates
 | 
						|
        pthread_mutex_lock(&THREAD_LOCK);
 | 
						|
        systemModuleTypeConfig.moduletypeconfig.clear();
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.getSystemConfig(systemModuleTypeConfig);
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
            sleep(5);
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
            sleep(5);
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
 | 
						|
        pthread_mutex_unlock(&THREAD_LOCK);
 | 
						|
 | 
						|
        bool LANOUTAGESUPPORT = true;
 | 
						|
        bool LOCALNICDOWN = false;
 | 
						|
 | 
						|
        if (enableModuleMonitor)
 | 
						|
        {
 | 
						|
            //test main local Ethernet interface status
 | 
						|
            for ( int count = 0 ; ; count ++)
 | 
						|
            {
 | 
						|
                int sockfd;
 | 
						|
                struct ifreq ifr;
 | 
						|
 | 
						|
                sockfd = socket(AF_INET, SOCK_DGRAM, 0);
 | 
						|
 | 
						|
                if (sockfd == -1)
 | 
						|
                {
 | 
						|
                    log.writeLog(__LINE__, "Could not get socket to check", LOG_TYPE_ERROR);
 | 
						|
                    close(sockfd);
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
 | 
						|
                /* get interface name */
 | 
						|
                strncpy(ifr.ifr_name, iface_name.c_str(), IFNAMSIZ);
 | 
						|
 | 
						|
                /* Read interface flags */
 | 
						|
                if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0)
 | 
						|
                {
 | 
						|
                    // not supported
 | 
						|
                    close(sockfd);
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
 | 
						|
                if (ifr.ifr_flags & IFF_UP)
 | 
						|
                {
 | 
						|
                    // ethernet port is up, continue on
 | 
						|
                    close(sockfd);
 | 
						|
                    break;
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    // ethernet port is down
 | 
						|
                    log.writeLog(__LINE__, "NIC #1 is DOWN", LOG_TYPE_WARNING);
 | 
						|
 | 
						|
                    if ( count >= ModuleHeartbeatCount )
 | 
						|
                    {
 | 
						|
                        LOCALNICDOWN = true;
 | 
						|
                        close(sockfd);
 | 
						|
                        break;
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                        sleep(5);
 | 
						|
                }
 | 
						|
 | 
						|
                close(sockfd);
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        // if the NIC is down, go directly to LAN outage processing
 | 
						|
        if ( !LOCALNICDOWN )
 | 
						|
        {
 | 
						|
            for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | 
						|
            {
 | 
						|
                int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
                if ( moduleCount == 0)
 | 
						|
                    continue;
 | 
						|
 | 
						|
                DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
                for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | 
						|
                {
 | 
						|
                    string moduleName = (*pt).DeviceName;
 | 
						|
                    string ipAddr;
 | 
						|
                    string hostName;
 | 
						|
                    int moduleState = oam::INITIAL;
 | 
						|
                    HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
 | 
						|
 | 
						|
                    for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
 | 
						|
                    {
 | 
						|
                        ipAddr = (*pt1).IPAddr;
 | 
						|
                        hostName = (*pt1).HostName;
 | 
						|
 | 
						|
                        if (enableModuleMonitor)
 | 
						|
                        {
 | 
						|
                            // perform ping test
 | 
						|
                            cmd = cmdLine + ipAddr + cmdOption;
 | 
						|
                            rtnCode = system(cmd.c_str());
 | 
						|
                            rtnCode = WEXITSTATUS(rtnCode);
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                            rtnCode = 0;
 | 
						|
 | 
						|
                        int currentNICState = oam::UP;
 | 
						|
 | 
						|
                        try
 | 
						|
                        {
 | 
						|
                            oam.getNICStatus(hostName, currentNICState);
 | 
						|
                        }
 | 
						|
                        catch (exception& ex)
 | 
						|
                        {
 | 
						|
//							string error = ex.what();
 | 
						|
//							log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
                        }
 | 
						|
                        catch (...)
 | 
						|
                        {
 | 
						|
//							log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                        }
 | 
						|
 | 
						|
                        switch (rtnCode)
 | 
						|
                        {
 | 
						|
                            case 0:
 | 
						|
 | 
						|
                                //NIC Ack ping
 | 
						|
                                if ( currentNICState != oam::UP )
 | 
						|
                                {
 | 
						|
                                    processManager.setNICState(hostName, oam::UP);
 | 
						|
 | 
						|
                                    if ( ModuleHeartbeatPeriod > 0 )
 | 
						|
                                        //Clear an alarm
 | 
						|
                                        aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, CLEAR);
 | 
						|
                                }
 | 
						|
 | 
						|
                                //set LAN Outage indicator to false since a module is responding
 | 
						|
                                if ( moduleState == oam::INITIAL)
 | 
						|
                                    if ( moduleName != config.moduleName())
 | 
						|
                                        LANOUTAGESUPPORT = false;
 | 
						|
 | 
						|
                                //set Module State
 | 
						|
                                if ( moduleState == oam::INITIAL || moduleState == oam::UP)
 | 
						|
                                    moduleState = oam::UP;
 | 
						|
 | 
						|
                                break;
 | 
						|
 | 
						|
                            default:
 | 
						|
 | 
						|
                                //NIC failed to respond to ping
 | 
						|
                                if ( currentNICState != oam::DOWN )
 | 
						|
                                {
 | 
						|
                                    log.writeLog(__LINE__, "NIC failed to respond to ping: " + hostName, LOG_TYPE_WARNING);
 | 
						|
                                    processManager.setNICState(hostName, oam::DOWN);
 | 
						|
 | 
						|
                                    if ( ModuleHeartbeatPeriod > 0 )
 | 
						|
                                        //Issue an alarm
 | 
						|
                                        aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, SET);
 | 
						|
                                }
 | 
						|
 | 
						|
                                //set Module State
 | 
						|
                                if ( moduleState == oam::INITIAL || moduleState == oam::DOWN)
 | 
						|
                                    moduleState = oam::DOWN;
 | 
						|
                                else
 | 
						|
                                    // NIC 1 is up and NIC 2 is down
 | 
						|
                                    moduleState = oam::DEGRADED;
 | 
						|
 | 
						|
                                break;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    // if disable, default module state to up
 | 
						|
                    if (!enableModuleMonitor)
 | 
						|
                        moduleState = oam::UP;
 | 
						|
 | 
						|
                    // moduleState coming out of the NIC monitoring loop
 | 
						|
                    // UP - ALL NICs passed ping test
 | 
						|
                    // DEGRADED - NIC 1 passed, NIC 2 failed ping test
 | 
						|
                    // DOWN - NIC 1 or ALL NICs failed ping test
 | 
						|
 | 
						|
                    int opState = oam::ACTIVE;
 | 
						|
 | 
						|
                    try
 | 
						|
                    {
 | 
						|
                        bool degraded;
 | 
						|
                        oam.getModuleStatus(moduleName, opState, degraded);
 | 
						|
                    }
 | 
						|
                    catch (exception& ex)
 | 
						|
                    {
 | 
						|
//						string error = ex.what();
 | 
						|
//						log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
 | 
						|
                    }
 | 
						|
                    catch (...)
 | 
						|
                    {
 | 
						|
//						log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                    }
 | 
						|
 | 
						|
                    // skip module check if not inuse or in FAILED state
 | 
						|
                    if (opState == oam::MAN_OFFLINE ||
 | 
						|
                            opState == oam::MAN_DISABLED ||
 | 
						|
                            opState == oam::FAILED)
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    //fast track a restart of a downed failover modules
 | 
						|
                    if ( gdownActiveOAMModule == moduleName )
 | 
						|
                    {
 | 
						|
                        moduleInfoList[moduleName] = ModuleHeartbeatCount - 1;
 | 
						|
                        gdownActiveOAMModule.clear();
 | 
						|
                        moduleState = oam::DOWN;
 | 
						|
                        downActiveOAMModule = true;
 | 
						|
                    }
 | 
						|
 | 
						|
                    vector<string>::iterator pt2 = downModuleList.begin();
 | 
						|
 | 
						|
                    for ( ; pt2 != downModuleList.end() ; pt2++)
 | 
						|
                    {
 | 
						|
                        if ( *pt2 == moduleName )
 | 
						|
                        {
 | 
						|
                            moduleInfoList[moduleName] = ModuleHeartbeatCount - 1;
 | 
						|
                            moduleState = oam::DOWN;
 | 
						|
                            downModuleList.erase(pt2);
 | 
						|
                            break;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    switch (moduleState)
 | 
						|
                    {
 | 
						|
                        case oam::DEGRADED:
 | 
						|
                            // do nothing for now
 | 
						|
                            break;
 | 
						|
 | 
						|
                        case oam::UP:
 | 
						|
 | 
						|
// comment out, only come up when both nic are up, if not the pms list will not have the second nic in there
 | 
						|
//						case oam::DEGRADED:
 | 
						|
                            if (opState == oam::DOWN || opState == oam::INITIAL
 | 
						|
                                    || opState == oam::AUTO_DISABLED)
 | 
						|
                            {
 | 
						|
                                //Set the module state to up
 | 
						|
                                processManager.setModuleState(moduleName, moduleState);
 | 
						|
                            }
 | 
						|
 | 
						|
                            if ( moduleName == config.OAMStandbyName() )
 | 
						|
                                HOTSTANDBYACTIVE = true;
 | 
						|
 | 
						|
                            // if LAN OUTAGE ACTIVE, skip module checks
 | 
						|
                            if (LANOUTAGEACTIVE)
 | 
						|
                                break;
 | 
						|
 | 
						|
                            try
 | 
						|
                            {
 | 
						|
                                oam.getSystemConfig("MySQLRep", MySQLRep);
 | 
						|
                            }
 | 
						|
                            catch (...)
 | 
						|
                            {
 | 
						|
                                MySQLRep = "n";
 | 
						|
                            }
 | 
						|
 | 
						|
                            if (moduleInfoList[moduleName] >= ModuleHeartbeatCount ||
 | 
						|
                                    opState == oam::DOWN || opState == oam::AUTO_DISABLED)
 | 
						|
                            {
 | 
						|
                                log.writeLog(__LINE__, "Module alive, bring it back online: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                string PrimaryUMModuleName = config.moduleName();
 | 
						|
 | 
						|
                                try
 | 
						|
                                {
 | 
						|
                                    oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
 | 
						|
                                }
 | 
						|
                                catch (...) {}
 | 
						|
 | 
						|
                                bool busy = false;
 | 
						|
 | 
						|
                                for ( int retry = 0 ; retry < 20 ; retry++ )
 | 
						|
                                {
 | 
						|
                                    busy = false;
 | 
						|
                                    ProcessStatus DMLprocessstatus;
 | 
						|
 | 
						|
                                    try
 | 
						|
                                    {
 | 
						|
                                        oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
 | 
						|
 | 
						|
                                        if ( DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "DMLProc in BUSY_INIT, skip bringing module online " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                            busy = true;
 | 
						|
                                            sleep(5);
 | 
						|
                                        }
 | 
						|
                                        else
 | 
						|
                                            break;
 | 
						|
                                    }
 | 
						|
                                    catch (...)
 | 
						|
                                    {
 | 
						|
                                        sleep(5);
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
 | 
						|
                                if (busy)
 | 
						|
                                    break;
 | 
						|
 | 
						|
                                //set query system state not ready
 | 
						|
                                BRM::DBRM dbrm;
 | 
						|
                                dbrm.setSystemQueryReady(false);
 | 
						|
 | 
						|
                                processManager.setQuerySystemState(false);
 | 
						|
 | 
						|
                                processManager.setSystemState(oam::BUSY_INIT);
 | 
						|
 | 
						|
                                processManager.reinitProcessType("cpimport");
 | 
						|
 | 
						|
                                // halt the dbrm
 | 
						|
                                oam.dbrmctl("halt");
 | 
						|
                                log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
 | 
						|
 | 
						|
                                //send notification
 | 
						|
                                oam.sendDeviceNotification(config.moduleName(), MODULE_UP);
 | 
						|
 | 
						|
                                int status;
 | 
						|
                                DBRootConfigList dbrootConfigList;
 | 
						|
 | 
						|
                                // if shared pm, move dbroots back to pm
 | 
						|
                                if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | 
						|
                                        ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | 
						|
                                        ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | 
						|
                                {
 | 
						|
 | 
						|
                                    //restart to get the versionbuffer files closed so it can be unmounted
 | 
						|
                                    processManager.restartProcessType("WriteEngineServer", moduleName);
 | 
						|
 | 
						|
                                    //set module to enable state
 | 
						|
                                    processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                                    downActiveOAMModule = false;
 | 
						|
                                    int retry;
 | 
						|
 | 
						|
                                    for ( retry = 0 ; retry < 5 ; retry++ )
 | 
						|
                                    {
 | 
						|
                                        try
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "Call autoUnMovePmDbroot", LOG_TYPE_DEBUG);
 | 
						|
                                            oam.autoUnMovePmDbroot(moduleName);
 | 
						|
 | 
						|
                                            //check if any dbroots got assigned back to this module
 | 
						|
                                            // they could not be moved if there were busy on other pms
 | 
						|
                                            try
 | 
						|
                                            {
 | 
						|
                                                int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
 | 
						|
                                                oam.getPmDbrootConfig(moduleID, dbrootConfigList);
 | 
						|
 | 
						|
                                                if (  dbrootConfigList.size() == 0 )
 | 
						|
                                                {
 | 
						|
                                                    // no dbroots, fail module
 | 
						|
                                                    log.writeLog(__LINE__, "autoUnMovePmDbroot left no dbroots mounted, failing module restart: " + moduleName, LOG_TYPE_WARNING);
 | 
						|
 | 
						|
                                                    //Issue an alarm
 | 
						|
                                                    aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | 
						|
 | 
						|
                                                    //set module to disable state
 | 
						|
                                                    processManager.disableModule(moduleName, true);
 | 
						|
 | 
						|
                                                    //call dbrm control
 | 
						|
                                                    oam.dbrmctl("reload");
 | 
						|
                                                    log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                                    // resume the dbrm
 | 
						|
                                                    oam.dbrmctl("resume");
 | 
						|
                                                    log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                                    //clear count
 | 
						|
                                                    moduleInfoList[moduleName] = 0;
 | 
						|
 | 
						|
                                                    processManager.setSystemState(oam::ACTIVE);
 | 
						|
 | 
						|
                                                    //set query system state ready
 | 
						|
                                                    processManager.setQuerySystemState(true);
 | 
						|
 | 
						|
                                                    break;
 | 
						|
                                                }
 | 
						|
                                            }
 | 
						|
                                            catch (...)
 | 
						|
                                            {}
 | 
						|
 | 
						|
                                            log.writeLog(__LINE__, "autoUnMovePmDbroot success", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                            //distribute config file
 | 
						|
                                            processManager.distributeConfigFile("system");
 | 
						|
 | 
						|
                                            break;
 | 
						|
                                        }
 | 
						|
                                        catch (...)
 | 
						|
                                        {
 | 
						|
                                            sleep(5);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if ( retry == 5 )
 | 
						|
                                    {
 | 
						|
                                        log.writeLog(__LINE__, "autoUnMovePmDbroot: Failed. Fail Module", LOG_TYPE_WARNING);
 | 
						|
 | 
						|
                                        //Issue an alarm
 | 
						|
                                        aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | 
						|
 | 
						|
                                        //set module to disable state
 | 
						|
                                        processManager.disableModule(moduleName, true);
 | 
						|
 | 
						|
                                        //call dbrm control
 | 
						|
                                        oam.dbrmctl("reload");
 | 
						|
                                        log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                        // resume the dbrm
 | 
						|
                                        oam.dbrmctl("resume");
 | 
						|
                                        log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                        //clear count
 | 
						|
                                        moduleInfoList[moduleName] = 0;
 | 
						|
 | 
						|
                                        processManager.setSystemState(oam::ACTIVE);
 | 
						|
 | 
						|
                                        //set query system state ready
 | 
						|
                                        processManager.setQuerySystemState(true);
 | 
						|
 | 
						|
                                        break;
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
                                else
 | 
						|
                                    //set module to enable state
 | 
						|
                                    processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                                //restart module processes
 | 
						|
                                int retry = 0;
 | 
						|
 | 
						|
                                int ModuleProcMonWaitCount = 12;
 | 
						|
 | 
						|
                                try
 | 
						|
                                {
 | 
						|
                                    oam.getSystemConfig("ModuleProcMonWaitCount", ModuleProcMonWaitCount);
 | 
						|
                                }
 | 
						|
                                catch (...)
 | 
						|
                                {
 | 
						|
                                    ModuleProcMonWaitCount = 12;
 | 
						|
                                }
 | 
						|
 | 
						|
                                for ( ; retry < ModuleProcMonWaitCount ; retry ++ )
 | 
						|
                                {
 | 
						|
                                    // first, wait until module's ProcMon is ACTIVE
 | 
						|
                                    int opState = oam::ACTIVE;
 | 
						|
 | 
						|
                                    try
 | 
						|
                                    {
 | 
						|
                                        ProcessStatus procstat;
 | 
						|
                                        oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
 | 
						|
                                        opState = procstat.ProcessOpState;
 | 
						|
 | 
						|
                                        if (opState != oam::ACTIVE)
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "Waiting for Module ProcMon to go ACTIVE: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                            sleep(5);
 | 
						|
                                            continue;
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
                                    catch (exception& ex)
 | 
						|
                                    {
 | 
						|
//										string error = ex.what();
 | 
						|
//										log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
                                        sleep(5);
 | 
						|
                                        continue;
 | 
						|
                                    }
 | 
						|
                                    catch (...)
 | 
						|
                                    {
 | 
						|
//										log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                                        sleep(5);
 | 
						|
                                        continue;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    //check and assign Elastic IP Address
 | 
						|
                                    int AmazonElasticIPCount = 0;
 | 
						|
 | 
						|
                                    try
 | 
						|
                                    {
 | 
						|
                                        oam.getSystemConfig("AmazonElasticIPCount", AmazonElasticIPCount);
 | 
						|
                                    }
 | 
						|
                                    catch (...)
 | 
						|
                                    {
 | 
						|
                                        AmazonElasticIPCount = 0;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    for ( int id = 1 ; id < AmazonElasticIPCount + 1 ; id++ )
 | 
						|
                                    {
 | 
						|
                                        string AmazonElasticModule = "AmazonElasticModule" + oam.itoa(id);
 | 
						|
                                        string ELmoduleName;
 | 
						|
 | 
						|
                                        try
 | 
						|
                                        {
 | 
						|
                                            oam.getSystemConfig(AmazonElasticModule, ELmoduleName);
 | 
						|
                                        }
 | 
						|
                                        catch (...) {}
 | 
						|
 | 
						|
                                        if ( ELmoduleName == moduleName )
 | 
						|
                                        {
 | 
						|
                                            //match found assign Elastic IP Address
 | 
						|
                                            string AmazonElasticIPAddr = "AmazonElasticIPAddr" + oam.itoa(id);
 | 
						|
                                            string ELIPaddress;
 | 
						|
 | 
						|
                                            try
 | 
						|
                                            {
 | 
						|
                                                oam.getSystemConfig(AmazonElasticIPAddr, ELIPaddress);
 | 
						|
                                            }
 | 
						|
                                            catch (...) {}
 | 
						|
 | 
						|
                                            try
 | 
						|
                                            {
 | 
						|
                                                oam.assignElasticIP(hostName, ELIPaddress);
 | 
						|
                                                log.writeLog(__LINE__, "Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_DEBUG);
 | 
						|
                                            }
 | 
						|
                                            catch (...)
 | 
						|
                                            {
 | 
						|
                                                log.writeLog(__LINE__, "Failed to Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_ERROR);
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            break;
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    // next, stopmodule to start up clean
 | 
						|
                                    status = processManager.stopModule(moduleName, oam::FORCEFUL, false);
 | 
						|
 | 
						|
                                    if ( status == oam::API_SUCCESS )
 | 
						|
                                    {
 | 
						|
                                        string newStandbyModule = processManager.getStandbyModule();
 | 
						|
 | 
						|
                                        if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
 | 
						|
                                        {
 | 
						|
                                            processManager.setStandbyModule(newStandbyModule);
 | 
						|
                                        }
 | 
						|
                                        else
 | 
						|
                                        {
 | 
						|
                                            if ( newStandbyModule == "NONE")
 | 
						|
                                                if ( moduleName.substr(0, MAX_MODULE_TYPE_SIZE) == "pm" )
 | 
						|
                                                    processManager.setStandbyModule(moduleName);
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        if ((moduleName.find("pm") == 0) && (dbrootConfigList.size() > 0))
 | 
						|
                                        {
 | 
						|
                                            DBRootConfigList::iterator pt = dbrootConfigList.begin();
 | 
						|
 | 
						|
                                            if (( DBRootStorageType == "DataRedundancy") && (*pt == 1))
 | 
						|
                                            {
 | 
						|
                                                log.writeLog(__LINE__, "stopModule, " + config.moduleName(), LOG_TYPE_DEBUG);
 | 
						|
                                                processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
 | 
						|
                                                processManager.switchParentOAMModule(moduleName);
 | 
						|
                                                processManager.stopProcess(config.moduleName(), "ProcessManager", oam::FORCEFUL, true);
 | 
						|
                                                break;
 | 
						|
                                            }
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
                                    else
 | 
						|
                                    {
 | 
						|
                                        //stop failed, retry
 | 
						|
                                        log.writeLog(__LINE__, "stopModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                        sleep(5);
 | 
						|
                                        continue;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    // next, startmodule
 | 
						|
                                    status = processManager.startModule(moduleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                                    if ( status == oam::API_SUCCESS )
 | 
						|
                                        break;
 | 
						|
 | 
						|
                                    log.writeLog(__LINE__, "startModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    //sleep and retry all over again
 | 
						|
                                    sleep (5);
 | 
						|
                                } // end of the retry loop
 | 
						|
 | 
						|
                                if ( retry < ModuleProcMonWaitCount )
 | 
						|
                                {
 | 
						|
                                    // module successfully started
 | 
						|
 | 
						|
                                    //call dbrm control, need to resume before start so the getdbrmfiles halt doesn't hang
 | 
						|
                                    oam.dbrmctl("reload");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    // resume the dbrm
 | 
						|
                                    oam.dbrmctl("resume");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    //set recycle process
 | 
						|
                                    processManager.recycleProcess(moduleName);
 | 
						|
 | 
						|
                                    //distribute config file
 | 
						|
                                    processManager.distributeConfigFile("system");
 | 
						|
                                    sleep(1);
 | 
						|
 | 
						|
                                    string moduleType = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
 | 
						|
 | 
						|
                                    if ( MySQLRep == "y" )
 | 
						|
                                    {
 | 
						|
                                        if ( moduleType == "um" ||
 | 
						|
                                                ( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
 | 
						|
                                                ( moduleType == "pm" && PMwithUM == "y") )
 | 
						|
                                        {
 | 
						|
 | 
						|
                                            //setup MySQL Replication for started modules
 | 
						|
 | 
						|
                                            log.writeLog(__LINE__, "Setup MySQL Replication for module recovering from outage on " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                            DeviceNetworkList devicenetworklist;
 | 
						|
                                            DeviceNetworkConfig devicenetworkconfig;
 | 
						|
                                            devicenetworkconfig.DeviceName = moduleName;
 | 
						|
                                            devicenetworklist.push_back(devicenetworkconfig);
 | 
						|
                                            processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, false, true);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    //enable query stats
 | 
						|
                                    dbrm.setSystemQueryReady(true);
 | 
						|
 | 
						|
                                    //set query system state ready
 | 
						|
                                    processManager.setQuerySystemState(true);
 | 
						|
 | 
						|
                                    processManager.setSystemState(oam::ACTIVE);
 | 
						|
                                    //clear count
 | 
						|
                                    moduleInfoList[moduleName] = 0;
 | 
						|
                                }
 | 
						|
                                else
 | 
						|
                                {
 | 
						|
                                    // module failed to restart, place back in disabled state
 | 
						|
                                    //Log failure, issue alarm, set moduleOpState
 | 
						|
                                    Configuration config;
 | 
						|
 | 
						|
                                    //Issue an alarm
 | 
						|
                                    aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | 
						|
 | 
						|
                                    // if pm, move dbroots back to pm
 | 
						|
                                    if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | 
						|
                                            ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | 
						|
                                            ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | 
						|
                                    {
 | 
						|
                                        //move dbroots to other modules
 | 
						|
                                        try
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
 | 
						|
                                            oam.autoMovePmDbroot(moduleName);
 | 
						|
                                            log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
 | 
						|
                                            //distribute config file
 | 
						|
                                            processManager.distributeConfigFile("system");
 | 
						|
                                        }
 | 
						|
                                        catch (exception& ex)
 | 
						|
                                        {
 | 
						|
                                            string error = ex.what();
 | 
						|
                                            log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
 | 
						|
                                        }
 | 
						|
                                        catch (...)
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    //set module to disable state
 | 
						|
                                    processManager.disableModule(moduleName, true);
 | 
						|
 | 
						|
                                    //call dbrm control
 | 
						|
                                    oam.dbrmctl("reload");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    // resume the dbrm
 | 
						|
                                    oam.dbrmctl("resume");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
 | 
						|
 | 
						|
                                    if ( amazon )
 | 
						|
                                        processManager.setSystemState(oam::FAILED);
 | 
						|
                                    else
 | 
						|
                                        processManager.setSystemState(oam::ACTIVE);
 | 
						|
 | 
						|
                                    //enable query stats
 | 
						|
                                    dbrm.setSystemQueryReady(true);
 | 
						|
 | 
						|
                                    //set query system state ready
 | 
						|
                                    processManager.setQuerySystemState(true);
 | 
						|
 | 
						|
                                    //clear count
 | 
						|
                                    moduleInfoList[moduleName] = 0;
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
 | 
						|
                            break;
 | 
						|
 | 
						|
                        case oam::DOWN:
 | 
						|
 | 
						|
                            // if initial state, skip
 | 
						|
                            if (opState == oam::INITIAL)
 | 
						|
                                break;
 | 
						|
 | 
						|
                            // if disabled and not amazon, skip
 | 
						|
                            if ( (opState == oam::AUTO_DISABLED) && !amazon)
 | 
						|
                                break;
 | 
						|
 | 
						|
                            // if disabled, amazon,and NOT terminated skip
 | 
						|
                            if ( (opState == oam::AUTO_DISABLED) && amazon)
 | 
						|
                            {
 | 
						|
                                // return values = 'ip address' for running or rebooting, stopped or terminated
 | 
						|
                                string currentIPAddr = oam.getEC2InstanceIpAddress(hostName);
 | 
						|
 | 
						|
                                if ( currentIPAddr != "terminated")
 | 
						|
                                    break;
 | 
						|
                            }
 | 
						|
 | 
						|
                            log.writeLog(__LINE__, "module failed to respond to pings: " + moduleName, LOG_TYPE_WARNING);
 | 
						|
 | 
						|
                            //bump module ping failure counter
 | 
						|
                            moduleInfoList[moduleName]++;
 | 
						|
 | 
						|
                            if ( moduleName == config.OAMStandbyName() )
 | 
						|
                                HOTSTANDBYACTIVE = false;
 | 
						|
 | 
						|
                            if (moduleInfoList[moduleName] == ModuleHeartbeatCount)
 | 
						|
                            {
 | 
						|
                                // if LAN OUTAGE ACTIVE,skip module checks
 | 
						|
                                if (LANOUTAGEACTIVE)
 | 
						|
                                    break;
 | 
						|
 | 
						|
                                //check if down module is PrimaryUMModuleName
 | 
						|
                                bool downPrimaryUM = false;
 | 
						|
                                string PrimaryUMModuleName;
 | 
						|
 | 
						|
                                try
 | 
						|
                                {
 | 
						|
                                    oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
 | 
						|
                                }
 | 
						|
                                catch (...) {}
 | 
						|
 | 
						|
                                if ( PrimaryUMModuleName == moduleName )
 | 
						|
                                    downPrimaryUM = true;
 | 
						|
 | 
						|
                                // if not disabled and amazon, skip
 | 
						|
                                if (opState != oam::AUTO_DISABLED )
 | 
						|
                                {
 | 
						|
                                    //Log failure, issue alarm, set moduleOpState
 | 
						|
                                    Configuration config;
 | 
						|
                                    log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
 | 
						|
 | 
						|
                                    //set query system state not ready
 | 
						|
                                    BRM::DBRM dbrm;
 | 
						|
                                    dbrm.setSystemQueryReady(false);
 | 
						|
 | 
						|
                                    processManager.setQuerySystemState(false);
 | 
						|
 | 
						|
                                    processManager.setSystemState(oam::BUSY_INIT);
 | 
						|
 | 
						|
                                    processManager.reinitProcessType("cpimport");
 | 
						|
 | 
						|
                                    // halt the dbrm
 | 
						|
                                    oam.dbrmctl("halt");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    processManager.setSystemState(oam::BUSY_INIT);
 | 
						|
 | 
						|
                                    //string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
 | 
						|
                                    //system(cmd.c_str());
 | 
						|
 | 
						|
                                    //send notification
 | 
						|
                                    oam.sendDeviceNotification(moduleName, MODULE_DOWN);
 | 
						|
 | 
						|
                                    //Issue an alarm
 | 
						|
                                    aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | 
						|
 | 
						|
                                    //mark all processes running on module auto-offline
 | 
						|
                                    processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                                    //set module to disable state
 | 
						|
                                    processManager.disableModule(moduleName, false);
 | 
						|
 | 
						|
                                    //call dbrm control
 | 
						|
                                    oam.dbrmctl("reload");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    // if pm, move dbroots to other pms
 | 
						|
                                    if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | 
						|
                                            ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | 
						|
                                            ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | 
						|
                                    {
 | 
						|
                                        try
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
 | 
						|
                                            oam.autoMovePmDbroot(moduleName);
 | 
						|
                                            log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
 | 
						|
                                            //distribute config file
 | 
						|
                                            processManager.distributeConfigFile("system");
 | 
						|
                                        }
 | 
						|
                                        catch (exception& ex)
 | 
						|
                                        {
 | 
						|
                                            string error = ex.what();
 | 
						|
                                            log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
 | 
						|
                                        }
 | 
						|
                                        catch (...)
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
 | 
						|
                                // if Cloud Instance
 | 
						|
                                // state = terminate, remove/addmodule to launch new instance
 | 
						|
                                if ( amazon )
 | 
						|
                                {
 | 
						|
                                    if ( moduleName.find("um") == 0 )
 | 
						|
                                    {
 | 
						|
                                        // resume the dbrm
 | 
						|
                                        oam.dbrmctl("resume");
 | 
						|
                                        log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                        //set recycle process
 | 
						|
                                        processManager.recycleProcess(moduleName);
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    // return values = 'ip address' for running or rebooting, stopped or terminated
 | 
						|
                                    string currentIPAddr = oam.getEC2InstanceIpAddress(hostName);
 | 
						|
 | 
						|
                                    if ( currentIPAddr == "terminated")
 | 
						|
                                    {
 | 
						|
                                        //check if down module was Standby OAM, if so find another one
 | 
						|
                                        if ( moduleName == config.OAMStandbyName() )
 | 
						|
                                        {
 | 
						|
 | 
						|
                                            //set down module ProcessManager to AOS
 | 
						|
                                            processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
 | 
						|
 | 
						|
                                            //get another standby OAM module
 | 
						|
                                            string newStandbyModule = processManager.getStandbyModule();
 | 
						|
 | 
						|
                                            //send message to start new Standby Process-Manager, if needed
 | 
						|
                                            if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
 | 
						|
                                            {
 | 
						|
                                                processManager.setStandbyModule(newStandbyModule);
 | 
						|
                                            }
 | 
						|
                                            else
 | 
						|
                                            {
 | 
						|
                                                Config* sysConfig = Config::makeConfig();
 | 
						|
 | 
						|
                                                // clear Standby OAM Module
 | 
						|
                                                sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
 | 
						|
                                                sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
 | 
						|
 | 
						|
                                                //update Calpont Config table
 | 
						|
                                                try
 | 
						|
                                                {
 | 
						|
                                                    sysConfig->write();
 | 
						|
                                                }
 | 
						|
                                                catch (...)
 | 
						|
                                                {
 | 
						|
                                                    log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
 | 
						|
                                                }
 | 
						|
                                            }
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        // remove/addmodule
 | 
						|
                                        log.writeLog(__LINE__, "Instance terminated, re-launching: " + hostName, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                        // if pm, get assigned dbroots and deattach EBS
 | 
						|
                                        DBRootConfigList dbrootConfigList;
 | 
						|
                                        int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
 | 
						|
 | 
						|
                                        if ( moduleName.find("pm") == 0 )
 | 
						|
                                        {
 | 
						|
                                            //get dbroots ids for to PM
 | 
						|
                                            try
 | 
						|
                                            {
 | 
						|
                                                oam.getPmDbrootConfig(moduleID, dbrootConfigList);
 | 
						|
                                            }
 | 
						|
                                            catch (exception& e)
 | 
						|
                                            {
 | 
						|
                                                log.writeLog(__LINE__, "ERROR: getPmDbrootConfig error: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                            }
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        DeviceNetworkList devicenetworklist;
 | 
						|
                                        DeviceNetworkConfig devicenetworkconfig;
 | 
						|
                                        HostConfig hostconfig;
 | 
						|
 | 
						|
                                        devicenetworkconfig.DeviceName = moduleName;
 | 
						|
 | 
						|
                                        if (cloud == "amazon-vpc")
 | 
						|
                                            hostconfig.IPAddr = ipAddr;
 | 
						|
                                        else
 | 
						|
                                            hostconfig.IPAddr = oam::UnassignedName;
 | 
						|
 | 
						|
                                        hostconfig.HostName = oam::UnassignedName;
 | 
						|
                                        hostconfig.NicID = 1;
 | 
						|
                                        devicenetworkconfig.hostConfigList.push_back(hostconfig);
 | 
						|
 | 
						|
                                        devicenetworklist.push_back(devicenetworkconfig);
 | 
						|
 | 
						|
                                        bool pass = true;
 | 
						|
 | 
						|
                                        for ( int addRetry = 0 ; addRetry < 5 ; addRetry++ )
 | 
						|
                                        {
 | 
						|
                                            //remove module
 | 
						|
                                            int ret = processManager.removeModule(devicenetworklist, false);
 | 
						|
 | 
						|
                                            if ( ret != oam::API_SUCCESS )
 | 
						|
                                            {
 | 
						|
                                                log.writeLog(__LINE__, "Instance failed to remove, retry: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                            }
 | 
						|
                                            else
 | 
						|
                                            {
 | 
						|
                                                pass = true;
 | 
						|
                                                log.writeLog(__LINE__, "Instance removed, module: " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            // add module
 | 
						|
                                            string password = oam::UnassignedName;
 | 
						|
 | 
						|
                                            try
 | 
						|
                                            {
 | 
						|
                                                oam.getSystemConfig("rpw", password);
 | 
						|
                                            }
 | 
						|
                                            catch (...)
 | 
						|
                                            {
 | 
						|
                                                password = oam::UnassignedName;
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            ret = processManager.addModule(devicenetworklist, password, false);
 | 
						|
 | 
						|
                                            if ( ret != oam::API_SUCCESS )
 | 
						|
                                            {
 | 
						|
                                                log.writeLog(__LINE__, "Instance failed to add, retry: " + moduleName, LOG_TYPE_CRITICAL);
 | 
						|
                                                pass = false;
 | 
						|
                                            }
 | 
						|
                                            else
 | 
						|
                                            {
 | 
						|
                                                pass = true;
 | 
						|
                                                log.writeLog(__LINE__, "New Instance Launched for " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                                // if pm, config and attach EBS
 | 
						|
                                                if ( moduleName.find("pm") == 0 && !dbrootConfigList.empty() )
 | 
						|
                                                {
 | 
						|
                                                    try
 | 
						|
                                                    {
 | 
						|
                                                        oam.setPmDbrootConfig(moduleID, dbrootConfigList);
 | 
						|
 | 
						|
                                                        std::vector<std::string> dbrootList;
 | 
						|
                                                        DBRootConfigList::iterator pt1 = dbrootConfigList.begin();
 | 
						|
 | 
						|
                                                        for ( ; pt1 != dbrootConfigList.end() ; pt1++)
 | 
						|
                                                        {
 | 
						|
                                                            dbrootList.push_back(oam.itoa(*pt1));
 | 
						|
                                                        }
 | 
						|
 | 
						|
                                                        //attach EBS
 | 
						|
                                                        try
 | 
						|
                                                        {
 | 
						|
                                                            oam.amazonReattach(moduleName, dbrootList, true);
 | 
						|
                                                            pass = true;
 | 
						|
                                                            break;
 | 
						|
                                                        }
 | 
						|
                                                        catch (exception& e)
 | 
						|
                                                        {
 | 
						|
                                                            log.writeLog(__LINE__, "ERROR: amazonReattach error on " + moduleName, LOG_TYPE_ERROR);
 | 
						|
                                                            pass = false;
 | 
						|
                                                        }
 | 
						|
                                                    }
 | 
						|
                                                    catch (exception& e)
 | 
						|
                                                    {
 | 
						|
                                                        log.writeLog(__LINE__, "ERROR: setPmDbrootConfig error on " + moduleName, LOG_TYPE_ERROR);
 | 
						|
                                                        pass = false;
 | 
						|
                                                    }
 | 
						|
                                                }
 | 
						|
                                                else
 | 
						|
                                                {
 | 
						|
                                                    pass = true;
 | 
						|
                                                    break;
 | 
						|
                                                }
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            if (pass)
 | 
						|
                                                break;
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        if (pass)
 | 
						|
                                            //Set the module state so it will be brought back up
 | 
						|
                                            processManager.setModuleState(moduleName, oam::AUTO_DISABLED);
 | 
						|
                                        else
 | 
						|
                                        {
 | 
						|
                                            //new instance failed to get added
 | 
						|
                                            //remove and try auto moving dbroots to other pms
 | 
						|
                                            processManager.removeModule(devicenetworklist, false);
 | 
						|
 | 
						|
                                            // if pm, move dbroots to other pms
 | 
						|
                                            if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | 
						|
                                                    ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | 
						|
                                                    ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | 
						|
                                            {
 | 
						|
                                                try
 | 
						|
                                                {
 | 
						|
                                                    log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
 | 
						|
                                                    oam.autoMovePmDbroot(moduleName);
 | 
						|
                                                    log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
 | 
						|
                                                    //distribute config file
 | 
						|
                                                    processManager.distributeConfigFile("system");
 | 
						|
                                                }
 | 
						|
                                                catch (exception& ex)
 | 
						|
                                                {
 | 
						|
                                                    string error = ex.what();
 | 
						|
                                                    log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
 | 
						|
                                                }
 | 
						|
                                                catch (...)
 | 
						|
                                                {
 | 
						|
                                                    log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                                                }
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            //set recycle process
 | 
						|
                                            processManager.recycleProcess(moduleName);
 | 
						|
 | 
						|
                                            //enable query stats
 | 
						|
                                            dbrm.setSystemQueryReady(true);
 | 
						|
 | 
						|
                                            //set query system state ready
 | 
						|
                                            processManager.setQuerySystemState(true);
 | 
						|
 | 
						|
                                            sleep(2);
 | 
						|
                                            processManager.setSystemState(oam::ACTIVE);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if ( ( moduleName.find("pm") == 0 ) &&
 | 
						|
                                            ( opState != oam::AUTO_DISABLED ) )
 | 
						|
 | 
						|
                                    {
 | 
						|
                                        // resume the dbrm
 | 
						|
                                        oam.dbrmctl("resume");
 | 
						|
                                        log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                        //enable query stats
 | 
						|
                                        dbrm.setSystemQueryReady(true);
 | 
						|
 | 
						|
                                        //set query system state ready
 | 
						|
                                        processManager.setQuerySystemState(true);
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
                                else
 | 
						|
                                {
 | 
						|
                                    // non-amazon
 | 
						|
                                    // resume the dbrm
 | 
						|
                                    oam.dbrmctl("resume");
 | 
						|
                                    log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                    //set recycle process
 | 
						|
                                    processManager.recycleProcess(moduleName);
 | 
						|
 | 
						|
                                    //enable query stats
 | 
						|
                                    dbrm.setSystemQueryReady(true);
 | 
						|
 | 
						|
                                    //set query system state ready
 | 
						|
                                    processManager.setQuerySystemState(true);
 | 
						|
                                }
 | 
						|
 | 
						|
                                //check if down module was Standby OAM, if so find another one
 | 
						|
                                if ( moduleName == config.OAMStandbyName() )
 | 
						|
                                {
 | 
						|
 | 
						|
                                    //set down module ProcessManager to AOS
 | 
						|
                                    processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
 | 
						|
 | 
						|
                                    //get another standby OAM module
 | 
						|
                                    string newStandbyModule = processManager.getStandbyModule();
 | 
						|
 | 
						|
                                    //send message to start new Standby Process-Manager, if needed
 | 
						|
                                    if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
 | 
						|
                                    {
 | 
						|
                                        processManager.setStandbyModule(newStandbyModule);
 | 
						|
                                    }
 | 
						|
                                    else
 | 
						|
                                    {
 | 
						|
                                        Config* sysConfig = Config::makeConfig();
 | 
						|
 | 
						|
                                        // clear Standby OAM Module
 | 
						|
                                        sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
 | 
						|
                                        sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
 | 
						|
 | 
						|
                                        //update Calpont Config table
 | 
						|
                                        try
 | 
						|
                                        {
 | 
						|
                                            sysConfig->write();
 | 
						|
                                        }
 | 
						|
                                        catch (...)
 | 
						|
                                        {
 | 
						|
                                            log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
 | 
						|
                                // reset up mysql rep slaves is master changed
 | 
						|
                                if ( downPrimaryUM &&
 | 
						|
                                        ( MySQLRep == "y" ) )
 | 
						|
                                {
 | 
						|
                                    //setup MySQL Replication for started modules
 | 
						|
                                    log.writeLog(__LINE__, "Setup MySQL Replication for module outage on " + moduleName, LOG_TYPE_DEBUG);
 | 
						|
                                    DeviceNetworkList devicenetworklist;
 | 
						|
                                    processManager.setMySQLReplication(devicenetworklist);
 | 
						|
                                }
 | 
						|
 | 
						|
                                // if disabled and amazon, break out
 | 
						|
                                if ( (opState == oam::AUTO_DISABLED ) && amazon )
 | 
						|
                                    break;
 | 
						|
 | 
						|
                                //start SIMPLEX runtype processes on a SIMPLEX runtype module
 | 
						|
                                string moduletype = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
 | 
						|
 | 
						|
                                try
 | 
						|
                                {
 | 
						|
                                    oam.getSystemConfig(moduletype, moduletypeconfig);
 | 
						|
                                }
 | 
						|
                                catch (exception& ex)
 | 
						|
                                {
 | 
						|
                                    string error = ex.what();
 | 
						|
                                    log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
                                }
 | 
						|
                                catch (...)
 | 
						|
                                {
 | 
						|
                                    log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                                }
 | 
						|
 | 
						|
                                if ( moduletypeconfig.RunType == SIMPLEX )
 | 
						|
                                {
 | 
						|
                                    DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
 | 
						|
 | 
						|
                                    for ( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
 | 
						|
                                    {
 | 
						|
                                        string launchModuleName = (*pt).DeviceName;
 | 
						|
                                        string launchModuletype = launchModuleName.substr(0, MAX_MODULE_TYPE_SIZE);
 | 
						|
 | 
						|
                                        if ( moduletype != launchModuletype )
 | 
						|
                                            continue;
 | 
						|
 | 
						|
                                        //skip if active pm module (local module)
 | 
						|
                                        if ( launchModuleName == config.moduleName() )
 | 
						|
                                            continue;
 | 
						|
 | 
						|
                                        if ( moduleName != launchModuleName )
 | 
						|
                                        {
 | 
						|
                                            //check if module is active before starting any SIMPLEX STANDBY apps
 | 
						|
                                            try
 | 
						|
                                            {
 | 
						|
                                                int launchopState = oam::ACTIVE;
 | 
						|
                                                bool degraded;
 | 
						|
                                                oam.getModuleStatus(launchModuleName, launchopState, degraded);
 | 
						|
 | 
						|
                                                if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY )
 | 
						|
                                                {
 | 
						|
                                                    continue;
 | 
						|
                                                }
 | 
						|
                                            }
 | 
						|
                                            catch (exception& ex)
 | 
						|
                                            {
 | 
						|
//												string error = ex.what();
 | 
						|
//												log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
 | 
						|
                                            }
 | 
						|
                                            catch (...)
 | 
						|
                                            {
 | 
						|
//												log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            int status;
 | 
						|
                                            log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                                            for ( int j = 0 ; j < 20 ; j ++ )
 | 
						|
                                            {
 | 
						|
                                                status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                                                if ( status == API_SUCCESS)
 | 
						|
                                                    break;
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
 | 
						|
                            break;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            } //end of for loop
 | 
						|
        }
 | 
						|
 | 
						|
        // check and take action if LAN outage is flagged
 | 
						|
        if (LANOUTAGESUPPORT && !LANOUTAGEACTIVE && LOCALNICDOWN)
 | 
						|
        {
 | 
						|
            log.writeLog(__LINE__, "LAN Failure detected", LOG_TYPE_CRITICAL);
 | 
						|
 | 
						|
            oam.sendDeviceNotification(config.moduleName(), START_PM_MASTER_DOWN);
 | 
						|
 | 
						|
            LANOUTAGEACTIVE = true;
 | 
						|
 | 
						|
            log.writeLog(__LINE__, "Kill any cpimport running", LOG_TYPE_INFO);
 | 
						|
            system("pkill -9 cpimport");
 | 
						|
 | 
						|
            //request stop of local module
 | 
						|
            int status = processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
 | 
						|
 | 
						|
            if ( status != oam::API_SUCCESS )
 | 
						|
                log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
 | 
						|
 | 
						|
            //stop snmptrap daemon process
 | 
						|
            processManager.stopProcess(config.moduleName(), "SNMPTrapDaemon", oam::FORCEFUL, false);
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            if ( LANOUTAGEACTIVE && HOTSTANDBYACTIVE && !LOCALNICDOWN)
 | 
						|
            {
 | 
						|
//				pthread_mutex_unlock(&THREAD_LOCK);
 | 
						|
                LANOUTAGEACTIVE = false;
 | 
						|
 | 
						|
                log.writeLog(__LINE__, "LAN Failure recovery");
 | 
						|
 | 
						|
                //check if this module still is active according to last know hot standby module
 | 
						|
                ByteStream msg;
 | 
						|
                ByteStream::byte requestID = GETPARENTOAMMODULE;
 | 
						|
                msg << requestID;
 | 
						|
 | 
						|
                string parentOAMModule = processManager.sendMsgProcMon1( config.OAMStandbyName(), msg, requestID );
 | 
						|
 | 
						|
                if ( parentOAMModule == config.moduleName() ||
 | 
						|
                        parentOAMModule == "FAILED" )
 | 
						|
                {
 | 
						|
 | 
						|
                    //srestart to these guys incase they marked any PrimProcs offline
 | 
						|
                    processManager.restartProcessType("ExeMgr");
 | 
						|
                    processManager.reinitProcessType("DDLProc");
 | 
						|
                    processManager.reinitProcessType("DMLProc");
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    //send message to local Process Monitor to run coldStandby
 | 
						|
                    ByteStream msg;
 | 
						|
                    ByteStream::byte requestID = OAMPARENTCOLD;
 | 
						|
 | 
						|
                    msg << requestID;
 | 
						|
 | 
						|
                    int returnStatus = processManager.sendMsgProcMon( config.moduleName(), msg, requestID );
 | 
						|
                    log.writeLog(__LINE__, "sent OAM Parent Cold message to local Process-Monitor, status: " + oam.itoa(returnStatus), LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                    //request stop of local module
 | 
						|
                    int status = processManager.stopModule(config.moduleName(), oam::INSTALL, false);
 | 
						|
 | 
						|
                    if ( status != oam::API_SUCCESS )
 | 
						|
                        log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        //
 | 
						|
        // ping ext devices
 | 
						|
        //
 | 
						|
 | 
						|
        // read each time to catch updates
 | 
						|
        systemextdeviceconfig.extdeviceconfig.clear();
 | 
						|
 | 
						|
        try
 | 
						|
        {
 | 
						|
            oam.getSystemConfig(systemextdeviceconfig);
 | 
						|
        }
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
            string error = ex.what();
 | 
						|
//			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
//			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
        }
 | 
						|
 | 
						|
        for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count ; i++ )
 | 
						|
        {
 | 
						|
            string extDeviceName = systemextdeviceconfig.extdeviceconfig[i].Name;
 | 
						|
            string ipAddr = systemextdeviceconfig.extdeviceconfig[i].IPAddr;
 | 
						|
 | 
						|
            int opState = oam::ACTIVE;
 | 
						|
 | 
						|
            try
 | 
						|
            {
 | 
						|
                oam.getExtDeviceStatus(extDeviceName, opState);
 | 
						|
            }
 | 
						|
            catch (exception& ex)
 | 
						|
            {
 | 
						|
//				string error = ex.what();
 | 
						|
//				log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
            }
 | 
						|
            catch (...)
 | 
						|
            {
 | 
						|
//				log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
            }
 | 
						|
 | 
						|
            cmd = cmdLine + ipAddr + cmdOption;
 | 
						|
            rtnCode = system(cmd.c_str());
 | 
						|
 | 
						|
            switch (WEXITSTATUS(rtnCode))
 | 
						|
            {
 | 
						|
                case 0:
 | 
						|
 | 
						|
                    //Switch Ack ping, Check whether alarm have been issued
 | 
						|
                    if (extDeviceInfoList[extDeviceName] >= ModuleHeartbeatCount)
 | 
						|
                    {
 | 
						|
                        aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, CLEAR);
 | 
						|
 | 
						|
                    }
 | 
						|
 | 
						|
                    extDeviceInfoList[extDeviceName] = 0;
 | 
						|
 | 
						|
                    if (opState != oam::ACTIVE)
 | 
						|
                    {
 | 
						|
                        //Set the switch state to active
 | 
						|
                        processManager.setExtdeviceState(extDeviceName, oam::ACTIVE);
 | 
						|
                    }
 | 
						|
 | 
						|
                    break;
 | 
						|
 | 
						|
                default:
 | 
						|
                    //extDevice failed to respond to ping
 | 
						|
                    log.writeLog(__LINE__, "extDevice failed to respond to ping: " + extDeviceName, LOG_TYPE_WARNING);
 | 
						|
                    extDeviceInfoList[extDeviceName]++;
 | 
						|
 | 
						|
                    if (extDeviceInfoList[extDeviceName] == ModuleHeartbeatCount)
 | 
						|
                    {
 | 
						|
                        //Log failure, issue alarm, set extDeviceOpState
 | 
						|
                        log.writeLog(__LINE__, "extDevice is down: " + extDeviceName, LOG_TYPE_CRITICAL);
 | 
						|
 | 
						|
                        processManager.setExtdeviceState(extDeviceName, oam::AUTO_OFFLINE);
 | 
						|
 | 
						|
                        //Issue an alarm
 | 
						|
                        aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, SET);
 | 
						|
                    }
 | 
						|
 | 
						|
                    break;
 | 
						|
            }
 | 
						|
        } //end of for loop
 | 
						|
 | 
						|
        // double check to make sure the system status is ACTIVE if all module status's are ACTIVE
 | 
						|
        try
 | 
						|
        {
 | 
						|
            if (dbrm.isDBRMReady())
 | 
						|
            {
 | 
						|
                int systemReady = dbrm.getSystemReady();    // -1 == fail, 0 == not ready, 1 == ready
 | 
						|
 | 
						|
                if (systemReady > 0)
 | 
						|
                {
 | 
						|
                    bool updateActive = true;
 | 
						|
 | 
						|
                    for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | 
						|
                    {
 | 
						|
                        int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | 
						|
 | 
						|
                        if ( moduleCount == 0)
 | 
						|
                            continue;
 | 
						|
 | 
						|
                        DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | 
						|
 | 
						|
                        for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | 
						|
                        {
 | 
						|
                            string moduleName = (*pt).DeviceName;
 | 
						|
 | 
						|
                            int opState = oam::ACTIVE;
 | 
						|
 | 
						|
                            try
 | 
						|
                            {
 | 
						|
                                bool degraded;
 | 
						|
                                oam.getModuleStatus(moduleName, opState, degraded);
 | 
						|
 | 
						|
                                if (opState == oam::ACTIVE ||
 | 
						|
                                        opState == oam::DEGRADED ||
 | 
						|
                                        opState == oam::MAN_DISABLED ||
 | 
						|
                                        opState == oam::AUTO_DISABLED )
 | 
						|
                                    continue;
 | 
						|
 | 
						|
                                updateActive = false;
 | 
						|
                            }
 | 
						|
                            catch (exception& ex)
 | 
						|
                            {
 | 
						|
                                //                            string error = ex.what();
 | 
						|
                                //                          log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
 | 
						|
                            }
 | 
						|
                            catch (...)
 | 
						|
                            {
 | 
						|
                                //                            log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    if (updateActive)
 | 
						|
                    {
 | 
						|
//						log.writeLog(__LINE__, "Modules are ACTIVE, check system state ", LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                        string PrimaryUMModuleName;
 | 
						|
 | 
						|
                        try
 | 
						|
                        {
 | 
						|
                            oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
 | 
						|
                        }
 | 
						|
                        catch (...) {}
 | 
						|
 | 
						|
//						log.writeLog(__LINE__, "PrimaryUMModuleName = " + PrimaryUMModuleName, LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                        ProcessStatus DMLprocessstatus;
 | 
						|
 | 
						|
                        try
 | 
						|
                        {
 | 
						|
                            oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
 | 
						|
                        }
 | 
						|
                        catch (exception& ex)
 | 
						|
                        {
 | 
						|
                            //						string error = ex.what();
 | 
						|
                            //						log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
                        }
 | 
						|
                        catch (...)
 | 
						|
                        {
 | 
						|
                            //						log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                        }
 | 
						|
 | 
						|
//						log.writeLog(__LINE__, "DMLPROC STATUS = " + oamState[DMLprocessstatus.ProcessOpState], LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
                        if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
 | 
						|
                        {
 | 
						|
 | 
						|
                            //set the system status if a change has occurred
 | 
						|
                            SystemStatus systemstatus;
 | 
						|
 | 
						|
                            try
 | 
						|
                            {
 | 
						|
                                oam.getSystemStatus(systemstatus);
 | 
						|
                            }
 | 
						|
                            catch (exception& ex)
 | 
						|
                            {
 | 
						|
                                //							string error = ex.what();
 | 
						|
                                //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
                            }
 | 
						|
                            catch (...)
 | 
						|
                            {
 | 
						|
                                //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                            }
 | 
						|
 | 
						|
                            if ( systemstatus.SystemOpState != oam::ACTIVE )
 | 
						|
                            {
 | 
						|
                                processManager.setSystemState(oam::ACTIVE);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
 | 
						|
                        if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
 | 
						|
                        {
 | 
						|
 | 
						|
                            //set the system status if a change has occurred
 | 
						|
                            SystemStatus systemstatus;
 | 
						|
 | 
						|
                            try
 | 
						|
                            {
 | 
						|
                                oam.getSystemStatus(systemstatus);
 | 
						|
                            }
 | 
						|
                            catch (exception& ex)
 | 
						|
                            {
 | 
						|
                                //							string error = ex.what();
 | 
						|
                                //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
                            }
 | 
						|
                            catch (...)
 | 
						|
                            {
 | 
						|
                                //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
                            }
 | 
						|
 | 
						|
                            if ( systemstatus.SystemOpState != oam::BUSY_INIT )
 | 
						|
                            {
 | 
						|
                                processManager.setSystemState(oam::BUSY_INIT);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
        catch (...)
 | 
						|
        {
 | 
						|
        }
 | 
						|
 | 
						|
        //go sleep for a bit
 | 
						|
        int sleepTime = ModuleHeartbeatPeriod / 10;
 | 
						|
 | 
						|
        if (!enableModuleMonitor && systemextdeviceconfig.Count == 0)
 | 
						|
            sleep(60);
 | 
						|
        else
 | 
						|
            sleep(sleepTime);
 | 
						|
    }
 | 
						|
 | 
						|
    return;
 | 
						|
}
 | 
						|
 | 
						|
/******************************************************************************************
 | 
						|
* @brief      hdfsActiveAlarmsPushingThread
 | 
						|
*
 | 
						|
* purpose:    Push an image of ActiveAlarms to HDFS for non-OAMParentModule to view.
 | 
						|
*
 | 
						|
******************************************************************************************/
 | 
						|
static void hdfsActiveAlarmsPushingThread()
 | 
						|
{
 | 
						|
    boost::filesystem::path filePath(ACTIVE_ALARM_FILE);
 | 
						|
    boost::filesystem::path dirPath = filePath.parent_path();
 | 
						|
    string dirName = boost::filesystem::canonical(dirPath).string();
 | 
						|
 | 
						|
    if (boost::filesystem::exists("/etc/pdsh/machines"))
 | 
						|
    {
 | 
						|
        string cpCmd =  "pdcp -a -x " + localHostName + " " + ACTIVE_ALARM_FILE + " " + dirName +
 | 
						|
                        " > /dev/null 2>&1";
 | 
						|
        string rmCmd =  "pdsh -a -x " + localHostName + " rm -f " + ACTIVE_ALARM_FILE +
 | 
						|
                        " > /dev/null 2>&1";
 | 
						|
 | 
						|
        while (1)
 | 
						|
        {
 | 
						|
            if (boost::filesystem::exists(filePath))
 | 
						|
                system(cpCmd.c_str());
 | 
						|
            else
 | 
						|
                system(rmCmd.c_str());
 | 
						|
 | 
						|
            sleep(ACTIVE_ALARMS_PUSHING_INTERVAL);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    return;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*****************************************************************************************
 | 
						|
* @brief	Processor Heartbeat Msg Thread
 | 
						|
*
 | 
						|
* purpose:	Read Heartbeat Messages from other Processes
 | 
						|
*
 | 
						|
*****************************************************************************************/
 | 
						|
/*
 | 
						|
static void heartbeatMsgThread()
 | 
						|
{
 | 
						|
	ProcessLog log;
 | 
						|
	Configuration config;
 | 
						|
	ProcessManager processManager(config, log);
 | 
						|
 | 
						|
	//
 | 
						|
	//waiting for request
 | 
						|
	//
 | 
						|
	ByteStream receivedMSG;
 | 
						|
	IOSocket fIos;
 | 
						|
 | 
						|
	for (;;)
 | 
						|
	{
 | 
						|
		try
 | 
						|
		{
 | 
						|
			MessageQueueServer procmgr("ProcHeartbeatControl");
 | 
						|
			for (;;)
 | 
						|
			{
 | 
						|
				try
 | 
						|
				{
 | 
						|
					fIos = procmgr.accept();
 | 
						|
					receivedMSG = fIos.read();
 | 
						|
 | 
						|
					if (receivedMSG.length() > 0) {
 | 
						|
						processManager.processMSG(fIos, receivedMSG);
 | 
						|
					}
 | 
						|
				}
 | 
						|
				catch (exception& ex)
 | 
						|
				{
 | 
						|
					string error = ex.what();
 | 
						|
					log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: " + error, LOG_TYPE_ERROR);
 | 
						|
				}
 | 
						|
				catch(...)
 | 
						|
				{
 | 
						|
					log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
				}
 | 
						|
 | 
						|
				fIos.close();
 | 
						|
			}
 | 
						|
		}
 | 
						|
        catch (exception& ex)
 | 
						|
        {
 | 
						|
			string error = ex.what();
 | 
						|
			log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
 | 
						|
			// takes 2 - 4 minites to free sockets, sleep and retry
 | 
						|
			sleep(60);
 | 
						|
        }
 | 
						|
        catch(...)
 | 
						|
        {
 | 
						|
			log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcHeartbeatControl: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
			// takes 2 - 4 minites to free sockets, sleep and retry
 | 
						|
			sleep(60);
 | 
						|
        }
 | 
						|
	}
 | 
						|
 | 
						|
}
 | 
						|
*/
 | 
						|
 | 
						|
/*****************************************************************************************
 | 
						|
* @brief	Processor Heartbeat Thread
 | 
						|
*
 | 
						|
* purpose:	Check Heartbeat Messages from other Processes
 | 
						|
*
 | 
						|
*****************************************************************************************/
 | 
						|
/*
 | 
						|
static void heartbeatProcessThread()
 | 
						|
{
 | 
						|
	ProcessLog log;
 | 
						|
	Configuration config;
 | 
						|
	ProcessManager processManager(config, log);
 | 
						|
	Oam oam;
 | 
						|
	ALARMManager aManager;
 | 
						|
 | 
						|
	int processHeartbeatPeriod=60;	//default value to 60 seconds
 | 
						|
 | 
						|
	log.writeLog(__LINE__, "Thread Launched: Process Heartbeat!!!");
 | 
						|
 | 
						|
	while (true)
 | 
						|
	{
 | 
						|
		//
 | 
						|
		// check and report on register process not sending heartbeats
 | 
						|
		//
 | 
						|
 | 
						|
		// get process heartbeat period
 | 
						|
		try {
 | 
						|
			oam.getSystemConfig("ProcessHeartbeatPeriod", processHeartbeatPeriod);
 | 
						|
			processHeartbeatPeriod = processHeartbeatPeriod * 60;
 | 
						|
		}
 | 
						|
		catch (exception& ex)
 | 
						|
		{
 | 
						|
			string error = ex.what();
 | 
						|
			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | 
						|
		}
 | 
						|
		catch(...)
 | 
						|
		{
 | 
						|
			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
		}
 | 
						|
 | 
						|
		Oam oam;
 | 
						|
		log.writeLog(__LINE__, "Process Heartbeat check started, Heartbeat period is " + oam.itoa(processHeartbeatPeriod), LOG_TYPE_DEBUG);
 | 
						|
 | 
						|
		sleep(processHeartbeatPeriod);
 | 
						|
 | 
						|
		HeartBeatProcList::iterator list = hbproclist.begin();
 | 
						|
		for( ; list != hbproclist.end() ; list++)
 | 
						|
		{
 | 
						|
			string moduleName = (*list).ModuleName;
 | 
						|
			string processName = (*list).ProcessName;
 | 
						|
			int id = (*list).ID;
 | 
						|
 | 
						|
			// get Process state and only check if ACTIVE
 | 
						|
			ProcessStatus procstat;
 | 
						|
			try{
 | 
						|
				oam.getProcessStatus(processName, moduleName, procstat);
 | 
						|
			}
 | 
						|
			catch (exception& ex)
 | 
						|
			{
 | 
						|
				string error = ex.what();
 | 
						|
				log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | 
						|
				procstat.ProcessOpState = oam::MAN_OFFLINE;
 | 
						|
			}
 | 
						|
			catch(...)
 | 
						|
			{
 | 
						|
				log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | 
						|
				procstat.ProcessOpState = oam::MAN_OFFLINE;
 | 
						|
			}
 | 
						|
 | 
						|
			if ( procstat.ProcessOpState == oam::ACTIVE ) {
 | 
						|
				// skip testing if Heartbeat is disable
 | 
						|
				if( processHeartbeatPeriod != -1 ) {
 | 
						|
//log.writeLog(__LINE__, "Heartbeat: Process being monitored: " + moduleName + " / " + processName + " / " + oam.itoa(id), LOG_TYPE_DEBUG);
 | 
						|
					if ( !(*list).receiveFlag ) {
 | 
						|
						// got a missing heartbeat, request a restart on the process
 | 
						|
						log.writeLog(__LINE__, "heartbeatProcessThread: Failure from process " + moduleName + " / " + processName+ " / " + oam.itoa(id), LOG_TYPE_WARNING);
 | 
						|
 | 
						|
						oam.restartProcess(moduleName, processName, FORCEFUL, ACK_NO);
 | 
						|
						(*list).receiveFlag = true;
 | 
						|
						// reset all other entries for this process
 | 
						|
						HeartBeatProcList::iterator list1 = hbproclist.begin();
 | 
						|
						for( ; list1 != hbproclist.end() ; list1++)
 | 
						|
						{
 | 
						|
							string moduleName1 = (*list1).ModuleName;
 | 
						|
							string processName1 = (*list1).ProcessName;
 | 
						|
							if ( moduleName == moduleName1 && processName == processName1 )
 | 
						|
								(*list1).receiveFlag = true;
 | 
						|
						}
 | 
						|
					}
 | 
						|
					else
 | 
						|
						// reset receive heartbeat indication flag
 | 
						|
						(*list).receiveFlag = false;
 | 
						|
				}
 | 
						|
				else
 | 
						|
					// heartbeat is disabled
 | 
						|
					(*list).receiveFlag=true;
 | 
						|
			}
 | 
						|
			else
 | 
						|
			{	// registered process not active, remove from list
 | 
						|
				hbproclist.erase(list);
 | 
						|
				log.writeLog(__LINE__, "Removing OOS Process from Heartbeat Monitor list: " + moduleName + " / " + processName+ " / " + oam.itoa(id));
 | 
						|
				break;
 | 
						|
			}
 | 
						|
		}
 | 
						|
	} // end of while forever loop
 | 
						|
}
 | 
						|
*/
 | 
						|
// vim:ts=4 sw=4:
 |