You've already forked mariadb-columnstore-engine
							
							
				mirror of
				https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
				synced 2025-10-31 18:30:33 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			2888 lines
		
	
	
		
			115 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			2888 lines
		
	
	
		
			115 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /* Copyright (C) 2014 InfiniDB, Inc.
 | |
|    Copyright (C) 2016 MariaDB Corporaton
 | |
| 
 | |
|    This program is free software; you can redistribute it and/or
 | |
|    modify it under the terms of the GNU General Public License
 | |
|    as published by the Free Software Foundation; version 2 of
 | |
|    the License.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with this program; if not, write to the Free Software
 | |
|    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 | |
|    MA 02110-1301, USA. */
 | |
| 
 | |
| /*****************************************************************************************
 | |
| * $Id: main.cpp 2203 2013-07-08 16:50:51Z bpaul $
 | |
| *
 | |
| *****************************************************************************************/
 | |
| 
 | |
| 
 | |
| #include <clocale>
 | |
| 
 | |
| #include <boost/filesystem.hpp>
 | |
| 
 | |
| #include "processmanager.h"
 | |
| #include "installdir.h"
 | |
| 
 | |
| #include "utils_utf8.h"
 | |
| 
 | |
| using namespace std;
 | |
| using namespace logging;
 | |
| using namespace messageqcpp;
 | |
| using namespace processmanager;
 | |
| using namespace oam;
 | |
| using namespace alarmmanager;
 | |
| using namespace threadpool;
 | |
| //using namespace procheartbeat;
 | |
| using namespace config;
 | |
| 
 | |
| bool runStandby = false;
 | |
| bool runCold = false;
 | |
| string systemName = "system";
 | |
| string iface_name;
 | |
| string cloud;
 | |
| bool amazon = false;
 | |
| string PMInstanceType;
 | |
| string UMInstanceType;
 | |
| string AmazonPMFailover = "y";
 | |
| string DataRedundancyConfig = "n";
 | |
| bool rootUser = true;
 | |
| string USER = "root";
 | |
| bool HDFS = false;
 | |
| string localHostName;
 | |
| string PMwithUM = "n";
 | |
| string MySQLRep = "n";
 | |
| 
 | |
| // pushing the ACTIVE_ALARMS_FILE to all nodes every 10 seconds.
 | |
| const int ACTIVE_ALARMS_PUSHING_INTERVAL = 10;
 | |
| 
 | |
| typedef   map<string, int>	moduleList;
 | |
| moduleList	moduleInfoList;
 | |
| 
 | |
| extern HeartBeatProcList hbproclist;
 | |
| extern pthread_mutex_t THREAD_LOCK;
 | |
| extern bool startsystemthreadStop;
 | |
| extern string gdownActiveOAMModule;
 | |
| extern int startsystemthreadStatus;
 | |
| extern vector<string> downModuleList;
 | |
| extern bool startFailOver;
 | |
| extern bool gOAMParentModuleFlag;
 | |
| 
 | |
| static void messageThread(Configuration config);
 | |
| static void sigUser1Handler(int sig);
 | |
| static void startMgrProcessThread();
 | |
| static void hdfsActiveAlarmsPushingThread();
 | |
| //static void pingDeviceThread();
 | |
| //static void heartbeatProcessThread();
 | |
| //static void heartbeatMsgThread();
 | |
| 
 | |
| /*****************************************************************************************
 | |
| * @brief	main
 | |
| *
 | |
| * purpose:	request launching of Mgr controlled processes and wait for incoming messages
 | |
| *
 | |
| *****************************************************************************************/
 | |
| int main(int argc, char** argv)
 | |
| {
 | |
| #ifndef _MSC_VER
 | |
|     setuid(0); // set effective ID to root; ignore return status
 | |
| #endif
 | |
|     // get and set locale language
 | |
|     string systemLang = "C";
 | |
| 
 | |
|     setlocale(LC_ALL, systemLang.c_str());
 | |
| 
 | |
|     Oam oam;
 | |
| 
 | |
|     //check if root-user
 | |
|     int user;
 | |
|     user = getuid();
 | |
| 
 | |
|     if (user != 0)
 | |
|         rootUser = false;
 | |
| 
 | |
|     char* p = getenv("USER");
 | |
| 
 | |
|     if (p && *p)
 | |
|         USER = p;
 | |
| 
 | |
|     ProcessLog log;
 | |
|     Configuration config;
 | |
|     ProcessManager processManager(config, log);
 | |
|     ALARMManager aManager;
 | |
| 
 | |
|     log.writeLog(__LINE__, " ");
 | |
|     log.writeLog(__LINE__, "**********Process Manager Started**********");
 | |
| 
 | |
|     //Ignore SIGPIPE signals
 | |
|     signal(SIGPIPE, SIG_IGN);
 | |
| 
 | |
|     //Ignore SIGHUP signals
 | |
|     signal(SIGHUP, SIG_IGN);
 | |
| 
 | |
|     //create SIGUSR1 handler to get configuration updates
 | |
|     signal(SIGUSR1, sigUser1Handler);
 | |
| 
 | |
|     // Get System Name
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig("SystemName", systemName);
 | |
|     }
 | |
|     catch (...)
 | |
|     {}
 | |
| 
 | |
|     //get cloud setting
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig( "Cloud", cloud);
 | |
|     }
 | |
|     catch (...) {}
 | |
| 
 | |
|     //get amazon parameters
 | |
|     if ( cloud == "amazon-ec2" || cloud == "amazon-vpc" )
 | |
|     {
 | |
|         oam.getSystemConfig("PMInstanceType", PMInstanceType);
 | |
|         oam.getSystemConfig("UMInstanceType", UMInstanceType);
 | |
|         oam.getSystemConfig("AmazonPMFailover", AmazonPMFailover);
 | |
| 
 | |
|         amazon = true;
 | |
|     }
 | |
| 
 | |
|     //get gluster config
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         DataRedundancyConfig = "n";
 | |
|     }
 | |
| 
 | |
|     //hdfs / hadoop config
 | |
|     string DBRootStorageType;
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
 | |
|     }
 | |
|     catch (...) {}
 | |
| 
 | |
|     if ( DBRootStorageType == "hdfs" )
 | |
|         HDFS = true;
 | |
| 
 | |
|     log.writeLog(__LINE__, "Main: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
 | |
| 
 | |
|     //PMwithUM config
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig( "PMwithUM", PMwithUM);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         PMwithUM = "n";
 | |
|     }
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig("MySQLRep", MySQLRep);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         MySQLRep = "n";
 | |
|     }
 | |
| 
 | |
|     // get system uptime and alarm if this is a restart after module outage
 | |
|     if ( gOAMParentModuleFlag )
 | |
|     {
 | |
|         log.writeLog(__LINE__, "Running Active");
 | |
|         log.writeLog(__LINE__, "Running Active", LOG_TYPE_DEBUG);
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         log.writeLog(__LINE__, "Running Standby");
 | |
|         log.writeLog(__LINE__, "Running Standby", LOG_TYPE_DEBUG);
 | |
|         runStandby = true;
 | |
|     }
 | |
| 
 | |
|     //get local module main IP address
 | |
|     ModuleConfig moduleconfig;
 | |
|     oam.getSystemConfig(config.moduleName(), moduleconfig);
 | |
|     HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
 | |
|     string localIPaddr = (*pt1).IPAddr;
 | |
|     localHostName = (*pt1).HostName;
 | |
| 
 | |
|     struct ifaddrs* addrs, *iap;
 | |
|     struct sockaddr_in* sa;
 | |
|     char buf[32];
 | |
| 
 | |
|     getifaddrs(&addrs);
 | |
| 
 | |
|     for (iap = addrs; iap != NULL; iap = iap->ifa_next)
 | |
|     {
 | |
| 
 | |
|         if (iap->ifa_addr && (iap->ifa_flags & IFF_UP) && iap->ifa_addr->sa_family == AF_INET)
 | |
|         {
 | |
|             sa = (struct sockaddr_in*)(iap->ifa_addr);
 | |
|             inet_ntop(iap->ifa_addr->sa_family, (void*) & (sa->sin_addr), buf, sizeof(buf));
 | |
| 
 | |
|             if (!strcmp(localIPaddr.c_str(), buf))
 | |
|             {
 | |
|                 iface_name = iap->ifa_name;
 | |
|                 break;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     freeifaddrs(addrs);
 | |
|     log.writeLog(__LINE__, "Main Ethernet Port = " + iface_name, LOG_TYPE_DEBUG);
 | |
| 
 | |
|     //
 | |
|     //start a thread to ping all system modules
 | |
|     //
 | |
|     if (runStandby)
 | |
|     {
 | |
|         //running standby after startup
 | |
|         try
 | |
|         {
 | |
|             oam.processInitComplete("ProcessManager", oam::STANDBY);
 | |
|             log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
|             string error = ex.what();
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         // create message thread
 | |
|         pthread_t MessageThread;
 | |
|         int ret = pthread_create (&MessageThread, NULL, (void* (*)(void*)) &messageThread, &config);
 | |
| 
 | |
|         if ( ret != 0 )
 | |
|             log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
 | |
| 
 | |
|         //monitor OAM Parent Module for failover
 | |
|         while (true)
 | |
|         {
 | |
|             if ( processManager.OAMParentModuleChange() == oam::API_SUCCESS )
 | |
|                 break;
 | |
| 
 | |
|             log.writeLog(__LINE__, "OAMParentModuleChange failure", LOG_TYPE_WARNING);
 | |
|             // GO TRY AGAIN
 | |
|         }
 | |
| 
 | |
|         pthread_t srvThread;
 | |
|         int status = pthread_create (&srvThread, NULL, (void* (*)(void*)) &pingDeviceThread, NULL);
 | |
| 
 | |
|         if ( status != 0 )
 | |
|             log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         //running active after startup
 | |
|         //Update DBRM section of Columnstore.xml
 | |
|         processManager.updateWorkerNodeconfig();
 | |
| //		processManager.distributeConfigFile("system");
 | |
| 
 | |
|         pthread_t srvThread;
 | |
|         int status = pthread_create (&srvThread, NULL, (void* (*)(void*)) &pingDeviceThread, NULL);
 | |
| 
 | |
|         if ( status != 0 )
 | |
|             log.writeLog(__LINE__, "pingDeviceThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | |
| 
 | |
|         // if HDFS, create a thread to push an image of activeAlarms to HDFS filesystem
 | |
|         if (HDFS)
 | |
|         {
 | |
|             pthread_t hdfsAlarmThread;
 | |
|             int status = pthread_create(&hdfsAlarmThread, NULL, (void* (*)(void*)) &hdfsActiveAlarmsPushingThread, NULL);
 | |
| 
 | |
|             if ( status != 0 )
 | |
|                 log.writeLog(__LINE__, "hdfsActiveAlarmsPushingThread pthread_create failed, return code = " + oam.itoa(status), LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         sleep(5);
 | |
| 
 | |
|         SystemStatus systemstatus;
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             oam.getSystemStatus(systemstatus);
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
| //			string error = ex.what();
 | |
| //			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
| //			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         if (systemstatus.SystemOpState != oam::MAN_OFFLINE &&
 | |
|                 systemstatus.SystemOpState != oam::ACTIVE)
 | |
|         {
 | |
|             pthread_t mgrProcThread;
 | |
|             int status = pthread_create (&mgrProcThread, NULL, (void* (*)(void*)) &startMgrProcessThread, NULL);
 | |
| 
 | |
|             if ( status != 0 )
 | |
|                 log.writeLog(__LINE__, "startMgrProcessThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             oam.processInitComplete("ProcessManager");
 | |
|             log.writeLog(__LINE__, "processInitComplete Successfully Called", LOG_TYPE_DEBUG);
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
|             string error = ex.what();
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: " + error, LOG_TYPE_ERROR);
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on processInitComplete: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         //make sure ProcMgr IP Address is configured correctly
 | |
|         try
 | |
|         {
 | |
|             Config* sysConfig = Config::makeConfig();
 | |
| 
 | |
|             // get Standby IP address
 | |
|             ModuleConfig moduleconfig;
 | |
|             oam.getSystemConfig(config.moduleName(), moduleconfig);
 | |
|             HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
 | |
|             string IPaddr = (*pt1).IPAddr;
 | |
| 
 | |
|             sysConfig->setConfig("ProcMgr", "IPAddr", IPaddr);
 | |
| 
 | |
|             log.writeLog(__LINE__, "set ProcMgr IPaddr to " + IPaddr, LOG_TYPE_DEBUG);
 | |
| 
 | |
|             //update Calpont Config table
 | |
|             try
 | |
|             {
 | |
|                 sysConfig->write();
 | |
|             }
 | |
|             catch (...)
 | |
|             {
 | |
|                 log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
 | |
|             }
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "ERROR: makeConfig failed", LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             oam.distributeConfigFile();
 | |
|         }
 | |
|         catch (...)
 | |
|         {}
 | |
| 
 | |
|         // create message thread
 | |
|         pthread_t MessageThread;
 | |
|         int ret = pthread_create (&MessageThread, NULL, (void* (*)(void*)) &messageThread, &config);
 | |
| 
 | |
|         if ( ret != 0 )
 | |
|             log.writeLog(__LINE__, "pthread_create failed, return code = " + oam.itoa(ret), LOG_TYPE_ERROR);
 | |
|     }
 | |
| 
 | |
|     //
 | |
|     //start a thread to process heartbeat checks
 | |
|     //
 | |
| //	pthread_t heartThread;
 | |
| //	pthread_create (&heartThread, NULL, (void*(*)(void*)) &heartbeatProcessThread, NULL);
 | |
| 
 | |
|     //
 | |
|     //start a thread to read heartbeat messages
 | |
|     //
 | |
| //	pthread_t heartMsgThread;
 | |
| //	pthread_create (&heartMsgThread, NULL, (void*(*)(void*)) &heartbeatMsgThread, NULL);
 | |
| 
 | |
|     // suspend forever
 | |
|     while (true)
 | |
|     {
 | |
|         sleep(1000);
 | |
|     }
 | |
| }
 | |
| 
 | |
| /******************************************************************************************
 | |
| * @brief	messageThread
 | |
| *
 | |
| * purpose:	Read incoming messages
 | |
| *
 | |
| ******************************************************************************************/
 | |
| static void messageThread(Configuration config)
 | |
| {
 | |
|     ProcessLog log;
 | |
|     ProcessManager processManager(config, log);
 | |
|     Oam oam;
 | |
| 
 | |
|     //check for running active, then launch
 | |
|     while (true)
 | |
|     {
 | |
|         if ( !runStandby)
 | |
|             break;
 | |
| 
 | |
|         sleep (1);
 | |
|     }
 | |
| 
 | |
|     log.writeLog(__LINE__, "Message Thread started ..", LOG_TYPE_DEBUG);
 | |
| 
 | |
|     //read and cleanup port before trying to use
 | |
|     try
 | |
|     {
 | |
|         Config* sysConfig = Config::makeConfig();
 | |
|         string port = sysConfig->getConfig("ProcMgr", "Port");
 | |
|         string cmd = "fuser -k " + port + "/tcp >/dev/null 2>&1";
 | |
| 
 | |
|         if ( !rootUser)
 | |
|             cmd = "sudo fuser -k " + port + "/tcp >/dev/null 2>&1";
 | |
| 
 | |
| 
 | |
|         system(cmd.c_str());
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     //
 | |
|     //waiting for request
 | |
|     //
 | |
|     IOSocket fIos;
 | |
| 
 | |
|     for (;;)
 | |
|     {
 | |
|         try
 | |
|         {
 | |
|             MessageQueueServer procmgr("ProcMgr");
 | |
| 
 | |
|             for (;;)
 | |
|             {
 | |
|                 try
 | |
|                 {
 | |
|                     fIos = procmgr.accept();
 | |
| 
 | |
|                     pthread_t messagethread;
 | |
|                     int status = pthread_create (&messagethread, NULL, (void* (*)(void*)) &processMSG, &fIos);
 | |
| 
 | |
|                     if ( status != 0 )
 | |
|                         log.writeLog(__LINE__, "messagethread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
 | |
|                 }
 | |
|                 catch (...)
 | |
|                 {}
 | |
| 
 | |
|             }
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
|             string error = ex.what();
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
 | |
| 
 | |
|             // takes 2 - 4 minites to free sockets, sleep and retry
 | |
|             sleep(60);
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
| 
 | |
|             // takes 2 - 4 minites to free sockets, sleep and retry
 | |
|             sleep(60);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| /******************************************************************************************
 | |
| * @brief	sigUser1Handler
 | |
| *
 | |
| * purpose:	Handler SIGUSER1 signal and initial failover
 | |
| *
 | |
| ******************************************************************************************/
 | |
| static void sigUser1Handler(int sig)
 | |
| {
 | |
|     ProcessLog log;
 | |
|     Configuration config;
 | |
|     ProcessManager processManager(config, log);
 | |
|     Oam oam;
 | |
|     log.writeLog(__LINE__, "SIGUSER1 received, set startFailOver = true", LOG_TYPE_DEBUG);
 | |
| 
 | |
|     startFailOver = true;
 | |
| }
 | |
| 
 | |
| /*****************************************************************************************
 | |
| * @brief	Start Mgr Process by module Thread
 | |
| *
 | |
| * purpose:	Send Messages to Module Process Monitors to start Processes
 | |
| *
 | |
| *****************************************************************************************/
 | |
| static void startMgrProcessThread()
 | |
| {
 | |
|     ProcessLog log;
 | |
|     Configuration config;
 | |
|     ProcessManager processManager(config, log);
 | |
|     Oam oam;
 | |
|     SystemModuleTypeConfig systemmoduletypeconfig;
 | |
|     ModuleTypeConfig PMSmoduletypeconfig;
 | |
|     ALARMManager aManager;
 | |
| 
 | |
|     int waitTime = 60;
 | |
| 
 | |
|     log.writeLog(__LINE__, "startMgrProcessThread launched", LOG_TYPE_DEBUG);
 | |
| 
 | |
|     //get calpont software version and release
 | |
|     SystemSoftware systemsoftware;
 | |
|     string softwareVersion;
 | |
|     string softwareRelease;
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemSoftware(systemsoftware);
 | |
| 
 | |
|         softwareVersion = systemsoftware.Version;
 | |
|         softwareRelease = systemsoftware.Release;
 | |
|     }
 | |
|     catch (exception& e)
 | |
|     {
 | |
|         cout << endl << "ProcMon Construct Error reading getSystemSoftware = " << e.what() << endl;
 | |
|         exit(-1);
 | |
|     }
 | |
| 
 | |
|     string localSoftwareInfo = softwareVersion + softwareRelease;
 | |
| 
 | |
|     //get systemStartupOffline
 | |
|     string systemStartupOffline = "n";
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         Config* sysConfig = Config::makeConfig();
 | |
| 
 | |
|         systemStartupOffline = sysConfig->getConfig("Installation", "SystemStartupOffline");
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         log.writeLog(__LINE__, "ERROR: Problem getting systemStartupOffline from the Calpont System Configuration file", LOG_TYPE_ERROR);
 | |
|         systemStartupOffline = "n";
 | |
|     }
 | |
| 
 | |
|     if ( systemStartupOffline == "y" )
 | |
|         log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_INFO);
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig(systemmoduletypeconfig);
 | |
|     }
 | |
|     catch (exception& ex)
 | |
|     {
 | |
|         string error = ex.what();
 | |
|         log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|     }
 | |
| 
 | |
|     //get Distributed Install
 | |
|     string DistributedInstall = "y";
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig("DistributedInstall", DistributedInstall);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         log.writeLog(__LINE__, "addModule - ERROR: get DistributedInstall", LOG_TYPE_ERROR);
 | |
|     }
 | |
| 
 | |
|     //Send out a start service just to make sure Columnstore is runing on remote nodes
 | |
|     //note this only works for systems with ssh-keys
 | |
|     /*	for( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | |
|     	{
 | |
|     		int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | |
|     		if( moduleCount == 0)
 | |
|     			continue;
 | |
| 
 | |
|     		DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
|     		for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | |
|     		{
 | |
|     		      //skip OAM Parent module
 | |
|     		      if ( (*pt).DeviceName == config.moduleName() )
 | |
|     			      continue;
 | |
| 
 | |
|     		      HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
 | |
|     		      for( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
 | |
|     		      {
 | |
|     			      //run remote command script
 | |
|     			      string cmd = startup::StartUp::installDir() + "/bin/remote_command.sh " + (*pt1).IPAddr + " ssh '" + startup::StartUp::installDir() + "/bin/columnstore restart' 0";
 | |
|     			      system(cmd.c_str());
 | |
|     		      }
 | |
|     		}
 | |
|     	}
 | |
|     */
 | |
|     //distribute system and process config files
 | |
|     processManager.distributeConfigFile("system");
 | |
|     processManager.distributeConfigFile("system", "ProcessConfig.xml");
 | |
| 
 | |
|     //send out moduleName to remote nodes, this will be used to startup new installed nodes
 | |
|     {
 | |
|         int status = API_SUCCESS;
 | |
|         int k = 0;
 | |
| 
 | |
|         for ( ; k < waitTime ; k++ )
 | |
|         {
 | |
|             if ( startsystemthreadStop )
 | |
|             {
 | |
|                 processManager.setSystemState(oam::MAN_OFFLINE);
 | |
| 
 | |
|                 // exit thread
 | |
|                 log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
 | |
|                 pthread_exit(0);
 | |
|             }
 | |
| 
 | |
|             status = API_SUCCESS;
 | |
| 
 | |
|             for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | |
|             {
 | |
|                 int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|                 if ( moduleCount == 0)
 | |
|                     continue;
 | |
| 
 | |
|                 DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|                 for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | |
|                 {
 | |
|                     string moduleName = (*pt).DeviceName;
 | |
| 
 | |
|                     //skip OAM Parent module
 | |
|                     if ( (*pt).DeviceName == config.moduleName() )
 | |
|                         continue;
 | |
| 
 | |
|                     if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
 | |
|                             (*pt).DisableState == oam::AUTODISABLEDSTATE )
 | |
|                         continue;
 | |
| 
 | |
|                     int ret = processManager.configureModule(moduleName);
 | |
| 
 | |
|                     if ( ret != API_SUCCESS )
 | |
|                         status = ret;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             //get out of loop if all modules updated
 | |
|             if ( status == API_SUCCESS )
 | |
|                 break;
 | |
| 
 | |
|             //retry after sleeping for a bit
 | |
|             sleep(1);
 | |
|         }
 | |
| 
 | |
|         if ( k == waitTime || status == API_FAILURE)
 | |
|         {
 | |
|             // system didn't successfull restart
 | |
|             processManager.setSystemState(oam::FAILED);
 | |
|             // exit thread
 | |
|             log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons running", LOG_TYPE_CRITICAL);
 | |
|             log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
 | |
|             pthread_exit(0);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     //wait until all modules are up after a system reboot
 | |
|     int i = 0;
 | |
| 
 | |
|     for ( ; i < waitTime ; i++ )
 | |
|     {
 | |
|         if ( startsystemthreadStop )
 | |
|         {
 | |
|             processManager.setSystemState(oam::MAN_OFFLINE);
 | |
| 
 | |
|             // exit thread
 | |
|             log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
 | |
|             pthread_exit(0);
 | |
|         }
 | |
| 
 | |
|         int status = API_SUCCESS;
 | |
| 
 | |
|         for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | |
|         {
 | |
|             if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType == "pm" )
 | |
|                 PMSmoduletypeconfig = systemmoduletypeconfig.moduletypeconfig[i];
 | |
| 
 | |
|             int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|             if ( moduleCount == 0)
 | |
|                 continue;
 | |
| 
 | |
|             DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|             for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | |
|             {
 | |
|                 string moduleName = (*pt).DeviceName;
 | |
| 
 | |
|                 // Is Module UP
 | |
|                 try
 | |
|                 {
 | |
|                     bool degraded;
 | |
|                     int opState = oam::ACTIVE;
 | |
|                     oam.getModuleStatus(moduleName, opState, degraded);
 | |
| 
 | |
|                     if ( opState == oam::MAN_DISABLED )
 | |
|                         //mark all processes running on module man-offline except ProcMon
 | |
|                         processManager.setProcessStates(moduleName, oam::MAN_OFFLINE);
 | |
| 
 | |
|                     if ( opState == oam::AUTO_DISABLED)
 | |
|                         //mark all processes running on module auto-offline
 | |
|                         processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                     if (opState == oam::INITIAL ||
 | |
|                             opState == oam::DOWN)
 | |
|                     {
 | |
|                         //a module is not up
 | |
|                         status = API_MINOR_FAILURE;
 | |
|                         break;
 | |
|                     }
 | |
|                 }
 | |
|                 catch (exception& ex)
 | |
|                 {
 | |
| //					string error = ex.what();
 | |
| //					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
 | |
|                 }
 | |
|                 catch (...)
 | |
|                 {
 | |
| //					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if ( status == API_MINOR_FAILURE)
 | |
|             {
 | |
|                 sleep(1);
 | |
|                 break;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if ( status == API_SUCCESS)
 | |
|             //all modules are up
 | |
|             break;
 | |
|     }
 | |
| 
 | |
|     if ( i == waitTime )
 | |
|     {
 | |
|         // system didn't successfull restart
 | |
|         processManager.setSystemState(oam::FAILED);
 | |
| 
 | |
|         // exit thread
 | |
|         log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all modules are UP", LOG_TYPE_CRITICAL);
 | |
|         pthread_exit(0);
 | |
|     }
 | |
| 
 | |
|     //configure the PMS settings
 | |
|     processManager.updatePMSconfig();
 | |
| 
 | |
|     if (HDFS)
 | |
|         //distribute config file
 | |
|         processManager.distributeConfigFile("system");
 | |
| 
 | |
|     //now wait until all procmons are ACTIVE and validate rpms on each module
 | |
|     int status = API_SUCCESS;
 | |
|     int k = 0;
 | |
| 
 | |
|     for ( ; k < waitTime ; k++ )
 | |
|     {
 | |
|         if ( startsystemthreadStop )
 | |
|         {
 | |
|             processManager.setSystemState(oam::MAN_OFFLINE);
 | |
| 
 | |
|             // exit thread
 | |
|             log.writeLog(__LINE__, "startMgrProcessThread Exit with a stop system flag", LOG_TYPE_DEBUG);
 | |
|             pthread_exit(0);
 | |
|         }
 | |
| 
 | |
|         status = API_SUCCESS;
 | |
| 
 | |
|         for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
 | |
|         {
 | |
|             int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|             if ( moduleCount == 0)
 | |
|                 continue;
 | |
| 
 | |
|             DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|             for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
 | |
|             {
 | |
|                 string moduleName = (*pt).DeviceName;
 | |
| 
 | |
|                 if ( (*pt).DisableState == oam::MANDISABLEDSTATE ||
 | |
|                         (*pt).DisableState == oam::AUTODISABLEDSTATE )
 | |
|                     continue;
 | |
| 
 | |
|                 int moduleOpState = oam::ACTIVE;
 | |
| 
 | |
|                 // check module state
 | |
|                 try
 | |
|                 {
 | |
|                     bool degraded;
 | |
|                     oam.getModuleStatus(moduleName, moduleOpState, degraded);
 | |
| 
 | |
|                     // if up, set to MAN_INIT
 | |
|                     if ( HDFS &&
 | |
|                             (moduleOpState == oam::UP) )
 | |
|                     {
 | |
|                         processManager.setModuleState(moduleName, oam::MAN_INIT);
 | |
|                     }
 | |
|                 }
 | |
|                 catch (exception& ex)
 | |
|                 {
 | |
| //					string error = ex.what();
 | |
| //					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
 | |
|                 }
 | |
|                 catch (...)
 | |
|                 {
 | |
| //					log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                 }
 | |
| 
 | |
|                 // Is Module's ProcMon ACTIVE and module status has been updated
 | |
|                 int opState = oam::ACTIVE;
 | |
| 
 | |
|                 try
 | |
|                 {
 | |
|                     ProcessStatus procstat;
 | |
|                     oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
 | |
|                     opState = procstat.ProcessOpState;
 | |
| 
 | |
|                     if (opState != oam::ACTIVE)
 | |
|                     {
 | |
|                         //skip if Not ACTIVE
 | |
|                         log.writeLog(__LINE__, "Module ProcMon not active yet: " + moduleName, LOG_TYPE_DEBUG);
 | |
|                         status = API_MINOR_FAILURE;
 | |
|                         continue;
 | |
|                     }
 | |
|                 }
 | |
|                 catch (exception& ex)
 | |
|                 {
 | |
| //					string error = ex.what();
 | |
| //					log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | |
|                     status = API_MINOR_FAILURE;
 | |
|                     continue;
 | |
|                 }
 | |
|                 catch (...)
 | |
|                 {
 | |
| //					log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                     status = API_MINOR_FAILURE;
 | |
|                     continue;
 | |
|                 }
 | |
| 
 | |
|                 //skip OAM Parent module
 | |
|                 if ( moduleName == config.moduleName() )
 | |
|                     continue;
 | |
| 
 | |
|                 //ProcMon ACTIVE, validate the software release and version of that module
 | |
|                 ByteStream msg;
 | |
|                 ByteStream::byte requestID = GETSOFTWAREINFO;
 | |
|                 msg << requestID;
 | |
| 
 | |
|                 string moduleSoftwareInfo = processManager.sendMsgProcMon1( moduleName, msg, requestID );
 | |
| 
 | |
|                 if ( moduleSoftwareInfo == "FAILED" )
 | |
|                     continue;
 | |
| 
 | |
|                 if ( localSoftwareInfo != moduleSoftwareInfo )
 | |
|                 {
 | |
|                     // module not running on same Calpont Software build as this local Director
 | |
|                     // alarm and fail the module
 | |
|                     log.writeLog(__LINE__, "Software Version mismatch : " + moduleName + "/" + localSoftwareInfo + "/" + moduleSoftwareInfo, LOG_TYPE_CRITICAL);
 | |
| 
 | |
|                     aManager.sendAlarmReport(moduleName.c_str(), INVALID_SW_VERSION, SET);
 | |
|                     processManager.setModuleState(moduleName, oam::FAILED);
 | |
|                     status = API_FAILURE;
 | |
|                     break;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         //get out of loop if all modules ACTTVE or MAN_OFFLINE
 | |
|         if ( status == API_SUCCESS )
 | |
|         {
 | |
|             if ( systemStartupOffline == "y" )
 | |
|             {
 | |
|                 processManager.setSystemState(oam::MAN_OFFLINE);
 | |
|                 log.writeLog(__LINE__, "SystemStartupOffline set to 'y', Not starting up Calpont Database Processes", LOG_TYPE_DEBUG);
 | |
|             }
 | |
| 
 | |
|             break;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             //get out of loop if start module failed
 | |
|             if ( status == API_FAILURE )
 | |
|                 break;
 | |
| 
 | |
|             //retry after sleeping for a bit
 | |
|             sleep(1);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if ( k == waitTime || status == API_FAILURE)
 | |
|     {
 | |
|         // system didn't successfull restart
 | |
|         processManager.setSystemState(oam::FAILED);
 | |
|         // exit thread
 | |
|         log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, not all ProcMons ACTIVE", LOG_TYPE_CRITICAL);
 | |
|         log.writeLog(__LINE__, "startMgrProcessThread Exit - failure", LOG_TYPE_DEBUG);
 | |
|         pthread_exit(0);
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         //distribute config file
 | |
| //		processManager.distributeConfigFile("system");
 | |
| 
 | |
|         if ( systemStartupOffline == "n" && status == API_SUCCESS )
 | |
|         {
 | |
|             oam::DeviceNetworkList devicenetworklist;
 | |
|             pthread_t startsystemthread;
 | |
|             int status = pthread_create (&startsystemthread, NULL, (void* (*)(void*)) &startSystemThread, &devicenetworklist);
 | |
| 
 | |
|             if ( status != 0 )
 | |
|             {
 | |
|                 log.writeLog(__LINE__, "STARTSYSTEMS: pthread_create failed, return status = " + oam.itoa(status));
 | |
|                 status = API_FAILURE;
 | |
|             }
 | |
| 
 | |
|             if (status == 0)
 | |
|             {
 | |
|                 pthread_join(startsystemthread, NULL);
 | |
|                 status = startsystemthreadStatus;
 | |
|             }
 | |
| 
 | |
|             if ( status != API_SUCCESS )
 | |
|             {
 | |
|                 // system didn't successfull restart
 | |
|                 processManager.setSystemState(oam::FAILED);
 | |
|                 log.writeLog(__LINE__, "startMgrProcessThread Exit with a failure, error returned from startSystemThread", LOG_TYPE_CRITICAL);
 | |
|             }
 | |
|             else
 | |
|                 //distribute config file
 | |
|                 processManager.distributeConfigFile("system");
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     // exit thread
 | |
|     log.writeLog(__LINE__, "startMgrProcessThread Exit", LOG_TYPE_DEBUG);
 | |
|     pthread_exit(0);
 | |
| }
 | |
| 
 | |
| 
 | |
| /*****************************************************************************************
 | |
| * @brief	pingDeviceThread
 | |
| *
 | |
| * purpose:	perform ping testing on the devices within the system
 | |
| *
 | |
| *****************************************************************************************/
 | |
| void pingDeviceThread()
 | |
| {
 | |
|     ProcessLog log;
 | |
|     Configuration config;
 | |
|     ProcessManager processManager(config, log);
 | |
|     Oam oam;
 | |
|     ModuleTypeConfig moduletypeconfig;
 | |
|     ALARMManager aManager;
 | |
|     BRM::DBRM dbrm;
 | |
| 
 | |
|     log.writeLog(__LINE__, "pingDeviceThread launched", LOG_TYPE_DEBUG);
 | |
| 
 | |
|     string cmdLine = "ping ";
 | |
|     string cmdOption = " -c 1 -w 5 >> /dev/null";
 | |
|     string cmd;
 | |
|     string deviceIP;
 | |
| 
 | |
|     //
 | |
|     // Get Module Info
 | |
|     //
 | |
|     SystemModuleTypeConfig systemModuleTypeConfig;
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig(systemModuleTypeConfig);
 | |
|     }
 | |
|     catch (exception& ex)
 | |
|     {
 | |
|         string error = ex.what();
 | |
|         log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
|         log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|     }
 | |
| 
 | |
|     //Build the initial list, clear module state
 | |
| 
 | |
|     for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | |
|     {
 | |
|         int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|         if ( moduleCount == 0 )
 | |
|             // skip of no modules configured
 | |
|             continue;
 | |
| 
 | |
|         DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|         for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | |
|         {
 | |
|             moduleInfoList.insert(moduleList::value_type((*pt).DeviceName, 0));
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     typedef   map<string, int>	nicList;
 | |
|     nicList	nicInfoList;
 | |
| 
 | |
|     //Build the initial list, clear NIC state
 | |
| 
 | |
|     for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | |
|     {
 | |
|         int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|         if ( moduleCount == 0 )
 | |
|             // skip of no modules configured
 | |
|             continue;
 | |
| 
 | |
|         DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|         for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | |
|         {
 | |
| 
 | |
|             HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
 | |
| 
 | |
|             for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
 | |
|             {
 | |
|                 nicInfoList.insert(moduleList::value_type((*pt1).HostName, 0));
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     //
 | |
|     // Get ext device info
 | |
|     //
 | |
|     SystemExtDeviceConfig systemextdeviceconfig;
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig(systemextdeviceconfig);
 | |
|     }
 | |
|     catch (exception& ex)
 | |
|     {
 | |
|         string error = ex.what();
 | |
|         log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|     }
 | |
|     catch (...)
 | |
|     {
 | |
| //		log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|     }
 | |
| 
 | |
|     typedef   map<string, int>	extDeviceList;
 | |
|     extDeviceList	extDeviceInfoList;
 | |
| 
 | |
|     //Build the initial list, clear ext device state
 | |
| 
 | |
|     for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count; i++)
 | |
|     {
 | |
|         string name = systemextdeviceconfig.extdeviceconfig[i].Name;
 | |
|         extDeviceInfoList.insert(extDeviceList::value_type(name, 0));
 | |
|     }
 | |
| 
 | |
|     //storage config
 | |
|     string DBRootStorageType;
 | |
| 
 | |
|     try
 | |
|     {
 | |
|         oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
 | |
|     }
 | |
|     catch (...) {}
 | |
| 
 | |
|     log.writeLog(__LINE__, "pingDeviceThread: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
 | |
| 
 | |
|     int rtnCode = 0;
 | |
|     Configuration configData;
 | |
|     SystemStatus systemstatus;
 | |
| 
 | |
|     bool enableModuleMonitor = true;
 | |
| 
 | |
|     bool LANOUTAGEACTIVE = false;
 | |
|     bool HOTSTANDBYACTIVE = false;
 | |
|     bool downActiveOAMModule = false;
 | |
| 
 | |
|     // monitor module and external device loop
 | |
| 
 | |
|     while (true)
 | |
|     {
 | |
|         //don't peform module test if system is MAN_OFFLINE or not getting status's
 | |
|         while (true)
 | |
|         {
 | |
|             SystemStatus systemstatus;
 | |
| 
 | |
|             try
 | |
|             {
 | |
|                 oam.getSystemStatus(systemstatus);
 | |
| 
 | |
|                 if (systemstatus.SystemOpState == oam::MAN_OFFLINE )
 | |
|                     sleep(5);
 | |
|                 else
 | |
|                     break;
 | |
|             }
 | |
|             catch (...)
 | |
|             {
 | |
|                 sleep(5);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Module Heartbeat period and failure count
 | |
|         int ModuleHeartbeatPeriod;
 | |
|         int ModuleHeartbeatCount;
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             oam.getSystemConfig("ModuleHeartbeatPeriod", ModuleHeartbeatPeriod);
 | |
|             oam.getSystemConfig("ModuleHeartbeatCount", ModuleHeartbeatCount);
 | |
|             ModuleHeartbeatPeriod = ModuleHeartbeatPeriod * 10;
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
|             string error = ex.what();
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|             sleep(5);
 | |
|             continue;
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|             sleep(5);
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         // skip testing if Heartbeat is disable
 | |
|         if ( ModuleHeartbeatPeriod <= 0 )
 | |
|         {
 | |
|             if ( enableModuleMonitor )
 | |
|                 log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to disabled", LOG_TYPE_DEBUG);
 | |
| 
 | |
|             enableModuleMonitor = false;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             if ( !enableModuleMonitor && moduleInfoList.size() > 1 )
 | |
|                 log.writeLog(__LINE__, "ModuleHeartbeatPeriod set to enabled", LOG_TYPE_DEBUG);
 | |
| 
 | |
|             enableModuleMonitor = true;
 | |
|         }
 | |
| 
 | |
|         //single server system
 | |
|         if ( moduleInfoList.size() <= 1)
 | |
|             enableModuleMonitor = false;
 | |
| 
 | |
|         //
 | |
|         // ping NIC
 | |
|         //
 | |
| 
 | |
|         // read each time to catch updates
 | |
|         pthread_mutex_lock(&THREAD_LOCK);
 | |
|         systemModuleTypeConfig.moduletypeconfig.clear();
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             oam.getSystemConfig(systemModuleTypeConfig);
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
|             string error = ex.what();
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|             sleep(5);
 | |
|             continue;
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|             sleep(5);
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         pthread_mutex_unlock(&THREAD_LOCK);
 | |
| 
 | |
|         bool LANOUTAGESUPPORT = true;
 | |
|         bool LOCALNICDOWN = false;
 | |
| 
 | |
|         if (enableModuleMonitor)
 | |
|         {
 | |
|             //test main local Ethernet interface status
 | |
|             for ( int count = 0 ; ; count ++)
 | |
|             {
 | |
|                 int sockfd;
 | |
|                 struct ifreq ifr;
 | |
| 
 | |
|                 sockfd = socket(AF_INET, SOCK_DGRAM, 0);
 | |
| 
 | |
|                 if (sockfd == -1)
 | |
|                 {
 | |
|                     log.writeLog(__LINE__, "Could not get socket to check", LOG_TYPE_ERROR);
 | |
|                     close(sockfd);
 | |
|                     break;
 | |
|                 }
 | |
| 
 | |
|                 /* get interface name */
 | |
|                 strncpy(ifr.ifr_name, iface_name.c_str(), IFNAMSIZ);
 | |
| 
 | |
|                 /* Read interface flags */
 | |
|                 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0)
 | |
|                 {
 | |
|                     // not supported
 | |
|                     close(sockfd);
 | |
|                     break;
 | |
|                 }
 | |
| 
 | |
|                 if (ifr.ifr_flags & IFF_UP)
 | |
|                 {
 | |
|                     // ethernet port is up, continue on
 | |
|                     close(sockfd);
 | |
|                     break;
 | |
|                 }
 | |
|                 else
 | |
|                 {
 | |
|                     // ethernet port is down
 | |
|                     log.writeLog(__LINE__, "NIC #1 is DOWN", LOG_TYPE_WARNING);
 | |
| 
 | |
|                     if ( count >= ModuleHeartbeatCount )
 | |
|                     {
 | |
|                         LOCALNICDOWN = true;
 | |
|                         close(sockfd);
 | |
|                         break;
 | |
|                     }
 | |
|                     else
 | |
|                         sleep(5);
 | |
|                 }
 | |
| 
 | |
|                 close(sockfd);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // if the NIC is down, go directly to LAN outage processing
 | |
|         if ( !LOCALNICDOWN )
 | |
|         {
 | |
|             for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | |
|             {
 | |
|                 int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|                 if ( moduleCount == 0)
 | |
|                     continue;
 | |
| 
 | |
|                 DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|                 for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | |
|                 {
 | |
|                     string moduleName = (*pt).DeviceName;
 | |
|                     string ipAddr;
 | |
|                     string hostName;
 | |
|                     int moduleState = oam::INITIAL;
 | |
|                     HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
 | |
| 
 | |
|                     for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
 | |
|                     {
 | |
|                         ipAddr = (*pt1).IPAddr;
 | |
|                         hostName = (*pt1).HostName;
 | |
| 
 | |
|                         if (enableModuleMonitor)
 | |
|                         {
 | |
|                             // perform ping test
 | |
|                             cmd = cmdLine + ipAddr + cmdOption;
 | |
|                             rtnCode = system(cmd.c_str());
 | |
|                             rtnCode = WEXITSTATUS(rtnCode);
 | |
|                         }
 | |
|                         else
 | |
|                             rtnCode = 0;
 | |
| 
 | |
|                         int currentNICState = oam::UP;
 | |
| 
 | |
|                         try
 | |
|                         {
 | |
|                             oam.getNICStatus(hostName, currentNICState);
 | |
|                         }
 | |
|                         catch (exception& ex)
 | |
|                         {
 | |
| //							string error = ex.what();
 | |
| //							log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: " + error, LOG_TYPE_ERROR);
 | |
|                         }
 | |
|                         catch (...)
 | |
|                         {
 | |
| //							log.writeLog(__LINE__, "EXCEPTION ERROR on getNICStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                         }
 | |
| 
 | |
|                         switch (rtnCode)
 | |
|                         {
 | |
|                             case 0:
 | |
| 
 | |
|                                 //NIC Ack ping
 | |
|                                 if ( currentNICState != oam::UP )
 | |
|                                 {
 | |
|                                     processManager.setNICState(hostName, oam::UP);
 | |
| 
 | |
|                                     if ( ModuleHeartbeatPeriod > 0 )
 | |
|                                         //Clear an alarm
 | |
|                                         aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, CLEAR);
 | |
|                                 }
 | |
| 
 | |
|                                 //set LAN Outage indicator to false since a module is responding
 | |
|                                 if ( moduleState == oam::INITIAL)
 | |
|                                     if ( moduleName != config.moduleName())
 | |
|                                         LANOUTAGESUPPORT = false;
 | |
| 
 | |
|                                 //set Module State
 | |
|                                 if ( moduleState == oam::INITIAL || moduleState == oam::UP)
 | |
|                                     moduleState = oam::UP;
 | |
| 
 | |
|                                 break;
 | |
| 
 | |
|                             default:
 | |
| 
 | |
|                                 //NIC failed to respond to ping
 | |
|                                 if ( currentNICState != oam::DOWN )
 | |
|                                 {
 | |
|                                     log.writeLog(__LINE__, "NIC failed to respond to ping: " + hostName, LOG_TYPE_WARNING);
 | |
|                                     processManager.setNICState(hostName, oam::DOWN);
 | |
| 
 | |
|                                     if ( ModuleHeartbeatPeriod > 0 )
 | |
|                                         //Issue an alarm
 | |
|                                         aManager.sendAlarmReport(hostName.c_str(), NIC_DOWN_AUTO, SET);
 | |
|                                 }
 | |
| 
 | |
|                                 //set Module State
 | |
|                                 if ( moduleState == oam::INITIAL || moduleState == oam::DOWN)
 | |
|                                     moduleState = oam::DOWN;
 | |
|                                 else
 | |
|                                     // NIC 1 is up and NIC 2 is down
 | |
|                                     moduleState = oam::DEGRADED;
 | |
| 
 | |
|                                 break;
 | |
|                         }
 | |
|                     }
 | |
| 
 | |
|                     // if disable, default module state to up
 | |
|                     if (!enableModuleMonitor)
 | |
|                         moduleState = oam::UP;
 | |
| 
 | |
|                     // moduleState coming out of the NIC monitoring loop
 | |
|                     // UP - ALL NICs passed ping test
 | |
|                     // DEGRADED - NIC 1 passed, NIC 2 failed ping test
 | |
|                     // DOWN - NIC 1 or ALL NICs failed ping test
 | |
| 
 | |
|                     int opState = oam::ACTIVE;
 | |
| 
 | |
|                     try
 | |
|                     {
 | |
|                         bool degraded;
 | |
|                         oam.getModuleStatus(moduleName, opState, degraded);
 | |
|                     }
 | |
|                     catch (exception& ex)
 | |
|                     {
 | |
| //						string error = ex.what();
 | |
| //						log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
 | |
|                     }
 | |
|                     catch (...)
 | |
|                     {
 | |
| //						log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                     }
 | |
| 
 | |
|                     // skip module check if not inuse or in FAILED state
 | |
|                     if (opState == oam::MAN_OFFLINE ||
 | |
|                             opState == oam::MAN_DISABLED ||
 | |
|                             opState == oam::FAILED)
 | |
|                         continue;
 | |
| 
 | |
|                     //fast track a restart of a downed failover modules
 | |
|                     if ( gdownActiveOAMModule == moduleName )
 | |
|                     {
 | |
|                         moduleInfoList[moduleName] = ModuleHeartbeatCount - 1;
 | |
|                         gdownActiveOAMModule.clear();
 | |
|                         moduleState = oam::DOWN;
 | |
|                         downActiveOAMModule = true;
 | |
|                     }
 | |
| 
 | |
|                     vector<string>::iterator pt2 = downModuleList.begin();
 | |
| 
 | |
|                     for ( ; pt2 != downModuleList.end() ; pt2++)
 | |
|                     {
 | |
|                         if ( *pt2 == moduleName )
 | |
|                         {
 | |
|                             moduleInfoList[moduleName] = ModuleHeartbeatCount - 1;
 | |
|                             moduleState = oam::DOWN;
 | |
|                             downModuleList.erase(pt2);
 | |
|                             break;
 | |
|                         }
 | |
|                     }
 | |
| 
 | |
|                     switch (moduleState)
 | |
|                     {
 | |
|                         case oam::DEGRADED:
 | |
|                             // do nothing for now
 | |
|                             break;
 | |
| 
 | |
|                         case oam::UP:
 | |
| 
 | |
| // comment out, only come up when both nic are up, if not the pms list will not have the second nic in there
 | |
| //						case oam::DEGRADED:
 | |
|                             if (opState == oam::DOWN || opState == oam::INITIAL
 | |
|                                     || opState == oam::AUTO_DISABLED)
 | |
|                             {
 | |
|                                 //Set the module state to up
 | |
|                                 processManager.setModuleState(moduleName, moduleState);
 | |
|                             }
 | |
| 
 | |
|                             if ( moduleName == config.OAMStandbyName() )
 | |
|                                 HOTSTANDBYACTIVE = true;
 | |
| 
 | |
|                             // if LAN OUTAGE ACTIVE, skip module checks
 | |
|                             if (LANOUTAGEACTIVE)
 | |
|                                 break;
 | |
| 
 | |
|                             try
 | |
|                             {
 | |
|                                 oam.getSystemConfig("MySQLRep", MySQLRep);
 | |
|                             }
 | |
|                             catch (...)
 | |
|                             {
 | |
|                                 MySQLRep = "n";
 | |
|                             }
 | |
| 
 | |
|                             if (moduleInfoList[moduleName] >= ModuleHeartbeatCount ||
 | |
|                                     opState == oam::DOWN || opState == oam::AUTO_DISABLED)
 | |
|                             {
 | |
|                                 log.writeLog(__LINE__, "Module alive, bring it back online: " + moduleName, LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                 string PrimaryUMModuleName = config.moduleName();
 | |
| 
 | |
|                                 try
 | |
|                                 {
 | |
|                                     oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
 | |
|                                 }
 | |
|                                 catch (...) {}
 | |
| 
 | |
|                                 bool busy = false;
 | |
| 
 | |
|                                 for ( int retry = 0 ; retry < 20 ; retry++ )
 | |
|                                 {
 | |
|                                     busy = false;
 | |
|                                     ProcessStatus DMLprocessstatus;
 | |
| 
 | |
|                                     try
 | |
|                                     {
 | |
|                                         oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
 | |
| 
 | |
|                                         if ( DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
 | |
|                                         {
 | |
|                                             log.writeLog(__LINE__, "DMLProc in BUSY_INIT, skip bringing module online " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                             busy = true;
 | |
|                                             sleep(5);
 | |
|                                         }
 | |
|                                         else
 | |
|                                             break;
 | |
|                                     }
 | |
|                                     catch (...)
 | |
|                                     {
 | |
|                                         sleep(5);
 | |
|                                     }
 | |
|                                 }
 | |
| 
 | |
|                                 if (busy)
 | |
|                                     break;
 | |
| 
 | |
|                                 //set query system state not ready
 | |
|                                 BRM::DBRM dbrm;
 | |
|                                 dbrm.setSystemQueryReady(false);
 | |
| 
 | |
|                                 processManager.setQuerySystemState(false);
 | |
| 
 | |
|                                 processManager.setSystemState(oam::BUSY_INIT);
 | |
| 
 | |
|                                 processManager.reinitProcessType("cpimport");
 | |
| 
 | |
|                                 // halt the dbrm
 | |
|                                 oam.dbrmctl("halt");
 | |
|                                 log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                 aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, CLEAR);
 | |
| 
 | |
|                                 //send notification
 | |
|                                 oam.sendDeviceNotification(config.moduleName(), MODULE_UP);
 | |
| 
 | |
|                                 int status;
 | |
|                                 DBRootConfigList dbrootConfigList;
 | |
| 
 | |
|                                 // if shared pm, move dbroots back to pm
 | |
|                                 if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | |
|                                         ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | |
|                                         ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | |
|                                 {
 | |
| 
 | |
|                                     //restart to get the versionbuffer files closed so it can be unmounted
 | |
|                                     processManager.restartProcessType("WriteEngineServer", moduleName);
 | |
| 
 | |
|                                     //set module to enable state
 | |
|                                     processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                                     downActiveOAMModule = false;
 | |
|                                     int retry;
 | |
| 
 | |
|                                     for ( retry = 0 ; retry < 5 ; retry++ )
 | |
|                                     {
 | |
|                                         try
 | |
|                                         {
 | |
|                                             log.writeLog(__LINE__, "Call autoUnMovePmDbroot", LOG_TYPE_DEBUG);
 | |
|                                             oam.autoUnMovePmDbroot(moduleName);
 | |
| 
 | |
|                                             //check if any dbroots got assigned back to this module
 | |
|                                             // they could not be moved if there were busy on other pms
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
 | |
|                                                 oam.getPmDbrootConfig(moduleID, dbrootConfigList);
 | |
| 
 | |
|                                                 if (  dbrootConfigList.size() == 0 )
 | |
|                                                 {
 | |
|                                                     // no dbroots, fail module
 | |
|                                                     log.writeLog(__LINE__, "autoUnMovePmDbroot left no dbroots mounted, failing module restart: " + moduleName, LOG_TYPE_WARNING);
 | |
| 
 | |
|                                                     //Issue an alarm
 | |
|                                                     aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | |
| 
 | |
|                                                     //set module to disable state
 | |
|                                                     processManager.disableModule(moduleName, true);
 | |
| 
 | |
|                                                     //call dbrm control
 | |
|                                                     oam.dbrmctl("reload");
 | |
|                                                     log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                                     // resume the dbrm
 | |
|                                                     oam.dbrmctl("resume");
 | |
|                                                     log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                                     //clear count
 | |
|                                                     moduleInfoList[moduleName] = 0;
 | |
| 
 | |
|                                                     processManager.setSystemState(oam::ACTIVE);
 | |
| 
 | |
|                                                     //set query system state ready
 | |
|                                                     processManager.setQuerySystemState(true);
 | |
| 
 | |
|                                                     break;
 | |
|                                                 }
 | |
|                                             }
 | |
|                                             catch (...)
 | |
|                                             {}
 | |
| 
 | |
|                                             log.writeLog(__LINE__, "autoUnMovePmDbroot success", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                             //distribute config file
 | |
|                                             processManager.distributeConfigFile("system");
 | |
| 
 | |
|                                             break;
 | |
|                                         }
 | |
|                                         catch (...)
 | |
|                                         {
 | |
|                                             sleep(5);
 | |
|                                         }
 | |
|                                     }
 | |
| 
 | |
|                                     if ( retry == 5 )
 | |
|                                     {
 | |
|                                         log.writeLog(__LINE__, "autoUnMovePmDbroot: Failed. Fail Module", LOG_TYPE_WARNING);
 | |
| 
 | |
|                                         //Issue an alarm
 | |
|                                         aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | |
| 
 | |
|                                         //set module to disable state
 | |
|                                         processManager.disableModule(moduleName, true);
 | |
| 
 | |
|                                         //call dbrm control
 | |
|                                         oam.dbrmctl("reload");
 | |
|                                         log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                         // resume the dbrm
 | |
|                                         oam.dbrmctl("resume");
 | |
|                                         log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                         //clear count
 | |
|                                         moduleInfoList[moduleName] = 0;
 | |
| 
 | |
|                                         processManager.setSystemState(oam::ACTIVE);
 | |
| 
 | |
|                                         //set query system state ready
 | |
|                                         processManager.setQuerySystemState(true);
 | |
| 
 | |
|                                         break;
 | |
|                                     }
 | |
|                                 }
 | |
|                                 else
 | |
|                                     //set module to enable state
 | |
|                                     processManager.enableModule(moduleName, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                                 //restart module processes
 | |
|                                 int retry = 0;
 | |
| 
 | |
|                                 int ModuleProcMonWaitCount = 6;
 | |
| 
 | |
|                                 try
 | |
|                                 {
 | |
|                                     oam.getSystemConfig("ModuleProcMonWaitCount", ModuleProcMonWaitCount);
 | |
|                                 }
 | |
|                                 catch (...)
 | |
|                                 {
 | |
|                                     ModuleProcMonWaitCount = 6;
 | |
|                                 }
 | |
| 
 | |
|                                 for ( ; retry < ModuleProcMonWaitCount ; retry ++ )
 | |
|                                 {
 | |
|                                     // first, wait until module's ProcMon is ACTIVE
 | |
|                                     int opState = oam::ACTIVE;
 | |
| 
 | |
|                                     try
 | |
|                                     {
 | |
|                                         ProcessStatus procstat;
 | |
|                                         oam.getProcessStatus("ProcessMonitor", moduleName, procstat);
 | |
|                                         opState = procstat.ProcessOpState;
 | |
| 
 | |
|                                         if (opState != oam::ACTIVE)
 | |
|                                         {
 | |
|                                             log.writeLog(__LINE__, "Waiting for Module ProcMon to go ACTIVE: " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                             sleep(5);
 | |
|                                             continue;
 | |
|                                         }
 | |
|                                     }
 | |
|                                     catch (exception& ex)
 | |
|                                     {
 | |
| //										string error = ex.what();
 | |
| //										log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | |
|                                         sleep(5);
 | |
|                                         continue;
 | |
|                                     }
 | |
|                                     catch (...)
 | |
|                                     {
 | |
| //										log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                                         sleep(5);
 | |
|                                         continue;
 | |
|                                     }
 | |
| 
 | |
|                                     //check and assign Elastic IP Address
 | |
|                                     int AmazonElasticIPCount = 0;
 | |
| 
 | |
|                                     try
 | |
|                                     {
 | |
|                                         oam.getSystemConfig("AmazonElasticIPCount", AmazonElasticIPCount);
 | |
|                                     }
 | |
|                                     catch (...)
 | |
|                                     {
 | |
|                                         AmazonElasticIPCount = 0;
 | |
|                                     }
 | |
| 
 | |
|                                     for ( int id = 1 ; id < AmazonElasticIPCount + 1 ; id++ )
 | |
|                                     {
 | |
|                                         string AmazonElasticModule = "AmazonElasticModule" + oam.itoa(id);
 | |
|                                         string ELmoduleName;
 | |
| 
 | |
|                                         try
 | |
|                                         {
 | |
|                                             oam.getSystemConfig(AmazonElasticModule, ELmoduleName);
 | |
|                                         }
 | |
|                                         catch (...) {}
 | |
| 
 | |
|                                         if ( ELmoduleName == moduleName )
 | |
|                                         {
 | |
|                                             //match found assign Elastic IP Address
 | |
|                                             string AmazonElasticIPAddr = "AmazonElasticIPAddr" + oam.itoa(id);
 | |
|                                             string ELIPaddress;
 | |
| 
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 oam.getSystemConfig(AmazonElasticIPAddr, ELIPaddress);
 | |
|                                             }
 | |
|                                             catch (...) {}
 | |
| 
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 oam.assignElasticIP(hostName, ELIPaddress);
 | |
|                                                 log.writeLog(__LINE__, "Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_DEBUG);
 | |
|                                             }
 | |
|                                             catch (...)
 | |
|                                             {
 | |
|                                                 log.writeLog(__LINE__, "Failed to Set Elastic IP Address: " + hostName + "/" + ELIPaddress, LOG_TYPE_ERROR);
 | |
|                                             }
 | |
| 
 | |
|                                             break;
 | |
|                                         }
 | |
|                                     }
 | |
| 
 | |
|                                     // next, stopmodule to start up clean
 | |
|                                     status = processManager.stopModule(moduleName, oam::FORCEFUL, false);
 | |
| 
 | |
|                                     if ( status == oam::API_SUCCESS )
 | |
|                                     {
 | |
|                                         string newStandbyModule = processManager.getStandbyModule();
 | |
| 
 | |
|                                         if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
 | |
|                                         {
 | |
|                                             processManager.setStandbyModule(newStandbyModule);
 | |
|                                         }
 | |
|                                         else
 | |
|                                         {
 | |
|                                             if ( newStandbyModule == "NONE")
 | |
|                                                 if ( moduleName.substr(0, MAX_MODULE_TYPE_SIZE) == "pm" )
 | |
|                                                     processManager.setStandbyModule(moduleName);
 | |
|                                         }
 | |
| 
 | |
|                                         DBRootConfigList::iterator pt = dbrootConfigList.begin();
 | |
| 
 | |
|                                         if (( DBRootStorageType == "DataRedundancy") && (*pt == 1))
 | |
|                                         {
 | |
|                                             log.writeLog(__LINE__, "stopModule, " + config.moduleName(), LOG_TYPE_DEBUG);
 | |
|                                             processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
 | |
|                                             processManager.switchParentOAMModule(moduleName);
 | |
|                                             processManager.stopProcess(config.moduleName(), "ProcessManager", oam::FORCEFUL, true);
 | |
|                                             break;
 | |
|                                         }
 | |
|                                     }
 | |
|                                     else
 | |
|                                     {
 | |
|                                         //stop failed, retry
 | |
|                                         log.writeLog(__LINE__, "stopModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                         sleep(5);
 | |
|                                         continue;
 | |
|                                     }
 | |
| 
 | |
|                                     // next, startmodule
 | |
|                                     status = processManager.startModule(moduleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                                     if ( status == oam::API_SUCCESS )
 | |
|                                         break;
 | |
| 
 | |
|                                     log.writeLog(__LINE__, "startModule, failed will retry: " + moduleName, LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                     //sleep and retry all over again
 | |
|                                     sleep (5);
 | |
|                                 } // end of the retry loop
 | |
| 
 | |
|                                 if ( retry < ModuleProcMonWaitCount )
 | |
|                                 {
 | |
|                                     // module successfully started
 | |
| 
 | |
|                                     //call dbrm control, need to resume before start so the getdbrmfiles halt doesn't hang
 | |
|                                     oam.dbrmctl("reload");
 | |
|                                     log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                     // resume the dbrm
 | |
|                                     oam.dbrmctl("resume");
 | |
|                                     log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                     //distribute config file
 | |
|                                     processManager.distributeConfigFile("system");
 | |
|                                     sleep(1);
 | |
| 
 | |
|                                     // if a PM module was started successfully, restart ACTIVE ExeMgr(s) / mysqld
 | |
|                                     if ( moduleName.find("pm") == 0 )
 | |
|                                     {
 | |
|                                         processManager.restartProcessType("ExeMgr", moduleName);
 | |
|                                     }
 | |
| 
 | |
|                                     string moduleType = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
 | |
| 
 | |
|                                     if ( MySQLRep == "y" )
 | |
|                                     {
 | |
|                                         if ( moduleType == "um" ||
 | |
|                                                 ( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
 | |
|                                                 ( moduleType == "pm" && PMwithUM == "y") )
 | |
|                                         {
 | |
| 
 | |
|                                             //setup MySQL Replication for started modules
 | |
| 
 | |
|                                             log.writeLog(__LINE__, "Setup MySQL Replication for module recovering from outage on " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                             DeviceNetworkList devicenetworklist;
 | |
|                                             DeviceNetworkConfig devicenetworkconfig;
 | |
|                                             devicenetworkconfig.DeviceName = moduleName;
 | |
|                                             devicenetworklist.push_back(devicenetworkconfig);
 | |
|                                             processManager.setMySQLReplication(devicenetworklist);
 | |
|                                         }
 | |
|                                     }
 | |
|                                     else
 | |
|                                     {
 | |
|                                         if ( moduleName.find("pm") == 0 )
 | |
|                                         {
 | |
|                                             processManager.restartProcessType("mysql", moduleName);
 | |
|                                             sleep(1);
 | |
|                                         }
 | |
|                                     }
 | |
| 
 | |
|                                     // if a PM module was started successfully, DMLProc/DDLProc
 | |
|                                     if ( moduleName.find("pm") == 0 )
 | |
|                                     {
 | |
|                                         processManager.restartProcessType("DDLProc", moduleName);
 | |
|                                         sleep(1);
 | |
|                                         processManager.restartProcessType("DMLProc", moduleName);
 | |
|                                     }
 | |
| 
 | |
|                                     //enable query stats
 | |
|                                     dbrm.setSystemQueryReady(true);
 | |
| 
 | |
|                                     //set query system state ready
 | |
|                                     processManager.setQuerySystemState(true);
 | |
| 
 | |
|                                     processManager.setSystemState(oam::ACTIVE);
 | |
| 
 | |
|                                     //clear count
 | |
|                                     moduleInfoList[moduleName] = 0;
 | |
|                                 }
 | |
|                                 else
 | |
|                                 {
 | |
|                                     // module failed to restart, place back in disabled state
 | |
|                                     //Log failure, issue alarm, set moduleOpState
 | |
|                                     Configuration config;
 | |
| 
 | |
|                                     //Issue an alarm
 | |
|                                     aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | |
| 
 | |
|                                     // if pm, move dbroots back to pm
 | |
|                                     if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | |
|                                             ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | |
|                                             ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | |
|                                     {
 | |
|                                         //move dbroots to other modules
 | |
|                                         try
 | |
|                                         {
 | |
|                                             log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
 | |
|                                             oam.autoMovePmDbroot(moduleName);
 | |
|                                             log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
 | |
|                                             //distribute config file
 | |
|                                             processManager.distributeConfigFile("system");
 | |
|                                         }
 | |
|                                         catch (exception& ex)
 | |
|                                         {
 | |
|                                             string error = ex.what();
 | |
|                                             log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
 | |
|                                         }
 | |
|                                         catch (...)
 | |
|                                         {
 | |
|                                             log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                                         }
 | |
|                                     }
 | |
| 
 | |
|                                     //set module to disable state
 | |
|                                     processManager.disableModule(moduleName, true);
 | |
| 
 | |
|                                     //call dbrm control
 | |
|                                     oam.dbrmctl("reload");
 | |
|                                     log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                     // resume the dbrm
 | |
|                                     oam.dbrmctl("resume");
 | |
|                                     log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                     log.writeLog(__LINE__, "Module failed to auto start: " + moduleName, LOG_TYPE_CRITICAL);
 | |
| 
 | |
|                                     if ( amazon )
 | |
|                                         processManager.setSystemState(oam::FAILED);
 | |
|                                     else
 | |
|                                         processManager.setSystemState(oam::ACTIVE);
 | |
| 
 | |
|                                     //enable query stats
 | |
|                                     dbrm.setSystemQueryReady(true);
 | |
| 
 | |
|                                     //set query system state ready
 | |
|                                     processManager.setQuerySystemState(true);
 | |
| 
 | |
|                                     //clear count
 | |
|                                     moduleInfoList[moduleName] = 0;
 | |
|                                 }
 | |
|                             }
 | |
| 
 | |
|                             break;
 | |
| 
 | |
|                         case oam::DOWN:
 | |
| 
 | |
|                             // if initial state, skip
 | |
|                             if (opState == oam::INITIAL)
 | |
|                                 break;
 | |
| 
 | |
|                             // if disabled and not amazon, skip
 | |
|                             if (opState == oam::AUTO_DISABLED && !amazon)
 | |
|                                 break;
 | |
| 
 | |
|                             log.writeLog(__LINE__, "module failed to respond to pings: " + moduleName, LOG_TYPE_WARNING);
 | |
| 
 | |
|                             //bump module ping failure counter
 | |
|                             moduleInfoList[moduleName]++;
 | |
| 
 | |
|                             if ( moduleName == config.OAMStandbyName() )
 | |
|                                 HOTSTANDBYACTIVE = false;
 | |
| 
 | |
|                             if (moduleInfoList[moduleName] == ModuleHeartbeatCount)
 | |
|                             {
 | |
|                                 // if LAN OUTAGE ACTIVE,skip module checks
 | |
|                                 if (LANOUTAGEACTIVE)
 | |
|                                     break;
 | |
| 
 | |
|                                 //Log failure, issue alarm, set moduleOpState
 | |
|                                 Configuration config;
 | |
|                                 log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
 | |
| 
 | |
|                                 //set query system state not ready
 | |
|                                 BRM::DBRM dbrm;
 | |
|                                 dbrm.setSystemQueryReady(false);
 | |
| 
 | |
|                                 processManager.setQuerySystemState(false);
 | |
| 
 | |
|                                 processManager.setSystemState(oam::BUSY_INIT);
 | |
| 
 | |
|                                 processManager.reinitProcessType("cpimport");
 | |
| 
 | |
|                                 // halt the dbrm
 | |
|                                 oam.dbrmctl("halt");
 | |
|                                 log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                 processManager.setSystemState(oam::BUSY_INIT);
 | |
| 
 | |
|                                 //string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
 | |
|                                 //system(cmd.c_str());
 | |
| 
 | |
|                                 //send notification
 | |
|                                 oam.sendDeviceNotification(moduleName, MODULE_DOWN);
 | |
| 
 | |
|                                 //Issue an alarm
 | |
|                                 aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
 | |
| 
 | |
|                                 //mark all processes running on module auto-offline
 | |
|                                 processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                                 //set module to disable state
 | |
|                                 processManager.disableModule(moduleName, false);
 | |
| 
 | |
|                                 //call dbrm control
 | |
|                                 oam.dbrmctl("reload");
 | |
|                                 log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                 // if pm, move dbroots to other pms
 | |
|                                 if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | |
|                                         ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | |
|                                         ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | |
|                                 {
 | |
|                                     try
 | |
|                                     {
 | |
|                                         log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
 | |
|                                         oam.autoMovePmDbroot(moduleName);
 | |
|                                         log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
 | |
|                                         //distribute config file
 | |
|                                         processManager.distributeConfigFile("system");
 | |
|                                     }
 | |
|                                     catch (exception& ex)
 | |
|                                     {
 | |
|                                         string error = ex.what();
 | |
|                                         log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
 | |
|                                     }
 | |
|                                     catch (...)
 | |
|                                     {
 | |
|                                         log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                                     }
 | |
|                                 }
 | |
| 
 | |
|                                 // if Cloud Instance
 | |
|                                 // state = running, then instance is rebooting, monitor for recovery
 | |
|                                 // state = stopped, then try starting, if fail, remove/addmodule to launch new instance
 | |
|                                 // state = terminate or nothing, remove/addmodule to launch new instance
 | |
|                                 if ( amazon )
 | |
|                                 {
 | |
|                                     if ( moduleName.find("um") == 0 )
 | |
|                                     {
 | |
|                                         // resume the dbrm
 | |
|                                         oam.dbrmctl("resume");
 | |
|                                         log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                         //set recycle process
 | |
|                                         processManager.recycleProcess(moduleName);
 | |
|                                     }
 | |
| 
 | |
|                                     // return values = 'ip address' for running or rebooting, stopped or terminated
 | |
|                                     string currentIPAddr = oam.getEC2InstanceIpAddress(hostName);
 | |
| 
 | |
|                                     if ( currentIPAddr == "terminated")
 | |
|                                     {
 | |
|                                         //check if down module was Standby OAM, if so find another one
 | |
|                                         if ( moduleName == config.OAMStandbyName() )
 | |
|                                         {
 | |
| 
 | |
|                                             //set down module ProcessManager to AOS
 | |
|                                             processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
 | |
| 
 | |
|                                             //get another standby OAM module
 | |
|                                             string newStandbyModule = processManager.getStandbyModule();
 | |
| 
 | |
|                                             //send message to start new Standby Process-Manager, if needed
 | |
|                                             if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
 | |
|                                             {
 | |
|                                                 processManager.setStandbyModule(newStandbyModule);
 | |
|                                             }
 | |
|                                             else
 | |
|                                             {
 | |
|                                                 Config* sysConfig = Config::makeConfig();
 | |
| 
 | |
|                                                 // clear Standby OAM Module
 | |
|                                                 sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
 | |
|                                                 sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
 | |
| 
 | |
|                                                 //update Calpont Config table
 | |
|                                                 try
 | |
|                                                 {
 | |
|                                                     sysConfig->write();
 | |
|                                                 }
 | |
|                                                 catch (...)
 | |
|                                                 {
 | |
|                                                     log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
 | |
|                                                 }
 | |
|                                             }
 | |
|                                         }
 | |
| 
 | |
|                                         // remove/addmodule
 | |
|                                         log.writeLog(__LINE__, "Instance terminated, re-launching: " + hostName, LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                         // if pm, get assigned dbroots and deattach EBS
 | |
|                                         DBRootConfigList dbrootConfigList;
 | |
|                                         int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
 | |
| 
 | |
|                                         if ( moduleName.find("pm") == 0 )
 | |
|                                         {
 | |
|                                             //get dbroots ids for to PM
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 oam.getPmDbrootConfig(moduleID, dbrootConfigList);
 | |
|                                             }
 | |
|                                             catch (exception& e)
 | |
|                                             {
 | |
|                                                 log.writeLog(__LINE__, "ERROR: getPmDbrootConfig error: " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                             }
 | |
|                                         }
 | |
| 
 | |
|                                         DeviceNetworkList devicenetworklist;
 | |
|                                         DeviceNetworkConfig devicenetworkconfig;
 | |
|                                         HostConfig hostconfig;
 | |
| 
 | |
|                                         devicenetworkconfig.DeviceName = moduleName;
 | |
| 
 | |
|                                         if (cloud == "amazon-vpc")
 | |
|                                             hostconfig.IPAddr = ipAddr;
 | |
|                                         else
 | |
|                                             hostconfig.IPAddr = oam::UnassignedName;
 | |
| 
 | |
|                                         hostconfig.HostName = oam::UnassignedName;
 | |
|                                         hostconfig.NicID = 1;
 | |
|                                         devicenetworkconfig.hostConfigList.push_back(hostconfig);
 | |
| 
 | |
|                                         devicenetworklist.push_back(devicenetworkconfig);
 | |
| 
 | |
|                                         bool pass = true;
 | |
| 
 | |
|                                         for ( int addRetry = 0 ; addRetry < 5 ; addRetry++ )
 | |
|                                         {
 | |
|                                             //remove module
 | |
|                                             int ret = processManager.removeModule(devicenetworklist, false);
 | |
| 
 | |
|                                             if ( ret != oam::API_SUCCESS )
 | |
|                                             {
 | |
|                                                 log.writeLog(__LINE__, "Instance failed to remove, retry: " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                             }
 | |
|                                             else
 | |
|                                             {
 | |
|                                                 pass = true;
 | |
|                                                 log.writeLog(__LINE__, "Instance removed, module: " + moduleName, LOG_TYPE_DEBUG);
 | |
|                                             }
 | |
| 
 | |
|                                             // add module
 | |
|                                             string password = oam::UnassignedName;
 | |
| 
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 oam.getSystemConfig("rpw", password);
 | |
|                                             }
 | |
|                                             catch (...)
 | |
|                                             {
 | |
|                                                 password = oam::UnassignedName;
 | |
|                                             }
 | |
| 
 | |
|                                             ret = processManager.addModule(devicenetworklist, password, false);
 | |
| 
 | |
|                                             if ( ret != oam::API_SUCCESS )
 | |
|                                             {
 | |
|                                                 log.writeLog(__LINE__, "Instance failed to add, retry: " + moduleName, LOG_TYPE_CRITICAL);
 | |
|                                                 pass = false;
 | |
|                                             }
 | |
|                                             else
 | |
|                                             {
 | |
|                                                 pass = true;
 | |
|                                                 log.writeLog(__LINE__, "New Instance Launched for " + moduleName, LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                                 // if pm, config and attach EBS
 | |
|                                                 if ( moduleName.find("pm") == 0 && !dbrootConfigList.empty() )
 | |
|                                                 {
 | |
|                                                     try
 | |
|                                                     {
 | |
|                                                         oam.setPmDbrootConfig(moduleID, dbrootConfigList);
 | |
| 
 | |
|                                                         std::vector<std::string> dbrootList;
 | |
|                                                         DBRootConfigList::iterator pt1 = dbrootConfigList.begin();
 | |
| 
 | |
|                                                         for ( ; pt1 != dbrootConfigList.end() ; pt1++)
 | |
|                                                         {
 | |
|                                                             dbrootList.push_back(oam.itoa(*pt1));
 | |
|                                                         }
 | |
| 
 | |
|                                                         //attach EBS
 | |
|                                                         try
 | |
|                                                         {
 | |
|                                                             oam.amazonReattach(moduleName, dbrootList, true);
 | |
|                                                             pass = true;
 | |
|                                                             break;
 | |
|                                                         }
 | |
|                                                         catch (exception& e)
 | |
|                                                         {
 | |
|                                                             log.writeLog(__LINE__, "ERROR: amazonReattach error on " + moduleName, LOG_TYPE_ERROR);
 | |
|                                                             pass = false;
 | |
|                                                         }
 | |
|                                                     }
 | |
|                                                     catch (exception& e)
 | |
|                                                     {
 | |
|                                                         log.writeLog(__LINE__, "ERROR: setPmDbrootConfig error on " + moduleName, LOG_TYPE_ERROR);
 | |
|                                                         pass = false;
 | |
|                                                     }
 | |
|                                                 }
 | |
|                                                 else
 | |
|                                                 {
 | |
|                                                     pass = true;
 | |
|                                                     break;
 | |
|                                                 }
 | |
|                                             }
 | |
| 
 | |
|                                             if (pass)
 | |
|                                                 break;
 | |
|                                         }
 | |
| 
 | |
|                                         if (pass)
 | |
|                                             //Set the module state so it will be brought back up
 | |
|                                             processManager.setModuleState(moduleName, oam::AUTO_DISABLED);
 | |
|                                         else
 | |
|                                         {
 | |
|                                             //new instance failed to get added
 | |
|                                             //remove and try auto moving dbroots to other pms
 | |
|                                             processManager.removeModule(devicenetworklist, false);
 | |
| 
 | |
|                                             // if pm, move dbroots to other pms
 | |
|                                             if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
 | |
|                                                     ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
 | |
|                                                     ( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") )
 | |
|                                             {
 | |
|                                                 try
 | |
|                                                 {
 | |
|                                                     log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
 | |
|                                                     oam.autoMovePmDbroot(moduleName);
 | |
|                                                     log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
 | |
|                                                     //distribute config file
 | |
|                                                     processManager.distributeConfigFile("system");
 | |
|                                                 }
 | |
|                                                 catch (exception& ex)
 | |
|                                                 {
 | |
|                                                     string error = ex.what();
 | |
|                                                     log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
 | |
|                                                 }
 | |
|                                                 catch (...)
 | |
|                                                 {
 | |
|                                                     log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                                                 }
 | |
|                                             }
 | |
| 
 | |
|                                             //set recycle process
 | |
|                                             processManager.recycleProcess(moduleName);
 | |
| 
 | |
|                                             //enable query stats
 | |
|                                             dbrm.setSystemQueryReady(true);
 | |
| 
 | |
|                                             //set query system state ready
 | |
|                                             processManager.setQuerySystemState(true);
 | |
| 
 | |
|                                             sleep(2);
 | |
|                                             processManager.setSystemState(oam::ACTIVE);
 | |
|                                         }
 | |
|                                     }
 | |
| 
 | |
|                                     if ( moduleName.find("pm") == 0 )
 | |
|                                     {
 | |
|                                         // resume the dbrm
 | |
|                                         oam.dbrmctl("resume");
 | |
|                                         log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                         //enable query stats
 | |
|                                         dbrm.setSystemQueryReady(true);
 | |
| 
 | |
|                                         //set query system state ready
 | |
|                                         processManager.setQuerySystemState(true);
 | |
|                                     }
 | |
|                                 }
 | |
|                                 else
 | |
|                                 {
 | |
|                                     // non-amazon
 | |
|                                     // resume the dbrm
 | |
|                                     oam.dbrmctl("resume");
 | |
|                                     log.writeLog(__LINE__, "'dbrmctl resume' done", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                     //set recycle process
 | |
|                                     processManager.recycleProcess(moduleName);
 | |
| 
 | |
|                                     //enable query stats
 | |
|                                     dbrm.setSystemQueryReady(true);
 | |
| 
 | |
|                                     //set query system state ready
 | |
|                                     processManager.setQuerySystemState(true);
 | |
| 
 | |
|                                     sleep(2);
 | |
| 
 | |
|                                     //check if down module was Standby OAM, if so find another one
 | |
|                                     if ( moduleName == config.OAMStandbyName() )
 | |
|                                     {
 | |
| 
 | |
|                                         //set down module ProcessManager to AOS
 | |
|                                         processManager.setProcessState(moduleName, "ProcessManager", oam::AUTO_OFFLINE, 0);
 | |
| 
 | |
|                                         //get another standby OAM module
 | |
|                                         string newStandbyModule = processManager.getStandbyModule();
 | |
| 
 | |
|                                         //send message to start new Standby Process-Manager, if needed
 | |
|                                         if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
 | |
|                                         {
 | |
|                                             processManager.setStandbyModule(newStandbyModule);
 | |
|                                         }
 | |
|                                         else
 | |
|                                         {
 | |
|                                             Config* sysConfig = Config::makeConfig();
 | |
| 
 | |
|                                             // clear Standby OAM Module
 | |
|                                             sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
 | |
|                                             sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
 | |
| 
 | |
|                                             //update Calpont Config table
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 sysConfig->write();
 | |
|                                             }
 | |
|                                             catch (...)
 | |
|                                             {
 | |
|                                                 log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
 | |
|                                             }
 | |
|                                         }
 | |
|                                     }
 | |
|                                 }
 | |
| 
 | |
|                                 //start SIMPLEX runtype processes on a SIMPLEX runtype module
 | |
|                                 string moduletype = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
 | |
| 
 | |
|                                 try
 | |
|                                 {
 | |
|                                     oam.getSystemConfig(moduletype, moduletypeconfig);
 | |
|                                 }
 | |
|                                 catch (exception& ex)
 | |
|                                 {
 | |
|                                     string error = ex.what();
 | |
|                                     log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|                                 }
 | |
|                                 catch (...)
 | |
|                                 {
 | |
|                                     log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                                 }
 | |
| 
 | |
|                                 if ( moduletypeconfig.RunType == SIMPLEX )
 | |
|                                 {
 | |
|                                     DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
 | |
| 
 | |
|                                     for ( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
 | |
|                                     {
 | |
|                                         string launchModuleName = (*pt).DeviceName;
 | |
|                                         string launchModuletype = launchModuleName.substr(0, MAX_MODULE_TYPE_SIZE);
 | |
| 
 | |
|                                         if ( moduletype != launchModuletype )
 | |
|                                             continue;
 | |
| 
 | |
|                                         //skip if active pm module (local module)
 | |
|                                         if ( launchModuleName == config.moduleName() )
 | |
|                                             continue;
 | |
| 
 | |
|                                         if ( moduleName != launchModuleName )
 | |
|                                         {
 | |
|                                             //check if module is active before starting any SIMPLEX STANDBY apps
 | |
|                                             try
 | |
|                                             {
 | |
|                                                 int launchopState = oam::ACTIVE;
 | |
|                                                 bool degraded;
 | |
|                                                 oam.getModuleStatus(launchModuleName, launchopState, degraded);
 | |
| 
 | |
|                                                 if (launchopState != oam::ACTIVE && launchopState != oam::STANDBY )
 | |
|                                                 {
 | |
|                                                     continue;
 | |
|                                                 }
 | |
|                                             }
 | |
|                                             catch (exception& ex)
 | |
|                                             {
 | |
| //												string error = ex.what();
 | |
| //												log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
 | |
|                                             }
 | |
|                                             catch (...)
 | |
|                                             {
 | |
| //												log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                                             }
 | |
| 
 | |
|                                             int status;
 | |
|                                             log.writeLog(__LINE__, "Starting up STANDBY process on module " + launchModuleName, LOG_TYPE_DEBUG);
 | |
| 
 | |
|                                             for ( int j = 0 ; j < 20 ; j ++ )
 | |
|                                             {
 | |
|                                                 status = processManager.startModule(launchModuleName, oam::FORCEFUL, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                                                 if ( status == API_SUCCESS)
 | |
|                                                     break;
 | |
|                                             }
 | |
| 
 | |
|                                             log.writeLog(__LINE__, "pingDeviceThread: ACK received from '" + launchModuleName + "' Process-Monitor, return status = " + oam.itoa(status), LOG_TYPE_DEBUG);
 | |
|                                         }
 | |
|                                     }
 | |
|                                 }
 | |
|                             }
 | |
| 
 | |
|                             break;
 | |
|                     }
 | |
|                 }
 | |
|             } //end of for loop
 | |
|         }
 | |
| 
 | |
|         // check and take action if LAN outage is flagged
 | |
|         if (LANOUTAGESUPPORT && !LANOUTAGEACTIVE && LOCALNICDOWN)
 | |
|         {
 | |
|             log.writeLog(__LINE__, "LAN Failure detected", LOG_TYPE_CRITICAL);
 | |
| 
 | |
|             oam.sendDeviceNotification(config.moduleName(), START_PM_MASTER_DOWN);
 | |
| 
 | |
|             LANOUTAGEACTIVE = true;
 | |
| 
 | |
|             log.writeLog(__LINE__, "Kill any cpimport running", LOG_TYPE_INFO);
 | |
|             system("pkill -9 cpimport");
 | |
| 
 | |
|             //request stop of local module
 | |
|             int status = processManager.stopModule(config.moduleName(), oam::FORCEFUL, false);
 | |
| 
 | |
|             if ( status != oam::API_SUCCESS )
 | |
|                 log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
 | |
| 
 | |
|             //stop snmptrap daemon process
 | |
|             processManager.stopProcess(config.moduleName(), "SNMPTrapDaemon", oam::FORCEFUL, false);
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             if ( LANOUTAGEACTIVE && HOTSTANDBYACTIVE && !LOCALNICDOWN)
 | |
|             {
 | |
| //				pthread_mutex_unlock(&THREAD_LOCK);
 | |
|                 LANOUTAGEACTIVE = false;
 | |
| 
 | |
|                 log.writeLog(__LINE__, "LAN Failure recovery");
 | |
| 
 | |
|                 //check if this module still is active according to last know hot standby module
 | |
|                 ByteStream msg;
 | |
|                 ByteStream::byte requestID = GETPARENTOAMMODULE;
 | |
|                 msg << requestID;
 | |
| 
 | |
|                 string parentOAMModule = processManager.sendMsgProcMon1( config.OAMStandbyName(), msg, requestID );
 | |
| 
 | |
|                 if ( parentOAMModule == config.moduleName() ||
 | |
|                         parentOAMModule == "FAILED" )
 | |
|                 {
 | |
| 
 | |
|                     //srestart to these guys incase they marked any PrimProcs offline
 | |
|                     processManager.restartProcessType("ExeMgr");
 | |
|                     processManager.reinitProcessType("DDLProc");
 | |
|                     processManager.reinitProcessType("DMLProc");
 | |
|                 }
 | |
|                 else
 | |
|                 {
 | |
|                     //send message to local Process Monitor to run coldStandby
 | |
|                     ByteStream msg;
 | |
|                     ByteStream::byte requestID = OAMPARENTCOLD;
 | |
| 
 | |
|                     msg << requestID;
 | |
| 
 | |
|                     int returnStatus = processManager.sendMsgProcMon( config.moduleName(), msg, requestID );
 | |
|                     log.writeLog(__LINE__, "sent OAM Parent Cold message to local Process-Monitor, status: " + oam.itoa(returnStatus), LOG_TYPE_DEBUG);
 | |
| 
 | |
|                     //request stop of local module
 | |
|                     int status = processManager.stopModule(config.moduleName(), oam::INSTALL, false);
 | |
| 
 | |
|                     if ( status != oam::API_SUCCESS )
 | |
|                         log.writeLog(__LINE__, "stopmodule failed", LOG_TYPE_ERROR);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         //
 | |
|         // ping ext devices
 | |
|         //
 | |
| 
 | |
|         // read each time to catch updates
 | |
|         systemextdeviceconfig.extdeviceconfig.clear();
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             oam.getSystemConfig(systemextdeviceconfig);
 | |
|         }
 | |
|         catch (exception& ex)
 | |
|         {
 | |
|             string error = ex.what();
 | |
| //			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
| //			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|         }
 | |
| 
 | |
|         for ( unsigned int i = 0 ; i < systemextdeviceconfig.Count ; i++ )
 | |
|         {
 | |
|             string extDeviceName = systemextdeviceconfig.extdeviceconfig[i].Name;
 | |
|             string ipAddr = systemextdeviceconfig.extdeviceconfig[i].IPAddr;
 | |
| 
 | |
|             int opState = oam::ACTIVE;
 | |
| 
 | |
|             try
 | |
|             {
 | |
|                 oam.getExtDeviceStatus(extDeviceName, opState);
 | |
|             }
 | |
|             catch (exception& ex)
 | |
|             {
 | |
| //				string error = ex.what();
 | |
| //				log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: " + error, LOG_TYPE_ERROR);
 | |
|             }
 | |
|             catch (...)
 | |
|             {
 | |
| //				log.writeLog(__LINE__, "EXCEPTION ERROR on getExtDeviceStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|             }
 | |
| 
 | |
|             cmd = cmdLine + ipAddr + cmdOption;
 | |
|             rtnCode = system(cmd.c_str());
 | |
| 
 | |
|             switch (WEXITSTATUS(rtnCode))
 | |
|             {
 | |
|                 case 0:
 | |
| 
 | |
|                     //Switch Ack ping, Check whether alarm have been issued
 | |
|                     if (extDeviceInfoList[extDeviceName] >= ModuleHeartbeatCount)
 | |
|                     {
 | |
|                         aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, CLEAR);
 | |
| 
 | |
|                     }
 | |
| 
 | |
|                     extDeviceInfoList[extDeviceName] = 0;
 | |
| 
 | |
|                     if (opState != oam::ACTIVE)
 | |
|                     {
 | |
|                         //Set the switch state to active
 | |
|                         processManager.setExtdeviceState(extDeviceName, oam::ACTIVE);
 | |
|                     }
 | |
| 
 | |
|                     break;
 | |
| 
 | |
|                 default:
 | |
|                     //extDevice failed to respond to ping
 | |
|                     log.writeLog(__LINE__, "extDevice failed to respond to ping: " + extDeviceName, LOG_TYPE_WARNING);
 | |
|                     extDeviceInfoList[extDeviceName]++;
 | |
| 
 | |
|                     if (extDeviceInfoList[extDeviceName] == ModuleHeartbeatCount)
 | |
|                     {
 | |
|                         //Log failure, issue alarm, set extDeviceOpState
 | |
|                         log.writeLog(__LINE__, "extDevice is down: " + extDeviceName, LOG_TYPE_CRITICAL);
 | |
| 
 | |
|                         processManager.setExtdeviceState(extDeviceName, oam::AUTO_OFFLINE);
 | |
| 
 | |
|                         //Issue an alarm
 | |
|                         aManager.sendAlarmReport(extDeviceName.c_str(), EXT_DEVICE_DOWN_AUTO, SET);
 | |
|                     }
 | |
| 
 | |
|                     break;
 | |
|             }
 | |
|         } //end of for loop
 | |
| 
 | |
|         // double check to make sure the system status is ACTIVE if all module status's are ACTIVE
 | |
|         try
 | |
|         {
 | |
|             if (dbrm.isDBRMReady())
 | |
|             {
 | |
|                 int systemReady = dbrm.getSystemReady();    // -1 == fail, 0 == not ready, 1 == ready
 | |
| 
 | |
|                 if (systemReady > 0)
 | |
|                 {
 | |
|                     bool updateActive = true;
 | |
| 
 | |
|                     for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
 | |
|                     {
 | |
|                         int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
 | |
| 
 | |
|                         if ( moduleCount == 0)
 | |
|                             continue;
 | |
| 
 | |
|                         DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
 | |
| 
 | |
|                         for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
 | |
|                         {
 | |
|                             string moduleName = (*pt).DeviceName;
 | |
| 
 | |
|                             int opState = oam::ACTIVE;
 | |
| 
 | |
|                             try
 | |
|                             {
 | |
|                                 bool degraded;
 | |
|                                 oam.getModuleStatus(moduleName, opState, degraded);
 | |
| 
 | |
|                                 if (opState == oam::ACTIVE ||
 | |
|                                         opState == oam::DEGRADED ||
 | |
|                                         opState == oam::MAN_DISABLED ||
 | |
|                                         opState == oam::AUTO_DISABLED )
 | |
|                                     continue;
 | |
| 
 | |
|                                 updateActive = false;
 | |
|                             }
 | |
|                             catch (exception& ex)
 | |
|                             {
 | |
|                                 //                            string error = ex.what();
 | |
|                                 //                          log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
 | |
|                             }
 | |
|                             catch (...)
 | |
|                             {
 | |
|                                 //                            log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                             }
 | |
|                         }
 | |
|                     }
 | |
| 
 | |
|                     if (updateActive)
 | |
|                     {
 | |
| //						log.writeLog(__LINE__, "Modules are ACTIVE, check system state ", LOG_TYPE_DEBUG);
 | |
| 
 | |
|                         string PrimaryUMModuleName;
 | |
| 
 | |
|                         try
 | |
|                         {
 | |
|                             oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
 | |
|                         }
 | |
|                         catch (...) {}
 | |
| 
 | |
| //						log.writeLog(__LINE__, "PrimaryUMModuleName = " + PrimaryUMModuleName, LOG_TYPE_DEBUG);
 | |
| 
 | |
|                         ProcessStatus DMLprocessstatus;
 | |
| 
 | |
|                         try
 | |
|                         {
 | |
|                             oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
 | |
|                         }
 | |
|                         catch (exception& ex)
 | |
|                         {
 | |
|                             //						string error = ex.what();
 | |
|                             //						log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | |
|                         }
 | |
|                         catch (...)
 | |
|                         {
 | |
|                             //						log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                         }
 | |
| 
 | |
| //						log.writeLog(__LINE__, "DMLPROC STATUS = " + oamState[DMLprocessstatus.ProcessOpState], LOG_TYPE_DEBUG);
 | |
| 
 | |
|                         if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
 | |
|                         {
 | |
| 
 | |
|                             //set the system status if a change has occurred
 | |
|                             SystemStatus systemstatus;
 | |
| 
 | |
|                             try
 | |
|                             {
 | |
|                                 oam.getSystemStatus(systemstatus);
 | |
|                             }
 | |
|                             catch (exception& ex)
 | |
|                             {
 | |
|                                 //							string error = ex.what();
 | |
|                                 //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
 | |
|                             }
 | |
|                             catch (...)
 | |
|                             {
 | |
|                                 //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                             }
 | |
| 
 | |
|                             if ( systemstatus.SystemOpState != oam::ACTIVE )
 | |
|                             {
 | |
|                                 processManager.setSystemState(oam::ACTIVE);
 | |
|                             }
 | |
|                         }
 | |
| 
 | |
|                         if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
 | |
|                         {
 | |
| 
 | |
|                             //set the system status if a change has occurred
 | |
|                             SystemStatus systemstatus;
 | |
| 
 | |
|                             try
 | |
|                             {
 | |
|                                 oam.getSystemStatus(systemstatus);
 | |
|                             }
 | |
|                             catch (exception& ex)
 | |
|                             {
 | |
|                                 //							string error = ex.what();
 | |
|                                 //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
 | |
|                             }
 | |
|                             catch (...)
 | |
|                             {
 | |
|                                 //							log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
|                             }
 | |
| 
 | |
|                             if ( systemstatus.SystemOpState != oam::BUSY_INIT )
 | |
|                             {
 | |
|                                 processManager.setSystemState(oam::BUSY_INIT);
 | |
|                             }
 | |
|                         }
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         catch (...)
 | |
|         {
 | |
|         }
 | |
| 
 | |
|         //go sleep for a bit
 | |
|         int sleepTime = ModuleHeartbeatPeriod / 10;
 | |
| 
 | |
|         if (!enableModuleMonitor && systemextdeviceconfig.Count == 0)
 | |
|             sleep(60);
 | |
|         else
 | |
|             sleep(sleepTime);
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| /******************************************************************************************
 | |
| * @brief      hdfsActiveAlarmsPushingThread
 | |
| *
 | |
| * purpose:    Push an image of ActiveAlarms to HDFS for non-OAMParentModule to view.
 | |
| *
 | |
| ******************************************************************************************/
 | |
| static void hdfsActiveAlarmsPushingThread()
 | |
| {
 | |
|     boost::filesystem::path filePath(ACTIVE_ALARM_FILE);
 | |
|     boost::filesystem::path dirPath = filePath.parent_path();
 | |
|     string dirName = boost::filesystem::canonical(dirPath).string();
 | |
| 
 | |
|     if (boost::filesystem::exists("/etc/pdsh/machines"))
 | |
|     {
 | |
|         string cpCmd =  "pdcp -a -x " + localHostName + " " + ACTIVE_ALARM_FILE + " " + dirName +
 | |
|                         " > /dev/null 2>&1";
 | |
|         string rmCmd =  "pdsh -a -x " + localHostName + " rm -f " + ACTIVE_ALARM_FILE +
 | |
|                         " > /dev/null 2>&1";
 | |
| 
 | |
|         while (1)
 | |
|         {
 | |
|             if (boost::filesystem::exists(filePath))
 | |
|                 system(cpCmd.c_str());
 | |
|             else
 | |
|                 system(rmCmd.c_str());
 | |
| 
 | |
|             sleep(ACTIVE_ALARMS_PUSHING_INTERVAL);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*****************************************************************************************
 | |
| * @brief	Processor Heartbeat Msg Thread
 | |
| *
 | |
| * purpose:	Read Heartbeat Messages from other Processes
 | |
| *
 | |
| *****************************************************************************************/
 | |
| /*
 | |
| static void heartbeatMsgThread()
 | |
| {
 | |
| 	ProcessLog log;
 | |
| 	Configuration config;
 | |
| 	ProcessManager processManager(config, log);
 | |
| 
 | |
| 	//
 | |
| 	//waiting for request
 | |
| 	//
 | |
| 	ByteStream receivedMSG;
 | |
| 	IOSocket fIos;
 | |
| 
 | |
| 	for (;;)
 | |
| 	{
 | |
| 		try
 | |
| 		{
 | |
| 			MessageQueueServer procmgr("ProcHeartbeatControl");
 | |
| 			for (;;)
 | |
| 			{
 | |
| 				try
 | |
| 				{
 | |
| 					fIos = procmgr.accept();
 | |
| 					receivedMSG = fIos.read();
 | |
| 
 | |
| 					if (receivedMSG.length() > 0) {
 | |
| 						processManager.processMSG(fIos, receivedMSG);
 | |
| 					}
 | |
| 				}
 | |
| 				catch (exception& ex)
 | |
| 				{
 | |
| 					string error = ex.what();
 | |
| 					log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: " + error, LOG_TYPE_ERROR);
 | |
| 				}
 | |
| 				catch(...)
 | |
| 				{
 | |
| 					log.writeLog(__LINE__, "EXCEPTION ERROR on ProcHeartbeatControl.accept: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
| 				}
 | |
| 
 | |
| 				fIos.close();
 | |
| 			}
 | |
| 		}
 | |
|         catch (exception& ex)
 | |
|         {
 | |
| 			string error = ex.what();
 | |
| 			log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcMgr:" + error, LOG_TYPE_ERROR);
 | |
| 			// takes 2 - 4 minites to free sockets, sleep and retry
 | |
| 			sleep(60);
 | |
|         }
 | |
|         catch(...)
 | |
|         {
 | |
| 			log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueServer for ProcHeartbeatControl: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
| 			// takes 2 - 4 minites to free sockets, sleep and retry
 | |
| 			sleep(60);
 | |
|         }
 | |
| 	}
 | |
| 
 | |
| }
 | |
| */
 | |
| 
 | |
| /*****************************************************************************************
 | |
| * @brief	Processor Heartbeat Thread
 | |
| *
 | |
| * purpose:	Check Heartbeat Messages from other Processes
 | |
| *
 | |
| *****************************************************************************************/
 | |
| /*
 | |
| static void heartbeatProcessThread()
 | |
| {
 | |
| 	ProcessLog log;
 | |
| 	Configuration config;
 | |
| 	ProcessManager processManager(config, log);
 | |
| 	Oam oam;
 | |
| 	ALARMManager aManager;
 | |
| 
 | |
| 	int processHeartbeatPeriod=60;	//default value to 60 seconds
 | |
| 
 | |
| 	log.writeLog(__LINE__, "Thread Launched: Process Heartbeat!!!");
 | |
| 
 | |
| 	while (true)
 | |
| 	{
 | |
| 		//
 | |
| 		// check and report on register process not sending heartbeats
 | |
| 		//
 | |
| 
 | |
| 		// get process heartbeat period
 | |
| 		try {
 | |
| 			oam.getSystemConfig("ProcessHeartbeatPeriod", processHeartbeatPeriod);
 | |
| 			processHeartbeatPeriod = processHeartbeatPeriod * 60;
 | |
| 		}
 | |
| 		catch (exception& ex)
 | |
| 		{
 | |
| 			string error = ex.what();
 | |
| 			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
 | |
| 		}
 | |
| 		catch(...)
 | |
| 		{
 | |
| 			log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
| 		}
 | |
| 
 | |
| 		Oam oam;
 | |
| 		log.writeLog(__LINE__, "Process Heartbeat check started, Heartbeat period is " + oam.itoa(processHeartbeatPeriod), LOG_TYPE_DEBUG);
 | |
| 
 | |
| 		sleep(processHeartbeatPeriod);
 | |
| 
 | |
| 		HeartBeatProcList::iterator list = hbproclist.begin();
 | |
| 		for( ; list != hbproclist.end() ; list++)
 | |
| 		{
 | |
| 			string moduleName = (*list).ModuleName;
 | |
| 			string processName = (*list).ProcessName;
 | |
| 			int id = (*list).ID;
 | |
| 
 | |
| 			// get Process state and only check if ACTIVE
 | |
| 			ProcessStatus procstat;
 | |
| 			try{
 | |
| 				oam.getProcessStatus(processName, moduleName, procstat);
 | |
| 			}
 | |
| 			catch (exception& ex)
 | |
| 			{
 | |
| 				string error = ex.what();
 | |
| 				log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
 | |
| 				procstat.ProcessOpState = oam::MAN_OFFLINE;
 | |
| 			}
 | |
| 			catch(...)
 | |
| 			{
 | |
| 				log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
 | |
| 				procstat.ProcessOpState = oam::MAN_OFFLINE;
 | |
| 			}
 | |
| 
 | |
| 			if ( procstat.ProcessOpState == oam::ACTIVE ) {
 | |
| 				// skip testing if Heartbeat is disable
 | |
| 				if( processHeartbeatPeriod != -1 ) {
 | |
| //log.writeLog(__LINE__, "Heartbeat: Process being monitored: " + moduleName + " / " + processName + " / " + oam.itoa(id), LOG_TYPE_DEBUG);
 | |
| 					if ( !(*list).receiveFlag ) {
 | |
| 						// got a missing heartbeat, request a restart on the process
 | |
| 						log.writeLog(__LINE__, "heartbeatProcessThread: Failure from process " + moduleName + " / " + processName+ " / " + oam.itoa(id), LOG_TYPE_WARNING);
 | |
| 
 | |
| 						oam.restartProcess(moduleName, processName, FORCEFUL, ACK_NO);
 | |
| 						(*list).receiveFlag = true;
 | |
| 						// reset all other entries for this process
 | |
| 						HeartBeatProcList::iterator list1 = hbproclist.begin();
 | |
| 						for( ; list1 != hbproclist.end() ; list1++)
 | |
| 						{
 | |
| 							string moduleName1 = (*list1).ModuleName;
 | |
| 							string processName1 = (*list1).ProcessName;
 | |
| 							if ( moduleName == moduleName1 && processName == processName1 )
 | |
| 								(*list1).receiveFlag = true;
 | |
| 						}
 | |
| 					}
 | |
| 					else
 | |
| 						// reset receive heartbeat indication flag
 | |
| 						(*list).receiveFlag = false;
 | |
| 				}
 | |
| 				else
 | |
| 					// heartbeat is disabled
 | |
| 					(*list).receiveFlag=true;
 | |
| 			}
 | |
| 			else
 | |
| 			{	// registered process not active, remove from list
 | |
| 				hbproclist.erase(list);
 | |
| 				log.writeLog(__LINE__, "Removing OOS Process from Heartbeat Monitor list: " + moduleName + " / " + processName+ " / " + oam.itoa(id));
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 	} // end of while forever loop
 | |
| }
 | |
| */
 | |
| // vim:ts=4 sw=4:
 |