1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-09-03 23:42:03 +03:00
Files
mariadb-columnstore-engine/procmgr/processmanager.cpp
2020-11-17 15:03:10 +03:00

11263 lines
383 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
Copyright (C) 2016 MariaDB Corporation
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/******************************************************************************************
* $Id: processmanager.cpp 2216 2013-08-13 14:34:10Z dhill $
*
******************************************************************************************/
//#define NDEBUG
#include <cassert>
#include "columnstoreversion.h"
#include "mcsconfig.h"
#include "processmanager.h"
#include "installdir.h"
#include "dbrm.h"
#include "cacheutils.h"
#include "ddlcleanuputil.h"
#include "IDBFileSystem.h"
#include "IDBDataFile.h"
#include "IDBPolicy.h"
#include <boost/filesystem/path.hpp>
using namespace cacheutils;
using namespace std;
using namespace processmanager;
using namespace messageqcpp;
using namespace oam;
using namespace logging;
using namespace alarmmanager;
using namespace config;
using namespace idbdatafile;
pthread_mutex_t STATUS_LOCK;
pthread_mutex_t THREAD_LOCK;
extern string cloud;
extern bool amazon;
extern bool runStandby;
extern bool MsgThreadActive;
extern string iface_name;
extern string PMInstanceType;
extern string UMInstanceType;
extern string DataRedundancyConfig;
extern bool rootUser;
extern string USER;
extern bool HDFS;
extern string localHostName;
extern string PMwithUM;
extern string AmazonPMFailover;
extern string tmpLogDir;
typedef map<string, int> moduleList;
extern moduleList moduleInfoList;
bool gOAMParentModuleFlag;
oam::DeviceNetworkList startdevicenetworklist;
int upgradethreadStatus = oam::API_SUCCESS;
int startsystemthreadStatus = oam::API_SUCCESS;
int stopsystemthreadStatus = oam::API_SUCCESS;
int startmodulethreadStatus = oam::API_SUCCESS;
bool startsystemthreadStop = false;
bool startsystemthreadRunning = false;
string gdownActiveOAMModule;
vector<string> downModuleList;
bool startFailOver = false;
string masterLogFile = oam::UnassignedName;
string masterLogPos = oam::UnassignedName;
HeartBeatProcList hbproclist;
namespace processmanager
{
/******************************************************************************************
* @brief Configuration Constructor
*
* purpose: Configuration Constructor
*
******************************************************************************************/
Configuration::Configuration()
{
Oam oam;
oamModuleInfo_t t;
try
{
t = oam.getModuleInfo();
flocalModuleName = boost::get<0>(t);
flocalModuleType = boost::get<1>(t);
flocalModuleID = boost::get<2>(t);
fOAMParentModuleName = boost::get<3>(t);
fOAMParentModuleFlag = boost::get<4>(t);
fserverInstallType = boost::get<5>(t);
fOAMStandbyModuleName = boost::get<6>(t);
fOAMStandbyModuleFlag = boost::get<7>(t);
gOAMParentModuleFlag = boost::get<4>(t);
}
catch (exception& e)
{
cout << endl << "ProcMgr Construct Error = " << e.what() << endl;
exit(-1);
}
}
/******************************************************************************************
* @brief Configuration Destructor#
*
* purpose: Configuration
*
******************************************************************************************/
Configuration::~Configuration()
{
}
/******************************************************************************************
* @brief getstateInfo
*
* purpose: Return the module opstate tag
*
******************************************************************************************/
string Configuration::getstateInfo(string moduleName)
{
return stateInfoList[moduleName];
}
/******************************************************************************************
* @brief ProcessLog Constructor
*
* purpose: ProcessLog Constructorname
*
******************************************************************************************/
ProcessLog::ProcessLog()
{
}
/******************************************************************************************
* @brief ProcessLog Destructor
*
* purpose: ProcessLog Destructor
*
******************************************************************************************/
ProcessLog::~ProcessLog()
{
}
/******************************************************************************************
* @brief writeLog
*
* purpose: Write the message to the log
*
******************************************************************************************/
void ProcessLog::writeLog(const int lineNumber, const string logContent, const LOG_TYPE logType)
{
LoggingID lid(17);
MessageLog ml(lid);
Message msg;
Message::Args args;
if (logType == LOG_TYPE_ERROR)
{
args.add("line:");
args.add(lineNumber);
}
args.add(logContent);
msg.format(args);
switch (logType)
{
case LOG_TYPE_DEBUG:
try
{
ml.logDebugMessage(msg);
}
catch (...) {}
break;
case LOG_TYPE_INFO:
try
{
ml.logInfoMessage(msg);
}
catch (...) {}
break;
case LOG_TYPE_WARNING:
try
{
ml.logWarningMessage(msg);
}
catch (...) {}
break;
case LOG_TYPE_ERROR:
try
{
ml.logErrorMessage(msg);
}
catch (...) {}
break;
case LOG_TYPE_CRITICAL:
try
{
ml.logCriticalMessage(msg);
}
catch (...) {}
break;
}
return;
}
/******************************************************************************************
* @brief writeLog
*
* purpose: Write the message to the log
*
******************************************************************************************/
void ProcessLog::writeLog(const int lineNumber, const int logContent, const LOG_TYPE logType)
{
LoggingID lid(17);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add(logContent);
msg.format(args);
switch (logType)
{
case LOG_TYPE_DEBUG:
ml.logDebugMessage(msg);
break;
case LOG_TYPE_INFO:
ml.logInfoMessage(msg);
break;
case LOG_TYPE_WARNING:
ml.logWarningMessage(msg);
break;
case LOG_TYPE_ERROR:
args.add("line:");
args.add(lineNumber);
ml.logErrorMessage(msg);
break;
case LOG_TYPE_CRITICAL:
ml.logCriticalMessage(msg);
break;
}
return;
}
/******************************************************************************************
* @brief setSysLogData
*
* purpose: Write the message to the log
*
******************************************************************************************/
void ProcessLog::setSysLogData()
{
return;
}
/******************************************************************************************
* @brief getSysLogData
*
* purpose: return the sysLogData
*
******************************************************************************************/
string ProcessLog::getSysLogData()
{
string i;
return i;
}
/******************************************************************************************
* @brief writeSystemLog
*
* purpose: log process status change into system log
*
******************************************************************************************/
void ProcessLog::writeSystemLog()
{
}
/******************************************************************************************
* @brief ProcessManager Constructor
*
* purpose: ProcessManager Constructor
*
******************************************************************************************/
ProcessManager::ProcessManager(Configuration& aconfig, ProcessLog& alog): config(aconfig), log(alog)
{
}
/******************************************************************************************
* @brief ProcessManager Destructor
*
* purpose: ProcessManager Destructor
*
******************************************************************************************/
ProcessManager::~ProcessManager()
{
}
/******************************************************************************************
* @brief processMSG
*
* purpose: Process the received message
*
******************************************************************************************/
//void ProcessManager::processMSG( messageqcpp::IOSocket fIos, messageqcpp::ByteStream msg)
void* processMSG(messageqcpp::IOSocket* cfIos)
{
messageqcpp::IOSocket fIos = *cfIos;
pthread_t ThreadId;
ThreadId = pthread_self();
ByteStream msg;
try
{
msg = fIos.read();
}
catch (...)
{
pthread_detach (ThreadId);
pthread_exit(0);
}
if (msg.length() <= 0)
{
fIos.close();
pthread_detach (ThreadId);
pthread_exit(0);
}
ByteStream::byte msgType;
msg >> msgType;
Oam oam;
ProcessLog log;
// log.writeLog(__LINE__, "** processMSG msg type: " + oam.itoa(msgType), LOG_TYPE_DEBUG);
Configuration config;
ProcessManager processManager(config, log);
ByteStream::byte actionType;
string target;
ByteStream::byte graceful;
ByteStream::byte ackIndicator = 0;
ByteStream::byte manualFlag;
ByteStream ackMsg;
ByteStream::byte status = 0;
ALARMManager aManager;
SystemModuleTypeConfig systemmoduletypeconfig;
SystemProcessConfig systemprocessconfig;
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
switch (msgType)
{
case REQUEST:
msg >> actionType;
msg >> target;
msg >> graceful;
msg >> ackIndicator;
msg >> manualFlag;
switch (actionType)
{
case STOPMODULE:
{
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
//get module count to remove
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
devicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "MSG RECEIVED: Stop Module request on " + moduleName );
string moduletype = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
status = API_SUCCESS;
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
{
status = API_DISABLED;
log.writeLog(__LINE__, "Stop Module requested Ignored on a Disabled " + moduleName);
}
else
{
status = processManager.stopModule(moduleName, graceful, manualFlag);
log.writeLog(__LINE__, "Stop Module Completed on " + moduleName, LOG_TYPE_INFO);
Configuration config;
if ( moduleName == config.OAMStandbyName() )
{
string newStandbyModule = processManager.getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
processManager.setStandbyModule(newStandbyModule);
else
{
Config* sysConfig = Config::makeConfig();
// clear Standby OAM Module
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
}
}
}
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "STOPMODULE: Module Count invalid = " + oam.itoa(count));
}
log.writeLog(__LINE__, "STOPMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STOPMODULE: ACK back to sender");
}
break;
}
case SHUTDOWNMODULE:
{
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
//get module count to remove
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
devicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "MSG RECEIVED: Shutdown Module request on " + moduleName );
status = API_SUCCESS;
log.writeLog(__LINE__, "Shutdown Module Requested on " + moduleName, LOG_TYPE_INFO);
processManager.shutdownModule(moduleName, graceful, manualFlag, 0);
//check for SIMPLEX Processes on mate might need to be started
processManager.checkSimplexModule(moduleName);
Configuration config;
if ( moduleName == config.OAMStandbyName() )
{
string newStandbyModule = processManager.getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
processManager.setStandbyModule(newStandbyModule);
}
}
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "SHUTDOWNMODULE: Module Count invalid = " + oam.itoa(count));
}
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SHUTDOWNMODULE: ACK back to sender, return status = " + oam.itoa(status));
}
break;
}
case STARTMODULE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Start Module request" );
startsystemthreadStop = false;
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
startdevicenetworklist.clear();
//get module count to remove
msg >> count;
if ( count > 0 )
{
string module = oam::UnassignedName;
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
module = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
startdevicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
pthread_t startsystemthread;
status = pthread_create (&startsystemthread, NULL, (void* (*)(void*)) &startSystemThread, &startdevicenetworklist);
if ( status != 0 )
{
log.writeLog(__LINE__, "STARTMODULE: pthread_create failed, return status = " + oam.itoa(status));
status = API_FAILURE;
}
if (status == 0 && ackIndicator)
{
pthread_join(startsystemthread, NULL);
status = startsystemthreadStatus;
}
if ( status == API_SUCCESS)
{
processManager.setSystemState(oam::BUSY_INIT);
//set query system state not ready
processManager.setQuerySystemState(false);
//set recycle process
processManager.recycleProcess(target, true);
//distribute config file
processManager.distributeConfigFile("system");
//set query system state ready
processManager.setQuerySystemState(true);
processManager.setSystemState(oam::ACTIVE);
}
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "STARTMODULE: Module Count invalid = " + oam.itoa(count));
}
log.writeLog(__LINE__, "STARTMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STARTMODULE: ACK back to sender");
}
break;
}
case RESTARTMODULE:
{
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
startdevicenetworklist.clear();
startsystemthreadStop = false;
//get module count to remove
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
startdevicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
DeviceNetworkList::iterator listPT = startdevicenetworklist.begin();
for ( ; listPT != startdevicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "MSG RECEIVED: Restart Module request on " + moduleName );
status = API_SUCCESS;
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState != oam::MAN_DISABLED)
{
status = processManager.stopModule(moduleName, graceful, manualFlag);
log.writeLog(__LINE__, "Stop Module Completed on " + moduleName, LOG_TYPE_INFO);
Configuration config;
if ( moduleName == config.OAMStandbyName() )
{
string newStandbyModule = processManager.getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
processManager.setStandbyModule(newStandbyModule);
}
}
else
{
status = API_DISABLED;
log.writeLog(__LINE__, "Stop Module requested Ignored on a Disabled " + moduleName);
}
}
pthread_t startsystemthread;
status = pthread_create (&startsystemthread, NULL, (void* (*)(void*)) &startSystemThread, &startdevicenetworklist);
if ( status != 0 )
{
log.writeLog(__LINE__, "RESTARTMODULE: pthread_create failed, return status = " + oam.itoa(status));
status = API_FAILURE;
}
if (status == 0 && ackIndicator)
{
pthread_join(startsystemthread, NULL);
status = startsystemthreadStatus;
}
if ( status == API_SUCCESS)
{
//distribute config file
processManager.distributeConfigFile("system");
processManager.restartProcessType("ExeMgr");
}
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "RESTARTMODULE: Module Count invalid = " + oam.itoa(count));
}
log.writeLog(__LINE__, "RESTARTMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "RESTARTMODULE: ACK back to sender");
}
break;
}
case DISABLEMODULE:
{
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
//get module count to remove
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
devicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "MSG RECEIVED: Disable Module request on " + moduleName );
// check module status, Disable module
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
//don't allow disble of current Parent OAM Module
if ( moduleName == config.moduleName() )
{
log.writeLog(__LINE__, "ERROR: can't disable Parent OAM module", LOG_TYPE_ERROR);
status = API_INVALID_PARAMETER;
break;
}
if (opState == oam::MAN_OFFLINE || opState == oam::MAN_DISABLED
|| opState == oam::AUTO_DISABLED || opState == oam::AUTO_OFFLINE)
{
processManager.setSystemState(oam::BUSY_INIT);
//set query system state not ready
processManager.setQuerySystemState(false);
status = processManager.disableModule(moduleName, true);
log.writeLog(__LINE__, "Disable Module Completed on " + moduleName, LOG_TYPE_INFO);
//check for SIMPLEX Processes on mate might need to be started
processManager.checkSimplexModule(moduleName);
processManager.setSystemState(oam::ACTIVE);
//set query system state ready
processManager.setQuerySystemState(true);
}
else
{
log.writeLog(__LINE__, "ERROR: module not stopped, state = " + oam.itoa(opState), LOG_TYPE_ERROR);
status = API_FAILURE;
break;
}
}
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "DISABLEMODULE: Module Count invalid = " + oam.itoa(count));
}
log.writeLog(__LINE__, "DISABLEMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "DISABLEMODULE: ACK back to sender");
}
break;
}
case ENABLEMODULE:
{
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
//get module count to remove
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
devicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
// do stopmodule then enable
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "MSG RECEIVED: Enable Module request on " + moduleName );
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED)
{
processManager.stopModule(moduleName, graceful, manualFlag);
log.writeLog(__LINE__, "stop Module Completed on " + moduleName, LOG_TYPE_INFO);
status = processManager.enableModule(moduleName, oam::MAN_OFFLINE);
log.writeLog(__LINE__, "Enable Module Completed on " + moduleName, LOG_TYPE_INFO);
}
else
{
log.writeLog(__LINE__, "ERROR: module name not Disabled", LOG_TYPE_ERROR);
status = API_INVALID_STATE;
break;
}
}
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "ENABLEMODULE: Module Count invalid = " + oam.itoa(count));
}
log.writeLog(__LINE__, "ENABLEMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "ENABLEMODULE: ACK back to sender");
}
break;
}
case STOPSYSTEM:
{
log.writeLog(__LINE__, "MSG RECEIVED: Stop System request..." );
// GRACEFUL_WAIT means that we are shutting down, but waiting for
// all transactions to finish or rollback as commanded. This is only set if
// there are, in fact, transactions active (or cpimport).
if (graceful == GRACEFUL_WAIT)
{
ByteStream stillWorkingMsg;
stillWorkingMsg << (ByteStream::byte) oam::ACK;
stillWorkingMsg << actionType;
stillWorkingMsg << target;
stillWorkingMsg << (ByteStream::byte) API_STILL_WORKING;
// This wait can take a while. We wait for table locks to release and open transactions to commit.
if (oam.waitForSystem(STOPSYSTEM, fIos, stillWorkingMsg))
{
graceful = GRACEFUL; // ProcMonitor doesn't know GRACEFUL_WAIT.
// Send an ack back to say we're done waiting and are now shutting down.
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_TRANSACTIONS_COMPLETE;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STOPSYSTEM: ACK transactions complete back to sender, return status = " + oam.itoa(API_TRANSACTIONS_COMPLETE));
}
else
{
// We've been cancelled.
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_CANCELLED;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STOPSYSTEM: ACK back to sender (canceled)");
break;
}
}
}
//set the flag to have any startsystemthreads to exit out before stop is done
startsystemthreadStop = true;
if ( startsystemthreadRunning )
sleep(5);
//stop by process type first, if system is ACTIVE
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (...)
{}
//set system status
processManager.setSystemState(oam::MAN_INIT);
if (HDFS)
{
oam::DeviceNetworkList devicenetworklist;
pthread_t stopsystemthread;
status = pthread_create (&stopsystemthread, NULL, (void* (*)(void*)) &stopSystemThread, &devicenetworklist);
if ( status != 0 )
{
log.writeLog(__LINE__, "STOPSYSTEMS: pthread_create failed, return status = " + oam.itoa(status));
status = API_FAILURE;
}
if (status == 0 && ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STOPSYSTEM: ACK back to sender");
}
break;
}
//call to update module status and send notification message
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
processManager.stopModule((*pt).DeviceName, STATUS_UPDATE, manualFlag, 0);
}
}
//set query system state not ready
processManager.setQuerySystemState(false);
if (systemstatus.SystemOpState == ACTIVE && graceful == oam::GRACEFUL)
processManager.stopProcessTypes(manualFlag);
//stop all of processes..
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
//skip OAM Parent module, do at the end
if ( (*pt).DeviceName == config.moduleName() )
continue;
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
log.writeLog(__LINE__, "STOPSYSTEM: Request Stop Module on " + (*pt).DeviceName );
// int retStatus = processManager.stopModule((*pt).DeviceName, graceful, manualFlag, 0);
processManager.stopModule((*pt).DeviceName, graceful, manualFlag, 0);
// log.writeLog(__LINE__, "STOPSYSTEM: ACK received from Process-Monitor, return status = " + oam.itoa(status));
// if (retStatus != API_SUCCESS)
// status = retStatus;
}
}
//wait until all child modules are offline or A FAILURE HAS OCCURRED
bool failure = false;
bool stopped = true;
for ( int retry = 0 ; retry < 30 ; retry++ )
{
sleep(1);
stopped = true;
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
string moduleName = (*pt).DeviceName;
//skip OAM Parent module, do at the end
if ( moduleName == config.moduleName() )
continue;
int opState = oam::ACTIVE;
try
{
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::FAILED)
{
failure = true;
log.writeLog(__LINE__, "STOPSYSTEM: Failed, failure on module " + moduleName, LOG_TYPE_ERROR);
break;
}
if (opState == oam::MAN_OFFLINE ||
opState == oam::MAN_DISABLED ||
opState == oam::AUTO_DISABLED )
continue;
stopped = false;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on : " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
}
if ( failure )
break;
}
if ( failure)
break;
if ( stopped )
break;
}
if ( failure )
{
processManager.setSystemState(oam::FAILED);
}
else
{
if ( !stopped)
{
//timeout waiting for system to stop, error out
log.writeLog(__LINE__, "STOPSYSTEM: Failed, timeout waiting for module to stop", LOG_TYPE_ERROR);
processManager.setSystemState(oam::FAILED);
}
else
{
/* XXXPAT: saveBRM requires StorageManager being up at the time.
A couple options. 1) start/stop SM around saveBRM(). Will work but it means SM would go
down-up-down for this single operation. 2) add a special path to stopModule()
to NOT stop SM in the first call, then after saveBRM(), stop SM.
Neither option is great. The least invasive is option 1, so going with that
for now.
*/
//now stop local module
processManager.stopModule(config.moduleName(), graceful, manualFlag );
//run save brm script
string storageType = Config::makeConfig()->getConfig("Installation", "DBRootStorageType");
if (storageType == "storagemanager")
processManager.startProcess(config.moduleName(), "StorageManager", FORCEFUL);
processManager.saveBRM(false);
if (storageType == "storagemanager")
processManager.stopProcess(config.moduleName(), "StorageManager", GRACEFUL, false);
log.writeLog(__LINE__, "Stop System Completed Success", LOG_TYPE_INFO);
processManager.setSystemState(oam::MAN_OFFLINE);
//clearout auto move dbroots files
string cmd = "rm -f /var/lib/columnstore/local/moveDbrootTransactionLog";
system(cmd.c_str());
cmd = "touch /var/lib/columnstore/local/moveDbrootTransactionLog";
system(cmd.c_str());
}
}
if (ackIndicator)
{
ackMsg.reset();
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_SUCCESS;
fIos.write(ackMsg);
log.writeLog(__LINE__, "STOPSYSTEM: ACK back to sender");
}
//set query system state ready
processManager.setQuerySystemState(true);
startsystemthreadStop = false;
break;
}
case SHUTDOWNSYSTEM:
{
log.writeLog(__LINE__, "MSG RECEIVED: Shutdown System request..." );
// GRACEFUL_WAIT means that we are shutting down, but waiting for
// all transactions to finish or rollback as commanded. This is only set if
// there are, in fact, transactions active (or cpimport).
//int retStatus = oam::API_SUCCESS;
if (HDFS)
{
if (ackIndicator)
{
ackMsg.reset();
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SHUTDOWNSYSTEM: ACK back to sender, return status = " + oam.itoa(API_SUCCESS));
}
Config* sysConfig = Config::makeConfig();
// clear Standby OAM Module
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
string cmd = "pdsh -a -x " + localHostName + " 'columnstore stop' > /dev/null 2>&1";
system(cmd.c_str());
break;
}
else
{
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
//do local module last
if ( (*pt).DeviceName == config.moduleName() )
{
continue;
}
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
processManager.shutdownModule((*pt).DeviceName, graceful, manualFlag, 0);
}
}
}
if (ackIndicator)
{
ackMsg.reset();
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SHUTDOWNSYSTEM: ACK back to sender, return status = " + oam.itoa(API_SUCCESS));
}
Config* sysConfig = Config::makeConfig();
// clear Standby OAM Module
sysConfig->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
//clearout auto move dbroots files
string cmd = "rm -f /var/lib/columnstore/local/moveDbrootTransactionLog";
system(cmd.c_str());
cmd = "touch /var/lib/columnstore/local/moveDbrootTransactionLog";
system(cmd.c_str());
//clear shared memory
cmd = "clearShm > /dev/null 2>&1";
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) != 1)
log.writeLog(__LINE__, "Successfully ran DBRM clearShm", LOG_TYPE_DEBUG);
else
log.writeLog(__LINE__, "Error running DBRM clearShm", LOG_TYPE_ERROR);
// now do local module
processManager.shutdownModule(config.moduleName(), graceful, manualFlag);
break;
}
case STARTSYSTEM:
{
log.writeLog(__LINE__, "MSG RECEIVED: Start System request...ackIndicator=" + oam.itoa(ackIndicator));
startsystemthreadStop = false;
// get system status and don't process if already in-progress
try
{
SystemStatus systemstatus;
oam.getSystemStatus(systemstatus);
if (systemstatus.SystemOpState == MAN_INIT)
{
log.writeLog(__LINE__, "STARTSYSTEM: Start already in-progess");
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_ALREADY_IN_PROGRESS;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STARTSYSTEM: ACK back to sender");
}
break;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
oam::DeviceNetworkList devicenetworklist;
pthread_t startsystemthread;
status = pthread_create (&startsystemthread, NULL, (void* (*)(void*)) &startSystemThread, &devicenetworklist);
if ( status != 0 )
{
log.writeLog(__LINE__, "STARTSYSTEMS: pthread_create failed, return status = " + oam.itoa(status));
status = API_FAILURE;
}
if (status == 0 && ackIndicator)
{
pthread_join(startsystemthread, NULL);
status = stopsystemthreadStatus;
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STARTSYSTEM: ACK back to sender");
}
log.writeLog(__LINE__, "STARTSYSTEM: Start System Request Completed with status = " + oam.itoa(status));
break;
}
case RESTARTSYSTEM:
{
log.writeLog(__LINE__, "MSG RECEIVED: Restart System request..." );
startsystemthreadStop = false;
// GRACEFUL_WAIT means that we are shutting down, but waiting for
// all transactions to finish or rollback as commanded. This is only set if
// there are, in fact, transactions active (or cpimport).
if (graceful == GRACEFUL_WAIT)
{
ByteStream stillWorkingMsg;
stillWorkingMsg << (ByteStream::byte) oam::ACK;
stillWorkingMsg << actionType;
stillWorkingMsg << target;
stillWorkingMsg << (ByteStream::byte) API_STILL_WORKING;
// This wait can take a while. We wait for table locks to release and open transactions to commit.
if (oam.waitForSystem(RESTARTSYSTEM, fIos, stillWorkingMsg))
{
graceful = GRACEFUL; // ProcMonitor doesn't know GRACEFUL_WAIT.
// Send an ack back to say we're done waiting and are now shutting down.
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_TRANSACTIONS_COMPLETE;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "RESTARTSYSTEM: ACK transactions complete back to sender, return status = " + oam.itoa(API_TRANSACTIONS_COMPLETE));
}
else
{
// We've been cancelled.
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_CANCELLED;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "RESTARTSYSTEM: ACK back to sender (canceled)");
break;
}
}
}
//set the flag to have any startsystemthreads to exit out before stop is done
startsystemthreadStop = true;
if ( startsystemthreadRunning )
sleep(5);
//get system status
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (...)
{}
//set system status
processManager.setSystemState(oam::MAN_OFFLINE);
//call to update module status and send notification message
//stop all of processes..
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
processManager.stopModule((*pt).DeviceName, STATUS_UPDATE, manualFlag);
}
}
//stop by process type first, if system is ACTIVE
if (systemstatus.SystemOpState == ACTIVE)
processManager.stopProcessTypes(manualFlag);
status = API_SUCCESS;
// stop modules
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
//skip OAM Parent module, do at the end
if ( (*pt).DeviceName == config.moduleName() )
continue;
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
log.writeLog(__LINE__, "RESTARTSYSTEM: Request Stop Module on " + (*pt).DeviceName );
int retStatus = processManager.stopModule((*pt).DeviceName, graceful, manualFlag);
log.writeLog(__LINE__, "RESTARTSYSTEM: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (retStatus != API_SUCCESS)
status = retStatus;
}
}
//now stop local module
processManager.stopModule(config.moduleName(), graceful, manualFlag );
//run save.brm script
string storageType = Config::makeConfig()->getConfig("Installation", "DBRootStorageType");
if (storageType == "storagemanager")
processManager.startProcess(config.moduleName(), "StorageManager", FORCEFUL);
processManager.saveBRM(false);
if (storageType == "storagemanager")
processManager.stopProcess(config.moduleName(), "StorageManager", GRACEFUL, false);
log.writeLog(__LINE__, "RESTARTSYSTEM: ACK received from Process-Monitor for stopModule requests, return status = " + oam.itoa(status));
startsystemthreadStop = false;
if (status == API_SUCCESS )
{
//distribute config file
processManager.distributeConfigFile("system");
oam::DeviceNetworkList devicenetworklist;
pthread_t startsystemthread;
status = pthread_create (&startsystemthread, NULL, (void*(*)(void*)) &startSystemThread, &devicenetworklist);
if ( status != 0 )
{
log.writeLog(__LINE__, "STARTMODULE: pthread_create failed, return status = " + oam.itoa(status));
status = API_FAILURE;
}
if (status == 0 && ackIndicator)
{
pthread_join(startsystemthread, NULL);
status = startsystemthreadStatus;
}
// setup MySQL Replication after FORCE restart command
if ( (status == API_SUCCESS) &&
(graceful == oam::FORCEFUL) )
{
log.writeLog(__LINE__, "Setup MySQL Replication for restartSystem FORCE", LOG_TYPE_DEBUG);
oam::DeviceNetworkList devicenetworklist;
processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, true);
}
log.writeLog(__LINE__, "RESTARTSYSTEM: Start System Request Completed", LOG_TYPE_INFO);
}
if (ackIndicator)
{
ackMsg.reset();
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "RESTARTSYSTEM: ACK back to sender");
}
log.writeLog(__LINE__, "Restart System Completed, status = " + oam.itoa(status), LOG_TYPE_INFO);
break;
}
case STOPPROCESS:
{
log.writeLog(__LINE__, "MSG RECEIVED: Stop Process request on " + target );
string moduleName;
msg >> moduleName;
status = API_SUCCESS;
status = processManager.stopProcess(moduleName, target, graceful, manualFlag);
log.writeLog(__LINE__, "STOPPROCESS: ACK received from Process-Monitor, return status = " + oam.itoa(status));
log.writeLog(__LINE__, "Stop Process Completed on " + moduleName + " / " + target, LOG_TYPE_INFO );
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STOPPROCESS: ACK back to sender");
}
break;
}
case STARTPROCESS:
{
log.writeLog(__LINE__, "MSG RECEIVED: Start Process request on " + target);
string moduleName;
msg >> moduleName;
status = processManager.startProcess(moduleName, target, graceful);
log.writeLog(__LINE__, "STARTPROCESS: ACK received from Process-Monitor, return status = " + oam.itoa(status));
log.writeLog(__LINE__, "Start Process Completed on " + moduleName + " / " + target, LOG_TYPE_INFO );
// if a PrimProc was restarted, restart ACTIVE ExeMgr(s) and DDL/DMLProc
#if 0 // A RESTARTPROCESS message is about to arrive, so this is redundant.
if ( target.find("PrimProc") == 0)
{
//distribute config file
processManager.distributeConfigFile("system");
processManager.reinitProcessType("WriteEngineServer");
processManager.restartProcessType("ExeMgr");
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
// if a WriteEngineServer was restarted, restart DDL/DMLProc
if ( target.find("WriteEngineServer") == 0)
{
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
#endif
// if DDL or DMLProc, change IP Address
if ( target.find("DDLProc") == 0 ||
target.find("DMLProc") == 0 )
{
processManager.setPMProcIPs(moduleName, target);
}
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "STARTPROCESS: ACK back to sender");
}
break;
}
case RESTARTPROCESS:
{
log.writeLog(__LINE__, "MSG RECEIVED: Restart Process request on " + target );
string moduleName;
msg >> moduleName;
status = processManager.restartProcess(moduleName, target, graceful, manualFlag);
// if a PrimProc was restarted, restart ACTIVE ExeMgr(s)
if ( target.find("PrimProc") == 0)
{
//distribute config file
processManager.distributeConfigFile("system");
processManager.reinitProcessType("WriteEngineServer");
processManager.restartProcessType("ExeMgr");
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
// if a WriteEngineServer was restarted, restart DDL/DMLProc
if ( target.find("WriteEngineServer") == 0)
{
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
log.writeLog(__LINE__, "RESTARTPROCESS: ACK received from Process-Monitor, return status = " + oam.itoa(status));
log.writeLog(__LINE__, "Restart Process Completed on " + moduleName + " / " + target, LOG_TYPE_INFO );
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "RESTARTPROCESS: ACK back to sender");
}
break;
}
case UPDATELOG:
{
string action;
string level;
msg >> action;
msg >> level;
log.writeLog(__LINE__, "MSG RECEIVED: " + action + " logging on " + target + " for level " + level );
status = API_SUCCESS;
if ( target == "system" )
{
// send logging message to all modules
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
int retStatus = processManager.updateLog(action, (*pt).DeviceName, level);
if ( retStatus != API_SUCCESS)
status = retStatus;
}
}
}
else
{
// for a specific module
// validate module name
bool found = false;
for ( unsigned int i = 0; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
if ((*pt).DeviceName == target)
{
status = processManager.updateLog(action, target, level);
found = true;
break;
}
}
}
if ( found == false )
{
log.writeLog(__LINE__, "ERROR: Invalid module name: " + target, LOG_TYPE_ERROR);
status = API_INVALID_PARAMETER;
}
}
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "UPDATELOG: ACK back to sender, return status = " + oam.itoa(status));
break;
}
case GETCONFIGLOG:
{
log.writeLog(__LINE__, "MSG RECEIVED: Get Log Configuation" );
status = API_SUCCESS;
// validate module name and make request
bool found = false;
for ( unsigned int i = 0; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
if ((*pt).DeviceName == target)
{
status = processManager.getConfigLog(target);
found = true;
break;
}
}
}
if ( found == false )
{
log.writeLog(__LINE__, "ERROR: Invalid module name: " + target, LOG_TYPE_ERROR);
status = API_INVALID_PARAMETER;
}
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "GETCONFIGLOG: ACK back to sender, return status = " + oam.itoa(status));
break;
}
case REINITPROCESS:
{
log.writeLog(__LINE__, "MSG RECEIVED: Re-Init Process request..." );
string moduleName;
msg >> moduleName;
//distribute config file
processManager.distributeConfigFile(moduleName);
status = processManager.reinitProcess(moduleName, target);
log.writeLog(__LINE__, "REINITPROCESS: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "REINITPROCESS: ACK back to sender");
}
break;
}
case UPDATECONFIG:
{
log.writeLog(__LINE__, "MSG RECEIVED: Update Process Configuation" );
status = API_SUCCESS;
//distribute update of process config file
processManager.distributeConfigFile("system", "ProcessConfig.xml");
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for (; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
int retStatus = processManager.updateConfig((*pt).DeviceName);
if (retStatus != API_SUCCESS)
status = retStatus;
}
}
log.writeLog(__LINE__, "UPDATECONFIG: ACK back to sender, return status = " + oam.itoa(status));
break;
}
case BUILDSYSTEMTABLES:
{
log.writeLog(__LINE__, "MSG RECEIVED: Send Build System Table request to " + target);
status = processManager.buildSystemTables(target);
log.writeLog(__LINE__, "BUILDSYSTEMTABLES: ACK received from Process-Monitor, return status = " + oam.itoa(status));
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "BUILDSYSTEMTABLES: ACK back to sender");
}
break;
}
case ADDMODULE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Add Module request");
string value;
uint16_t count, ivalue, nicCount;
uint8_t tmp8;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
oam::HostConfig hostconfig;
bool storeHostnames;
msg >> tmp8;
storeHostnames = (tmp8 != 0);
//get module count to add
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
msg >> nicCount;
for (int j = 0 ; j < nicCount ; j ++ )
{
msg >> value;
hostconfig.IPAddr = value;
msg >> value;
hostconfig.HostName = value;
msg >> ivalue;
hostconfig.NicID = ivalue;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
}
devicenetworklist.push_back(devicenetworkconfig);
devicenetworkconfig.hostConfigList.clear();
}
string password;
msg >> password;
status = processManager.addModule(devicenetworklist, password, storeHostnames);
log.writeLog(__LINE__, "ADDMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "ADDMODULE: Module Count invalid = " + oam.itoa(count));
}
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "ADDMODULE: ACK back to sender");
}
break;
}
case REMOVEMODULE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Remove Module request");
uint16_t count, hostConfigCount;
string value;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
//get module count to remove
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
devicenetworklist.push_back(devicenetworkconfig);
msg >> hostConfigCount;
}
string password;
msg >> password;
status = processManager.removeModule(devicenetworklist);
log.writeLog(__LINE__, "REMOVEMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
log.writeLog(__LINE__, "Remove Module Completed", LOG_TYPE_INFO);
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "REMOVEMODULE: Module Count invalid = " + oam.itoa(count));
}
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "REMOVEMODULE: ACK back to sender");
}
break;
}
case RECONFIGUREMODULE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Reconfigure Module request");
string value;
uint16_t count, ivalue, nicCount;
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
oam::HostConfig hostconfig;
//get module count
msg >> count;
if ( count > 0 )
{
for (int i = 0; i < count; i++)
{
msg >> value;
devicenetworkconfig.DeviceName = value;
msg >> value;
devicenetworkconfig.UserTempDeviceName = value;
msg >> value;
devicenetworkconfig.DisableState = value;
msg >> nicCount;
for (int j = 0 ; j < nicCount ; j ++ )
{
msg >> value;
hostconfig.IPAddr = value;
msg >> value;
hostconfig.HostName = value;
msg >> ivalue;
hostconfig.NicID = ivalue;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
}
devicenetworklist.push_back(devicenetworkconfig);
devicenetworkconfig.hostConfigList.clear();
}
string password;
msg >> password;
status = processManager.reconfigureModule(devicenetworklist);
log.writeLog(__LINE__, "RECONFIGUREMODULE: ACK received from Process-Monitor, return status = " + oam.itoa(status));
}
else
{
status = oam::API_INVALID_PARAMETER;
log.writeLog(__LINE__, "RECONFIGUREMODULE: Module Count invalid = " + oam.itoa(count));
}
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "RECONFIGUREMODULE: ACK back to sender");
}
break;
}
case STOPPROCESSTYPE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Stop Process Type request: " + target);
if ( target == "DBRM" )
{
processManager.stopProcessType("DBRMControllerNode");
processManager.stopProcessType("DBRMWorkerNode");
}
else
processManager.stopProcessType(target);
log.writeLog(__LINE__, "Stop Process Type Completed", LOG_TYPE_INFO );
break;
}
case STARTPROCESSTYPE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Start Process Type request: " + target);
if ( target == "DBRM" )
{
processManager.startProcessType("DBRMControllerNode");
processManager.startProcessType("DBRMWorkerNode");
}
else
processManager.startProcessType(target);
// if a PrimProc was restarted, restart ACTIVE ExeMgr(s) and DDL/DMLProc
if ( target == "PrimProc" )
{
//distribute config file
processManager.distributeConfigFile("system");
processManager.reinitProcessType("WriteEngineServer");
processManager.restartProcessType("ExeMgr");
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
// if a WriteEngineServer was restarted, restart DDL/DMLProc
if ( target.find("WriteEngineServer") == 0)
{
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
log.writeLog(__LINE__, "Start Process Type Completed", LOG_TYPE_INFO );
break;
}
case RESTARTPROCESSTYPE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Restart Process Type request: " + target);
if ( target == "DBRM" )
{
processManager.restartProcessType("DBRMControllerNode");
processManager.restartProcessType("DBRMWorkerNode");
}
else
{
processManager.restartProcessType(target);
// if a PrimProc was restarted, restart ACTIVE ExeMgr(s) and DDL/DMLProc
if ( target == "PrimProc" )
{
//distribute config file
processManager.distributeConfigFile("system");
processManager.reinitProcessType("WriteEngineServer");
processManager.restartProcessType("ExeMgr");
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
}
// if a WriteEngineServer was restarted, restart DDL/DMLProc
if ( target.find("WriteEngineServer") == 0)
{
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
log.writeLog(__LINE__, "Restart Process Type Completed", LOG_TYPE_INFO );
break;
}
case REINITPROCESSTYPE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Reinit Process Type request: " + target);
status = processManager.reinitProcessType(target);
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
}
log.writeLog(__LINE__, "Reinit Process Type Completed, return status = " + oam.itoa(status));
break;
}
case DISTRIBUTECONFIG:
{
string file;
msg >> file;
log.writeLog(__LINE__, "MSG RECEIVED: Distribute Config File " + target + "/" + file);
processManager.distributeConfigFile(target, file);
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) oam::API_SUCCESS;
try
{
fIos.write(ackMsg);
}
catch (...) {}
}
log.writeLog(__LINE__, "Distribute Config File Completed " + target + "/" + file);
break;
}
case SWITCHOAMPARENT:
{
log.writeLog(__LINE__, "MSG RECEIVED: Switch OAM Parent to : " + target);
// GRACEFUL_WAIT means that we are shutting down, but waiting for
// all transactions to finish or rollback as commanded. This is only set if
// there are, in fact, transactions active (or cpimport).
if (graceful == GRACEFUL_WAIT)
{
ByteStream stillWorkingMsg;
stillWorkingMsg << (ByteStream::byte) oam::ACK;
stillWorkingMsg << actionType;
stillWorkingMsg << target;
stillWorkingMsg << (ByteStream::byte) API_STILL_WORKING;
// This wait can take a while. We wait for table locks to release and open transactions to commit.
if (oam.waitForSystem(RESTARTSYSTEM, fIos, stillWorkingMsg))
{
graceful = GRACEFUL; // ProcMonitor doesn't know GRACEFUL_WAIT.
// Send an ack back to say we're done waiting and are now shutting down.
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_TRANSACTIONS_COMPLETE;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SWITCHOAMPARENT: ACK transactions complete back to sender, return status = " + oam.itoa(API_TRANSACTIONS_COMPLETE));
}
else
{
// We've been cancelled.
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_CANCELLED;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SWITCHOAMPARENT: ACK back to sender (canceled)");
break;
}
}
}
status = processManager.switchParentOAMModule(target);
log.writeLog(__LINE__, "Switch OAM Parent Completed", LOG_TYPE_INFO );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
// stop myself
processManager.stopProcess(config.moduleName(), "ProcessManager", oam::FORCEFUL, true);
break;
}
case UNMOUNT:
{
log.writeLog(__LINE__, "MSG RECEIVED: Unmount dbroot : " + target);
status = processManager.unmountDBRoot(target);
log.writeLog(__LINE__, "UnMount Completed status: " + oam.itoa(status) );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
break;
}
case MOUNT:
{
log.writeLog(__LINE__, "MSG RECEIVED: mount dbroot : " + target);
status = processManager.mountDBRoot(target);
log.writeLog(__LINE__, "Mount Completed status: " + oam.itoa(status) );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
break;
}
case SUSPENDWRITES:
{
ByteStream::byte ackResponse = API_FAILURE;
log.writeLog(__LINE__, "MSG RECEIVED: suspend database writes");
string storageType = Config::makeConfig()->getConfig("Installation", "DBRootStorageType");
// GRACEFUL_WAIT means that we are Suspending writes, but waiting for all
// transactions to finish or rollback as commanded. This is only set if there
// are, in fact, transactions active (or cpimport).
if (graceful == GRACEFUL_WAIT)
{
ByteStream stillWorkingMsg;
stillWorkingMsg << (ByteStream::byte) oam::ACK;
stillWorkingMsg << actionType;
stillWorkingMsg << target;
stillWorkingMsg << (ByteStream::byte) API_STILL_WORKING;
// This wait can take a while. We wait for table locks to release and open transactions to commit.
if (oam.waitForSystem(SUSPENDWRITES, fIos, stillWorkingMsg))
{
graceful = GRACEFUL; // ProcMonitor doesn't know GRACEFUL_WAIT.
// Send an ack back to say we're done waiting and are now shutting down.
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_TRANSACTIONS_COMPLETE;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SUSPENDWRITES: ACK transactions complete back to sender, return status = " + oam.itoa(API_TRANSACTIONS_COMPLETE));
}
else
{
// We've been cancelled.
if (ackIndicator)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_CANCELLED;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SUSPENDWRITES: ACK back to sender (canceled)");
break;
}
}
}
BRM::DBRM dbrm;
dbrm.setSystemSuspended(true);
// Wait for everything to settle down
sleep(5);
// Save the BRM. This command presages a system backup. Best to have a current BRM on disk
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
string cmd = "save_brm > " + logdir + "/save_brm.log1 2>&1";
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) == 0)
{
ackResponse = API_SUCCESS;
}
else
{
ackResponse = API_FAILURE_DB_ERROR;
dbrm.setSystemSuspended(false);
}
if (storageType == "storagemanager")
{
//sync fs on all pm nodes if up
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType != "pm" )
continue;
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
int returnStatus = processManager.syncFsAll( (*pt).DeviceName );
if (returnStatus != API_SUCCESS)
{
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) API_FAILURE;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SUSPENDWRITES: API_FAILURE filestemSync() on module " + (*pt).DeviceName,LOG_TYPE_ERROR);
break;
}
}
}
}
ackMsg.reset();
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << ackResponse;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "SUSPENDWRITES: ACK back to sender" + oam.itoa(ackResponse));
break;
}
case FSTABUPDATE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Distribute Fstab update" );
//get fstab entry
string entry;
msg >> entry;
status = API_SUCCESS;
if ( target == "system" )
{
//send out to all pms except local module
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType != "pm" )
continue;
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for (; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
if ( (*pt).DeviceName == config.moduleName() )
continue;
int retStatus = processManager.updateFstab((*pt).DeviceName, entry);
if (retStatus != API_SUCCESS)
status = retStatus;
}
}
}
else
{
int retStatus = processManager.updateFstab(target, entry);
if (retStatus != API_SUCCESS)
status = retStatus;
}
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
log.writeLog(__LINE__, "FSTABUPDATE: ACK back to sender, return status = " + oam.itoa(status));
break;
}
case ENABLEMYSQLREP:
{
log.writeLog(__LINE__, "MSG RECEIVED: Enable MySQL Replication");
// target = root password
oam::DeviceNetworkList devicenetworklist;
status = processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, true, target);
log.writeLog(__LINE__, "Enable MySQL Replication status: " + oam.itoa(status) );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
break;
}
case DISABLEMYSQLREP:
{
log.writeLog(__LINE__, "MSG RECEIVED: Disable MySQL Replication");
// target = root password
oam::DeviceNetworkList devicenetworklist;
status = processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, false, target, false);
log.writeLog(__LINE__, "Disable MySQL Replication status: " + oam.itoa(status) );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
break;
}
case GLUSTERASSIGN:
{
string dbroot;
msg >> dbroot;
log.writeLog(__LINE__, "MSG RECEIVED: Gluster Assign DBRoot: " + dbroot);
status = processManager.glusterAssign(target, dbroot);
log.writeLog(__LINE__, "Gluster Assign DBRoot status: " + oam.itoa(status) );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
break;
}
case GLUSTERUNASSIGN:
{
string dbroot;
msg >> dbroot;
log.writeLog(__LINE__, "MSG RECEIVED: Gluster Unassign DBRoot: " + dbroot);
status = processManager.glusterUnassign(target, dbroot);
log.writeLog(__LINE__, "Gluster Unassign DBRoot status: " + oam.itoa(status) );
ackMsg << (ByteStream::byte) oam::ACK;
ackMsg << actionType;
ackMsg << target;
ackMsg << (ByteStream::byte) status;
try
{
fIos.write(ackMsg);
}
catch (...) {}
break;
}
default:
log.writeLog(__LINE__, "MSG RECEIVED: Invalid type" );
break;
}
break;
case HEARTBEAT_REGISTER:
{
string moduleName;
string processName;
ByteStream::byte id;
msg >> moduleName;
msg >> processName;
msg >> id;
HeartBeatProc hbproc;
hbproc.ModuleName = moduleName;
hbproc.ProcessName = processName;
hbproc.ID = id;
hbproc.receiveFlag = true;
HeartBeatProcList::iterator list = hbproclist.begin();
for ( ; list != hbproclist.end() ; list++)
{
if ( (*list).ModuleName == moduleName
&& (*list).ProcessName == processName
&& (*list).ID == id)
{
// already in the list
break;
}
}
if ( list == hbproclist.end() )
{
// add to list
hbproclist.push_front(hbproc);
log.writeLog(__LINE__, "Adding Process to Heartbeat Monitor list: " + moduleName + " / " + processName + " / " + oam.itoa(id));
}
}
break;
case HEARTBEAT_DEREGISTER:
{
string moduleName;
string processName;
ByteStream::byte id;
msg >> moduleName;
msg >> processName;
msg >> id;
HeartBeatProcList::iterator list = hbproclist.begin();
for ( ; list != hbproclist.end() ; list++)
{
if ( (*list).ModuleName == moduleName
&& (*list).ProcessName == processName
&& (*list).ID == id)
{
hbproclist.erase(list);
log.writeLog(__LINE__, "Removing Process from Heartbeat Monitor list: " + moduleName + " / " + processName + " / " + oam.itoa(id));
break;
}
}
}
break;
case HEARTBEAT_SEND:
{
string moduleName;
string processName;
string timeStamp;
ByteStream::byte id;
ByteStream::byte ackFlag;
msg >> moduleName;
msg >> processName;
msg >> timeStamp;
msg >> id;
msg >> ackFlag;
if ( ackFlag == oam::ACK_YES )
{
// send back an ack msg
ackMsg << (ByteStream::byte) HEARTBEAT_SEND;
try
{
fIos.write(ackMsg);
}
catch (...) {}
//log.writeLog(__LINE__, "Heartbeat Ack message sent", LOG_TYPE_DEBUG);
}
HeartBeatProcList::iterator list = hbproclist.begin();
for ( ; list != hbproclist.end() ; list++)
{
if ( (*list).ModuleName == moduleName
&& (*list).ProcessName == processName
&& (*list).ID == id)
{
(*list).receiveFlag = true;
//log.writeLog(__LINE__, "Heartbeat Received: " + moduleName + " / " + processName + " / " + oam.itoa(id) + ", timestamp: " + timeStamp, LOG_TYPE_DEBUG);
break;
}
}
if ( list == hbproclist.end() )
{
// not found, add to list
HeartBeatProc hbproc;
hbproc.ModuleName = moduleName;
hbproc.ProcessName = processName;
hbproc.ID = id;
hbproc.receiveFlag = true;
hbproclist.push_front(hbproc);
log.writeLog(__LINE__, "Adding Process to Heartbeat Monitor list: " + moduleName + " / " + processName + " / " + oam.itoa(id));
}
}
break;
case PROCESSRESTART:
{
string moduleName;
string processName;
ByteStream::byte manual;
msg >> moduleName;
msg >> processName;
msg >> manual;
log.writeLog(__LINE__, "MSG RECEIVED: Process Restarted on " + moduleName + "/" + processName);
//set query system states not ready
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport");
//request reinit after Process is active
for ( int i = 0; i < 10 ; i++ ) {
try {
ProcessStatus procstat;
oam.getProcessStatus(processName, moduleName, procstat);
if (procstat.ProcessOpState == oam::COLD_STANDBY)
break;
if ( (procstat.ProcessOpState == oam::ACTIVE) ||
(procstat.ProcessOpState == oam::STANDBY) ) {
// if a PrimProc was restarted, reinit ACTIVE ExeMgr(s) and DDL/DMLProc
if ( processName == "PrimProc")
{
//distribute config file
processManager.distributeConfigFile("system");
processManager.reinitProcessType("WriteEngineServer");
processManager.restartProcessType("ExeMgr");
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
// if a WriteEngineServer was restarted, restart DDL/DMLProc
if ( processName == "WriteEngineServer")
{
processManager.reinitProcessType("DDLProc");
processManager.reinitProcessType("DMLProc");
}
// if a ControllerNode was restarted, restart DMLProc
if ( processName == "DBRMControllerNode")
{
// sleep(5);
// processManager.reinitProcessType("DBRMWorkerNode");
// Wait for DBRMControllerNode to go active
ProcessStatus procstat;
uint16_t state = AUTO_OFFLINE;
while (state == oam::MAN_OFFLINE
|| state == oam::AUTO_OFFLINE
|| state == oam::MAN_INIT
|| state == oam::AUTO_INIT)
{
oam.getProcessStatus("DBRMControllerNode", config.OAMParentName(), procstat);
state = procstat.ProcessOpState;
if ( procstat.ProcessOpState == oam::ACTIVE)
break;
sleep(1);
}
processManager.restartProcessType("DDLProc");
processManager.restartProcessType("DMLProc");
sleep(1);
string DMLmodule = config.OAMParentName();
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM )
{
string PrimaryUMModuleName;
try
{
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch (...) {}
if ( !PrimaryUMModuleName.empty() )
DMLmodule = PrimaryUMModuleName;
}
// Wait for DMLProc to be ACTIVE
BRM::DBRM dbrm;
state = AUTO_OFFLINE;
while (state == oam::MAN_OFFLINE
|| state == oam::AUTO_OFFLINE
|| state == oam::MAN_INIT
|| state == oam::AUTO_INIT
|| state == oam::ROLLBACK_INIT)
{
oam.getProcessStatus("DMLProc", DMLmodule, procstat);
state = procstat.ProcessOpState;
if ( procstat.ProcessOpState == oam::ACTIVE)
break;
sleep(1);
}
processManager.setQuerySystemState(true);
}
// if a DDLProc was restarted, restart DMLProc
if ( processName == "DDLProc")
{
processManager.reinitProcessType("DMLProc");
//set query system states ready
processManager.setQuerySystemState(true);
processManager.setSystemState(oam::ACTIVE);
}
//only run on auto process restart
if (manual == 0 )
{
//get dbhealth flag
string DBHealthMonitorFlag = "n";
string DBFunctionalMonitorFlag;
try
{
oam.getSystemConfig( "DBHealthMonitorFlag", DBHealthMonitorFlag);
}
catch (...)
{
DBHealthMonitorFlag = "n";
}
//check the db health
if (DBHealthMonitorFlag == "y" )
{
log.writeLog(__LINE__, "Call the check DB Health API", LOG_TYPE_DEBUG);
try
{
oam.checkDBFunctional();
log.writeLog(__LINE__, "check DB Health passed", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "check DB Health FAILED", LOG_TYPE_ERROR);
}
}
}
break;
}
sleep(1);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
break;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
break;
}
}
//set query system states ready
processManager.setQuerySystemState(true);
processManager.setSystemState(oam::ACTIVE);
log.writeLog(__LINE__, "MSG RECEIVED: Process Restarted Completed");
}
break;
case GETDBRMDATA:
{
log.writeLog(__LINE__, "MSG RECEIVED: Get DBRM Data Files");
string moduleName;
msg >> moduleName;
int ret = processManager.getDBRMData(fIos, moduleName);
if ( ret == oam::API_SUCCESS )
log.writeLog(__LINE__, "Get DBRM Data Files Completed");
else
log.writeLog(__LINE__, "Get DBRM Data Files Failed");
}
break;
case GETALARMDATA:
{
log.writeLog(__LINE__, "MSG RECEIVED: Get Alarm Data Files");
string date;
msg >> date;
processManager.getAlarmData(fIos, GETALARMDATA, date);
log.writeLog(__LINE__, "Get Alarm Data Files Completed");
}
break;
case GETACTIVEALARMDATA:
{
// log.writeLog(__LINE__, "MSG RECEIVED: Get Active Alarm Data Files");
//pull off, but don't need
string date;
msg >> date;
processManager.getAlarmData(fIos, GETACTIVEALARMDATA, "");
// log.writeLog(__LINE__, "Get Active Alarm Data Files Completed");
}
break;
default:
break;
}
sleep(5);
fIos.close();
pthread_detach (ThreadId);
pthread_exit(0);
return NULL;
}
/******************************************************************************************
* @brief getAlarmData
*
* purpose: get DBRM Data and send to requester
*
******************************************************************************************/
int ProcessManager::getAlarmData(messageqcpp::IOSocket fIos, int type, std::string date)
{
ByteStream msg;
Oam oam;
int returnStatus = oam::API_SUCCESS;
AlarmList alarmList;
if ( type == GETALARMDATA )
{
try
{
ALARMManager sm;
sm.getAlarm(date, alarmList);
}
catch (...)
{
msg << (ByteStream::byte) oam::ACK;
msg << (ByteStream::byte) type;
msg << (ByteStream::byte) oam::API_FAILURE;
try
{
fIos.write(msg);
}
catch (...) {}
return oam::API_FAILURE;
}
}
else
{
try
{
ALARMManager sm;
sm.getActiveAlarm(alarmList);
}
catch (...)
{
msg << (ByteStream::byte) oam::ACK;
msg << (ByteStream::byte) type;
msg << (ByteStream::byte) oam::API_FAILURE;
try
{
fIos.write(msg);
}
catch (...) {}
return oam::API_FAILURE;
}
}
msg << (ByteStream::byte) oam::ACK;
msg << (ByteStream::byte) type;
msg << (ByteStream::byte) oam::API_SUCCESS;
//number of alarms
msg << (ByteStream::byte) alarmList.size();
//log.writeLog(__LINE__, oam.itoa(alarmList.size()), LOG_TYPE_ERROR );
AlarmList :: iterator i;
for (i = alarmList.begin(); i != alarmList.end(); ++i)
{
msg << (ByteStream::doublebyte) i->second.getAlarmID();
//log.writeLog(__LINE__, oam.itoa(i->second.getAlarmID()), LOG_TYPE_ERROR );
msg << i->second.getDesc();
msg << (ByteStream::doublebyte) i->second.getSeverity();
msg << i->second.getTimestamp();
msg << i->second.getSname();
msg << i->second.getPname();
msg << i->second.getComponentID();
}
try
{
fIos.write(msg);
}
catch (...) {}
return returnStatus;
}
/******************************************************************************************
* @brief buildRequestMessage
*
* purpose: Build a request message
*
******************************************************************************************/
ByteStream ProcessManager::buildRequestMessage(ByteStream::byte requestID,
ByteStream::byte actionIndicator, string processName, bool manualFlag)
{
ByteStream msg;
ByteStream::byte messageType = REQUEST;
msg << messageType;
msg << requestID;
msg << actionIndicator;
if (processName != "" )
msg << processName;
msg << (ByteStream::byte) manualFlag;
return msg;
}
/******************************************************************************************
* @brief startModule
*
* purpose: Start all processes on the specified module
*
******************************************************************************************/
int ProcessManager::startModule(string target, messageqcpp::ByteStream::byte actionIndicator, uint16_t startType, bool systemStart)
{
ByteStream msg;
ByteStream::byte requestID = STARTALL;
string processName = "";
Oam oam;
if ( startType == oam::MAN_OFFLINE )
setModuleState(target, oam::MAN_INIT);
else
setModuleState(target, oam::AUTO_INIT);
msg = buildRequestMessage(requestID, actionIndicator, processName);
int returnStatus = sendMsgProcMon( target, msg, requestID );
if ( returnStatus == API_SUCCESS)
{
setModuleState(target, oam::ACTIVE);
//clear alarm, log the event
log.writeLog(__LINE__, target + " module is started by request.", LOG_TYPE_DEBUG);
//clear an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_MANUAL, CLEAR);
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_AUTO, CLEAR);
}
else
{
if ( returnStatus == oam::API_FAILURE || returnStatus == API_FAILURE_DB_ERROR)
setModuleState(target, oam::FAILED);
else if ( !systemStart )
setModuleState(target, oam::FAILED);
//log the event
log.writeLog(__LINE__, target + " module failed to start!!", LOG_TYPE_DEBUG);
}
return returnStatus;
}
/******************************************************************************************
* @brief stopModule
*
* purpose: Stop all processes on the specified module
*
******************************************************************************************/
int ProcessManager::stopModule(string target, ByteStream::byte actionIndicator, bool manualFlag, int timeout)
{
Configuration config;
ProcessManager processManager(config, log);
ByteStream msg;
ByteStream::byte requestID = STOPALL;
string processName = "";
msg = buildRequestMessage(requestID, actionIndicator, processName, manualFlag);
string msgPort = target;
msgPort = msgPort + "_ProcessMonitor";
int returnStatus = API_FAILURE;
if ( actionIndicator == INSTALL && target == config.OAMParentName() )
{
// Process Manager will be taken down, do your updates now
log.writeLog(__LINE__, target + " module is stopped by request.", LOG_TYPE_DEBUG);
if ( manualFlag )
{
setModuleState(target, oam::MAN_OFFLINE);
//Issue an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_MANUAL, SET);
}
else
{
setModuleState(target, oam::AUTO_OFFLINE);
//Issue an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_AUTO, SET);
}
}
else
{
log.writeLog(__LINE__, target + " module is stopped by request.", LOG_TYPE_DEBUG);
if ( manualFlag )
{
setModuleState(target, oam::MAN_INIT);
}
else
{
setModuleState(target, oam::AUTO_INIT);
}
}
returnStatus = sendMsgProcMon( target, msg, requestID, timeout );
if ( actionIndicator != STATUS_UPDATE )
{
if ( returnStatus == API_SUCCESS)
{
//Issue an alarm, log the event
log.writeLog(__LINE__, target + " module is successfully stopped.", LOG_TYPE_DEBUG);
if ( manualFlag )
{
// setModuleState(target, oam::MAN_OFFLINE);
//Issue an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_MANUAL, SET);
}
else
{
// setModuleState(target, oam::AUTO_OFFLINE);
//Issue an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_AUTO, SET);
}
}
else
{
// if ( manualFlag ) {
// setModuleState(target, oam::FAILED);
// }
//log the event
log.writeLog(__LINE__, target + " module failed to stop!!", LOG_TYPE_WARNING);
}
}
return returnStatus;
}
/******************************************************************************************
* @brief shutdownModule
*
* purpose: power off the specified module,
*
******************************************************************************************/
int ProcessManager::shutdownModule(string target, ByteStream::byte actionIndicator, bool manualFlag, int timeout)
{
ByteStream msg;
ByteStream::byte requestID = SHUTDOWNMODULE;
string processName = "";
msg = buildRequestMessage(requestID, actionIndicator, processName, manualFlag);
int returnStatus = sendMsgProcMon( target, msg, requestID, timeout );
if ( returnStatus == API_SUCCESS)
{
//Issue an alarm, log the event
log.writeLog(__LINE__, target + " module is shutdown by request.", LOG_TYPE_DEBUG);
if ( manualFlag )
{
setModuleState(target, oam::MAN_OFFLINE);
//mark all processes running on module man-offline
setProcessStates(target, oam::MAN_OFFLINE);
//Issue an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_MANUAL, SET);
}
else
{
setModuleState(target, oam::AUTO_OFFLINE);
//mark all processes running on module auto-offline
setProcessStates(target, oam::AUTO_OFFLINE);
//Issue an alarm
ALARMManager aManager;
aManager.sendAlarmReport(target.c_str(), MODULE_DOWN_AUTO, SET);
}
}
else
{
setModuleState(target, oam::FAILED);
//log the event
log.writeLog(__LINE__, target + " module failed to shutdown!!", LOG_TYPE_WARNING);
}
return returnStatus;
}
/******************************************************************************************
* @brief disableModule
*
* purpose: Set the Disable State on a specified module
*
******************************************************************************************/
int ProcessManager::disableModule(string target, bool manualFlag)
{
Oam oam;
ProcessManager processManager(config, log);
ModuleConfig moduleconfig;
log.writeLog(__LINE__, "disableModule request for " + target, LOG_TYPE_DEBUG);
string moduleType = target.substr(0, MAX_MODULE_TYPE_SIZE);
pthread_mutex_lock(&THREAD_LOCK);
int newState;
string SnewState;
if ( manualFlag )
{
newState = oam::MAN_DISABLED;
SnewState = oam::MANDISABLEDSTATE;
}
else
{
newState = oam::AUTO_DISABLED;
SnewState = oam::AUTODISABLEDSTATE;
}
// skip of module already in current DISABLED state or in MAN_DISABLED state
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(target, opState, degraded);
if (opState == newState || opState == oam::MAN_DISABLED)
{
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
// if current state is AUTO_DISABLED and new state is MAN_DISABLED
// update state to MAN_DISABLED
if (opState == oam::AUTO_DISABLED && newState == oam::MAN_DISABLED)
{
//removemodule to get proess in MAN_OFFLINE
stopModule(target, REMOVE, true);
try
{
oam.getSystemConfig(target, moduleconfig);
moduleconfig.DisableState = oam::MANDISABLEDSTATE;
try
{
oam.setSystemConfig(target, moduleconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on setSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on setSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_mutex_unlock(&THREAD_LOCK);
setModuleState(target, oam::MAN_DISABLED);
return API_SUCCESS;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + target + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + target + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_mutex_unlock(&THREAD_LOCK);
setModuleState(target, newState);
//set Columnstore.xml enable state
setEnableState( target, SnewState);
//sleep a bit to give time for the state change to apply
sleep(1);
//update PMS area if PM was disabled
if ( moduleType == "pm" )
{
if ( updatePMSconfig() != API_SUCCESS )
return API_FAILURE;
}
//Update DBRM section of Columnstore.xml
if ( updateWorkerNodeconfig() != API_SUCCESS )
{
return API_FAILURE;
}
//distribute config file
distributeConfigFile("system");
processManager.reinitProcesses();
log.writeLog(__LINE__, "disableModule successfully complete for " + target, LOG_TYPE_DEBUG);
return API_SUCCESS;
}
void ProcessManager::reinitProcesses(std::string skipModule)
{
Oam oam;
log.writeLog(__LINE__, "reinitProcesses... ", LOG_TYPE_DEBUG);
reinitProcessType("DBRMWorkerNode");
reinitProcessType("WriteEngineServer");
restartProcessType("ExeMgr",skipModule);
sleep(1);
restartProcessType("DDLProc",skipModule);
sleep(1);
restartProcessType("DMLProc",skipModule);
sleep(3);
log.writeLog(__LINE__, "reinitProcesses complete", LOG_TYPE_DEBUG);
}
/******************************************************************************************
* @brief recycleProcess
*
* purpose: recyle process, done after disable/enable module
*
******************************************************************************************/
void ProcessManager::recycleProcess(string module, bool enableModule)
{
Oam oam;
ModuleConfig moduleconfig;
log.writeLog(__LINE__, "recycleProcess request after module status update: " + module, LOG_TYPE_DEBUG);
string moduleType = module.substr(0, MAX_MODULE_TYPE_SIZE);
string PrimaryUMModuleName;
try
{
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch (...) {}
stopProcessType("WriteEngineServer");
stopProcessType("ExeMgr");
stopProcessType("PrimProc");
stopProcessType("DBRMControllerNode");
stopProcessType("DBRMWorkerNode");
stopProcessType("DDLProc");
stopProcessType("DMLProc");
stopProcessType("mysqld");
// restartProcessType("mysqld");
startProcessType("DBRMControllerNode");
startProcessType("DBRMWorkerNode");
startProcessType("PrimProc");
sleep(5);
startProcessType("WriteEngineServer");
sleep(3);
startProcessType("ExeMgr");
startProcessType("DDLProc");
sleep(1);
startProcessType("DMLProc");
startProcessType("mysqld");
return;
}
/******************************************************************************************
* @brief enableModule
*
* purpose: Clear the Disable State on a specified module
*
******************************************************************************************/
int ProcessManager::enableModule(string target, int state, bool failover)
{
Oam oam;
ModuleConfig moduleconfig;
log.writeLog(__LINE__, "enableModule request for " + target, LOG_TYPE_DEBUG);
string moduleType = target.substr(0, MAX_MODULE_TYPE_SIZE);
if (setEnableState( target, oam::ENABLEDSTATE) != API_SUCCESS )
return API_FAILURE;
setModuleState(target, state);
//sleep a bit to give time for the state change to apply
sleep(5);
//update PMS area if PM was disabled
if ( moduleType == "pm" )
{
if ( updatePMSconfig() != API_SUCCESS )
return API_FAILURE;
log.writeLog(__LINE__, "enableModule - Updated PM server Count", LOG_TYPE_DEBUG);
}
//Update DBRM section of Columnstore.xml
if ( updateWorkerNodeconfig() != API_SUCCESS )
return API_FAILURE;
//distribute config file
distributeConfigFile("system");
//check if new module should be hot-standby
string newStandbyModule = getStandbyModule();
if ( newStandbyModule == target)
setStandbyModule(newStandbyModule);
log.writeLog(__LINE__, "enableModule request for " + target + " completed", LOG_TYPE_DEBUG);
return API_SUCCESS;
}
/******************************************************************************************
* @brief startMgrProcesses
*
* purpose: start all Mgr Controlled processes for a module
*
******************************************************************************************/
void ProcessManager::startMgrProcesses(std::string moduleName)
{
Oam oam;
SystemProcessConfig systemprocessconfig;
vector<ProcessConfig>::iterator itor;
ByteStream msg;
string modulePortName = moduleName + "_ProcessMonitor";
try
{
oam.getProcessConfig(systemprocessconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
string moduleType = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
while (true)
{
bool status = true;
for (itor = systemprocessconfig.processconfig.begin();
itor != systemprocessconfig.processconfig.end(); ++itor)
{
status = true;
if ((*itor).BootLaunch == MGR_LAUNCH)
{
if ((*itor).ModuleType == moduleType
|| (*itor).ModuleType == "ChildExtOAMModule"
|| ( (*itor).ModuleType == "ChildOAMModule")
|| ((*itor).ModuleType == "ParentOAMModule" && moduleName == config.OAMParentName()) )
{
int state = oam::ACTIVE;
try
{
ProcessStatus procstat;
oam.getProcessStatus((*itor).ProcessName, moduleName, procstat);
state = procstat.ProcessOpState;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
continue;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
continue;
}
if ( state == oam::INITIAL )
{
msg = buildRequestMessage(START, FORCEFUL, (*itor).ProcessName);
log.writeLog(__LINE__, "Request Start of Process/Module: " + (*itor).ProcessName + " / " + moduleName, LOG_TYPE_DEBUG);
try
{
MessageQueueClient mqRequest(modulePortName);
mqRequest.write(msg);
mqRequest.shutdown();
// sleep(2);
status = false;
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
}
} //end of for loop
if (status)
return;
} //end of while
}
/******************************************************************************************
* @brief stopProcess
*
* purpose: Stop a Process on the specified module
*
******************************************************************************************/
int ProcessManager::stopProcess(string moduleName, string processName,
messageqcpp::ByteStream::byte actionIndicator, bool manualFlag, int timeout)
{
ByteStream msg;
ByteStream::byte requestID = STOP;
msg = buildRequestMessage(requestID, actionIndicator, processName, manualFlag);
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, timeout );
if ( returnStatus == API_SUCCESS)
//log the event
log.writeLog(__LINE__, processName + " process is stopped by request.", LOG_TYPE_DEBUG);
else
//log the event
log.writeLog(__LINE__, processName + " process failed to stop!!", LOG_TYPE_WARNING);
return returnStatus;
}
/******************************************************************************************
* @brief startProcess
*
* purpose: Start a Process on the specified module
*
******************************************************************************************/
int ProcessManager::startProcess(string moduleName, string processName,
messageqcpp::ByteStream::byte actionIndicator)
{
Oam oam;
if ( actionIndicator != oam::STATUS_UPDATE )
{
//skip if module is DISABLED
int opState;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (...)
{}
//check if disabled
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
return API_SUCCESS;
}
ByteStream msg;
ByteStream::byte requestID = START;
msg = buildRequestMessage(requestID, actionIndicator, processName);
int returnStatus = sendMsgProcMon( moduleName, msg, requestID );
if ( returnStatus == API_SUCCESS)
//log the event
log.writeLog(__LINE__, moduleName + "/" + processName + " process is started by request.", LOG_TYPE_DEBUG);
else
//log the event
log.writeLog(__LINE__, moduleName + "/" + processName + " process failed to start!!", LOG_TYPE_WARNING);
return returnStatus;
}
/******************************************************************************************
* @brief restartProcess
*
* purpose: Restart a Process on the specified module
*
******************************************************************************************/
int ProcessManager::restartProcess(string moduleName, string processName,
messageqcpp::ByteStream::byte actionIndicator, bool manualFlag)
{
Oam oam;
//skip if module is DISABLED
int opState;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (...)
{}
//check if disabled
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
return API_SUCCESS;
ByteStream msg;
ByteStream::byte requestID = RESTART;
msg = buildRequestMessage(requestID, actionIndicator, processName, manualFlag);
int returnStatus;
// need retry due to the depend process checks
for ( int retry = 0 ; retry < 5 ; retry++)
{
returnStatus = sendMsgProcMon( moduleName, msg, requestID );
if ( returnStatus == API_SUCCESS)
{
log.writeLog(__LINE__, processName + " process is restarted by request.", LOG_TYPE_DEBUG);
return returnStatus;
}
else
log.writeLog(__LINE__, processName + " process failed to restart, will retry!!", LOG_TYPE_WARNING);
sleep(2);
}
return returnStatus;
}
/******************************************************************************************
* @brief reinitProcess
*
* purpose: Reinit a Process on the specified module
*
******************************************************************************************/
int ProcessManager::reinitProcess(string moduleName, string processName)
{
Oam oam;
//skip if module is DISABLED
int opState;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (...)
{}
//check if disabled
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
return API_SUCCESS;
ByteStream msg;
ByteStream::byte requestID = PROCREINITPROCESS;
ByteStream::byte actionIndicator = FORCEFUL;
msg = buildRequestMessage(requestID, actionIndicator, processName);
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 0 );
if ( returnStatus == API_SUCCESS)
//log the event
log.writeLog(__LINE__, processName + " process is reinited by request.", LOG_TYPE_DEBUG);
else
//log the event
log.writeLog(__LINE__, processName + " process failed to reinit!!", LOG_TYPE_WARNING);
return returnStatus;
}
/******************************************************************************************
* @brief setSystemState
*
* purpose: set System State and process required alarms
*
******************************************************************************************/
void ProcessManager::setSystemState(uint16_t state)
{
ProcessLog log;
Oam oam;
ALARMManager aManager;
Configuration config;
ProcessManager processManager(config, log);
log.writeLog(__LINE__, "Set System State = " + oamState[state], LOG_TYPE_DEBUG);
pthread_mutex_lock(&STATUS_LOCK);
try
{
oam.setSystemStatus(state);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
// Process Alarms
string system = "System";
if( state == oam::ACTIVE ) {
//set query system states ready
processManager.setQuerySystemState(true);
//clear alarms if set
aManager.sendAlarmReport(system.c_str(), SYSTEM_DOWN_AUTO, CLEAR);
aManager.sendAlarmReport(system.c_str(), SYSTEM_DOWN_MANUAL, CLEAR);
}
else
{
if ( state == oam::MAN_OFFLINE )
aManager.sendAlarmReport(system.c_str(), SYSTEM_DOWN_MANUAL, SET);
else if ( state == oam::AUTO_OFFLINE )
aManager.sendAlarmReport(system.c_str(), SYSTEM_DOWN_AUTO, SET);
aManager.sendAlarmReport(system.c_str(), CONN_FAILURE, CLEAR);
}
pthread_mutex_unlock(&STATUS_LOCK);
return;
}
/******************************************************************************************
* @brief setModuleState
*
* purpose: set Module State of a specific module
*
******************************************************************************************/
void ProcessManager::setModuleState(string moduleName, uint16_t state)
{
ProcessLog log;
Oam oam;
log.writeLog(__LINE__, "Set Module " + moduleName + " State = " + oam.itoa(state), LOG_TYPE_DEBUG);
pthread_mutex_lock(&STATUS_LOCK);
try
{
oam.setModuleStatus(moduleName, state);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_mutex_unlock(&STATUS_LOCK);
return;
}
/******************************************************************************************
* @brief setExtdeviceState
*
* purpose: set Switch State of a specific switch
*
******************************************************************************************/
void ProcessManager::setExtdeviceState(string extDeviceName, uint16_t state)
{
ProcessLog log;
Oam oam;
log.writeLog(__LINE__, "Set Ext Device " + extDeviceName + " State = " + oam.itoa(state), LOG_TYPE_DEBUG);
pthread_mutex_lock(&STATUS_LOCK);
try
{
oam.setExtDeviceStatus(extDeviceName, state);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setExtDeviceStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setExtDeviceStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_mutex_unlock(&STATUS_LOCK);
return;
}
/******************************************************************************************
* @brief setNICState
*
* purpose: set NIC State of a specific storage
*
******************************************************************************************/
void ProcessManager::setNICState(string hostName, uint16_t state)
{
ProcessLog log;
Oam oam;
log.writeLog(__LINE__, "Set NIC " + hostName + " State = " + oam.itoa(state), LOG_TYPE_DEBUG);
pthread_mutex_lock(&STATUS_LOCK);
try
{
oam.setNICStatus(hostName, state);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setNICStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setNICStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_mutex_unlock(&STATUS_LOCK);
return;
}
/******************************************************************************************
* @brief setProcessState
*
* purpose: set Process State of a specific Process
*
******************************************************************************************/
int ProcessManager::setProcessState(string moduleName, string processName, uint16_t state, pid_t PID)
{
ProcessLog log;
Oam oam;
log.writeLog(__LINE__, "StatusUpdate of Process " + processName + " State = " + oam.itoa(state), LOG_TYPE_DEBUG);
try
{
oam.setProcessStatus(processName, moduleName, state, PID);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setProcessStatus: " + error, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief setProcessStates
*
* purpose: set all processes running on a module to requested state
*
******************************************************************************************/
void ProcessManager::setProcessStates(std::string moduleName, uint16_t state, std::string processNameSkip )
{
ProcessLog log;
Oam oam;
log.writeLog(__LINE__, "Set All NON-MAN_OFFLINE Process for module " + moduleName + " = " + oam.itoa(state), LOG_TYPE_DEBUG);
SystemProcessConfig systemprocessconfig;
vector<ProcessConfig>::iterator itor;
//PMwithUM config
string PMwithUM = "n";
try
{
oam.getSystemConfig( "PMwithUM", PMwithUM);
}
catch (...)
{
PMwithUM = "n";
}
string moduleType = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
try
{
oam.getProcessConfig(systemprocessconfig);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
string moduleTypeSet = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
for (itor = systemprocessconfig.processconfig.begin();
itor != systemprocessconfig.processconfig.end(); ++itor)
{
if ( (*itor).ModuleType == moduleType
|| (*itor).ModuleType == "ChildExtOAMModule"
|| ( (*itor).ModuleType == "ChildOAMModule" )
|| ((*itor).ModuleType == "ParentOAMModule") )
{
if ( (*itor).ProcessName == processNameSkip )
continue;
ProcessStatus processstatus;
try
{
oam.getProcessStatus((*itor).ProcessName, moduleName, processstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if (processstatus.ProcessOpState != oam::MAN_OFFLINE)
{
setProcessState(moduleName, (*itor).ProcessName, state, 0);
if ( (*itor).ProcessName == "ExeMgr" || state == oam::AUTO_OFFLINE )
setProcessState(moduleName, "mysqld", state, 0);
}
}
else
{
//for for umwithpm apps, which is ExeMgr now
if ( moduleTypeSet == "pm" && PMwithUM == "y" )
{
ProcessStatus processstatus;
try
{
oam.getProcessStatus("ExeMgr", moduleName, processstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if (processstatus.ProcessOpState != oam::MAN_OFFLINE)
{
setProcessState(moduleName, "ExeMgr", state, 0);
if ( state == oam::AUTO_OFFLINE )
setProcessState(moduleName, "mysqld", state, 0);
}
}
}
}
}
/******************************************************************************************
* @brief updateLog
*
* purpose: updatelog on a specific module
*
******************************************************************************************/
int ProcessManager::updateLog(std::string action, std::string moduleName, std::string level)
{
ByteStream msg;
ByteStream::byte requestID = PROCUPDATELOG;
msg << requestID;
msg << action;
msg << level;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
if ( returnStatus == API_SUCCESS)
{
//log the success event
log.writeLog(__LINE__, moduleName + " updateLog by request.", LOG_TYPE_DEBUG);
}
else
{
//log the error event
log.writeLog(__LINE__, moduleName + " updateLog failed!!", LOG_TYPE_WARNING);
}
return returnStatus;
}
/******************************************************************************************
* @brief getConfigLog
*
* purpose: get Log Configation on a specific module
*
******************************************************************************************/
int ProcessManager::getConfigLog(std::string moduleName)
{
ByteStream msg;
ByteStream::byte requestID = PROCGETCONFIGLOG;
msg << requestID;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
return returnStatus;
}
/******************************************************************************************
* @brief updateConfig
*
* purpose: Send Msg to Process-Monitor to re-read updated Configation data
*
******************************************************************************************/
int ProcessManager::updateConfig(std::string moduleName)
{
ByteStream msg;
ByteStream::byte requestID = PROCUPDATECONFIG;
msg << requestID;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
return returnStatus;
}
/******************************************************************************************
* @brief buildSystemTables
*
* purpose: Send a Message to 'pm1' to check and build System Table
*
******************************************************************************************/
int ProcessManager::buildSystemTables(string target)
{
ByteStream msg;
ByteStream::byte requestID = PROCBUILDSYSTEMTABLES;
msg << requestID;
int returnStatus = sendMsgProcMon( target, msg, requestID );
return returnStatus;
}
/******************************************************************************************
* @brief updateFstab
*
* purpose: send Fstab Update to a specific module
*
******************************************************************************************/
int ProcessManager::updateFstab(std::string moduleName, std::string entry)
{
ByteStream msg;
ByteStream::byte requestID = PROCFSTABUPDATE;
msg << requestID;
msg << entry;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
return returnStatus;
}
/******************************************************************************************
* @brief stopProcessType
*
* purpose: Stops a type of process within the system
*
******************************************************************************************/
int ProcessManager::stopProcessType( std::string processName, bool manualFlag )
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
log.writeLog(__LINE__, "stopProcessType: Stop all " + processName, LOG_TYPE_DEBUG);
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == processName)
{
//skip if in a COLD_STANDBY state
// if ( systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY )
if ( systemprocessstatus.processstatus[i].ProcessOpState != oam::ACTIVE )
continue;
// found one, request restart of it
processManager.stopProcess(systemprocessstatus.processstatus[i].Module,
processName,
GRACEFUL,
manualFlag, 0);
// log.writeLog(__LINE__, "stopProcessType: Start ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief startProcessType
*
* purpose: Starts a type of process within the system
*
******************************************************************************************/
int ProcessManager::startProcessType( std::string processName )
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
log.writeLog(__LINE__, "StartProcessType: Start all " + processName, LOG_TYPE_DEBUG);
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == processName)
{
// found one, request restart of it
int retStatus = processManager.startProcess(systemprocessstatus.processstatus[i].Module,
processName,
FORCEFUL);
log.writeLog(__LINE__, "StartProcessType: Start ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief restartProcessType
*
* purpose: Restarts ACTIVE type of process within the system
*
******************************************************************************************/
int ProcessManager::restartProcessType( std::string processName, std::string skipModule, bool manualFlag )
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
int retStatus = API_SUCCESS;
log.writeLog(__LINE__, "restartProcessType: Restart all " + processName, LOG_TYPE_DEBUG);
//PMwithUM config
string PMwithUM = "n";
try
{
oam.getSystemConfig( "PMwithUM", PMwithUM);
}
catch (...)
{
PMwithUM = "n";
}
// If mysqld is the processName, then send to modules were ExeMgr is running
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
//check for skipModule
if ( systemprocessstatus.processstatus[i].Module == skipModule )
continue;
if ( processName == "mysqld" )
{
if ( systemprocessstatus.processstatus[i].ProcessName == "ExeMgr")
{
ProcessStatus procstat;
oam.getProcessStatus("mysqld", systemprocessstatus.processstatus[i].Module, procstat);
int state = procstat.ProcessOpState;
if ( state == ACTIVE )
{
retStatus = processManager.restartProcess(systemprocessstatus.processstatus[i].Module,
processName,
FORCEFUL,
true);
log.writeLog(__LINE__, "restartProcessType: Start ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
}
}
}
else
{
if ( systemprocessstatus.processstatus[i].ProcessName == processName )
{
//skip if in a BUSY_INIT state
// if ( systemprocessstatus.processstatus[i].ProcessOpState == oam::BUSY_INIT ||
// systemprocessstatus.processstatus[i].ProcessOpState == oam::MAN_OFFLINE ||
// systemprocessstatus.processstatus[i].ProcessOpState == oam::AUTO_OFFLINE ||
// systemprocessstatus.processstatus[i].ProcessOpState == oam::AUTO_INIT ||
// systemprocessstatus.processstatus[i].ProcessOpState == oam::MAN_INIT ||
// ( systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY && !manualFlag ) )
// continue;
if ( systemprocessstatus.processstatus[i].ProcessOpState != oam::ACTIVE )
continue;
if ( (processName.find("DDLProc") == 0 || processName.find("DMLProc") == 0) )
{
string procModuleType = systemprocessstatus.processstatus[i].Module.substr(0, MAX_MODULE_TYPE_SIZE);
if ( procModuleType == "pm" && PMwithUM == "y" )
continue;
try
{
oam.setSystemConfig("PrimaryUMModuleName", systemprocessstatus.processstatus[i].Module);
processManager.setPMProcIPs(systemprocessstatus.processstatus[i].Module);
//distribute config file
processManager.distributeConfigFile("system");
sleep(1);
}
catch (...) {}
}
// found one, request restart of it
retStatus = processManager.restartProcess(systemprocessstatus.processstatus[i].Module,
processName,
FORCEFUL,
true);
log.writeLog(__LINE__, "restartProcessType: Start ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
// if DDL or DMLProc, change IP Address
if ( retStatus == oam::API_SUCCESS )
{
// sleep(5);
ProcessStatus procstat;
oam.getProcessStatus(processName, systemprocessstatus.processstatus[i].Module, procstat);
if ( (processName.find("DDLProc") == 0 || processName.find("DMLProc") == 0) )
{
processManager.setPMProcIPs(systemprocessstatus.processstatus[i].Module, processName);
break;
}
}
}
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
return retStatus;
}
/******************************************************************************************
* @brief reinitProcessType
*
* purpose: Reinit ACTIVE type of process within the system
*
******************************************************************************************/
int ProcessManager::reinitProcessType( std::string processName )
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
int retStatus = API_SUCCESS;
log.writeLog(__LINE__, "reinitProcessType: ReInit all " + processName, LOG_TYPE_DEBUG);
try
{
oam.getProcessStatus(systemprocessstatus);
// re-init cpimport on all nodes
if ( processName == "cpimport" )
{
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == "ServerMonitor" )
{
// found one, request reinit of it
log.writeLog(__LINE__, "reinitProcessType: cpimport" + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
"cpimport");
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
}
}
}
else
{
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == processName &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::ACTIVE )
{
// found one, request reinit of it
retStatus = processManager.reinitProcess(systemprocessstatus.processstatus[i].Module,
processName);
log.writeLog(__LINE__, "reinitProcessType: ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
}
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
return retStatus;
}
/******************************************************************************************
* @brief addModule
*
* purpose: Add Module to system configuration
*
******************************************************************************************/
int ProcessManager::addModule(oam::DeviceNetworkList devicenetworklist, std::string password, bool storeHostnames,
bool manualFlag)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
SystemModuleTypeConfig systemmoduletypeconfig;
ModuleTypeConfig moduletypeconfig;
ModuleTypeConfig setmoduletypeconfig;
DeviceNetworkConfig devicenetworkconfig;
Oam oam;
string Section;
pthread_mutex_lock(&THREAD_LOCK);
int AddModuleCount = devicenetworklist.size();
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
string moduleType = (*listPT).DeviceName.substr(0, MAX_MODULE_TYPE_SIZE);
//
//Check hostname and IP Address for availibility
//
try
{
oam.getSystemConfig(systemmoduletypeconfig);
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType.empty() )
// end of list
break;
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
string moduletype = systemmoduletypeconfig.moduletypeconfig[i].ModuleType;
if ( moduleCount > 0 )
{
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
{
string hostname = (*pt1).HostName;
if ( hostname == oam::UnassignedName )
continue;
string ipAddr = (*pt1).IPAddr;
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
string newHostName = (*pt1).HostName;
string newIPAddr = (*pt1).IPAddr;
if ( newIPAddr == ipAddr || newHostName == hostname )
{
log.writeLog(__LINE__, "addModule - ERROR: hostName or IP address already in-use: " + newIPAddr + "/" + newHostName, LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_INVALID_PARAMETER;
}
}
}
}
}
}
}
catch (exception& e)
{
log.writeLog(__LINE__, "addModule - ERROR: getSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
string calpontPackage;
string systemID;
string packageType = "rpm";
try
{
oam.getSystemConfig("EEPackageType", packageType);
}
catch (...)
{
log.writeLog(__LINE__, "addModule - ERROR: get EEPackageType", LOG_TYPE_ERROR);
}
//
// check for RPM package
//
SystemSoftware systemsoftware;
try
{
oam.getSystemSoftware(systemsoftware);
}
catch (exception& e)
{
log.writeLog(__LINE__, "addModule - ERROR: getSystemSoftware", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
string homedir = "/root";
if (!rootUser)
{
char* p = getenv("HOME");
if (p && *p)
homedir = p;
}
//clear out the known_host file, sometimes causes a failure on amazon during addModule
if ( amazon )
{
string cmd = "unlink " + homedir + ".ssh/know_hosts > /dev/null 2>&1";
system(cmd.c_str());
}
if ( packageType == "rpm")
calpontPackage = homedir + "/mariadb-columnstore*" + columnstore_version + "-" + columnstore_release + "*.rpm";
else if ( packageType == "deb")
calpontPackage = homedir + "/mariadb-columnstore*" + columnstore_version + "-" + columnstore_release + "*.deb";
else
calpontPackage = homedir + "/mariadb-columnstore*" + columnstore_version + "-" + columnstore_release + "*.bin.tar.gz";
//
//Get System Configuration file
//
try
{
oam.getSystemConfig(moduleType, moduletypeconfig);
}
catch (...)
{
log.writeLog(__LINE__, "addModule - ERROR: getSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
setmoduletypeconfig = moduletypeconfig;
// update Module Type Count
int oldModuleCount = moduletypeconfig.ModuleCount;
int newModuleCount = oldModuleCount + AddModuleCount;
setmoduletypeconfig.ModuleCount = newModuleCount;
//add new IP Addresses and Hostnames
listPT = devicenetworklist.begin();
HostConfig hostconfig;
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
devicenetworkconfig.DeviceName = (*listPT).DeviceName;
devicenetworkconfig.DisableState = oam::MANDISABLEDSTATE;
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
for ( ; pt1 != (*listPT).hostConfigList.end() ; pt1++)
{
string hostName = (*pt1).HostName;
string IPAddr = (*pt1).IPAddr;
//if cloud and unassigned, launch a new Instance
if ( ( cloud == "amazon-ec2" && hostName == oam::UnassignedName ) ||
( cloud == "amazon-vpc" && hostName == oam::UnassignedName ) )
{
string UMinstanceType;
string UMSecurityGroup;
if ( moduleType == "um")
{
try
{
oam.getSystemConfig("UMInstanceType", UMinstanceType);
oam.getSystemConfig("UMSecurityGroup", UMSecurityGroup);
}
catch (...) {}
}
log.writeLog(__LINE__, "addModule - Launching a new Instance for: " + moduleName, LOG_TYPE_DEBUG);
if ( moduleType == "um" )
hostName = oam.launchEC2Instance(moduleName, IPAddr, UMinstanceType, UMSecurityGroup);
else
hostName = oam.launchEC2Instance(moduleName, IPAddr);
if ( hostName == "failed" )
{
log.writeLog(__LINE__, "addModule - Launch New Instance Failure", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
// add instance tag
string systemName;
string AmazonAutoTagging;
{
try
{
oam.getSystemConfig("SystemName", systemName);
oam.getSystemConfig("AmazonAutoTagging", AmazonAutoTagging);
}
catch (...) {}
}
if ( AmazonAutoTagging == "y" )
{
string tagValue = systemName + "-" + moduleName;
oam.createEC2tag( hostName, "Name", tagValue );
}
//wait until login is success until continuing or fail if can't login
log.writeLog(__LINE__, "addModule - Successfully Launch of new Instance, perform login test: " + moduleName, LOG_TYPE_DEBUG);
int retry = 0;
for ( ; retry < 30 ; retry++)
{
IPAddr = oam.getEC2InstanceIpAddress(hostName);
if (IPAddr == "terminated")
{
log.writeLog(__LINE__, "addModule - Failed to log in to Instance, it was terminated: " + hostName, LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
if (IPAddr == "stopped")
{
sleep(5);
continue;
}
string loginTmp = tmpLogDir + "/login_test.log";
string cmd = "remote_command.sh " + IPAddr + " " + password + " 'ls' 1 > " + loginTmp;
system(cmd.c_str());
if (!oam.checkLogStatus(loginTmp, "README")) {
//check for RSA KEY ISSUE and fix
if (oam.checkLogStatus(loginTmp, "Host key verification failed"))
{
log.writeLog(__LINE__, "addModule - login failed, Host key verification failed, try fixing: " + moduleName, LOG_TYPE_DEBUG);
cmd = "rm -f " + homedir + "/.ssh/known_hosts";
system(cmd.c_str());
}
log.writeLog(__LINE__, "addModule - login failed, retry login test: " + moduleName, LOG_TYPE_DEBUG);
sleep(5);
continue;
}
// logged in
break;
}
if ( retry >= 30 )
{
log.writeLog(__LINE__, "addModule - Failed to log in to Instance: " + hostName, LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
log.writeLog(__LINE__, "addModule - Successful loggin: " + hostName, LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "addModule - Launched new Instance: " + hostName + "/" + IPAddr, LOG_TYPE_DEBUG);
(*pt1).HostName = hostName;
(*pt1).IPAddr = IPAddr;
//check if any volumes need to be attached
if ( moduleType == "um" )
{
string UMStorageType = "internal";
{
try
{
oam.getSystemConfig("UMStorageType", UMStorageType);
}
catch (...) {}
}
if ( UMStorageType == "external" )
{
//check if volume already assigned or need to create a new one
int moduleID = atoi((*listPT).DeviceName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
string volumeNameID = "UMVolumeName" + oam.itoa(moduleID);
string volumeName = oam::UnassignedName;
string deviceNameID = "UMVolumeDeviceName" + oam.itoa(moduleID);
string deviceName = oam::UnassignedName;
try
{
oam.getSystemConfig( volumeNameID, volumeName);
oam.getSystemConfig( deviceNameID, deviceName);
}
catch (...)
{}
if ( volumeName.empty() || volumeName == oam::UnassignedName )
{
// need to create a new one
string device;
try
{
oam.addUMdisk(moduleID, volumeName, device);
}
catch (...)
{
log.writeLog(__LINE__, "addModule: volume create failed for um: " + moduleName, LOG_TYPE_CRITICAL);
pthread_mutex_unlock(&THREAD_LOCK);
}
//attach to UM
log.writeLog(__LINE__, "addModule - attach new Volume to " + moduleName, LOG_TYPE_DEBUG);
if (!oam.attachEC2Volume(volumeName, device, hostName))
{
log.writeLog(__LINE__, "addModule: volume failed to attach to um: " + moduleName, LOG_TYPE_CRITICAL);
pthread_mutex_unlock(&THREAD_LOCK);
}
try
{
Config* sysConfig = Config::makeConfig();
sysConfig->setConfig("Installation", volumeNameID, volumeName);
sysConfig->setConfig("Installation", deviceNameID, device);
sysConfig->write();
}
catch (...)
{}
log.writeLog(__LINE__, "addModule - create/attach new volume: " + volumeName + "/" + device, LOG_TYPE_DEBUG);
}
else
{
// one exist, detach and reattach it
oam.detachEC2Volume( volumeName );
if (!oam.attachEC2Volume(volumeName, deviceName, hostName))
{
log.writeLog(__LINE__, "addModule: volume failed to attached: " + volumeName, LOG_TYPE_CRITICAL);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
log.writeLog(__LINE__, "addModule - attach existing volume: " + volumeName + "/" + deviceName, LOG_TYPE_DEBUG);
}
}
}
}
hostconfig.HostName = hostName;
if (storeHostnames)
hostconfig.IPAddr = hostName;
else
hostconfig.IPAddr = IPAddr;
hostconfig.NicID = (*pt1).NicID;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
}
setmoduletypeconfig.ModuleNetworkList.push_back(devicenetworkconfig);
}
Config* sysConfig = Config::makeConfig();
//Add additional Process Ports
// all nodes: ProcessMonitor, ServerMonitor
// dm: NONE
// um: ExeMgr
// pm: NONE
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
Section = (*listPT).DeviceName + "_ProcessMonitor";
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
sysConfig->setConfig(Section, "IPAddr", (*pt1).IPAddr);
sysConfig->setConfig(Section, "Port", "8800");
Section = (*listPT).DeviceName + "_ServerMonitor";
sysConfig->setConfig(Section, "IPAddr", (*pt1).IPAddr);
sysConfig->setConfig(Section, "Port", "8622");
}
if ( moduleType == "um" ||
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
( moduleType == "pm" && PMwithUM == "y") )
{
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
int moduleID = atoi((*listPT).DeviceName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
int exemgrID = moduleID;
if ( PMwithUM == "y" )
{
// then go check for next available ID
exemgrID = 0;
for ( int id = 2 ; ; id++ )
{
string Section = "ExeMgr" + oam.itoa(id);
string moduleName;
try
{
Config* sysConfig = Config::makeConfig();
moduleName = sysConfig->getConfig(Section, "Module");
}
catch (...) {}
if ( moduleName.empty() )
{
exemgrID = id;
break;
}
}
}
Section = "ExeMgr" + oam.itoa(exemgrID);
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
sysConfig->setConfig(Section, "IPAddr", (*pt1).IPAddr);
sysConfig->setConfig(Section, "Port", "8601");
sysConfig->setConfig(Section, "Module", (*listPT).DeviceName);
}
}
if ( moduleType == "pm" )
{
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
Section = (*listPT).DeviceName + "_WriteEngineServer";
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
sysConfig->setConfig(Section, "IPAddr", (*pt1).IPAddr);
sysConfig->setConfig(Section, "Port", "8630");
}
}
log.writeLog(__LINE__, "addModule - Updated Process Ports", LOG_TYPE_DEBUG);
string parentOAMModuleHostName;
string parentOAMModuleIPAddr;
//setup dbroot entries
if (moduleType == "pm" && manualFlag)
{
const string MODULE_DBROOTID = "ModuleDBRootID";
const string MODULE_DBROOT_COUNT = "ModuleDBRootCount";
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleID = (*listPT).DeviceName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE);
string ModuleDBRootCount = MODULE_DBROOT_COUNT + moduleID + "-3";
sysConfig->setConfig("SystemModuleConfig", ModuleDBRootCount, "0");
string ModuleDBrootID = MODULE_DBROOTID + moduleID + "-1-3";
sysConfig->setConfig("SystemModuleConfig", ModuleDBrootID, oam::UnassignedName);
}
}
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "addModule - ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//write Columnstore.xml Module section
try
{
oam.setSystemConfig(moduleType, setmoduletypeconfig);
log.writeLog(__LINE__, "addModule - Updated Module Section of Config file", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "addModule - ERROR: setSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
//check if any added modules are Active OAM
bool activeOAM = false;
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
if ( (*listPT).DeviceName == config.OAMParentName() )
{
activeOAM = true;
break;
}
}
//
//send message to Process Monitor to add module/processes to shared memory
//
if ( !activeOAM )
{
try
{
ByteStream obs;
obs << (ByteStream::byte) ADD_MODULE;
obs << (ByteStream::byte) AddModuleCount;
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
obs << (*listPT).DeviceName;
}
//pass NIC Hostnames
vector<string> nicHostNames;
listPT = devicenetworklist.begin();
HostConfig hostconfig;
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
for ( ; pt1 != (*listPT).hostConfigList.end() ; pt1++)
{
nicHostNames.push_back((*pt1).HostName);
}
}
obs << (ByteStream::byte) nicHostNames.size();
vector<string>::iterator pt2 = nicHostNames.begin();
for ( ; pt2 != nicHostNames.end() ; pt2++)
{
obs << *pt2;
}
sendStatusUpdate(obs, ADD_MODULE);
log.writeLog(__LINE__, "addModule - Updated Shared Memory", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "addModule - ERROR: sendStatusUpdate error", LOG_TYPE_ERROR);
return API_FAILURE;
}
}
//distribute config file
distributeConfigFile("system");
string cmd = "rm -f " + homedir + "/.ssh/known_hosts > /dev/null 2>&1";
system(cmd.c_str());
listPT = devicenetworklist.begin();
//distribute config file
distributeConfigFile("system");
distributeConfigFile("system", "ProcessConfig.xml");
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string remoteModuleName = (*listPT).DeviceName;
string remoteModuleType = remoteModuleName.substr(0, MAX_MODULE_TYPE_SIZE);
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
string remoteModuleIP = (*pt1).IPAddr;
string remoteHostName = (*pt1).HostName;
string dir = "/var/lib/columnstore/local/etc" + remoteModuleName;
cmd = "mkdir " + dir + " > /dev/null 2>&1";
system(cmd.c_str());
if ( remoteModuleType == "um" )
{
cmd = "cp /var/lib/columnstore/local/etc/um1/* " + dir + "/.";
system(cmd.c_str());
}
else if ( remoteModuleType == "pm" )
{
cmd = "cp /var/lib/columnstore/local/etc/pm1/* " + dir + "/.";
system(cmd.c_str());
}
log.writeLog(__LINE__, "addModule - created directory and custom OS files for " + remoteModuleName, LOG_TYPE_DEBUG);
//create module file
if ( !createModuleFile(remoteModuleName) )
{
log.writeLog(__LINE__, "addModule - ERROR: createModuleFile failed", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
log.writeLog(__LINE__, "addModule - create module file for " + remoteModuleName, LOG_TYPE_DEBUG);
if ( remoteModuleType == "pm" )
{
//setup Standby OAM Parent, if needed
if ( config.OAMStandbyName() == oam::UnassignedName )
setStandbyModule(remoteModuleName, false);
}
string logFile = tmpLogDir + "/" + remoteModuleName + "_mcs_module_installer.log";
log.writeLog(__LINE__, "addModule - mcs_module_installer run for " + remoteModuleName, LOG_TYPE_DEBUG);
cmd = "mcs_module_installer.sh " + remoteModuleName + " " + remoteModuleIP + " " + password + " 1 >" + logFile;
log.writeLog(__LINE__, "addModule cmd: " + cmd, LOG_TYPE_DEBUG);
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) != 0)
{
log.writeLog(__LINE__, "addModule - ERROR: " + logFile + " failed, retry", LOG_TYPE_DEBUG);
DeviceNetworkList devicenetworklistR;
DeviceNetworkConfig devicenetworkconfigR;
HostConfig hostconfig;
devicenetworkconfigR.DeviceName = remoteModuleName;
hostconfig.IPAddr = oam::UnassignedName;
hostconfig.HostName = oam::UnassignedName;
hostconfig.NicID = 1;
devicenetworkconfigR.hostConfigList.push_back(hostconfig);
devicenetworklistR.push_back(devicenetworkconfigR);
processManager.removeModule(devicenetworklistR, false);
log.writeLog(__LINE__, "addModule - Remove Module Completed", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
cmd = "/bin/cp -f " + logFile + " " + logFile + "failed";
system(cmd.c_str());
processManager.setModuleState(remoteModuleName, oam::FAILED);
return API_FAILURE;
}
if (manualFlag)
//set new module to disable state if manual add
disableModule(remoteModuleName, true);
// add to monitor list
moduleInfoList.insert(moduleList::value_type(remoteModuleName, 0));
processManager.configureModule(remoteModuleName);
}
//delay to give time for ProcMon to start after the config is sent and procmon restarts
log.writeLog(__LINE__, "addModule - sleep 60 - give ProcMon time to CONFIGURE and restart", LOG_TYPE_DEBUG);
sleep(60);
//start mysqld on the new modules so mysql replication can be setup
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
processManager.startProcess((*listPT).DeviceName, "mysqld", oam::STATUS_UPDATE);
}
log.writeLog(__LINE__, "Setup MySQL Replication for new Modules being Added", LOG_TYPE_DEBUG);
processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, true, password, true, true );
//stop mysqld
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
processManager.stopProcess((*listPT).DeviceName, "mysqld", oam::FORCEFUL, true );
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief removeModule
*
* purpose: Remove Module to system configuration
*
******************************************************************************************/
int ProcessManager::removeModule(oam::DeviceNetworkList devicenetworklist, bool manualFlag)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
ModuleTypeConfig moduletypeconfig;
ModuleTypeConfig setmoduletypeconfig;
Oam oam;
string Section;
pthread_mutex_lock(&THREAD_LOCK);
//get module count being removed
int RemoveModuleCount = devicenetworklist.size();
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
//
//Get System Configuration
//
listPT = devicenetworklist.begin();
string moduleType = (*listPT).DeviceName.substr(0, MAX_MODULE_TYPE_SIZE);
try
{
oam.getSystemConfig(moduleType, moduletypeconfig);
}
catch (...)
{
log.writeLog(__LINE__, "removeModule - ERROR: getSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
setmoduletypeconfig = moduletypeconfig;
// get current Module Type Count and validate request
int oldModuleCount = moduletypeconfig.ModuleCount;
if ( oldModuleCount < RemoveModuleCount )
{
log.writeLog(__LINE__, "removeModule - ERROR: remove count is larger than ModuleType count", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_INVALID_PARAMETER;
}
//validate the module list to be removed
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
int returnStatus = oam.validateModule((*listPT).DeviceName);
if (returnStatus != API_SUCCESS)
{
log.writeLog(__LINE__, "removeModule - ERROR: invalid module: " + (*listPT).DeviceName, LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_INVALID_PARAMETER;
}
}
if (manualFlag)
{
//stopModules being removed with the REMOVE option, which will stop process
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "removeModule - stopping module: " + moduleName, LOG_TYPE_DEBUG);
//don't allow remove of Active PM Module
if ( moduleName == config.OAMParentName() )
{
log.writeLog(__LINE__, "removeModule - ERROR: can't remove current module (Active Parent OAM) ", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_INVALID_PARAMETER;
}
int status;
status = stopModule(moduleName, REMOVE, true);
if (status == API_SUCCESS)
{
log.writeLog(__LINE__, "removeModule - stopModule Successfully " + moduleName, LOG_TYPE_DEBUG);
//check for SIMPLEX Processes on mate might need to be started
pthread_mutex_unlock(&THREAD_LOCK);
checkSimplexModule(moduleName);
pthread_mutex_lock(&THREAD_LOCK);
}
else
log.writeLog(__LINE__, "removeModule - stopModule " + moduleName, LOG_TYPE_ERROR);
}
}
int newModuleCount = oldModuleCount - RemoveModuleCount;
setmoduletypeconfig.ModuleCount = newModuleCount;
string systemName;
string AmazonAutoTagging;
{
try
{
oam.getSystemConfig("SystemName", systemName);
oam.getSystemConfig("AmazonAutoTagging", AmazonAutoTagging);
}
catch (...) {}
}
//Clear out Module IP and Hostnames
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
log.writeLog(__LINE__, "removeModule - removing module: " + moduleName, LOG_TYPE_DEBUG);
//don't allow remove of Active PM Module
if ( moduleName == config.OAMParentName() )
{
log.writeLog(__LINE__, "removeModule - ERROR: can't remove current module (Active Parent OAM) ", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_INVALID_PARAMETER;
}
DeviceNetworkList::iterator pt = setmoduletypeconfig.ModuleNetworkList.begin();
for ( ; pt != setmoduletypeconfig.ModuleNetworkList.end() ; pt++)
{
if ( moduleName == (*pt).DeviceName )
{
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
{
//if cloud, delete instance
if (amazon)
{
log.writeLog(__LINE__, "removeModule - terminate instance: " + (*pt1).HostName, LOG_TYPE_DEBUG);
oam.terminateEC2Instance( (*pt1).HostName );
// update instance tag
if ( AmazonAutoTagging == "y" )
{
string tagValue = systemName + "-" + moduleName + "-terminated";
oam.createEC2tag( (*pt1).HostName, "Name", tagValue );
}
//check if any volumes need to be deleted
if ( moduleType == "um" )
{
string UMStorageType = "internal";
{
try
{
oam.getSystemConfig("UMStorageType", UMStorageType);
}
catch (...) {}
}
if ( UMStorageType == "external" )
{
//check if volume already assigned or need to create a new one
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
string volumeNameID = "UMVolumeName" + oam.itoa(moduleID);
string volumeName = oam::UnassignedName;
string deviceNameID = "UMVolumeDeviceName" + oam.itoa(moduleID);
string deviceName = oam::UnassignedName;
try
{
oam.getSystemConfig( volumeNameID, volumeName);
oam.getSystemConfig( deviceNameID, deviceName);
}
catch (...)
{}
if ( !volumeName.empty() || volumeName != oam::UnassignedName )
{
log.writeLog(__LINE__, "removeModule - detach / remove volume: " + volumeName + "/" + deviceName, LOG_TYPE_DEBUG);
oam.detachEC2Volume( volumeName );
oam.deleteEC2Volume( volumeName );
try
{
Config* sysConfig = Config::makeConfig();
sysConfig->setConfig("Installation", volumeNameID, oam::UnassignedName);
sysConfig->setConfig("Installation", deviceNameID, oam::UnassignedName);
sysConfig->write();
}
catch (...)
{}
}
}
}
}
clearNICAlarms((*pt1).HostName);
(*pt1).IPAddr = oam::UnassignedIpAddr;
(*pt1).HostName = oam::UnassignedName;
}
break;
}
}
}
//Remove Process Ports
// all nodes: ProcessMonitor, ServerMonitor
// dm: NONE
// um: ExeMgr
// pm: NONE
Config* sysConfig = Config::makeConfig();
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
Section = (*listPT).DeviceName + "_ProcessMonitor";
sysConfig->setConfig(Section, "IPAddr", oam::UnassignedName);
Section = (*listPT).DeviceName + "_ServerMonitor";
sysConfig->setConfig(Section, "IPAddr", oam::UnassignedName);
}
if ( moduleType == "um" ||
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
( moduleType == "um" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM ) ||
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_PM_UM ) ||
( moduleType == "pm" && PMwithUM == "y" ) )
{
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
// go find ExeMgr ID by moduleName
for ( int id = 1 ; ; id++ )
{
string Section = "ExeMgr" + oam.itoa(id);
string moduleName;
try
{
Config* sysConfig = Config::makeConfig();
moduleName = sysConfig->getConfig(Section, "Module");
if ( moduleName == (*listPT).DeviceName )
{
// match
sysConfig->setConfig(Section, "IPAddr", oam::UnassignedName);
sysConfig->setConfig(Section, "Module", oam::UnassignedName);
break;
}
}
catch (...) {}
if ( moduleName.empty() )
break;
}
}
}
log.writeLog(__LINE__, "removeModule - Updated Process Ports", LOG_TYPE_DEBUG);
//unassign dbroot entries
if (moduleType == "pm")
{
const string MODULE_DBROOTID = "ModuleDBRootID";
const string MODULE_DBROOT_COUNT = "ModuleDBRootCount";
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleID = (*listPT).DeviceName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE);
string ModuleDBRootCount = MODULE_DBROOT_COUNT + moduleID + "-3";
sysConfig->setConfig("SystemModuleConfig", ModuleDBRootCount, oam::UnassignedName);
string ModuleDBrootID = MODULE_DBROOTID + moduleID + "-1-3";
sysConfig->setConfig("SystemModuleConfig", ModuleDBrootID, oam::UnassignedName);
}
}
log.writeLog(__LINE__, "removeModule - Updated DBRoot paramaters", LOG_TYPE_DEBUG);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "removeModule - ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//write Columnstore.xml Module section
try
{
oam.setSystemConfig(moduleType, setmoduletypeconfig);
log.writeLog(__LINE__, "removeModule - Updated Module Section of Config file", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "removeModule - ERROR: setSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//clear out the known_host file, sometimes causes a failure on amazon during addModule
if ( amazon )
{
string homedir = "/root";
if (!rootUser)
{
char* p = getenv("HOME");
if (p && *p)
homedir = p;
}
string cmd = "unlink " + homedir + ".ssh/know_hosts > /dev/null 2>&1";
system(cmd.c_str());
}
pthread_mutex_unlock(&THREAD_LOCK);
//check if any removed modules was Standby OAM or Active OAM
bool activeOAM = false;
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
if ( (*listPT).DeviceName == config.OAMStandbyName() )
clearStandbyModule();
else if ( (*listPT).DeviceName == config.OAMParentName() )
activeOAM = true;
}
//
//send message to Process Monitor to remove module/processes to shared memory
//
if ( !activeOAM )
{
try
{
ByteStream obs;
obs << (ByteStream::byte) REMOVE_MODULE;
obs << (ByteStream::byte) RemoveModuleCount;
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
obs << (*listPT).DeviceName;
}
sendStatusUpdate(obs, REMOVE_MODULE);
log.writeLog(__LINE__, "removeModule - Updated Shared Memory", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "removeModule - ERROR: sendStatusUpdate error", LOG_TYPE_ERROR);
return API_FAILURE;
}
}
if ( moduleType == "pm" )
{
if ( updatePMSconfig() != API_SUCCESS )
return API_FAILURE;
}
//Update DBRM section of Columnstore.xml
if ( updateWorkerNodeconfig() != API_SUCCESS )
return API_FAILURE;
// remove all associated alarms for this modules being removed
listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
clearModuleAlarms( (*listPT).DeviceName );
log.writeLog(__LINE__, "removeModule - successfully removed module: " + (*listPT).DeviceName, LOG_TYPE_DEBUG);
}
//distribute config file
distributeConfigFile("system");
string password;
// check if there is a root password stored
string rpw = oam::UnassignedName;
try
{
oam.getSystemConfig("rpw", password);
}
catch (...)
{
rpw = "root";
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief reconfigureModule
*
* purpose: Reconfigure Module in system configuration
*
******************************************************************************************/
int ProcessManager::reconfigureModule(oam::DeviceNetworkList devicenetworklist)
{
ModuleTypeConfig reconfiguremoduletypeconfig;
ModuleTypeConfig setreconfiguremoduletypeconfig;
ModuleTypeConfig moduletypeconfig;
DeviceNetworkConfig devicenetworkconfig;
Oam oam;
string Section;
pthread_mutex_lock(&THREAD_LOCK);
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
//get module name being reconfigured
string moduleName = (*listPT).DeviceName;
string moduleType = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
//get module type being configured as
listPT++;
string reconfigureModuleName = (*listPT).DeviceName;
string reconfigureModuleType = reconfigureModuleName.substr(0, MAX_MODULE_TYPE_SIZE);
string reconfigureHostName2;
string reconfigureIpAddr2;
int reconfigureNicId2 = 0;
if ( !(*listPT).hostConfigList.empty())
{
HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin();
reconfigureHostName2 = (*pt1).HostName;
reconfigureIpAddr2 = (*pt1).IPAddr;
reconfigureNicId2 = (*pt1).NicID;
}
int status = stopModule(moduleName, GRACEFUL, true);
if (status == API_SUCCESS)
{
log.writeLog(__LINE__, "reconfigureModule - stopModule Successfully " + moduleName, LOG_TYPE_DEBUG);
//check for SIMPLEX Processes on mate might need to be started
pthread_mutex_unlock(&THREAD_LOCK);
checkSimplexModule(moduleName);
pthread_mutex_lock(&THREAD_LOCK);
}
else
log.writeLog(__LINE__, "reconfigureModule - stopModule " + moduleName, LOG_TYPE_ERROR);
//
//Get Module Configuration
//
try
{
oam.getSystemConfig(moduleType, moduletypeconfig);
oam.getSystemConfig(reconfigureModuleType, reconfiguremoduletypeconfig);
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: getSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
setreconfiguremoduletypeconfig = reconfiguremoduletypeconfig;
// update Module Type Counts
setreconfiguremoduletypeconfig.ModuleCount++;
Config* sysConfig = Config::makeConfig();
//Move Module IP and Hostnames
string IPaddress = oam::UnassignedIpAddr;
HostConfig hostconfig;
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
for ( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
{
if ( moduleName == (*pt).DeviceName )
{
devicenetworkconfig.DeviceName = reconfigureModuleName;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
{
if ( pt1 == (*pt).hostConfigList.begin() )
//save first IP for Process Port usage
IPaddress = (*pt1).IPAddr;
hostconfig.IPAddr = (*pt1).IPAddr;
hostconfig.HostName = (*pt1).HostName;
hostconfig.NicID = (*pt1).NicID;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
}
//configure any secondary NIC info passed from console
if ( ! reconfigureHostName2.empty() )
{
hostconfig.IPAddr = reconfigureIpAddr2;
hostconfig.HostName = reconfigureHostName2;
hostconfig.NicID = reconfigureNicId2;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
}
setreconfiguremoduletypeconfig.ModuleNetworkList.push_back(devicenetworkconfig);
break;
}
}
if ( IPaddress == oam::UnassignedIpAddr )
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: module IP is unassigned", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//Update Process Ports
// all nodes: ProcessMonitor, ServerMonitor
// dm: NONE
// um: ExeMgr
// pm: NONE
Section = reconfigureModuleName + "_ProcessMonitor";
sysConfig->setConfig(Section, "IPAddr", IPaddress);
sysConfig->setConfig(Section, "Port", "8800");
Section = reconfigureModuleName + "_ServerMonitor";
sysConfig->setConfig(Section, "IPAddr", IPaddress);
sysConfig->setConfig(Section, "Port", "8622");
if ( moduleType == "um" ||
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM ) ||
( moduleType == "pm" && config.ServerInstallType() == oam::INSTALL_COMBINE_PM_UM ) )
{
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
Section = "ExeMgr" + oam.itoa(moduleID);
sysConfig->setConfig(Section, "IPAddr", oam::UnassignedIpAddr);
}
else
{
//PM TO UM
int moduleID = atoi(reconfigureModuleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
Section = "ExeMgr" + oam.itoa(moduleID);
sysConfig->setConfig(Section, "IPAddr", IPaddress);
sysConfig->setConfig(Section, "Port", "8601");
}
log.writeLog(__LINE__, "reconfigureModule - Updated Process Ports", LOG_TYPE_DEBUG);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//write Columnstore.xml Module section
try
{
oam.setSystemConfig(reconfigureModuleType, setreconfiguremoduletypeconfig);
log.writeLog(__LINE__, "reconfigureModule - Updated Module Section of Config file", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: setSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//distribute config file
distributeConfigFile(moduleName);
//
//Send Reconfigure msg to Module's Process-Monitor being reconfigured
//
ByteStream msg;
ByteStream::byte requestID = RECONFIGURE;
msg << requestID;
msg << reconfigureModuleName;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID );
if ( returnStatus == API_SUCCESS)
//log the event
log.writeLog(__LINE__, "reconfigureModule - procmon reconfigure successful", LOG_TYPE_DEBUG);
else
{
log.writeLog(__LINE__, "reconfigureModule - procmon reconfigure failed", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
ModuleTypeConfig setmoduletypeconfig;
try
{
oam.getSystemConfig(moduleType, setmoduletypeconfig);
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: getSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
// update Module Type Counts
setmoduletypeconfig.ModuleCount--;
//Clear Module IP and Hostnames
pt = setmoduletypeconfig.ModuleNetworkList.begin();
for ( ; pt != setmoduletypeconfig.ModuleNetworkList.end() ; pt++)
{
if ( moduleName == (*pt).DeviceName )
{
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
{
(*pt1).IPAddr = oam::UnassignedIpAddr;
(*pt1).HostName = oam::UnassignedName;
}
break;
}
}
//Update Process Ports
// all nodes: ProcessMonitor, ServerMonitor
// dm: NONE
// um: ExeMgr
// pm: NONE
Section = moduleName + "_ProcessMonitor";
sysConfig->setConfig(Section, "IPAddr", oam::UnassignedIpAddr);
Section = moduleName + "_ServerMonitor";
sysConfig->setConfig(Section, "IPAddr", oam::UnassignedIpAddr);
log.writeLog(__LINE__, "reconfigureModule - Updated Process Ports", LOG_TYPE_DEBUG);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//write Columnstore.xml Module section
try
{
oam.setSystemConfig(moduleType, setmoduletypeconfig);
log.writeLog(__LINE__, "reconfigureModule - Updated Module Section of Config file", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: setSystemConfig", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
//
//send message to Process Monitor to remove/add module/processes to shared memory
//
try
{
ByteStream obs;
obs << (ByteStream::byte) REMOVE_MODULE;
obs << (ByteStream::byte) 1;
obs << moduleName;
sendStatusUpdate(obs, REMOVE_MODULE);
log.writeLog(__LINE__, "reconfigureModule - module removed from Shared Memory", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: sendStatusUpdate error", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
try
{
ByteStream obs;
obs << (ByteStream::byte) ADD_MODULE;
obs << (ByteStream::byte) 1;
obs << reconfigureModuleName;
//pass NIC Hostnames
if ( ! reconfigureHostName2.empty() )
{
obs << (ByteStream::byte) 1;
obs << hostconfig.HostName;
}
else
obs << (ByteStream::byte) 0;
sendStatusUpdate(obs, ADD_MODULE);
log.writeLog(__LINE__, "reconfigureModule - module added from Shared Memory", LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "reconfigureModule - ERROR: sendStatusUpdate error", LOG_TYPE_ERROR);
return API_FAILURE;
}
if ( moduleType == "pm" )
{
if ( updatePMSconfig() != API_SUCCESS )
return API_FAILURE;
}
//Update DBRM section of Columnstore.xml
if ( updateWorkerNodeconfig() != API_SUCCESS )
return API_FAILURE;
// remove all associated alarms for this modules being removed
clearModuleAlarms( moduleName );
//distribute config file
distributeConfigFile("system");
return API_SUCCESS;
}
/******************************************************************************************
* @brief configureModule
*
* purpose: Configure Module sends message to procmon to setup modulename
*
******************************************************************************************/
int ProcessManager::configureModule(std::string moduleName)
{
log.writeLog(__LINE__, "configureModule: Process module " + moduleName, LOG_TYPE_DEBUG);
//distribute config file
distributeConfigFile(moduleName);
distributeConfigFile(moduleName, "ProcessConfig.xml");
//
//Send Configure msg to Module's Process-Monitor being reconfigured
//
ByteStream msg;
ByteStream::byte requestID = CONFIGURE;
msg << requestID;
msg << moduleName;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID );
if ( returnStatus == API_SUCCESS)
//log the event
log.writeLog(__LINE__, "configureModule - procmon configure successful", LOG_TYPE_DEBUG);
else
{
log.writeLog(__LINE__, "configureModule - procmon configure failed", LOG_TYPE_ERROR);
return API_FAILURE;
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief sendMsgProcMon
*
* purpose: Sends a Msg to ProcMon
*
******************************************************************************************/
int ProcessManager::sendMsgProcMon( std::string module, ByteStream msg, int requestID, int timeout )
{
string msgPort;
int returnStatus = API_FAILURE;
Oam oam;
if ( module != config.moduleName() )
{
msgPort = module + "_ProcessMonitor";
// do a ping test to determine a quick failure
Config* sysConfig = Config::makeConfig();
string IPAddr = sysConfig->getConfig(msgPort, "IPAddr");
if ( IPAddr == oam::UnassignedIpAddr )
{
log.writeLog(__LINE__, "sendMsgProcMon ping failure " + module + " " + IPAddr, LOG_TYPE_ERROR);
return oam::API_SUCCESS;
}
string cmdLine = "ping ";
string cmdOption = " -c 1 -w 5 >> /dev/null";
string cmd = cmdLine + IPAddr + cmdOption;
if ( system(cmd.c_str()) != 0)
{
//ping failure
log.writeLog(__LINE__, "sendMsgProcMon ping failure " + module + " " + IPAddr, LOG_TYPE_ERROR);
return oam::API_SUCCESS;
}
}
else
// use the localhost IP Address
msgPort = "localhost_ProcessMonitor";
log.writeLog(__LINE__, "sendMsgProcMon: Process module " + module, LOG_TYPE_DEBUG);
try
{
MessageQueueClient mqRequest(msgPort);
mqRequest.write(msg);
if ( timeout > 0 )
{
// wait for response
ByteStream::byte returnACK;
ByteStream::byte returnRequestID;
ByteStream::byte requestStatus;
ByteStream receivedMSG;
struct timespec ts = { timeout, 0 };
// get current time in seconds
time_t startTimeSec;
time (&startTimeSec);
while (true)
{
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read, module " + module + " : " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception! module " + module, LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
receivedMSG >> returnACK;
receivedMSG >> returnRequestID;
receivedMSG >> requestStatus;
if ( requestID == oam::MASTERREP )
{
receivedMSG >> masterLogFile;
receivedMSG >> masterLogPos;
}
if ( returnACK == oam::ACK && returnRequestID == requestID)
{
// ACK for this request
returnStatus = requestStatus;
break;
}
else
log.writeLog(__LINE__, "sendMsgProcMon: invalid message " + module, LOG_TYPE_ERROR);
}
else
{
//api timeout occurred, check if retry should be done
// get current time in seconds
time_t endTimeSec;
time (&endTimeSec);
if ( timeout <= (endTimeSec - startTimeSec) )
{
log.writeLog(__LINE__, "sendMsgProcMon: ProcMon Msg timeout on module " + module, LOG_TYPE_ERROR);
break;
}
}
}
}
else
returnStatus = oam::API_SUCCESS;
mqRequest.shutdown();
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief sendMsgProcMon1
*
* purpose: Sends a Msg to ProcMon
*
******************************************************************************************/
std::string ProcessManager::sendMsgProcMon1( std::string module, ByteStream msg, int requestID )
{
string msgPort;
string returnStatus = "FAILED";
if ( module != config.moduleName() )
{
msgPort = module + "_ProcessMonitor";
// do a ping test to determine a quick failure
Config* sysConfig = Config::makeConfig();
string IPAddr = sysConfig->getConfig(msgPort, "IPAddr");
string cmdLine = "ping ";
string cmdOption = " -c 1 -w 5 >> /dev/null";
string cmd = cmdLine + IPAddr + cmdOption;
if ( system(cmd.c_str()) != 0 )
{
//ping failure
log.writeLog(__LINE__, "sendMsgProcMon ping failure", LOG_TYPE_ERROR);
return returnStatus;
}
}
else
// use the localhost IP Address
msgPort = "localhost_ProcessMonitor";
try
{
MessageQueueClient mqRequest(msgPort);
mqRequest.write(msg);
// wait 30 seconds for response
ByteStream::byte returnACK;
ByteStream::byte returnRequestID;
string requestStatus;
ByteStream receivedMSG;
struct timespec ts = { 30, 0 };
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception!", LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
receivedMSG >> returnACK;
receivedMSG >> returnRequestID;
receivedMSG >> requestStatus;
if ( returnACK == oam::ACK && returnRequestID == requestID)
{
// ACK for this request
returnStatus = requestStatus;
}
}
else
log.writeLog(__LINE__, "sendMsgProcMon1: ProcMon Msg timeout on module " + module, LOG_TYPE_ERROR);
mqRequest.shutdown();
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief saveBRM
*
* purpose: Execute the reset_locks then save BRM data script
*
******************************************************************************************/
void ProcessManager::saveBRM(bool skipSession, bool clearshm)
{
Oam oam;
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
log.writeLog(__LINE__, "Running reset_locks", LOG_TYPE_DEBUG);
string skip = " ";
if ( skipSession )
skip = "-s";
string cmd = "reset_locks " + skip + " > " + logdir + "/reset_locks.log1 2>&1";
int rtnCode = system(cmd.c_str());
log.writeLog(__LINE__, "Ran reset_locks", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "Running DBRM save_brm", LOG_TYPE_DEBUG);
cmd = "save_brm > " + logdir + "/save_brm.log1 2>&1";
rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) != 1)
{
log.writeLog(__LINE__, "Successfully ran DBRM save_brm", LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "Error running DBRM save_brm", LOG_TYPE_ERROR);
if ( clearshm )
{
cmd = "clearShm -c > /dev/null 2>&1";
rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) != 1)
{
log.writeLog(__LINE__, "Successfully ran DBRM clearShm", LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "Error running DBRM clearShm", LOG_TYPE_ERROR);
}
}
/******************************************************************************************
* @brief setQuerySystemState
*
* purpose: set query system state not ready
*
******************************************************************************************/
void ProcessManager::setQuerySystemState(bool set)
{
Oam oam;
BRM::DBRM dbrm;
try
{
dbrm.setSystemQueryReady(set);
log.writeLog(__LINE__, "setSystemQueryReady = " + oam.itoa(set), LOG_TYPE_DEBUG);
try {
dbrm.setSystemReady(set);
log.writeLog(__LINE__, "setSystemReady = " + oam.itoa(set), LOG_TYPE_DEBUG);
}
catch(...)
{
log.writeLog(__LINE__, "setSystemReady failed", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "setSystemReady failed", LOG_TYPE_ERROR);
}
}
catch(...)
{
log.writeLog(__LINE__, "setSystemQueryReady failed", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "setSystemQueryReady failed", LOG_TYPE_ERROR);
}
}
/******************************************************************************************
* @brief createModuleFile
*
* purpose: Create a module file for remote server
*
******************************************************************************************/
bool ProcessManager::createModuleFile(string remoteModuleName)
{
// Read Local Install flag
string fileName = "/var/lib/columnstore/local/etc/" + remoteModuleName + "/module";
unlink (fileName.c_str());
ofstream newFile (fileName.c_str());
string cmd = "echo " + remoteModuleName + " > " + fileName;
system(cmd.c_str());
newFile.close();
return true;
}
/*****************************************************************************************
* @brief startSystemThread
*
* purpose: Send Messages to Module Process Monitors to start Processes
*
*****************************************************************************************/
void* startSystemThread(oam::DeviceNetworkList* Devicenetworklist)
{
assert(Devicenetworklist);
oam::DeviceNetworkList devicenetworklist = *Devicenetworklist;
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemModuleTypeConfig systemmoduletypeconfig;
ALARMManager aManager;
int status = API_SUCCESS;
bool exitThread = false;
int exitThreadStatus = oam::API_SUCCESS;
pthread_t ThreadId;
ThreadId = pthread_self();
log.writeLog(__LINE__, "startSystemThread launched", LOG_TYPE_DEBUG);
// get system status and exit thread if in AUTO_INIT OR MAN_INIT
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
if (systemstatus.SystemOpState == AUTO_INIT ||
systemstatus.SystemOpState == MAN_INIT)
{
log.writeLog(__LINE__, "Start already in-progess, exit startSystemThread", LOG_TYPE_DEBUG);
startsystemthreadStatus = oam::API_ALREADY_IN_PROGRESS;
exitThread = true;
exitThreadStatus = oam::API_ALREADY_IN_PROGRESS;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: " + error, LOG_TYPE_ERROR);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::MAN_OFFLINE);
exitThread = true;
exitThreadStatus = oam::API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemStatus: Caught unknown exception!", LOG_TYPE_ERROR);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::MAN_OFFLINE);
exitThread = true;
exitThreadStatus = oam::API_FAILURE;
}
if ( exitThread )
{
pthread_detach (ThreadId);
pthread_exit(reinterpret_cast<void*>(static_cast<ptrdiff_t>(exitThreadStatus)));
}
if (systemstatus.SystemOpState == AUTO_OFFLINE)
processManager.setSystemState(oam::AUTO_INIT);
else
processManager.setSystemState(oam::MAN_INIT);
//validate the dbroots assignments
//make sure no 1 ID is assigned to 2 PMs
//and a dbroot not assigned to a DISABLED PM
try
{
systemStorageInfo_t t;
t = oam.getStorageConfig();
DeviceDBRootList moduledbrootlist1 = boost::get<2>(t);
DeviceDBRootList moduledbrootlist2 = boost::get<2>(t);
DeviceDBRootList::iterator pt1 = moduledbrootlist1.begin();
for ( ; pt1 != moduledbrootlist1.end() ; pt1++)
{
string moduleID1 = oam.itoa((*pt1).DeviceID);
string moduleName = "pm" + moduleID1;
// check DISABLED modules
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(moduleName, opState, degraded);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
continue;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
continue;
}
//check if disabled
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
{
if ( (*pt1).dbrootConfigList.size() != 0 )
{
//issue log and Set the alarm
log.writeLog(__LINE__, "startSystemThread failed: Disabled Module '" + moduleName + "' has DBRoots assigned to it", LOG_TYPE_CRITICAL);
aManager.sendAlarmReport(config.moduleName().c_str(), STARTUP_DIAGNOTICS_FAILURE, SET);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
pthread_detach (ThreadId);
pthread_exit((void*) oam::API_FAILURE);
}
continue;
}
// if module has no dbroots assigned, fail startSystem
if ( (*pt1).dbrootConfigList.size() == 0 )
{
//issue log and Set the alarm
log.writeLog(__LINE__, "startSystemThread failed: Module '" + moduleName + "' has no DBRoots assigned to it", LOG_TYPE_CRITICAL);
aManager.sendAlarmReport(config.moduleName().c_str(), STARTUP_DIAGNOTICS_FAILURE, SET);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
pthread_detach (ThreadId);
pthread_exit((void*) oam::API_FAILURE);
}
DBRootConfigList::iterator pt1a = (*pt1).dbrootConfigList.begin();
for ( ; pt1a != (*pt1).dbrootConfigList.end() ; pt1a++)
{
DeviceDBRootList::iterator pt2 = moduledbrootlist2.begin();
for ( ; pt2 != moduledbrootlist2.end() ; pt2++)
{
string moduleID2 = oam.itoa((*pt2).DeviceID);
if ( moduleID1 == moduleID2 )
continue;
DBRootConfigList::iterator pt2a = (*pt2).dbrootConfigList.begin();
for ( ; pt2a != (*pt2).dbrootConfigList.end() ; pt2a++)
{
if ( *pt1a == *pt2a)
{
log.writeLog(__LINE__, "ERROR: DBRoot ID " + oam.itoa(*pt1a) + " configured on 2 pms: 'pm" + moduleID1 + "' and 'pm" + moduleID2 + "'", LOG_TYPE_CRITICAL);
//Set the alarm
aManager.sendAlarmReport(config.moduleName().c_str(), STARTUP_DIAGNOTICS_FAILURE, SET);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
pthread_detach (ThreadId);
pthread_exit((void*) oam::API_FAILURE);
}
}
}
}
}
}
catch (exception& e)
{}
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
exitThread = true;
exitThreadStatus = oam::API_FAILURE;
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
exitThread = true;
exitThreadStatus = oam::API_FAILURE;
}
if ( exitThread )
{
pthread_detach (ThreadId);
pthread_exit(reinterpret_cast<void*>(static_cast<ptrdiff_t>(exitThreadStatus)));
}
if (systemstatus.SystemOpState == AUTO_OFFLINE)
processManager.setSystemState(oam::AUTO_INIT);
else
processManager.setSystemState(oam::MAN_INIT);
startsystemthreadRunning = true;
string newStandbyModule = processManager.getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
processManager.setStandbyModule(newStandbyModule);
//update workernode section
processManager.updateWorkerNodeconfig();
//configure PMS ports
if ( processManager.updatePMSconfig() != API_SUCCESS )
{
startsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
pthread_detach (ThreadId);
pthread_exit((void*) oam::API_FAILURE);
}
if ( devicenetworklist.size() != 0 )
{
//distribute config file
processManager.distributeConfigFile("system");
// start modules from devicenetworklist
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
//launch start module threads, starting with local module
pthread_t startmodulethread;
string moduleName = config.moduleName();
int status = pthread_create (&startmodulethread, NULL, (void* (*)(void*)) &startModuleThread, &moduleName);
if ( status != 0 )
log.writeLog(__LINE__, "startModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
sleep(5);
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
// skip local module name
if ( moduleName == config.moduleName() )
continue;
// bypass DISABLED modules
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
//skip
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_t startmodulethread;
int status = pthread_create (&startmodulethread, NULL, (void* (*)(void*)) &startModuleThread, &moduleName);
if ( status != 0 )
log.writeLog(__LINE__, "startModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
sleep(5);
}
}
else
{
// start all modules, like on a systemStart command
//launch start module threads, starting with local module
if ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM )
{
try
{
oam.setSystemConfig("PrimaryUMModuleName", config.OAMParentName());
}
catch (...) {}
processManager.setPMProcIPs(config.OAMParentName());
}
//distribute config file
processManager.distributeConfigFile("system");
pthread_t startmodulethread;
string moduleName = config.moduleName();
int status = pthread_create (&startmodulethread, NULL, (void* (*)(void*)) &startModuleThread, &moduleName);
if ( status != 0 )
log.writeLog(__LINE__, "startModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
sleep(5);
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
string moduleName = (*pt).DeviceName;
// skip local module name
if ( moduleName == config.moduleName() )
continue;
// bypass DISABLED modules
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
//skip
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
//setup primary User Module, DML/DDL only start on this module
if ( moduleName.find("um") == 0 && config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM)
{
string PrimaryUMModuleName;
try
{
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch (...) {}
if ( PrimaryUMModuleName == oam::UnassignedName )
{
try
{
oam.setSystemConfig("PrimaryUMModuleName", moduleName);
}
catch (...) {}
processManager.setPMProcIPs(moduleName);
//distribute config file
processManager.distributeConfigFile("system");
}
}
pthread_t startmodulethread;
string name = moduleName;
int status = pthread_create (&startmodulethread, NULL, (void* (*)(void*)) &startModuleThread, &name);
if ( status != 0 )
log.writeLog(__LINE__, "startModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
if ( !HDFS )
sleep(5);
else
//usleep(100000);
sleep(1);
}
}
}
// check status and process accordingly
int k = 0;
for ( ; k < 1200 ; k++ )
{
if ( startsystemthreadStop )
{
log.writeLog(__LINE__, "startSystemThread exit early, startsystemthreadStop set", LOG_TYPE_DEBUG);
if ( startmodulethreadStatus != API_SUCCESS )
{
startsystemthreadStatus = startmodulethreadStatus;
processManager.setSystemState(oam::FAILED);
}
else
{
startsystemthreadStatus = API_FAILURE;
processManager.setSystemState(oam::MAN_OFFLINE);
}
startsystemthreadRunning = false;
pthread_detach (ThreadId);
pthread_exit((void*) oam::API_FAILURE);
}
string moduleName;
status = API_SUCCESS;
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
moduleName = (*pt).DeviceName;
// get module status
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if ( opState == oam::FAILED )
{
if ( startmodulethreadStatus != API_SUCCESS )
status = startmodulethreadStatus;
else
status = API_FAILURE;
break;
}
if (opState == oam::ACTIVE ||
opState == oam::MAN_DISABLED ||
opState == oam::AUTO_DISABLED ||
(opState == oam::MAN_OFFLINE && k > 0) )
//skip
continue;
status = API_ALREADY_IN_PROGRESS;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
continue;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
continue;
}
}
if ( status == API_FAILURE )
break;
}
//get out of loop if all modules started successfully
if ( status == API_SUCCESS )
{
//send message to start new Standby Process-Manager, if needed
string newStandbyModule = processManager.getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != "NONE")
{
// get standby IP address and update entries
processManager.setStandbyModule(newStandbyModule);
//distribute config file
processManager.distributeConfigFile("system");
}
break;
}
else
{
//get out of loop if start module failed
if ( status == API_FAILURE )
{
//set system status
log.writeLog(__LINE__, "startSystemThread: Module failed, Set System State to FAILED: " + moduleName, LOG_TYPE_CRITICAL);
processManager.setSystemState(oam::FAILED);
break;
}
}
sleep(5);
}
if ( k == 1200 )
{
// system didn't Successfully restart
log.writeLog(__LINE__, "startSystemThread: Modules failed to start after 1200 tries, Set System State to FAILED", LOG_TYPE_CRITICAL);
processManager.setSystemState(oam::FAILED);
status = oam::API_FAILURE;
}
//set query system state not ready
processManager.setQuerySystemState(false);
// Bug 4554: Wait until DMLProc is finished with rollback
if (status == oam::API_SUCCESS)
{
BRM::DBRM dbrm;
uint16_t rtn = 0;
bool bfirst = true;
SystemProcessStatus systemprocessstatus;
string PrimaryUMModuleName;
try
{
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch (...) {}
if ( PrimaryUMModuleName.empty() )
{
log.writeLog(__LINE__, "startSystemThread: Failed, PrimaryUMModuleName is unassigned", LOG_TYPE_CRITICAL);
rtn = oam::FAILED;
log.writeLog(__LINE__, "startSystemThread Exit", LOG_TYPE_DEBUG);
processManager.setSystemState(oam::FAILED);
startsystemthreadStatus = status;
startsystemthreadRunning = false;
pthread_detach (ThreadId);
pthread_exit(0);
}
// waiting until dml are ACTIVE, then mark system ACTIVE
while (rtn == 0)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", PrimaryUMModuleName, DMLprocessstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
{
if (bfirst)
{
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_INFO);
bfirst = false;
}
}
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
{
rtn = oam::ACTIVE;
break;
}
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
{
rtn = oam::FAILED;
status = oam::API_FAILURE;
break;
}
// wait some more
sleep(2);
}
// This was logical error and possible source of many problems.
if ( rtn == oam::ACTIVE )
//set query system state not ready
processManager.setQuerySystemState(true);
processManager.setSystemState(rtn);
}
else
processManager.setSystemState(oam::FAILED);
// exit thread
log.writeLog(__LINE__, "startSystemThread Exit", LOG_TYPE_DEBUG);
startsystemthreadStatus = status;
startsystemthreadRunning = false;
pthread_detach (ThreadId);
pthread_exit(0);
}
/*****************************************************************************************
* @brief startModuleThread
*
* purpose: Send Messages to Module Process Monitors to start Processes
*
*****************************************************************************************/
void* startModuleThread(string* module)
{
assert(module);
//store in a local variable
string moduleName = *module;
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
bool exitThread = false;
int exitThreadStatus = oam::API_SUCCESS;
pthread_t ThreadId;
ThreadId = pthread_self();
if ( moduleName.empty() )
{
log.writeLog(__LINE__, "startModuleThread received on invalid module name", LOG_TYPE_ERROR);
pthread_detach (ThreadId);
pthread_exit(0);
}
log.writeLog(__LINE__, "Start Module " + moduleName, LOG_TYPE_DEBUG);
bool start = false;
while (true)
{
if ( exitThread )
{
pthread_detach (ThreadId);
pthread_exit(reinterpret_cast<void*>(static_cast<ptrdiff_t>(exitThreadStatus)));
}
// get module status
uint16_t startType = oam::MAN_OFFLINE;
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if ( opState == oam::AUTO_OFFLINE || opState == oam::AUTO_INIT)
startType = oam::AUTO_OFFLINE;
if (opState == oam::ACTIVE ||
opState == oam::MAN_DISABLED ||
opState == oam::AUTO_DISABLED ||
( opState == oam::MAN_OFFLINE && start) )
//quit
break;
start = true;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if ( startsystemthreadStop)
{
// set status and exit this thread
processManager.setModuleState(moduleName, oam::MAN_OFFLINE);
log.writeLog(__LINE__, "startModuleThread early exit on " + moduleName, LOG_TYPE_DEBUG);
pthread_detach (ThreadId);
pthread_exit(0);
}
int retStatus = processManager.startModule(moduleName, oam::FORCEFUL, startType, true);
log.writeLog(__LINE__, "ACK received from '" + moduleName + "' Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
if (retStatus == API_SUCCESS)
break;
else
{
if (retStatus != API_MINOR_FAILURE)
{
//major failure, set stopsystem flag and exit this thread
startmodulethreadStatus = retStatus;
startsystemthreadStop = true;
break;
}
}
}
// exit thread
log.writeLog(__LINE__, "startModuleThread Exit on " + moduleName, LOG_TYPE_DEBUG);
pthread_detach (ThreadId);
pthread_exit(0);
}
/*****************************************************************************************
* @brief stopSystemThread
*
* purpose: Send Messages to Module Process Monitors to stop Processes
*
*****************************************************************************************/
void* stopSystemThread(oam::DeviceNetworkList* Devicenetworklist)
{
assert(Devicenetworklist);
oam::DeviceNetworkList devicenetworklist = *Devicenetworklist;
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemModuleTypeConfig systemmoduletypeconfig;
ALARMManager aManager;
int status = API_SUCCESS;
//bool exitThread = false;
//int exitThreadStatus = oam::API_SUCCESS;
pthread_t ThreadId;
ThreadId = pthread_self();
log.writeLog(__LINE__, "stopSystemThread launched", LOG_TYPE_DEBUG);
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
stopsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
//exitThread = true;
//exitThreadStatus = oam::API_FAILURE;
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
stopsystemthreadStatus = oam::API_FAILURE;
processManager.setSystemState(oam::FAILED);
//exitThread = true;
//exitThreadStatus = oam::API_FAILURE;
}
if ( devicenetworklist.size() != 0 )
{
// stop modules from devicenetworklist
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
//launch start module threads, starting with local module
pthread_t stopmodulethread;
string moduleName = config.moduleName();
int status = pthread_create (&stopmodulethread, NULL, (void* (*)(void*)) &stopModuleThread, &moduleName);
if ( status != 0 )
log.writeLog(__LINE__, "stopModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string moduleName = (*listPT).DeviceName;
// bypass DISABLED modules
try
{
int opState;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
//skip
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_t stopmodulethread;
int status = pthread_create (&stopmodulethread, NULL, (void* (*)(void*)) &stopModuleThread, &moduleName);
if ( status != 0 )
log.writeLog(__LINE__, "stopModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
sleep(5);
}
}
else
{
// stop all modules, like on a systemStart command
//launch stop module threads, stoping with local module
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
string moduleName = (*pt).DeviceName;
// bypass DISABLED modules
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
//skip
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
pthread_t stopmodulethread;
string name = moduleName;
int status = pthread_create (&stopmodulethread, NULL, (void* (*)(void*)) &stopModuleThread, &name);
if ( status != 0 )
log.writeLog(__LINE__, "stopModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
usleep(50000);
}
}
}
// check status and process accordingly
int k = 0;
for ( ; k < 1200 ; k++ )
{
string moduleName;
status = API_SUCCESS;
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
moduleName = (*pt).DeviceName;
// get module status
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if ( opState == oam::FAILED )
{
status = API_FAILURE;
break;
}
if (opState == oam::MAN_DISABLED ||
opState == oam::AUTO_DISABLED ||
opState == oam::MAN_OFFLINE)
//skip
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
continue;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
continue;
}
}
if ( status == API_FAILURE )
break;
}
//get out of loop if all modules stopped successfully
if ( status == API_SUCCESS )
{
break;
}
else
{
//get out of loop if stop module failed
if ( status == API_FAILURE )
{
//set system status
log.writeLog(__LINE__, "stopSystemThread: Module failed, Set System State to FAILED: " + moduleName, LOG_TYPE_CRITICAL);
processManager.setSystemState(oam::FAILED);
break;
}
}
sleep(5);
}
if ( k == 1200 )
{
// system didn't Successfully restart
log.writeLog(__LINE__, "stopSystemThread: Modules failed to stop after 1200 tries, Set System State to FAILED", LOG_TYPE_CRITICAL);
processManager.setSystemState(oam::FAILED);
status = oam::API_FAILURE;
}
else
{
processManager.setSystemState(oam::MAN_OFFLINE);
status = oam::API_SUCCESS;
}
// exit thread
stopsystemthreadStatus = status;
log.writeLog(__LINE__, "stopSystemThread Exit", LOG_TYPE_DEBUG);
pthread_detach (ThreadId);
pthread_exit(0);
}
/*****************************************************************************************
* @brief stopModuleThread
*
* purpose: Send Messages to Module Process Monitors to stop Processes
*
*****************************************************************************************/
void* stopModuleThread(string* module)
{
assert(module);
//store in a local variable
string moduleName = *module;
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
pthread_t ThreadId;
ThreadId = pthread_self();
if ( moduleName.empty() )
{
log.writeLog(__LINE__, "stopModuleThread received on invalid module name", LOG_TYPE_ERROR);
pthread_detach (ThreadId);
pthread_exit(0);
}
log.writeLog(__LINE__, "Stop Module " + moduleName, LOG_TYPE_DEBUG);
while (true)
{
// get module status
try
{
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState == oam::MAN_OFFLINE)
//quit
break;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
int retStatus = processManager.stopModule(moduleName, oam::GRACEFUL, true);
log.writeLog(__LINE__, "ACK received from '" + moduleName + "' Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
if (retStatus == API_SUCCESS)
break;
else
{
if (retStatus != API_MINOR_FAILURE)
{
//major failure, set stopsystem flag and exit this thread
break;
}
}
}
// exit thread
log.writeLog(__LINE__, "stopModuleThread Exit on " + moduleName, LOG_TYPE_DEBUG);
pthread_detach (ThreadId);
pthread_exit(0);
}
/*****************************************************************************************
* @brief checkSimplexModule
*
* purpose: Check for simplex module run-type and start mate processes if needed
*
*****************************************************************************************/
void ProcessManager::checkSimplexModule(std::string moduleName)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
SystemModuleTypeConfig systemmoduletypeconfig;
SystemProcessConfig systemprocessconfig;
log.writeLog(__LINE__, "checkSimplexModule called for " + moduleName, LOG_TYPE_DEBUG);
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
return;
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
return;
}
string moduletype = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
for ( unsigned int i = 0; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
if ( moduletype == systemmoduletypeconfig.moduletypeconfig[i].ModuleType )
{
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleCount == 0)
return;
//check for SIMPLEX Processes on mate might need to be started
if ( systemmoduletypeconfig.moduletypeconfig[i].RunType == SIMPLEX )
{
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
if ((*pt).DeviceName != moduleName)
{
//mate module, check for module ACTIVE and SIMPLEX processes
int opState = oam::ACTIVE;
try
{
bool degraded;
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
if (opState == oam::ACTIVE ||
opState == oam::DEGRADED )
{
//start COLD_STANDBY processes
try
{
oam.getProcessConfig(systemprocessconfig);
for ( unsigned int j = 0 ; j < systemprocessconfig.processconfig.size(); j++)
{
if ( systemprocessconfig.processconfig[j].ModuleType == moduletype &&
systemprocessconfig.processconfig[j].RunType == oam::SIMPLEX )
{
int state = oam::ACTIVE;
try
{
ProcessStatus procstat;
oam.getProcessStatus(systemprocessconfig.processconfig[j].ProcessName,
(*pt).DeviceName, procstat);
state = procstat.ProcessOpState;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
continue;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
continue;
}
if ( state == oam::COLD_STANDBY )
{
//process DDL/DMLProc
if ( systemprocessconfig.processconfig[j].ProcessName == "DDLProc")
{
setPMProcIPs((*pt).DeviceName);
log.writeLog(__LINE__, "Set Primary UM Module = " + (*pt).DeviceName, LOG_TYPE_DEBUG);
oam.setSystemConfig("PrimaryUMModuleName", (*pt).DeviceName);
//distribute config file
distributeConfigFile("system");
sleep(2);
}
int status = processManager.startProcess((*pt).DeviceName,
systemprocessconfig.processconfig[j].ProcessName,
FORCEFUL);
if ( status == API_SUCCESS )
{
log.writeLog(__LINE__, "checkSimplexModule: mate process started: " + (*pt).DeviceName + "/" + systemprocessconfig.processconfig[j].ProcessName, LOG_TYPE_DEBUG);
status = processManager.startProcess((*pt).DeviceName,
"DMLProc",
FORCEFUL);
if ( status == API_SUCCESS ) {
log.writeLog(__LINE__, "checkSimplexModule: mate process started: " + (*pt).DeviceName + "/DMLProc", LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "checkSimplexModule: mate process failed to start: " + (*pt).DeviceName + "/DMLProc", LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "checkSimplexModule: mate process failed to start: " + (*pt).DeviceName + "/" + systemprocessconfig.processconfig[j].ProcessName, LOG_TYPE_DEBUG);
//setup new MariaDB Replication Master
if ( systemprocessconfig.processconfig[j].ProcessName == "DMLProc" ) {
log.writeLog(__LINE__, "Setup MySQL Replication for COLD_STANDBY DMLProc going ACTIVE", LOG_TYPE_DEBUG);
oam::DeviceNetworkList devicenetworklist;
processManager.setMySQLReplication(devicenetworklist, (*pt).DeviceName);
}
}
else
{
// if found ACTIVE, skip to next process
if ( state == oam::ACTIVE )
return;
}
}
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "checkSimplexModule: EXCEPTION ERROR on getProcessConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "checkSimplexModule: EXCEPTION ERROR on getProcessConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + moduleName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
}
}
}
return;
}
/******************************************************************************************
* @brief updatePMSconfig
*
* purpose: Update PMS Configuration in System Configuration file
*
******************************************************************************************/
int ProcessManager::updatePMSconfig( bool check )
{
Oam oam;
int minPmPorts = 32;
vector<string> IpAddrs;
vector<int> nicIDs;
pthread_mutex_lock(&THREAD_LOCK);
ModuleTypeConfig moduletypeconfig;
oam.getSystemConfig("pm", moduletypeconfig);
Config* sysConfig = Config::makeConfig();
string pmsIPAddr = sysConfig->getConfig("PMS1", "IPAddr");
//exit out if PMS already setup
if ( pmsIPAddr != oam::UnassignedIpAddr &&
check)
{
log.writeLog(__LINE__, "updatePMSconfig: no update needed, exiting function", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
//exit out if PM module count is 1 or less
if ( moduletypeconfig.ModuleCount <= 1 &&
check)
{
log.writeLog(__LINE__, "updatePMSconfig: no update needed, exiting function", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
int maxPMNicID = atoi(sysConfig->getConfig("PrimitiveServers", "ConnectionsPerPrimProc").c_str()) / 2;
int pmCount = 0;
//get Perfomance module IP addresses
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
for ( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
{
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
pmCount++;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
{
if ( (*pt1).IPAddr == oam::UnassignedIpAddr )
continue;
else
{
//check NIC status and don't assigned if down
try
{
int state = oam::UP;
oam.getNICStatus((*pt1).HostName, state);
if ( state == oam::UP || state == oam::INITIAL)
{
IpAddrs.push_back((*pt1).IPAddr);
nicIDs.push_back((*pt1).NicID);
}
}
catch (...)
{
IpAddrs.push_back((*pt1).IPAddr);
nicIDs.push_back((*pt1).NicID);
}
}
}
}
if ( IpAddrs.empty())
{
log.writeLog(__LINE__, "updatePMSconfig: No up NICS found, exiting function", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
if ( pmCount == 0)
{
log.writeLog(__LINE__, "updatePMSconfig: No PM modules Enabled, exiting function", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
if ( pmCount == 1 &&
pmsIPAddr != oam::UnassignedIpAddr &&
check )
{
log.writeLog(__LINE__, "updatePMSconfig: no update needed, exiting function", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
Configuration config;
//retry 5 times loop just in case
for (int i = 0 ; i < 5; i++)
{
Config* sysConfig1 = Config::makeConfig();
//update PM count if needed
sysConfig1->setConfig("PrimitiveServers", "Count", oam.itoa(pmCount));
int pmPorts = pmCount * (maxPMNicID * 2);
if ( pmPorts < minPmPorts )
pmPorts = minPmPorts;
const string PM = "PMS";
int nicID = 1;
for ( int pmsID = 1; pmsID < pmPorts + 1 ; )
{
vector<string>::iterator pt = IpAddrs.begin();
vector<int>::iterator pt1 = nicIDs.begin();
for ( ; pt != IpAddrs.end() ; pt++, pt1++)
{
if ( *pt1 == nicID )
{
string pmsName = PM + oam.itoa(pmsID);
sysConfig1->setConfig(pmsName, "IPAddr", *pt);
pmsID++;
}
if ( pmsID > pmPorts )
break;
}
if ( pmsID > pmPorts )
break;
nicID++;
if ( nicID > maxPMNicID )
nicID = 1;
}
//update Calpont Config table
try
{
sysConfig1->write();
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
catch (...)
{
log.writeLog(__LINE__, "updatePMSconfig - ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
pthread_mutex_unlock(&THREAD_LOCK);
log.writeLog(__LINE__, "updatePMSconfig failed", LOG_TYPE_DEBUG);
return API_FAILURE;
}
/******************************************************************************************
* @brief updateWorkerNodeconfig
*
* purpose: Update WorkerNode Configuration in System Configuration file
*
******************************************************************************************/
int ProcessManager::updateWorkerNodeconfig()
{
Oam oam;
vector <string> module;
vector <string> ipadr;
pthread_mutex_lock(&THREAD_LOCK);
//setup current module as work-node #1 by entering it in first
module.push_back(config.moduleName());
// get my IP address and update entries
ModuleConfig moduleconfig;
oam.getSystemConfig(config.moduleName(), moduleconfig);
HostConfigList::iterator pt0 = moduleconfig.hostConfigList.begin();
idbassert(pt0 != moduleconfig.hostConfigList.end());
ipadr.push_back(pt0->IPAddr);
SystemModuleTypeConfig systemmoduletypeconfig;
try
{
oam.getSystemConfig(systemmoduletypeconfig);
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
if ( systemmoduletypeconfig.moduletypeconfig[i].ModuleType.empty() )
// end of list
break;
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount > 0 )
{
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
//skip current module
if ( (*pt).DeviceName == config.moduleName() )
continue;
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
module.push_back((*pt).DeviceName);
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
ipadr.push_back((*pt1).IPAddr);
}
}
}
}
catch (...)
{
log.writeLog(__LINE__, "updateWorkerNodeconfig: getSystemNetworkConfig Failed", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
Configuration config;
for ( int i = 1 ; i < 5 ; i++ )
{
Config* sysConfig3 = Config::makeConfig();;
//update Columnstore.xml
sysConfig3->setConfig("DBRM_Controller", "NumWorkers", oam.itoa(module.size()));
std::vector<std::string>::iterator pt = module.begin();
std::vector<std::string>::iterator pt1 = ipadr.begin();
int id = 1;
for ( ; pt != module.end() ; pt++, pt1++, id++)
{
string Section = "DBRM_Worker" + oam.itoa(id);
sysConfig3->setConfig(Section, "IPAddr", *pt1);
sysConfig3->setConfig(Section, "Module", *pt);
string moduleName = *pt;
sysConfig3->setConfig(Section, "Port", "8700");
}
//clear out any leftovers
for ( ; id < MAX_MODULE ; id++ )
{
string Section = "DBRM_Worker" + oam.itoa(id);
if ( sysConfig3->getConfig(Section, "IPAddr") != oam::UnassignedIpAddr &&
!sysConfig3->getConfig(Section, "IPAddr").empty())
sysConfig3->setConfig(Section, "IPAddr", oam::UnassignedIpAddr);
if ( sysConfig3->getConfig(Section, "Module") != oam::UnassignedIpAddr &&
!sysConfig3->getConfig(Section, "Module").empty())
sysConfig3->setConfig(Section, "Module", oam::UnassignedName);
}
try
{
sysConfig3->write();
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
catch (...)
{
log.writeLog(__LINE__, "updateWorkerNodeconfig - ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
pthread_mutex_unlock(&THREAD_LOCK);
log.writeLog(__LINE__, "updateWorkerNodeconfig failed", LOG_TYPE_DEBUG);
return API_FAILURE;
}
/******************************************************************************************
* @brief clearModuleAlarms
*
* purpose: Clears all alarms related to a module
*
******************************************************************************************/
void ProcessManager::clearModuleAlarms(std::string moduleName)
{
ALARMManager aManager;
AlarmList alarmList;
aManager.getActiveAlarm (alarmList);
AlarmList::iterator i;
for (i = alarmList.begin(); i != alarmList.end(); ++i)
{
// check if the same fault component on same module
if (moduleName.compare((i->second).getComponentID()) == 0 ||
moduleName.compare((i->second).getSname()) == 0)
{
// match, go clear it
aManager.sendAlarmReport((i->second).getComponentID().c_str(),
(i->second).getAlarmID(),
CLEAR,
(i->second).getSname().c_str(),
"ProcessManager");
}
}
}
/******************************************************************************************
* @brief clearNICAlarms
*
* purpose: Clears all alarms related to a NIC hostName
*
******************************************************************************************/
void ProcessManager::clearNICAlarms(std::string hostName)
{
ALARMManager aManager;
AlarmList alarmList;
aManager.getActiveAlarm (alarmList);
AlarmList::iterator i;
for (i = alarmList.begin(); i != alarmList.end(); ++i)
{
// check if the same fault component on same module
if (hostName.compare((i->second).getComponentID()) == 0)
{
// match, go clear it
aManager.sendAlarmReport((i->second).getComponentID().c_str(),
(i->second).getAlarmID(),
CLEAR,
(i->second).getSname().c_str(),
"ProcessManager");
}
}
}
/******************************************************************************************
* @brief updateExtentMap
*
* purpose: update Extent Map section in Columnstore.xml
*
******************************************************************************************/
bool ProcessManager::updateExtentMap()
{
string fileName = std::string(MCSSYSCONFDIR) + "/columnstore/Columnstore.xml";
ifstream oldFile (fileName.c_str());
if (!oldFile) return false;
vector <string> lines;
char line[200];
string buf;
string newLine;
string start = "</Installation>";
string firstComment = "<!--";
string end = "</ExtentMap>";
string lastComment = "-->";
while (oldFile.getline(line, 200))
{
buf = line;
string::size_type pos = buf.find(start, 0);
if (pos != string::npos)
{
//output to temp file and skip next line
lines.push_back(buf);
oldFile.getline(line, 200);
buf = line;
pos = buf.find(firstComment, 0);
if (pos == string::npos)
{
return true;
}
}
else
{
pos = buf.find(end, 0);
if (pos != string::npos)
{
//output to temp file and skip next line
lines.push_back(buf);
oldFile.getline(line, 200);
buf = line;
pos = buf.find(lastComment, 0);
if (pos == string::npos)
{
return true;
}
}
else
//output to temp file
lines.push_back(buf);
}
}
oldFile.close();
unlink (fileName.c_str());
ofstream newFile (fileName.c_str());
//create new file
int fd = open(fileName.c_str(), O_RDWR | O_CREAT, 0664);
copy(lines.begin(), lines.end(), ostream_iterator<string>(newFile, "\n"));
newFile.close();
close(fd);
return true;
}
/******************************************************************************************
* @brief makeXMInittab
*
* purpose: Make inittab to auto-launch ProcMon
*
******************************************************************************************/
bool ProcessManager::makeXMInittab(std::string moduleName, std::string systemID, std::string parentOAMModuleHostName)
{
string fileName = "/var/lib/columnstore/local/etc/" + moduleName + "/inittab.calpont";
vector <string> lines;
string init1 = "1" + systemID + ":2345:respawn:ProcMon " + parentOAMModuleHostName;
lines.push_back(init1);
unlink (fileName.c_str());
ofstream newFile (fileName.c_str());
//create new file
int fd = open(fileName.c_str(), O_RDWR | O_CREAT, 0664);
copy(lines.begin(), lines.end(), ostream_iterator<string>(newFile, "\n"));
newFile.close();
close(fd);
return true;
}
/******************************************************************************************
* @brief setPMProcIPs
*
* purpose: Updates the Columnstore.xml file for DDL/DMLProc IPs during PM switchover
*
*
******************************************************************************************/
int ProcessManager::setPMProcIPs( std::string moduleName, std::string processName )
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
ModuleConfig moduleconfig;
pthread_mutex_lock(&THREAD_LOCK);
if ( processName == oam::UnassignedName || processName == "DDLProc")
{
for ( int i = 1 ; i < 5 ; i ++)
{
//get Module IP address
try
{
oam.getSystemConfig(moduleName, moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string ipAdd = (*pt1).IPAddr;
Config* sysConfig2 = Config::makeConfig();
//check if IP address if different than current value, don't update if it is
if ( sysConfig2->getConfig("DDLProc", "IPAddr") == ipAdd )
{
log.writeLog(__LINE__, "setPMProcIPs for DDLProc: no update needed", LOG_TYPE_DEBUG);
break;
}
sysConfig2->setConfig("DDLProc", "IPAddr", ipAdd);
try
{
sysConfig2->write();
pthread_mutex_unlock(&THREAD_LOCK);
log.writeLog(__LINE__, "setPMProcIPs: DDLProc to " + ipAdd, LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "setPMProcIPs - ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "setPMProcIPs: EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "setPMProcIPs: EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
if ( processName == oam::UnassignedName || processName == "DMLProc")
{
for ( int i = 1 ; i < 5 ; i ++)
{
//get Module IP address
try
{
oam.getSystemConfig(moduleName, moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string ipAdd = (*pt1).IPAddr;
Config* sysConfig2 = Config::makeConfig();
//check if IP address if different than current value, don't update if it is
if ( sysConfig2->getConfig("DMLProc", "IPAddr") == ipAdd )
{
log.writeLog(__LINE__, "setPMProcIPs for DMLProc: no update needed, exiting function", LOG_TYPE_DEBUG);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
sysConfig2->setConfig("DMLProc", "IPAddr", ipAdd);
try
{
sysConfig2->write();
pthread_mutex_unlock(&THREAD_LOCK);
log.writeLog(__LINE__, "setPMProcIPs: DMLProc to " + ipAdd, LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "setPMProcIPs - ERROR: sysConfig->write", LOG_TYPE_ERROR);
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "setPMProcIPs: EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "setPMProcIPs: EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
pthread_mutex_unlock(&THREAD_LOCK);
//log.writeLog(__LINE__, "setPMProcIPs failed", LOG_TYPE_DEBUG);
return API_SUCCESS;
}
/******************************************************************************************
* @brief distributeConfigFile
*
* purpose: Distribute Calpont Config File to system modules
*
******************************************************************************************/
int ProcessManager::distributeConfigFile(std::string name, std::string file)
{
ByteStream msg;
ByteStream::byte requestID = UPDATECONFIGFILE;
Oam oam;
int returnStatus = oam::API_SUCCESS;
string dirName = std::string(MCSSYSCONFDIR) + "/columnstore/";
string fileName = dirName + file;
ifstream in (fileName.c_str());
if (!in)
{
log.writeLog(__LINE__, "distributeConfigFile failed, file doesn't exist: " + fileName, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
//skip any file of size 0
in.seekg(0, std::ios::end);
int size = in.tellg();
if ( size == 0 )
{
log.writeLog(__LINE__, "distributeConfigFile failed, file doesn't exist: " + fileName, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
// distribute using hdfs call, make sure host names are in /etc/pdsh/machines
ifstream in1 ("/etc/pdsh/machines");
if (in1)
{
if ( HDFS )
{
if ( name == "system" )
{
string cmd = "pdcp -a -x " + localHostName + " " + fileName + " " + dirName;
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) == 0)
{
log.writeLog(__LINE__, "distributeConfigFile using pdcp successful on " + fileName, LOG_TYPE_DEBUG);
return returnStatus;
}
else
{
log.writeLog(__LINE__, "distributeConfigFile using pdcp failed on " + fileName, LOG_TYPE_ERROR);
}
}
else
{
// get module hostname
ModuleConfig moduleconfig;
oam.getSystemConfig(name, moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string hostName = (*pt1).HostName;
string cmd = "pdcp -w " + hostName + " " + fileName + " " + dirName;
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) == 0)
{
log.writeLog(__LINE__, "distributeConfigFile using pdcp successful on " + fileName, LOG_TYPE_DEBUG);
return returnStatus;
}
else
{
log.writeLog(__LINE__, "distributeConfigFile using pdcp failed on " + fileName, LOG_TYPE_ERROR);
}
}
}
}
//send via tcp messaging
msg << requestID;
msg << fileName;
in.seekg(0, std::ios::beg);
in >> msg;
SystemModuleTypeConfig systemmoduletypeconfig;
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
if ( name == "system" )
{
// send config file to all modules
for ( unsigned int i = 0 ; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
//skip local module
if ( (*pt).DeviceName == config.moduleName() )
continue;
//skip if AOS
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (...)
{}
if (opState == oam::AUTO_DISABLED)
continue;
returnStatus = sendMsgProcMon( (*pt).DeviceName, msg, requestID, 0 );
if ( returnStatus == API_SUCCESS)
{
//log the success event
log.writeLog(__LINE__, (*pt).DeviceName + " distributeConfigFile success.", LOG_TYPE_DEBUG);
}
else
{
//log the error event
log.writeLog(__LINE__, (*pt).DeviceName + " distributeConfigFile failed!!", LOG_TYPE_WARNING);
}
}
}
}
else
{
returnStatus = sendMsgProcMon( name, msg, requestID, 0 );
if ( returnStatus == API_SUCCESS)
{
//log the success event
log.writeLog(__LINE__, name + " distributeConfigFile success.", LOG_TYPE_DEBUG);
}
else
{
//log the error event
log.writeLog(__LINE__, name + " distributeConfigFile failed!!", LOG_TYPE_WARNING);
}
}
return returnStatus;
}
/******************************************************************************************
* @brief getDBRMData
*
* purpose: get DBRM Data and send to requester
*
******************************************************************************************/
int ProcessManager::getDBRMData(messageqcpp::IOSocket fIos, std::string moduleName)
{
ByteStream msg;
Oam oam;
int returnStatus = oam::API_SUCCESS;
pthread_mutex_lock(&THREAD_LOCK);
messageqcpp::IOSocket cfIos = fIos;
string DBRMroot;
oam.getSystemConfig("DBRMRoot", DBRMroot);
string currentFileName = DBRMroot + "_current";
string journalFileName = DBRMroot + "_journal";
string oidFile;
oam.getSystemConfig("OIDBitmapFile", oidFile);
// StorageManager: Need to make these existence checks use an idbfilesystem op if we
// decide to put the BRM-managed files in cloud storage
string currentDbrmFile;
IDBFileSystem &fs = IDBPolicy::getFs(currentFileName.c_str());
boost::scoped_ptr<IDBDataFile> oldFile(IDBDataFile::open(IDBPolicy::getType(currentFileName.c_str(),
IDBPolicy::WRITEENG),
currentFileName.c_str(), "r", 0));
//ifstream oldFile (currentFileName.c_str());
if (fs.exists(currentFileName.c_str()))
{
// current file found, check for OIDBitmapFile
boost::scoped_ptr<IDBDataFile> mapFile(IDBDataFile::open(IDBPolicy::getType(oidFile.c_str(),
IDBPolicy::WRITEENG),
oidFile.c_str(), "r", 0));
//ifstream mapFile (oidFile.c_str());
if (!mapFile)
{
// no OIDBitmapFile, with current file, dbrm files are hosed
log.writeLog(__LINE__, "getDBRMData: DBRM data files error, current file exist without OIDBitmapFile", LOG_TYPE_CRITICAL);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE_DB_ERROR;
}
char line[200];
memset(line, 0, 200);
int err = oldFile->read(line, 200);
// XXXPAT. HACK! This is brittle, need to fix later. Need to eat a \n char. Need to move forward now.
if (err > 0)
line[err-1] = 0;
//oldFile.getline(line, 200);
// MCOL-1558. Handle absolute and relative paths.
if (line[0] == '/')
currentDbrmFile = line;
else
currentDbrmFile = DBRMroot.substr(0, DBRMroot.find_last_of('/') + 1) + line;
}
else
{
log.writeLog(__LINE__, "getDBRMData: no DBRM current file found, must be initial install", LOG_TYPE_DEBUG);
msg << "initial";
try
{
cfIos.write(msg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
returnStatus = oam::API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
return returnStatus;
}
//string fileName = startup::StartUp::installDir() + "/local/dbrmfiles";
//unlink(fileName.c_str());
// this replaces the stuff that's if-0'd below
boost::filesystem::path pCurrentDbrmFile(currentDbrmFile + "_");
boost::filesystem::path dbrmDir(pCurrentDbrmFile.parent_path());
list<string> fileListing;
vector<string> dbrmFiles;
fs.listDirectory(dbrmDir.string().c_str(), fileListing);
for (const auto &file : fileListing)
{
if (file.find(pCurrentDbrmFile.filename().string()) == 0 &&
fs.size((dbrmDir / file).string().c_str()) != 0)
{
log.writeLog(__LINE__, "adding " + (dbrmDir/file).string() + " to dbrmFiles", LOG_TYPE_DEBUG);
dbrmFiles.push_back((dbrmDir / file).string());
}
}
fileListing.clear();
#if 0
string cmd;
string storageType = config::Config::makeConfig()->getConfig("Installation", "DBRootStorageType");
if (storageType == "storagemanager")
cmd = startup::StartUp::installDir() + "/bin/smls " + currentDbrmFile + "_* | awk '// { print $3 }' >> " +
startup::StartUp::installDir() + "/local/dbrmfiles";
else
cmd = "ls " + currentDbrmFile + "_* >> " + startup::StartUp::installDir() + "/local/dbrmfiles";
log.writeLog(__LINE__, "Running '" + cmd + "'", LOG_TYPE_DEBUG);
system(cmd.c_str());
ifstream file (fileName.c_str());
if (!file)
{
log.writeLog(__LINE__, "getDBRMData: no DBRM files found, must be initial install", LOG_TYPE_DEBUG);
msg << "initial";
try
{
cfIos.write(msg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknow exception", LOG_TYPE_ERROR);
returnStatus = oam::API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
return returnStatus;
}
vector <string> dbrmFiles;
char line[200];
string buf;
while (file.getline(line, 200))
{
buf = line;
dbrmFiles.push_back(buf);
}
file.close();
#endif
if ( dbrmFiles.size() < 1 )
{
log.writeLog(__LINE__, "getDBRMData: dbrmFiles size = 0, must be initial install", LOG_TYPE_DEBUG);
msg << "initial";
try
{
cfIos.write(msg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
returnStatus = oam::API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
return returnStatus;
}
// put oid file and current file in list
dbrmFiles.push_back(currentFileName);
if (fs.exists(journalFileName.c_str()) && fs.size(journalFileName.c_str()) > 0)
dbrmFiles.push_back(journalFileName);
if (fs.exists(oidFile.c_str()) && fs.size(oidFile.c_str()) > 0)
dbrmFiles.push_back(oidFile);
//type
msg << "files";
try
{
cfIos.write(msg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
//remove any file of size 0
std::vector<std::string>::iterator pt1 = dbrmFiles.begin();
#if 0
for ( ; pt1 != dbrmFiles.end() ; pt1++)
{
if (fs.size(pt1->c_str()) == 0)
dbrmFiles.erase(pt1);
}
#endif
ByteStream fcmsg;
// number of files
fcmsg << (ByteStream::byte) dbrmFiles.size();
try
{
cfIos.write(fcmsg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
pt1 = dbrmFiles.begin();
for ( ; pt1 != dbrmFiles.end() ; pt1++)
{
ByteStream fnmsg, fdmsg;
string fileName = *pt1;
//Goal of the stuff below is to load a file's data into fdmsg
//and it's filename into fnmsg.
boost::scoped_ptr<IDBDataFile> in(IDBDataFile::open(
IDBPolicy::getType(fileName.c_str(),
IDBPolicy::WRITEENG),
fileName.c_str(), "r", 0));
ssize_t size = in->size();
fdmsg.needAtLeast(size);
uint8_t *buf = fdmsg.getInputPtr();
ssize_t progress = 0;
ssize_t err;
char errbuf[80];
while (progress < size)
{
err = in->read(&buf[progress], size - progress);
if (err < 0)
{
int saved_errno = errno;
log.writeLog(__LINE__, "getDBRMData(): failed reading " + fileName + ", got " +
strerror_r(saved_errno, errbuf, 80), LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
else if (err == 0)
{
log.writeLog(__LINE__, "getDBRMData(): failed reading " + fileName + ", got early EOF", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
progress += err;
}
fdmsg.advanceInputPtr(size);
log.writeLog(__LINE__, fileName, LOG_TYPE_DEBUG);
fnmsg << fileName;
#if 0
ifstream in(fileName.c_str());
//skip any file of size 0
in.seekg(0, std::ios::end);
size = in.tellg();
if ( size == 0 )
continue;
in.seekg(0, std::ios::beg);
log.writeLog(__LINE__, fileName, LOG_TYPE_DEBUG);
fnmsg << fileName;
#endif
try
{
cfIos.write(fnmsg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
//in >> fdmsg;
log.writeLog(__LINE__, "Sending " + to_string(fdmsg.length()) + " bytes.", LOG_TYPE_DEBUG);
try
{
cfIos.write(fdmsg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return oam::API_FAILURE;
}
}
try
{
cfIos.write(msg);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on cfIos.write: Unknown exception", LOG_TYPE_ERROR);
returnStatus = oam::API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
return returnStatus;
}
/******************************************************************************************
* @brief switchParentOAMModule
*
* purpose: Switch OAM Parent Module
*
******************************************************************************************/
int ProcessManager::switchParentOAMModule(std::string newActiveModuleName)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
int returnStatus = oam::API_SUCCESS;
ALARMManager aManager;
log.writeLog(__LINE__, "switchParentOAMModule Function Started", LOG_TYPE_DEBUG);
//storage config
string DBRootStorageType;
try
{
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
}
catch (...) {}
log.writeLog(__LINE__, "switchParentOAMModule: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
if ( DBRootStorageType == "internal" && DataRedundancyConfig == "n")
{
log.writeLog(__LINE__, "ERROR: DBRootStorageType = internal", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_INVALID_PARAMETER;
}
// set alarm
aManager.sendAlarmReport(newActiveModuleName.c_str(), MODULE_SWITCH_ACTIVE, SET);
//clear run standby flag;
runStandby = false;
int retryCount = 0;
//sleep, give time for message thread to startup
while (!MsgThreadActive && retryCount < 10)
{
log.writeLog(__LINE__, "Waiting for Message Thread...", LOG_TYPE_DEBUG);
sleep(5);
++retryCount;
}
int moduleID = atoi(newActiveModuleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
// update Columnstore.xml entries
string newActiveIPaddr;
try
{
pthread_mutex_lock(&THREAD_LOCK);
//move a newparent dbroot to old parent for balancing
DBRootConfigList residedbrootConfigList;
bool doDBRootMove = true;
try
{
oam.getPmDbrootConfig(moduleID, residedbrootConfigList);
if ( residedbrootConfigList.size() > 0 )
{
DBRootConfigList::iterator pt = residedbrootConfigList.begin();
if (*pt != 1)
{
try
{
oam.manualMovePmDbroot(newActiveModuleName, oam.itoa(*pt), config.OAMParentName());
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: manualMovePmDbroot Failed", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
}
else
{
doDBRootMove = false;
}
}
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: getPmDbrootConfig Failed", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
//move dbroot #1 to new parent
if (doDBRootMove)
{
try
{
oam.manualMovePmDbroot(config.OAMParentName(), "1", newActiveModuleName);
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: manualMovePmDbroot Failed", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
}
Config* sysConfig4 = Config::makeConfig();
// get new Active address
ModuleConfig moduleconfig;
oam.getSystemConfig(newActiveModuleName, moduleconfig);
HostConfigList::iterator pt2 = moduleconfig.hostConfigList.begin();
newActiveIPaddr = (*pt2).IPAddr;
sysConfig4->setConfig("ProcMgr", "IPAddr", newActiveIPaddr);
sysConfig4->setConfig("ProcMgr_Alarm", "IPAddr", newActiveIPaddr);
sysConfig4->setConfig("ProcStatusControl", "IPAddr", newActiveIPaddr);
sysConfig4->setConfig("DBRM_Controller", "IPAddr", newActiveIPaddr);
// update Parent OAM Module name to current module name
sysConfig4->setConfig("SystemConfig", "ParentOAMModuleName", newActiveModuleName);
// clear Standby OAM Module
sysConfig4->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig4->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try
{
sysConfig4->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
if ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM )
{
//set DDL/DMLproc IPs to new module
setPMProcIPs(newActiveModuleName);
//set Primary UM to new module
try
{
oam.setSystemConfig("PrimaryUMModuleName", newActiveModuleName);
}
catch (...) {}
}
log.writeLog(__LINE__, "Columnstore.xml entries update to local IP address of " + newActiveIPaddr, LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
sleep(1);
//change master MySQL Replication setup
log.writeLog(__LINE__, "Setup MySQL Replication for new Parent Module during switch-over", LOG_TYPE_DEBUG);
oam::DeviceNetworkList devicenetworklist;
processManager.setMySQLReplication(devicenetworklist, newActiveModuleName, false, oam::UnassignedName);
}
catch (exception& ex)
{
pthread_mutex_unlock(&THREAD_LOCK);
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
pthread_mutex_unlock(&THREAD_LOCK);
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
//send message to local Process Monitor for OAM Cold Activation
ByteStream msg1;
ByteStream::byte requestID = OAMPARENTCOLD;
msg1 << requestID;
while (true)
{
int returnStatus = sendMsgProcMon( config.moduleName(), msg1, requestID );
log.writeLog(__LINE__, "sent OAM Parent Cold message to local Process-Monitor, status: " + oam.itoa(returnStatus), LOG_TYPE_DEBUG);
if ( returnStatus == oam::API_SUCCESS)
break;
}
//send message to new Active Process Monitor for OAM Parent Activation
ByteStream msg;
requestID = OAMPARENTACTIVE;
msg << requestID;
while (true)
{
int returnStatus = sendMsgProcMon( newActiveModuleName, msg, requestID );
log.writeLog(__LINE__, "sent OAM Parent Activate message to New Active Process-Monitor, status: " + oam.itoa(returnStatus), LOG_TYPE_DEBUG);
if ( returnStatus == oam::API_SUCCESS)
break;
}
// start processmanager on new active node
startProcess(newActiveModuleName, "ProcessManager", oam::FORCEFUL);
// clear alarm
aManager.sendAlarmReport(newActiveModuleName.c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
//DOING THIS JUST TO UPDATE THE TIMESTAMP OF THE CALPONT.XML FILE AS A WORK-AROUND FIX
//BECAUSE PROCMON ISN'T READING UPDATES FROM DISK ON HDFS SYSTEMS
if (HDFS)
{
sleep(60);
Config* sysConfig = Config::makeConfig();
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
}
return returnStatus;
}
/******************************************************************************************
* @brief OAMParentModuleChange
*
* purpose: OAM Parent Module Change-over
* The module will take over running as the OAM Parent module
* after a detected outage
*
*
******************************************************************************************/
int ProcessManager::OAMParentModuleChange()
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
//
//monitor OAM Parent module for outage
//
log.writeLog(__LINE__, "OAMParentModuleChange Function Started", LOG_TYPE_DEBUG);
// Get Module Info
SystemModuleTypeConfig systemModuleTypeConfig;
try
{
oam.getSystemConfig(systemModuleTypeConfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
string downOAMParentIPAddress;
string downOAMParentHostname;
string downOAMParentName = config.OAMParentName();
//Build module list
vector<string> moduleNameList;
vector<string> moduleIPAddrList;
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0 )
// skip of no modules configured
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
//get parent module IP address
if ( (*pt).DeviceName == downOAMParentName )
{
downOAMParentIPAddress = (*pt1).IPAddr;
downOAMParentHostname = (*pt1).HostName;
continue;
}
//store the other modules
if ( (*pt).DeviceName != config.moduleName() )
{
moduleNameList.push_back((*pt).DeviceName);
moduleIPAddrList.push_back((*pt1).IPAddr);
}
}
}
string HA_IPAddr;
if ( moduleIPAddrList.empty() )
{
//get HA IP Address
Config* sysConfig = Config::makeConfig();
HA_IPAddr = sysConfig->getConfig("ProcMgr_HA", "IPAddr");
log.writeLog(__LINE__, "Get HA_IPAddr = " + HA_IPAddr, LOG_TYPE_DEBUG);
if ( !HA_IPAddr.empty() )
{
moduleNameList.push_back("HA_device");
moduleIPAddrList.push_back(HA_IPAddr);
}
}
int ModuleHeartbeatCount;
try
{
oam.getSystemConfig("ModuleHeartbeatCount", ModuleHeartbeatCount);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
string cmdLine = "ping ";
string cmdOption = " -c 1 -w 5 >> /dev/null";
string cmd;
int pingFailure = 0;
bool failover = false;
bool recoveryTest = false;
int disableCount = 0;
int noAckCount = 0;
bool amazonParentRestart = false;
while (!failover)
{
// check if a signal was received to start failover
if (startFailOver)
{
//send notification going from standby to active
oam.sendDeviceNotification(config.moduleName(), START_STANDBY_TO_MASTER);
break;
}
// perform ping test of Active Parent Module
string cmd = cmdLine + downOAMParentIPAddress + cmdOption;
int rtnCode = system(cmd.c_str());
switch (WEXITSTATUS(rtnCode))
{
case 0:
{
//Ack ping
pingFailure = 0;
if ( noAckCount != 0 )
oam.sendDeviceNotification(config.moduleName(), MODULE_UP);
noAckCount = 0;
//if Amazon Parent PM is restarting, monitor when back active and take needed actions
if (amazonParentRestart)
{
log.writeLog(__LINE__, "Amazon Parent pinging, waiting until it's active", LOG_TYPE_DEBUG);
sleep(60);
while (true)
{
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (...)
{}
if (systemstatus.SystemOpState == ACTIVE)
{
log.writeLog(__LINE__, "System Active, restart needed processes", LOG_TYPE_DEBUG);
processManager.restartProcessType("mysqld");
processManager.restartProcessType("ExeMgr");
processManager.restartProcessType("WriteEngineServer");
processManager.reinitProcessType("DBRMWorkerNode");
sleep(1);
processManager.restartProcessType("DDLProc");
sleep(1);
processManager.restartProcessType("DMLProc");
amazonParentRestart = false;
break;
}
sleep(5);
}
}
sleep(1);
break;
}
default:
{
//failed to respond to ping
pingFailure++;
log.writeLog(__LINE__, "OAMParentModule ping failure (" + downOAMParentName + ")", LOG_TYPE_WARNING);
if ( pingFailure >= ModuleHeartbeatCount )
{
bool ack = false;
bool noack = false;
//check NIC #1 status
int sockfd;
struct ifreq ifr;
sockfd = socket(AF_INET, SOCK_DGRAM, 0);
if (sockfd == -1)
{
log.writeLog(__LINE__, "Could not get socket to check NIC #1", LOG_TYPE_ERROR);
close(sockfd);
break;
}
/* get interface name */
strncpy(ifr.ifr_name, iface_name.c_str(), IFNAMSIZ);
/* Read interface flags */
if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0)
{
// not supported
close(sockfd);
break;
}
if (ifr.ifr_flags & IFF_UP)
{
log.writeLog(__LINE__, "Local Interface is UP", LOG_TYPE_INFO);
// any additional devices/modules to test
if ( !moduleNameList.empty())
{
// Active Parent not talking, check other modules or HA IP address
for ( int count = 0 ; count <= ModuleHeartbeatCount ; count++ )
{
vector<string>::iterator pt1 = moduleNameList.begin();
vector<string>::iterator pt2 = moduleIPAddrList.begin();
for ( ; pt1 != moduleNameList.end() ; pt1++, pt2++)
{
string cmd = cmdLine + *pt2 + cmdOption;
int rtnCode = system(cmd.c_str());
switch (WEXITSTATUS(rtnCode))
{
case 0:
{
//Ack ping
log.writeLog(__LINE__, *pt1 + " ping successful", LOG_TYPE_DEBUG);
ack = true;
break;
}
default:
{
// ping failure
log.writeLog(__LINE__, *pt1 + " ping failure", LOG_TYPE_WARNING);
noack = true;
//save module name
if ( *pt1 != "HA_device" )
downModuleList.push_back(*pt1);
break;
}
}
// exit loop if ping was successfuly
if ( ack )
break;
sleep (2);
}
// exit loop if ping was successfuly
if ( ack )
break;
}
}
else
{
// NIC #1 up, procede with failover
failover = true;
}
}
else
{
log.writeLog(__LINE__, "NIC #1 is DOWN", LOG_TYPE_WARNING);
// NIC #1 down, dont switch
noack = true;
if ( noAckCount == 0 )
oam.sendDeviceNotification(config.moduleName(), MODULE_DOWN);
noAckCount++;
}
close(sockfd);
//check if all modules are not responding to ping
if ( !ack && noack )
{
// yes, go into hold state by setting local module to cold-state
ByteStream msg;
ByteStream::byte requestID = OAMPARENTCOLD;
msg << requestID;
int returnStatus = processManager.sendMsgProcMon( config.moduleName(), msg, requestID );
log.writeLog(__LINE__, "sent OAM Parent Cold message to local Process-Monitor, status: " + oam.itoa(returnStatus), LOG_TYPE_DEBUG);
}
else
{
if ( ack && !noack )
{
// all other modules ACK, only parent failed, procede with failover
failover = true;
break;
}
else
{
if ( ack && noack && !recoveryTest)
{
// some other modules ACK, some didn't
// try 1 more time and mark sure didn't catch in the middle of a LAN recovery
recoveryTest = true;
}
else
{
if ( ack && noack && recoveryTest)
{
// some other modules ACK, some didn't, partial outage, do failover
failover = true;
break;
}
}
}
}
}
}
}
if ( !failover )
{
sleep(5);
downModuleList.clear();
}
else
{
// PARENT PM OUTAGE DETECTED
// check if disable flag is set, if so call the notification API
string activePmFailoverDisabled;
try
{
oam.getSystemConfig("ActivePmFailoverDisabled", activePmFailoverDisabled);
if ( activePmFailoverDisabled == "y" )
{
log.writeLog(__LINE__, "ActivePmFailoverDisabled is set, send notication", LOG_TYPE_DEBUG);
oam.sendDeviceNotification(downOAMParentName, PM_MASTER_FAILED_DISABLED);
failover = false;
sleep(5);
disableCount++;
if ( disableCount > 4 )
{
//no manually failover has been called, go ahead and do auto-failover
//send notification going from standby to active
log.writeLog(__LINE__, "ActivePmFailoverDisabled is set, but no manual action has been taken. Do Auto-Failover", LOG_TYPE_DEBUG);
oam.sendDeviceNotification(config.moduleName(), START_STANDBY_TO_MASTER);
}
}
else
{
//send notification going from standby to active
oam.sendDeviceNotification(config.moduleName(), START_STANDBY_TO_MASTER);
}
}
catch (exception& ex)
{}
//do amazon failover
if (amazon && AmazonPMFailover == "n")
{
log.writeLog(__LINE__, " ", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "*** OAMParentModule outage, AmazonPMFailover not set, waiting for instance to restart ***", LOG_TYPE_DEBUG);
string currentIPAddr = oam.getEC2InstanceIpAddress(downOAMParentHostname);
if (currentIPAddr == "stopped")
{
// start instance
int retryCount = 6; // 1 minutes
if ( PMInstanceType == "m2.4xlarge" )
retryCount = 15; // 2.5 minutes
log.writeLog(__LINE__, "Instance in stopped state, try starting it: " + downOAMParentHostname, LOG_TYPE_DEBUG);
int retry = 0;
for ( ; retry < retryCount ; retry++ )
{
if ( oam.startEC2Instance(downOAMParentHostname) )
{
log.writeLog(__LINE__, "Instance started, sleep for 30 seconds to allow it to fully come up: " + downOAMParentHostname, LOG_TYPE_DEBUG);
//delay then get new IP Address
sleep(30);
string currentIPAddr = oam.getEC2InstanceIpAddress(downOAMParentHostname);
if (currentIPAddr == "stopped" || currentIPAddr == "terminated")
{
log.writeLog(__LINE__, "Instance failed to start (no ip-address), retry: " + downOAMParentHostname, LOG_TYPE_DEBUG);
}
else
{
// update the Columnstore.xml with the new IP Address
string cmd = "sed -i s/" + downOAMParentIPAddress + "/" + currentIPAddr + "/g " + MCSSYSCONFDIR + "/columnstore/Columnstore.xml";
system(cmd.c_str());
// get parent hotsname and IP address in case it changed
downOAMParentIPAddress = currentIPAddr;
amazonParentRestart = true;
break;
}
}
else
{
log.writeLog(__LINE__, "Instance failed to start, retry: " + downOAMParentHostname, LOG_TYPE_DEBUG);
sleep(5);
}
}
if ( retry >= retryCount )
{
log.writeLog(__LINE__, "Instance failed to start, restart a new instance: " + downOAMParentHostname, LOG_TYPE_DEBUG);
currentIPAddr = "terminated";
}
}
if ( currentIPAddr != "terminated")
{
log.writeLog(__LINE__, "Instance rebooting, monitor", LOG_TYPE_DEBUG);
//clear and go monitor again
failover = false;
amazonParentRestart = true;
}
else
log.writeLog(__LINE__, "Instance terminated, do standby-active failover", LOG_TYPE_DEBUG);
}
//storage config
string DBRootStorageType;
try
{
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
}
catch (...) {}
log.writeLog(__LINE__, "OAMParentModuleChange: DBRootStorageType = " + DBRootStorageType, LOG_TYPE_DEBUG);
if ( DBRootStorageType == "internal" && failover && DataRedundancyConfig == "n")
{
log.writeLog(__LINE__, "DBRoot Storage configured for internal, don't do standby-active failover", LOG_TYPE_DEBUG);
//clear and go monitor again
failover = false;
}
}
}
log.writeLog(__LINE__, " ", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "*** OAMParentModule outage, OAM Parent Module change-over started ***", LOG_TYPE_DEBUG);
gdownActiveOAMModule = downOAMParentName;
// update Columnstore.xml entries
string localIPaddr;
string newStandbyModule = downOAMParentName;
string standbyIPaddr = downOAMParentIPAddress;
try
{
pthread_mutex_lock(&THREAD_LOCK);
Config* sysConfig4 = Config::makeConfig();
// get my IP address
ModuleConfig moduleconfig;
oam.getSystemConfig(config.moduleName(), moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
localIPaddr = (*pt1).IPAddr;
sysConfig4->setConfig("ProcMgr", "IPAddr", localIPaddr);
sysConfig4->setConfig("ProcMgr_Alarm", "IPAddr", localIPaddr);
sysConfig4->setConfig("ProcStatusControl", "IPAddr", localIPaddr);
sysConfig4->setConfig("DBRM_Controller", "IPAddr", localIPaddr);
// update Parent OAM Module name to current module name
sysConfig4->setConfig("SystemConfig", "ParentOAMModuleName", config.moduleName());
// clear Standby OAM Module
sysConfig4->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig4->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
//update Calpont Config table
try
{
sysConfig4->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_FAILURE;
}
pthread_mutex_unlock(&THREAD_LOCK);
//clear run standby flag;
runStandby = false;
int retryCount = 0;
//sleep, give time for message thread to startup
while (!MsgThreadActive && retryCount < 10)
{
log.writeLog(__LINE__, "Waiting for Message Thread...", LOG_TYPE_DEBUG);
sleep(5);
++retryCount;
}
//run save.brm script
//Nope turns out this has to be done first...
processManager.saveBRM(false);
try
{
oam.autoMovePmDbroot(downOAMParentName);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
}
//distribute config file
distributeConfigFile("system");
//re-read config info again
Configuration config;
oam.setHotStandbyPM(standbyIPaddr);
log.writeLog(__LINE__, "Columnstore.xml Standby OAM updated : " + newStandbyModule + ":" + standbyIPaddr, LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "Columnstore.xml entries update to local IP address of " + localIPaddr, LOG_TYPE_DEBUG);
}
catch (exception& ex)
{
pthread_mutex_unlock(&THREAD_LOCK);
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
pthread_mutex_unlock(&THREAD_LOCK);
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
if ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM )
{
//set DDL/DMLproc IPs to local module
setPMProcIPs(config.moduleName());
try
{
oam.setSystemConfig("PrimaryUMModuleName", config.moduleName());
}
catch (...) {}
}
//send message to local Process Monitor for OAM Parent Activation
ByteStream msg;
ByteStream::byte requestID = OAMPARENTACTIVE;
msg << requestID;
while (true)
{
int returnStatus = sendMsgProcMon( config.moduleName(), msg, requestID );
log.writeLog(__LINE__, "sent OAM Parent Activate message to local Process-Monitor, status: " + oam.itoa(returnStatus), LOG_TYPE_DEBUG);
if ( returnStatus == oam::API_SUCCESS)
break;
}
//set Process Manager state, will make sure process-monitor status control is working
while (true)
{
try
{
ProcessStatus procstat;
oam.getProcessStatus("ProcessManager", config.moduleName(), procstat);
int ret = setProcessState(config.moduleName(), "ProcessManager", oam::ACTIVE, 0);
if ( ret == oam::API_SUCCESS )
{
oam.getProcessStatus("ProcessManager", config.moduleName(), procstat);
if ( procstat.ProcessOpState == oam::ACTIVE )
break;
}
}
catch (...)
{}
sleep(1);
}
// set alarm
ALARMManager aManager;
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, SET);
//set down Active module to disable state
disableModule(downOAMParentName, false);
//do it here to get current processes active faster to process queries faster
processManager.setProcessStates(downOAMParentName, oam::AUTO_OFFLINE);
//set OTHER down modules to disable state
vector<string>::iterator pt1 = downModuleList.begin();
for ( ; pt1 != downModuleList.end() ; pt1++)
{
// Don't do this again for downOAMParentName we just did it 3 lines ago
if (*pt1 != downOAMParentName)
{
disableModule(*pt1, false);
processManager.setProcessStates(*pt1, oam::AUTO_OFFLINE);
}
}
//distribute config file
distributeConfigFile("system");
//restart local module WHY??
processManager.stopModule(config.moduleName(), oam::FORCEFUL, true);
string localModule = config.moduleName();
pthread_t startmodulethread;
int status = pthread_create (&startmodulethread, NULL, (void* (*)(void*)) &startModuleThread, &localModule);
if ( status != 0 )
log.writeLog(__LINE__, "startModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
if (status == 0)
{
pthread_join(startmodulethread, NULL);
status = startsystemthreadStatus;
}
reinitProcessType("cpimport");
// waiting until dml are ACTIVE
int retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
//restart/reinit processes to force their release of the controller node port
if ( ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM) &&
( moduleNameList.size() <= 0 && config.moduleType() == "pm") )
{
// Do Nothing
}
else
{
//send message to start new Standby Process-Manager, if needed
newStandbyModule = getStandbyModule();
if ( !newStandbyModule.empty() && newStandbyModule != downOAMParentName
&& newStandbyModule != "NONE")
{
// get standby IP address and update entries
setStandbyModule(newStandbyModule);
}
//send message to each child process to start any COLD_STANDBY processes
SystemModuleTypeConfig systemmoduletypeconfig;
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
for ( unsigned int i = 0; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++)
{
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus((*pt).DeviceName, opState, degraded);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getModuleStatus on module " + (*pt).DeviceName + ": Caught unknown exception!", LOG_TYPE_ERROR);
}
if (opState != oam::MAN_DISABLED)
{
if (opState != oam::AUTO_DISABLED)
{
if ((*pt).DeviceName != downOAMParentName )
{
if ((*pt).DeviceName != config.moduleName() )
{
// processManager.setModuleState((*pt).DeviceName, oam::AUTO_INIT);
pthread_t startmodulethread;
string moduleName = (*pt).DeviceName;
int status = pthread_create (&startmodulethread, NULL, (void* (*)(void*)) &startModuleThread, &moduleName);
if ( status != 0 )
log.writeLog(__LINE__, "startModuleThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
sleep(1);
}
}
}
}
}
}
}
if ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM )
{
//change master MySQL Replication setup
log.writeLog(__LINE__, "Setup this node as MySQL Replication Master", LOG_TYPE_DEBUG);
oam::DeviceNetworkList devicenetworklist;
processManager.setMySQLReplication(devicenetworklist, config.moduleName());
}
processManager.restartProcessType("DBRMControllerNode");
processManager.reinitProcesses();
// waiting until dml are ACTIVE
retry = 0;
while (retry < 30)
{
ProcessStatus DMLprocessstatus;
try
{
oam.getProcessStatus("DMLProc", config.moduleName(), DMLprocessstatus);
}
catch (exception& ex)
{}
catch (...)
{}
if (DMLprocessstatus.ProcessOpState == oam::BUSY_INIT)
log.writeLog(__LINE__, "Waiting for DMLProc to finish rollback", LOG_TYPE_DEBUG);
if (DMLprocessstatus.ProcessOpState == oam::ACTIVE)
break;
if (DMLprocessstatus.ProcessOpState == oam::FAILED)
break;
// wait some more
sleep(2);
++retry;
}
// clear alarm
aManager.sendAlarmReport(config.moduleName().c_str(), MODULE_SWITCH_ACTIVE, CLEAR);
//set status to ACTIVE while failover is in progress
processManager.setSystemState(oam::ACTIVE);
log.writeLog(__LINE__, "*** Exiting OAMParentModuleChange function ***", LOG_TYPE_DEBUG);
return API_SUCCESS;
}
/******************************************************************************************
* @brief sendStatusUpdate
*
* purpose: Send Status Update to Process Monitor
*
*
******************************************************************************************/
void ProcessManager::sendStatusUpdate(ByteStream obs, ByteStream::byte returnRequestType)
{
try
{
MessageQueueClient processor("ProcStatusControl");
ByteStream ibs;
processor.write(obs);
// wait 10 seconds for ACK from Process Monitor
struct timespec ts = { 10, 0 };
ibs = processor.read(&ts);
if (ibs.length() > 0)
{
ByteStream::byte status;
ibs >> status;
if ( status == oam::API_SUCCESS )
{
processor.shutdown();
}
else
{
// shutdown connection
processor.shutdown();
throw std::runtime_error("error");
}
}
else
{
// timeout occurred, shutdown connection
processor.shutdown();
throw std::runtime_error("timeout");
}
}
catch (...)
{
throw std::runtime_error("timeout");
}
Configuration config;
Config* sysConfig5 = Config::makeConfig();
if ( sysConfig5->getConfig("ProcStatusControlStandby", "IPAddr") == oam::UnassignedIpAddr )
return;
try
{
MessageQueueClient processor("ProcStatusControlStandby");
ByteStream ibs;
processor.write(obs);
processor.shutdown();
}
catch (...)
{}
return;
}
/******************************************************************************************
* @brief getStandbyModule
*
* purpose: find an avaliable hot-standby module based on Process-Manager status, if one exist
*
*
******************************************************************************************/
std::string ProcessManager::getStandbyModule()
{
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
string backupStandbyModule = "NONE";
string newStandbyModule = "NONE";
//check if gluster, if so then find PMs that have copies of DBROOT #1
string pmList = "";
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::STANDBY )
//already have a hot-standby
return "";
}
}
catch (exception& ex)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + string(ex.what()), LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if (DataRedundancyConfig == "y")
{
try
{
string errmsg;
oam.glusterctl(oam::GLUSTER_WHOHAS, "1", pmList, errmsg);
boost::char_separator<char> sep(" ");
boost::tokenizer< boost::char_separator<char> > tokens(pmList, sep);
for ( boost::tokenizer< boost::char_separator<char> >::iterator it = tokens.begin();
it != tokens.end();
++it)
{
string pm = "pm" + *it;
// skip if current module
if ( pm == config.moduleName() )
continue;
int opState;
bool degraded;
try
{
oam.getModuleStatus(pm, opState, degraded);
}
catch (...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
{
continue;
}
else
return pm;
}
}
catch (...)
{}
return "NONE";
}
//not gluster, check by status
try
{
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::STANDBY )
//already have a hot-standby
return "";
if ( ( backupStandbyModule != "NONE" ) ||
( newStandbyModule != "NONE" ) )
continue;
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY )
{
// Found a ProcessManager in a COLD_STANDBY state
newStandbyModule = systemprocessstatus.processstatus[i].Module;
continue;
}
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::MAN_OFFLINE &&
backupStandbyModule == "NONE" &&
newStandbyModule == "NONE" )
{
// Found a ProcessManager in a MAN_OFFLINE state, use if no COLD_STANDBY is found
// and module is not disabled
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(systemprocessstatus.processstatus[i].Module, opState, degraded);
}
catch (...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
{
continue;
}
else
backupStandbyModule = systemprocessstatus.processstatus[i].Module;
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
if ( newStandbyModule != "NONE" )
return newStandbyModule;
return backupStandbyModule;
}
/******************************************************************************************
* @brief setStandbyModule
*
* purpose: set Standby Module info in Columnstore.xml
*
*
******************************************************************************************/
bool ProcessManager::setStandbyModule(std::string newStandbyModule, bool send)
{
Oam oam;
if ( newStandbyModule.empty() )
return true;
pthread_mutex_lock(&THREAD_LOCK);
for (int i = 0 ; i < 5; i++)
{
// get standby IP address and update entries
ModuleConfig moduleconfig;
oam.getSystemConfig(newStandbyModule, moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string standbyIPaddr = (*pt1).IPAddr;
Configuration config;
Config* sysConfig6 = Config::makeConfig();
sysConfig6->setConfig("SystemConfig", "StandbyOAMModuleName", newStandbyModule);
sysConfig6->setConfig("ProcStatusControlStandby", "IPAddr", standbyIPaddr);
try
{
sysConfig6->write();
pthread_mutex_unlock(&THREAD_LOCK);
oam.setHotStandbyPM(standbyIPaddr);
//distribute config file
distributeConfigFile("system");
log.writeLog(__LINE__, "Columnstore.xml Standby OAM updated to : " + newStandbyModule + ":" + standbyIPaddr, LOG_TYPE_DEBUG);
if (send)
{
log.writeLog(__LINE__, "Send Message for new Hot-Standby ProcessManager to module = " + newStandbyModule, LOG_TYPE_DEBUG);
int retStatus = startProcess(newStandbyModule, "ProcessManager", oam::GRACEFUL_STANDBY);
log.writeLog(__LINE__, "Hot-Standby ProcessManager ACK received from Process-Monitor, return status = " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
}
return true;
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "setStandbyModule: EXCEPTION ERROR on sysConfig->write(): " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "setStandbyModule :EXCEPTION ERROR on sysConfig->write(): Caught unknown exception!", LOG_TYPE_ERROR);
}
}
log.writeLog(__LINE__, "setStandbyModule: failed to set enable state", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return false;
}
/******************************************************************************************
* @brief clearStandbyModule
*
* purpose: clear Standby Module info in Columnstore.xml
*
*
******************************************************************************************/
bool ProcessManager::clearStandbyModule()
{
Oam oam;
pthread_mutex_lock(&THREAD_LOCK);
Configuration config;
for (int i = 0 ; i < 5; i++)
{
Config* sysConfig7 = Config::makeConfig();
sysConfig7->setConfig("SystemConfig", "StandbyOAMModuleName", oam::UnassignedName);
sysConfig7->setConfig("ProcStatusControlStandby", "IPAddr", oam::UnassignedIpAddr);
try
{
sysConfig7->write();
pthread_mutex_unlock(&THREAD_LOCK);
oam.setHotStandbyPM(" ");
log.writeLog(__LINE__, "Clear Columnstore.xml Standby OAM", LOG_TYPE_DEBUG);
//distribute config file
distributeConfigFile("system");
return true;
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "clearStandbyModule: EXCEPTION ERROR on sysConfig->write(): " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "clearStandbyModule :EXCEPTION ERROR on sysConfig->write(): Caught unknown exception!", LOG_TYPE_ERROR);
}
sleep(1);
}
log.writeLog(__LINE__, "clearStandbyModule: failed to set enable state", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return false;
}
/******************************************************************************************
* @brief setEnableState
*
* purpose: set Enable State info in Columnstore.xml
*
*
******************************************************************************************/
int ProcessManager::setEnableState(std::string target, std::string state)
{
Oam oam;
ModuleConfig moduleconfig;
pthread_mutex_lock(&THREAD_LOCK);
for (int i = 0 ; i < 5; i++)
{
try
{
oam.getSystemConfig(target, moduleconfig);
moduleconfig.DisableState = state;
try
{
oam.setSystemConfig(target, moduleconfig);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "setEnableState: EXCEPTION ERROR on setSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "setEnableState: EXCEPTION ERROR on setSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "setEnableState: EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "setEnableState: EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
sleep(1);
}
log.writeLog(__LINE__, "setEnableState: failed to set enable state", LOG_TYPE_ERROR);
pthread_mutex_unlock(&THREAD_LOCK);
return API_SUCCESS;
}
/******************************************************************************************
* @brief stopProcessTypes
*
* purpose: stop by process type
*
*
******************************************************************************************/
void ProcessManager::stopProcessTypes(bool manualFlag)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
// skip if single server install, meaning only 1 worker node
try
{
Config* sysConfig = Config::makeConfig();
if ( sysConfig->getConfig("DBRM_Controller", "NumWorkers") == "1" )
return;
}
catch (...)
{
return;
}
log.writeLog(__LINE__, "stopProcessTypes Called");
//front-end first
processManager.stopProcessType("mysqld", manualFlag);
processManager.stopProcessType("DMLProc", manualFlag);
processManager.stopProcessType("DDLProc", manualFlag);
processManager.stopProcessType("ExeMgr", manualFlag);
//back-end
processManager.stopProcessType("WriteEngineServer", manualFlag);
processManager.stopProcessType("PrimProc", manualFlag);
//dbrm
processManager.stopProcessType("DBRMControllerNode", manualFlag);
processManager.stopProcessType("DBRMWorkerNode", manualFlag);
processManager.stopProcessType("StorageManager", manualFlag);
log.writeLog(__LINE__, "stopProcessTypes Completed");
}
/******************************************************************************************
* @brief unmountDBRoot
*
* purpose: unmount a dbroot
*
*
******************************************************************************************/
int ProcessManager::unmountDBRoot(std::string dbrootID)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
//get pm assigned to that dbroot
int pmID;
oam.getDbrootPmConfig(atoi(dbrootID.c_str()), pmID);
string moduleName = "pm" + oam.itoa(pmID);
log.writeLog(__LINE__, "send unmountDBRoot to pm: " + dbrootID + "/" + moduleName, LOG_TYPE_DEBUG );
ByteStream msg;
msg << (ByteStream::byte) PROCUNMOUNT;
msg << dbrootID;
return sendMsgProcMon( moduleName, msg, PROCUNMOUNT );
}
/******************************************************************************************
* @brief mountDBRoot
*
* purpose: mount a dbroot
*
*
******************************************************************************************/
int ProcessManager::mountDBRoot(std::string dbrootID)
{
ProcessLog log;
Configuration config;
ProcessManager processManager(config, log);
Oam oam;
if (DataRedundancyConfig == "y")
return oam::API_SUCCESS;
//get pm assigned to that dbroot
int pmID;
oam.getDbrootPmConfig(atoi(dbrootID.c_str()), pmID);
string moduleName = "pm" + oam.itoa(pmID);
log.writeLog(__LINE__, "send mountDBRoot to pm: " + dbrootID + "/" + moduleName, LOG_TYPE_DEBUG );
//send msg to ProcMon if not local module
if ( config.moduleName() == moduleName )
{
string tmpMount = tmpLogDir + "/mount.log";
string cmd = "export LC_ALL=C;mount /var/lib/columnstore/data" + dbrootID + " > " + tmpMount;
system(cmd.c_str());
if ( !rootUser)
{
cmd = "chown -R " + USER + ":" + USER + " /var/lib/columnstore/data" + dbrootID + " > /dev/null";
system(cmd.c_str());
}
ifstream in(tmpMount.c_str());
in.seekg(0, std::ios::end);
int size = in.tellg();
if ( size != 0 )
{
if (!oam.checkLogStatus(tmpMount, "already"))
{
log.writeLog(__LINE__, "mount failed, dbroot: " + dbrootID);
return API_FAILURE;
}
}
}
else
{
ByteStream msg;
msg << (ByteStream::byte) PROCMOUNT;
msg << dbrootID;
return sendMsgProcMon( moduleName, msg, PROCMOUNT );
}
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief flushInodeCache
*
* purpose: flush cache
*
*
******************************************************************************************/
void ProcessManager::flushInodeCache()
{
int fd;
ByteStream reply;
#ifdef __linux__
fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
if (fd >= 0)
{
if (write(fd, "3\n", 2) == 2)
{
log.writeLog(__LINE__, "flushInodeCache successful", LOG_TYPE_DEBUG);
}
else
{
log.writeLog(__LINE__, "flushInodeCache failed", LOG_TYPE_DEBUG);
}
close(fd);
}
else
{
log.writeLog(__LINE__, "flushInodeCache failed to open file", LOG_TYPE_DEBUG);
}
#endif
}
/******************************************************************************************
* @brief setMySQLReplication
*
* purpose: setMySQLReplication
*
*
******************************************************************************************/
int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist, std::string masterModule, bool distributeDB, std::string password, bool enable, bool addModule)
{
Oam oam;
string MySQLRep;
try
{
oam.getSystemConfig("MySQLRep", MySQLRep);
}
catch (...)
{
MySQLRep = "n";
}
if ( MySQLRep == "n" && enable )
{
log.writeLog(__LINE__, "setMySQLReplication: MySQLRep not set, exiting", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
log.writeLog(__LINE__, "Setup MySQL Replication", LOG_TYPE_DEBUG);
//get master info
if ( masterModule == oam::UnassignedName)
{
try
{
oam.getSystemConfig("PrimaryUMModuleName", masterModule);
}
catch (...)
{
masterModule = oam::UnassignedName;
}
if ( masterModule == oam::UnassignedName )
{
// use default setting
masterModule = "um1";
if ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM )
masterModule = "pm1";
}
}
//send distubute DB
if ( distributeDB )
{
if ( devicenetworklist.size() == 0 )
{
//dist to all slaves
ByteStream msg;
ByteStream::byte requestID = oam::MASTERDIST;
msg << requestID;
msg << password;
msg << "all";
log.writeLog(__LINE__, "Distribute Master DB, master module=" + masterModule, LOG_TYPE_DEBUG);
int returnStatus = sendMsgProcMon( masterModule, msg, requestID, 60 );
if ( returnStatus != API_SUCCESS)
{
log.writeLog(__LINE__, "setMySQLReplication: ERROR: Error getting MySQL Replication Master Information", LOG_TYPE_ERROR);
return API_FAILURE;
}
}
else
{
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string remoteModuleName = (*listPT).DeviceName;
//skip master
if ( remoteModuleName == masterModule )
continue;
if ( !addModule )
{
// skip disabled modules
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(remoteModuleName, opState, degraded);
}
catch (...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
}
// don't do PMs unless PMwithUM flag is set
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM )
{
string moduleType = remoteModuleName.substr(0, MAX_MODULE_TYPE_SIZE);
if ( moduleType == "pm" && PMwithUM == "n" )
continue;
}
ByteStream msg;
ByteStream::byte requestID = oam::MASTERDIST;
msg << requestID;
msg << password;
msg << remoteModuleName;
log.writeLog(__LINE__, "Distribute Master DB, master module=" + masterModule, LOG_TYPE_DEBUG);
int returnStatus = sendMsgProcMon( masterModule, msg, requestID, 60 );
if ( returnStatus != API_SUCCESS)
{
log.writeLog(__LINE__, "setMySQLReplication: ERROR: Error getting MySQL Replication Master Information", LOG_TYPE_ERROR);
return API_FAILURE;
}
}
}
}
//send setup master
ByteStream msg;
ByteStream::byte requestID = oam::MASTERREP;
if ( !enable )
{
requestID = oam::DISABLEREP;
log.writeLog(__LINE__, "Disable MySQL Replication, master module=" + masterModule, LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "Setup MySQL Replication, master module=" + masterModule, LOG_TYPE_DEBUG);
msg << requestID;
int returnStatus = sendMsgProcMon( masterModule, msg, requestID, 60 );
if ( returnStatus != API_SUCCESS)
{
log.writeLog(__LINE__, "setMySQLReplication: ERROR: Error getting MySQL Replication Master Information", LOG_TYPE_ERROR);
return API_FAILURE;
}
//
// send msg to setup slave
//
// check if a list was provide, if not, do all modules
if ( devicenetworklist.size() == 0 )
{
log.writeLog(__LINE__, "Setup MySQL Replication on all modules", LOG_TYPE_DEBUG);
SystemModuleTypeConfig systemmoduletypeconfig;
try
{
oam.getSystemConfig(systemmoduletypeconfig);
}
catch (exception& ex)
{}
for ( unsigned int i = 0; i < systemmoduletypeconfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemmoduletypeconfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
string moduleType = systemmoduletypeconfig.moduletypeconfig[i].ModuleType;
DeviceNetworkList::iterator pt = systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemmoduletypeconfig.moduletypeconfig[i].ModuleNetworkList.end(); pt++ )
{
string remoteModuleName = (*pt).DeviceName;
//skip master
if ( remoteModuleName == masterModule )
continue;
if ( !addModule )
{
// skip disabled modules
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(remoteModuleName, opState, degraded);
}
catch (...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
}
// don't do PMs unless PMwithUM flag is set
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM )
{
string moduleType = remoteModuleName.substr(0, MAX_MODULE_TYPE_SIZE);
if ( moduleType == "pm" && PMwithUM == "n" )
continue;
}
ByteStream msg1;
ByteStream::byte requestID = oam::SLAVEREP;
if ( !enable )
{
requestID = oam::DISABLEREP;
log.writeLog(__LINE__, "Disable MySQL Replication, slave module=" + remoteModuleName, LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "Setup MySQL Replication, slave module=" + remoteModuleName, LOG_TYPE_DEBUG);
msg1 << requestID;
if ( enable )
{
if ( masterLogFile == oam::UnassignedName ||
masterLogPos == oam::UnassignedName )
return API_FAILURE;
msg1 << masterLogFile;
msg1 << masterLogPos;
}
returnStatus = sendMsgProcMon( remoteModuleName, msg1, requestID, 60 );
if ( returnStatus != API_SUCCESS)
{
log.writeLog(__LINE__, "setMySQLReplication: ERROR: Error setting MySQL Replication Slave", LOG_TYPE_ERROR);
return API_FAILURE;
}
}
}
}
else
{
DeviceNetworkList::iterator listPT = devicenetworklist.begin();
for ( ; listPT != devicenetworklist.end() ; listPT++)
{
string remoteModuleName = (*listPT).DeviceName;
//skip master
if ( remoteModuleName == masterModule )
continue;
if ( !addModule )
{
// skip disabled modules
int opState = oam::ACTIVE;
bool degraded;
try
{
oam.getModuleStatus(remoteModuleName, opState, degraded);
}
catch (...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
}
log.writeLog(__LINE__, "Setup Slave MySQL Replication on " + remoteModuleName, LOG_TYPE_DEBUG);
ByteStream msg1;
ByteStream::byte requestID = oam::SLAVEREP;
if ( !enable )
{
requestID = oam::DISABLEREP;
log.writeLog(__LINE__, "Disable MySQL Replication, slave module=" + remoteModuleName, LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "Setup MySQL Replication, slave module=" + remoteModuleName, LOG_TYPE_DEBUG);
msg1 << requestID;
if ( masterLogFile == oam::UnassignedName ||
masterLogPos == oam::UnassignedName )
{
log.writeLog(__LINE__, "setMySQLReplication: ERROR: Unassigned masterLogFile or masterLogPos", LOG_TYPE_ERROR);
return API_FAILURE;
}
if ( enable )
{
if ( masterLogFile == oam::UnassignedName ||
masterLogPos == oam::UnassignedName )
return API_FAILURE;
msg1 << masterLogFile;
msg1 << masterLogPos;
}
returnStatus = sendMsgProcMon( remoteModuleName, msg1, requestID, 60 );
if ( returnStatus != API_SUCCESS)
{
log.writeLog(__LINE__, "setMySQLReplication: ERROR: Error setting MySQL Replication Slave", LOG_TYPE_ERROR);
return API_FAILURE;
}
}
}
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief glusterAssign
*
* purpose: Gluster assign dbroot to a module
*
******************************************************************************************/
int ProcessManager::glusterAssign(std::string moduleName, std::string dbroot)
{
ByteStream msg;
ByteStream::byte requestID = PROCGLUSTERASSIGN;
msg << requestID;
msg << dbroot;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
int retry = 0;
// Try this for a minute because in failover the node returning to service may not be listening yet
while(returnStatus != API_SUCCESS && retry < 60)
{
log.writeLog(__LINE__, "glusterAssign retrying...", LOG_TYPE_DEBUG);
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
sleep(1);
++retry;
}
if ( returnStatus == API_SUCCESS)
{
//log the success event
log.writeLog(__LINE__, "glusterAssign Success: " + moduleName + "/" + dbroot, LOG_TYPE_DEBUG);
}
else
{
//log the error event
log.writeLog(__LINE__, "glusterAssign FAILED: " + moduleName + "/" + dbroot, LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief glusterUnassign
*
* purpose: Gluster Unassign dbroot to a module
*
******************************************************************************************/
int ProcessManager::glusterUnassign(std::string moduleName, std::string dbroot)
{
ByteStream msg;
ByteStream::byte requestID = PROCGLUSTERUNASSIGN;
msg << requestID;
msg << dbroot;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
int retry = 0;
// Try this for a minute because in failover the node returning to service may not be listening yet
while(returnStatus != API_SUCCESS && retry < 60)
{
log.writeLog(__LINE__, "glusterUnassign retrying...", LOG_TYPE_DEBUG);
returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
sleep(1);
++retry;
}
if ( returnStatus == API_SUCCESS)
{
//log the success event
log.writeLog(__LINE__, "glusterUnassign Success: " + moduleName + "/" + dbroot, LOG_TYPE_DEBUG);
}
else
{
//log the error event
log.writeLog(__LINE__, "glusterUnassign FAILED: " + moduleName + "/" + dbroot, LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief syncFsALL
*
* purpose: Sync filesystem for backup snapshots on suspenddatabasewrites
*
******************************************************************************************/
int ProcessManager::syncFsAll(std::string moduleName)
{
ByteStream msg;
ByteStream::byte requestID = SYNCFSALL;
msg << requestID;
int returnStatus = sendMsgProcMon( moduleName, msg, requestID, 30 );
if ( returnStatus == API_SUCCESS)
{
//log the success event
log.writeLog(__LINE__, "syncFsALL Success: " + moduleName, LOG_TYPE_DEBUG);
}
else
{
//log the error event
log.writeLog(__LINE__, "syncFsALL FAILED: " + moduleName, LOG_TYPE_ERROR);
}
return returnStatus;
}
} //end of namespace
// vim:ts=4 sw=4: