1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-04-18 21:44:02 +03:00
mariadb-columnstore-engine/procmon/processmonitor.cpp
2020-11-17 15:03:10 +03:00

6249 lines
210 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
Copyright (C) 2016 MariaDB Corporation
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***************************************************************************
* $Id: processmonitor.cpp 2044 2013-08-07 19:47:37Z dhill $
*
***************************************************************************/
#include <boost/scoped_ptr.hpp>
#include <boost/filesystem.hpp>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/random_generator.hpp>
#include <boost/uuid/uuid_io.hpp>
#include "columnstoreversion.h"
#include "mcsconfig.h"
#include "IDBDataFile.h"
#include "IDBPolicy.h"
#include "processmonitor.h"
#include "installdir.h"
#include "cacheutils.h"
#include "ddlcleanuputil.h"
using namespace cacheutils;
using namespace std;
using namespace oam;
using namespace messageqcpp;
using namespace alarmmanager;
using namespace logging;
using namespace config;
using namespace idbdatafile;
namespace bf = boost::filesystem;
extern string systemOAM;
extern string dm_server;
extern bool runStandby;
extern bool processInitComplete;
extern int fmoduleNumber;
extern string cloud;
extern string DataRedundancyConfig;
extern bool rootUser;
extern string USER;
extern bool HDFS;
extern string PMwithUM;
extern bool startProcMon;
extern string tmpLogDir;
extern string SUDO;
//std::string gOAMParentModuleName;
bool gOAMParentModuleFlag;
bool gOAMStandbyModuleFlag;
typedef boost::tuple<std::string, ALARMS, int> sendAlarmInfo_t;
typedef boost::tuple<std::string, int, pid_t> sendProcessInfo_t;
pthread_mutex_t ALARM_LOCK;
pthread_mutex_t LIST_LOCK;
pthread_mutex_t PROCESS_LOCK;
namespace processmonitor
{
void* sendAlarmThread (sendAlarmInfo_t* t);
void* sendProcessThread (sendProcessInfo_t* t);
using namespace oam;
/******************************************************************************************
* @brief MonitorConfig
*
* purpose: MonitorConfig constructor
*
******************************************************************************************/
MonitorConfig::MonitorConfig()
{
Oam oam;
oamModuleInfo_t t;
//get local module info
try
{
t = oam.getModuleInfo();
flocalModuleName = boost::get<0>(t);
flocalModuleType = boost::get<1>(t);
flocalModuleID = boost::get<2>(t);
fOAMParentModuleName = boost::get<3>(t);
fOAMParentModuleFlag = boost::get<4>(t);
fserverInstallType = boost::get<5>(t);
fOAMStandbyModuleName = boost::get<6>(t);
fOAMStandbyModuleFlag = boost::get<7>(t);
gOAMStandbyModuleFlag = boost::get<7>(t);
gOAMParentModuleFlag = boost::get<4>(t);
}
catch (exception& e)
{
cout << endl << "ProcMon Construct Error reading getModuleInfo = " << e.what() << endl;
}
// cout << "OAMParentModuleName = " << fOAMParentModuleName << endl;
// if ( fOAMParentModuleName == oam::UnassignedName ) {
// cout << endl << "OAMParentModuleName == oam::UnassignedName, exiting " << endl;
// exit (-1);
// }
//get calpont software version and release
fsoftwareVersion = columnstore_version;
fsoftwareRelease = columnstore_release;
}
/******************************************************************************************
* @brief MonitorConfig destructor
*
* purpose: MonitorConfig destructor
*
******************************************************************************************/
MonitorConfig::~MonitorConfig()
{
}
/******************************************************************************************
* @brief MonitorLog Constructor
*
* purpose: Constructor:open the log file for writing
*
******************************************************************************************/
MonitorLog::MonitorLog()
{
}
/******************************************************************************************
* @brief MonitorLog Destructor
*
* purpose: Destructor:close the log file
*
******************************************************************************************/
MonitorLog::~MonitorLog()
{
}
/******************************************************************************************
* @brief writeLog for string
*
* purpose: write string message to the log file
*
******************************************************************************************/
void MonitorLog::writeLog(const int lineNumber, const string logContent, const LOG_TYPE logType)
{
//Log this event
LoggingID lid(18);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add(logContent);
msg.format(args);
switch (logType)
{
case LOG_TYPE_DEBUG:
ml.logDebugMessage(msg);
break;
case LOG_TYPE_INFO:
ml.logInfoMessage(msg);
break;
case LOG_TYPE_WARNING:
ml.logWarningMessage(msg);
break;
case LOG_TYPE_ERROR:
args.add("line:");
args.add(lineNumber);
ml.logErrorMessage(msg);
break;
case LOG_TYPE_CRITICAL:
ml.logCriticalMessage(msg);
break;
}
return;
}
/******************************************************************************************
* @brief writeLog for integer
*
* purpose: write integer information to the log file
*
******************************************************************************************/
void MonitorLog::writeLog(const int lineNumber, const int logContent, const LOG_TYPE logType)
{
//Log this event
LoggingID lid(18);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add(logContent);
msg.format(args);
switch (logType)
{
case LOG_TYPE_DEBUG:
ml.logDebugMessage(msg);
break;
case LOG_TYPE_INFO:
ml.logInfoMessage(msg);
break;
case LOG_TYPE_WARNING:
ml.logWarningMessage(msg);
break;
case LOG_TYPE_ERROR:
args.add("line:");
args.add(lineNumber);
ml.logErrorMessage(msg);
break;
case LOG_TYPE_CRITICAL:
ml.logCriticalMessage(msg);
break;
}
return;
}
/******************************************************************************************
* @brief ProcessMonitor Constructor
*
* purpose: ProcessMonitor Constructor
*
******************************************************************************************/
ProcessMonitor::ProcessMonitor(MonitorConfig& aconfig, MonitorLog& alog):
config(aconfig), log(alog)
{
// log.writeLog(__LINE__, "Process Monitor starts");
}
/******************************************************************************************
* @brief ProcessMonitor Default Destructor
*
* purpose: ProcessMonitor Default Destructor
*
******************************************************************************************/
ProcessMonitor::~ProcessMonitor()
{
}
/******************************************************************************************
* @brief statusListPtr
*
* purpose: return the process status list
*
******************************************************************************************/
//processStatusList* ProcessMonitor::statusListPtr()
//{
// return &fstatusListPtr;
//}
/******************************************************************************************
* @brief buildList
*
* purpose: Build a list of processes the monitor started
*
******************************************************************************************/
void MonitorConfig::buildList(string ProcessModuleType, string processName, string ProcessLocation,
string arg_list[MAXARGUMENTS], uint16_t launchID, pid_t processID,
uint16_t state, uint16_t BootLaunch, string RunType,
string DepProcessName[MAXDEPENDANCY], string DepModuleName[MAXDEPENDANCY],
string LogFile)
{
//check if the process is already in the list
MonitorLog log;
Oam oam;
if ( processName == "mysqld" )
return;
// Might need to add a similar do-nothing clause for StorageManager?
pthread_mutex_lock(&LIST_LOCK);
// get current time in seconds
time_t cal;
time (&cal);
processList::iterator listPtr;
processList* aPtr = monitoredListPtr();
//Log the current list
/* log.writeLog(__LINE__, "");
log.writeLog(__LINE__, "BEGIN: The current list in this monitor is");
for (listPtr=aPtr->begin(); listPtr != aPtr->end(); ++listPtr)
{
log.writeLog(__LINE__, (*listPtr).ProcessModuleType);
log.writeLog(__LINE__, (*listPtr).ProcessName);
log.writeLog(__LINE__, (*listPtr).ProcessLocation);
log.writeLog(__LINE__, (*listPtr).currentTime);
log.writeLog(__LINE__, (*listPtr).processID);
log.writeLog(__LINE__, (*listPtr).state);
}
*/
listPtr = aPtr->begin();
for (; listPtr != aPtr->end(); ++listPtr)
{
if ((*listPtr).ProcessName == processName)
break;
}
if (listPtr == aPtr->end())
{
// not in list, add it
processInfo proInfo;
proInfo.ProcessModuleType = ProcessModuleType;
proInfo.ProcessName = processName;
proInfo.ProcessLocation = ProcessLocation;
for (unsigned int i = 0; i < MAXARGUMENTS; i++)
{
if (arg_list[i].length() == 0)
break;
proInfo.ProcessArgs[i] = arg_list[i];
}
proInfo.launchID = launchID;
proInfo.currentTime = cal;
proInfo.processID = processID;
proInfo.state = state;
proInfo.BootLaunch = BootLaunch;
proInfo.RunType = RunType;
proInfo.LogFile = LogFile;
proInfo.dieCounter = 0;
for (unsigned int i = 0; i < MAXDEPENDANCY; i++)
{
if (DepProcessName[i].length() == 0)
break;
proInfo.DepProcessName[i] = DepProcessName[i];
}
for (unsigned int i = 0; i < MAXDEPENDANCY; i++)
{
if (DepModuleName[i].length() == 0)
break;
proInfo.DepModuleName[i] = DepModuleName[i];
}
listPtr = aPtr->begin();
if ( listPtr == aPtr->end())
{
// list empty, add first one
fmonitoredListPtr.push_back(proInfo);
}
else
{
for (; listPtr != aPtr->end(); ++listPtr)
{
if ((*listPtr).launchID > launchID)
{
fmonitoredListPtr.insert(listPtr, proInfo);
break;
}
}
if ( listPtr == aPtr->end())
fmonitoredListPtr.push_back(proInfo);
}
}
else
{
// in list, just update the information
if ( ProcessLocation.empty() )
//status update only
(*listPtr).state = state;
else
{
(*listPtr).processID = processID;
(*listPtr).currentTime = cal;
(*listPtr).state = state;
(*listPtr).launchID = launchID;
(*listPtr).BootLaunch = BootLaunch;
(*listPtr).RunType = RunType;
(*listPtr).LogFile = LogFile;
for (unsigned int i = 0; i < MAXARGUMENTS; i++)
{
(*listPtr).ProcessArgs[i] = arg_list[i];
}
}
}
//Log the current list
/* log.writeLog(__LINE__, "");
log.writeLog(__LINE__, "END: The current list in this monitor is");
for (listPtr=aPtr->begin(); listPtr != aPtr->end(); ++listPtr)
{
log.writeLog(__LINE__, (*listPtr).ProcessModuleType);
log.writeLog(__LINE__, (*listPtr).ProcessName);
log.writeLog(__LINE__, (*listPtr).ProcessLocation);
log.writeLog(__LINE__, (*listPtr).currentTime);
log.writeLog(__LINE__, (*listPtr).processID);
log.writeLog(__LINE__, (*listPtr).state);
}
*/
pthread_mutex_unlock(&LIST_LOCK);
return;
}
/******************************************************************************************
* @brief monitoredListPtr
*
* purpose: return the process list
*
******************************************************************************************/
processList* MonitorConfig::monitoredListPtr()
{
return &fmonitoredListPtr;
}
/******************************************************************************************
* @brief processMessage
*
* purpose: receive and process message
*
******************************************************************************************/
void ProcessMonitor::processMessage(messageqcpp::ByteStream msg, messageqcpp::IOSocket mq)
{
Oam oam;
ByteStream ackMsg;
MonitorConfig currentConfig;
ByteStream::byte messageType;
ByteStream::byte requestID;
ByteStream::byte actIndicator;
ByteStream::byte manualFlag;
string processName;
msg >> messageType;
switch (messageType)
{
case REQUEST:
{
msg >> requestID;
msg >> actIndicator;
if (!processInitComplete)
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) requestID;
ackMsg << (ByteStream::byte) oam::API_FAILURE;
mq.write(ackMsg);
break;
}
switch (requestID)
{
case STOP:
{
msg >> processName;
msg >> manualFlag;
log.writeLog(__LINE__, "MSG RECEIVED: Stop process request on " + processName);
int requestStatus = API_SUCCESS;
// check for mysqld
if ( processName == "mysqld" )
{
try
{
oam.actionMysqlCalpont(MYSQL_STOP);
}
catch (...)
{}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STOP;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "STOP: ACK back to ProcMgr, return status = " + oam.itoa((int) API_SUCCESS));
break;
}
processList::iterator listPtr;
processList* aPtr = config.monitoredListPtr();
listPtr = aPtr->begin();
for (; listPtr != aPtr->end(); ++listPtr)
{
if ((*listPtr).ProcessName == processName)
{
// update local process state
if ( manualFlag )
{
(*listPtr).state = oam::MAN_OFFLINE;
(*listPtr).dieCounter = 0;
}
else
(*listPtr).state = oam::AUTO_OFFLINE;
//stop the process first
if (stopProcess((*listPtr).processID, (*listPtr).ProcessName, (*listPtr).ProcessLocation, actIndicator, manualFlag))
requestStatus = API_FAILURE;
else
(*listPtr).processID = 0;
break;
}
}
if (listPtr == aPtr->end())
{
log.writeLog(__LINE__, "ERROR: No such process: " + processName);
requestStatus = API_FAILURE;
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STOP;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "STOP: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case START:
{
msg >> processName;
msg >> manualFlag;
log.writeLog(__LINE__, "MSG RECEIVED: Start process request on: " + processName);
// check for mysqld
if ( processName == "mysqld" )
{
try
{
oam.actionMysqlCalpont(MYSQL_START);
}
catch (...)
{}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) START;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "START: ACK back to ProcMgr, return status = " + oam.itoa((int) API_SUCCESS));
break;
}
ProcessConfig processconfig;
ProcessStatus processstatus;
try
{
//Get the process information
Oam oam;
oam.getProcessConfig(processName, config.moduleName(), processconfig);
oam.getProcessStatus(processName, config.moduleName(), processstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: " + error, LOG_TYPE_ERROR );
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: Caught unknown exception!", LOG_TYPE_ERROR );
}
int requestStatus = API_SUCCESS;
//check the process current status & start the requested process
if (processstatus.ProcessOpState != oam::ACTIVE)
{
int initType = oam::STANDBY_INIT;
if ( actIndicator == oam::GRACEFUL_STANDBY)
{
//this module running Parent OAM Standby
runStandby = true;
log.writeLog(__LINE__, "ProcMon Running Hot-Standby");
// delete any old active alarm log file
unlink ("/var/log/mariadb/columnstore/activeAlarms");
}
//Check for SIMPLEX runtype processes
initType = checkSpecialProcessState( processconfig.ProcessName, processconfig.RunType, processconfig.ModuleType );
if ( initType == oam::COLD_STANDBY)
{
//there is a mate active, skip
config.buildList(processconfig.ModuleType,
processconfig.ProcessName,
processconfig.ProcessLocation,
processconfig.ProcessArgs,
processconfig.LaunchID,
0,
oam::COLD_STANDBY,
processconfig.BootLaunch,
processconfig.RunType,
processconfig.DepProcessName,
processconfig.DepModuleName,
processconfig.LogFile);
requestStatus = API_SUCCESS;
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) START;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
//sleep(1);
log.writeLog(__LINE__, "START: process left STANDBY " + processName);
log.writeLog(__LINE__, "START: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
pid_t processID = startProcess(processconfig.ModuleType,
processconfig.ProcessName,
processconfig.ProcessLocation,
processconfig.ProcessArgs,
processconfig.LaunchID,
processconfig.BootLaunch,
processconfig.RunType,
processconfig.DepProcessName,
processconfig.DepModuleName,
processconfig.LogFile,
initType,
actIndicator);
// StorageManager doesn't send the "I'm online" msg to Proc*.
// Just mark it active for now. TODO: make it use the ping fcn in IDB* instead.
if (processconfig.ProcessName == "StorageManager")
oam.setProcessStatus("StorageManager", boost::get<0>(oam.getModuleInfo()),
oam::ACTIVE, processID);
if ( processID > oam::API_MAX )
processID = oam::API_SUCCESS;
requestStatus = processID;
}
else
log.writeLog(__LINE__, "START: process already active " + processName);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) START;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "START: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case RESTART:
{
msg >> processName;
msg >> manualFlag;
log.writeLog(__LINE__, "MSG RECEIVED: Restart process request on " + processName);
int requestStatus = API_SUCCESS;
// check for mysqld restart
if ( processName == "mysqld" )
{
try
{
oam.actionMysqlCalpont(MYSQL_RESTART);
}
catch (...)
{}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) RESTART;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "RESTART: ACK back to ProcMgr, return status = " + oam.itoa((int) API_SUCCESS));
break;
}
processList::iterator listPtr;
processList* aPtr = config.monitoredListPtr();
listPtr = aPtr->begin();
for (; listPtr != aPtr->end(); ++listPtr)
{
if ((*listPtr).ProcessName == processName)
{
// update local process state
if ( manualFlag )
{
(*listPtr).state = oam::MAN_OFFLINE;
(*listPtr).dieCounter = 0;
}
else
(*listPtr).state = oam::AUTO_OFFLINE;
//stop the process first
if (stopProcess((*listPtr).processID, (*listPtr).ProcessName, (*listPtr).ProcessLocation, actIndicator, manualFlag))
requestStatus = API_FAILURE;
else
{
// sleep(1);
(*listPtr).processID = 0;
//Check for SIMPLEX runtype processes
int initType = checkSpecialProcessState( (*listPtr).ProcessName, (*listPtr).RunType, (*listPtr).ProcessModuleType );
if ( initType == oam::COLD_STANDBY )
{
//there is a mate active, skip
(*listPtr).state = oam::COLD_STANDBY;
requestStatus = API_SUCCESS;
//sleep(1);
break;
}
//start the process again
pid_t processID = startProcess((*listPtr).ProcessModuleType,
(*listPtr).ProcessName,
(*listPtr).ProcessLocation,
(*listPtr).ProcessArgs,
(*listPtr).launchID,
(*listPtr).BootLaunch,
(*listPtr).RunType,
(*listPtr).DepProcessName,
(*listPtr).DepModuleName,
(*listPtr).LogFile,
initType);
// StorageManager doesn't send the "I'm online" msg to Proc*.
// Just mark it active for now. TODO: make it use the ping fcn in IDB* instead.
if (listPtr->ProcessName == "StorageManager")
oam.setProcessStatus("StorageManager", boost::get<0>(oam.getModuleInfo()),
oam::ACTIVE, listPtr->processID);
if ( processID > oam::API_MAX )
processID = oam::API_SUCCESS;
requestStatus = processID;
}
break;
}
}
if (listPtr == aPtr->end())
{
log.writeLog(__LINE__, "ERROR: No such process: " + processName, LOG_TYPE_ERROR );
requestStatus = API_FAILURE;
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) RESTART;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "RESTART: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case PROCREINITPROCESS:
{
msg >> processName;
msg >> manualFlag;
log.writeLog(__LINE__, "MSG RECEIVED: Re-Init process request on: " + processName);
if ( processName == "cpimport" )
{
system("pkill -sighup cpimport");
for (int i=0; i < 10; i++)
{
//get pid
char buf[512];
FILE *cmd_pipe = popen("pidof -s cpimport", "r");
fgets(buf, 512, cmd_pipe);
pid_t pid = strtoul(buf, NULL, 10);
pclose( cmd_pipe );
if (pid)
sleep(2);
else
break;
}
// kill other processes
system("pkill -9 cpimport.bin");
}
else
{
processList::iterator listPtr;
processList* aPtr = config.monitoredListPtr();
listPtr = aPtr->begin();
for (; listPtr != aPtr->end(); ++listPtr)
{
if ((*listPtr).ProcessName == processName)
{
if ( (*listPtr).processID <= 1 )
{
log.writeLog(__LINE__, "ERROR: process not active", LOG_TYPE_DEBUG );
break;
}
reinitProcess((*listPtr).processID, (*listPtr).ProcessName, actIndicator);
break;
}
}
if (listPtr == aPtr->end())
{
log.writeLog(__LINE__, "ERROR: No such process: " + processName, LOG_TYPE_ERROR );
}
}
log.writeLog(__LINE__, "PROCREINITPROCESS: completed, no ack to ProcMgr");
break;
}
case STOPALL:
{
msg >> manualFlag;
log.writeLog(__LINE__, "MSG RECEIVED: Stop All process request...");
if ( actIndicator == STATUS_UPDATE )
{
//check and send notification
MonitorConfig config;
if ( config.moduleType() == "um" )
oam.sendDeviceNotification(config.moduleName(), START_UM_DOWN);
else if ( gOAMParentModuleFlag )
oam.sendDeviceNotification(config.moduleName(), START_PM_MASTER_DOWN);
else if (gOAMStandbyModuleFlag)
oam.sendDeviceNotification(config.moduleName(), START_PM_STANDBY_DOWN);
else
oam.sendDeviceNotification(config.moduleName(), START_PM_COLD_DOWN);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STOPALL;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "STOPALL: ACK back to ProcMgr, STATUS_UPDATE only performed");
break;
}
//get local module run-type
string runType = oam::LOADSHARE; //default
try
{
ModuleTypeConfig moduletypeconfig;
oam.getSystemConfig(config.moduleType(), moduletypeconfig);
runType = moduletypeconfig.RunType;
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
//Loop reversely through the process list, stop all processes
processList* aPtr = config.monitoredListPtr();
processList::reverse_iterator rPtr;
uint16_t rtnCode;
int requestStatus = API_SUCCESS;
for (rPtr = aPtr->rbegin(); rPtr != aPtr->rend(); ++rPtr)
{
if ( (*rPtr).BootLaunch == INIT_LAUNCH)
//skip
continue;
if ( (*rPtr).BootLaunch == BOOT_LAUNCH &&
gOAMParentModuleFlag )
if ( actIndicator != INSTALL )
//skip
continue;
// update local process state here so monitor thread doesn't jump on it
if ( manualFlag )
{
(*rPtr).state = oam::MAN_OFFLINE;
(*rPtr).dieCounter = 0;
}
else
(*rPtr).state = oam::AUTO_OFFLINE;
rtnCode = stopProcess((*rPtr).processID, (*rPtr).ProcessName, (*rPtr).ProcessLocation, actIndicator, manualFlag);
if (rtnCode)
// error in stopping a process
requestStatus = API_FAILURE;
else
(*rPtr).processID = 0;
}
//reset BRM locks and clearShm
if ( requestStatus == oam::API_SUCCESS )
{
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
string cmd = "reset_locks > " + logdir + "/reset_locks.log1 2>&1";
system(cmd.c_str());
log.writeLog(__LINE__, "BRM reset_locks script run", LOG_TYPE_DEBUG);
if ( !gOAMParentModuleFlag )
{
cmd = "clearShm -c > /dev/null 2>&1";
rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) != 1)
{
log.writeLog(__LINE__, "Successfully ran DBRM clearShm", LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "Error running DBRM clearShm", LOG_TYPE_ERROR);
}
//stop the mysqld daemon
try
{
oam.actionMysqlCalpont(MYSQL_STOP);
log.writeLog(__LINE__, "Stop MySQL Process", LOG_TYPE_DEBUG);
}
catch (...)
{}
//send down notification
oam.sendDeviceNotification(config.moduleName(), MODULE_DOWN);
//setModule status to offline
if ( manualFlag )
{
try
{
oam.setModuleStatus(config.moduleName(), oam::MAN_OFFLINE);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
else
{
try
{
oam.setModuleStatus(config.moduleName(), oam::AUTO_OFFLINE);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
else
{
//setModule status to failed
try
{
oam.setModuleStatus(config.moduleName(), oam::FAILED);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
if ( config.moduleType() == "pm" )
{
//go unmount disk NOT assigned to this pm
unmountExtraDBroots();
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STOPALL;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "STOPALL: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case STARTALL:
{
msg >> manualFlag;
int requestStatus = oam::API_SUCCESS;
log.writeLog(__LINE__, "MSG RECEIVED: Start All process request...");
//start the mysqld daemon
try
{
oam.actionMysqlCalpont(MYSQL_START);
}
catch (...)
{
// mysqld didn't start, return with error
// mysql didn't start, return with error
log.writeLog(__LINE__, "STARTALL: MySQL failed to start, start-module failure", LOG_TYPE_CRITICAL);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STARTALL;
ackMsg << (ByteStream::byte) oam::API_FAILURE;
mq.write(ackMsg);
try
{
oam.setProcessStatus("mysqld", config.moduleName(), oam::FAILED, 0);
}
catch (...)
{}
log.writeLog(__LINE__, "STARTALL: ACK back to ProcMgr, return status = " + oam.itoa((int) oam::API_FAILURE));
break;
}
if ( config.moduleType() == "pm" )
{
//setup DBRoot mounts
createDataDirs(cloud);
int ret = checkDataMount();
if (ret != oam::API_SUCCESS)
{
int ret_status = oam::API_FAILURE;
log.writeLog(__LINE__, "checkDataMount error, startmodule failed", LOG_TYPE_CRITICAL);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STARTALL;
ackMsg << (ByteStream::byte) ret_status;
mq.write(ackMsg);
log.writeLog(__LINE__, "STARTALL: ACK back to ProcMgr, return status = " + oam.itoa((int) oam::API_FAILURE));
break;
}
}
//Loop through all Process belong to this module
processList::iterator listPtr;
processList* aPtr = config.monitoredListPtr();
listPtr = aPtr->begin();
requestStatus = API_SUCCESS;
//launched any processes controlled by ProcMon that aren't active
for (; listPtr != aPtr->end(); ++listPtr)
{
if ( (*listPtr).BootLaunch != BOOT_LAUNCH)
continue;
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(config.moduleName(), opState, degraded);
if (opState == oam::FAILED)
{
requestStatus = oam::API_FAILURE;
break;
}
//check the process current status & start the requested process
if ((*listPtr).state == oam::MAN_OFFLINE ||
(*listPtr).state == oam::AUTO_OFFLINE ||
(*listPtr).state == oam::COLD_STANDBY ||
(*listPtr).state == oam::INITIAL)
{
//Check for SIMPLEX runtype processes
int initType = checkSpecialProcessState( (*listPtr).ProcessName, (*listPtr).RunType, (*listPtr).ProcessModuleType );
if ( initType == oam::COLD_STANDBY )
{
//there is a mate active, skip
(*listPtr).state = oam::COLD_STANDBY;
requestStatus = API_SUCCESS;
//sleep(1);
continue;
}
pid_t processID = startProcess((*listPtr).ProcessModuleType,
(*listPtr).ProcessName,
(*listPtr).ProcessLocation,
(*listPtr).ProcessArgs,
(*listPtr).launchID,
(*listPtr).BootLaunch,
(*listPtr).RunType,
(*listPtr).DepProcessName,
(*listPtr).DepModuleName,
(*listPtr).LogFile,
initType);
// StorageManager doesn't send the "I'm online" msg to Proc*.
// Just mark it active for now. TODO: make it use the ping fcn in IDB* instead.
if (listPtr->ProcessName == "StorageManager")
oam.setProcessStatus("StorageManager", boost::get<0>(oam.getModuleInfo()),
oam::ACTIVE, processID);
if ( processID > oam::API_MAX )
{
processID = oam::API_SUCCESS;
}
requestStatus = processID;
if ( requestStatus != oam::API_SUCCESS )
{
// error in launching a process
break;
}
// sleep(1);
}
}
if ( requestStatus == oam::API_SUCCESS )
{
//launched any processes controlled by ProcMgr
for (listPtr = aPtr->begin(); listPtr != aPtr->end(); ++listPtr)
{
if ((*listPtr).BootLaunch != MGR_LAUNCH)
continue;
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(config.moduleName(), opState, degraded);
if (opState == oam::FAILED)
{
requestStatus = oam::API_FAILURE;
break;
}
//check the process current status & start the requested process
if ((*listPtr).state == oam::MAN_OFFLINE ||
(*listPtr).state == oam::AUTO_OFFLINE ||
(*listPtr).state == oam::COLD_STANDBY ||
(*listPtr).state == oam::INITIAL)
{
//Check for SIMPLEX runtype processes
int initType = checkSpecialProcessState( (*listPtr).ProcessName, (*listPtr).RunType, (*listPtr).ProcessModuleType );
if ( initType == oam::COLD_STANDBY )
{
//there is a mate active, skip
(*listPtr).state = oam::COLD_STANDBY;
requestStatus = API_SUCCESS;
//sleep(1);
continue;
}
pid_t processID = startProcess((*listPtr).ProcessModuleType,
(*listPtr).ProcessName,
(*listPtr).ProcessLocation,
(*listPtr).ProcessArgs,
(*listPtr).launchID,
(*listPtr).BootLaunch,
(*listPtr).RunType,
(*listPtr).DepProcessName,
(*listPtr).DepModuleName,
(*listPtr).LogFile,
initType);
// StorageManager doesn't send the "I'm online" msg to Proc*.
// Just mark it active for now. TODO: make it use the ping fcn in IDB* instead.
if (listPtr->ProcessName == "StorageManager")
oam.setProcessStatus("StorageManager", boost::get<0>(oam.getModuleInfo()),
oam::ACTIVE, processID);
if ( processID > oam::API_MAX )
processID = oam::API_SUCCESS;
requestStatus = processID;
if ( requestStatus != oam::API_SUCCESS )
{
// error in launching a process
if ( requestStatus == oam::API_FAILURE &&
(*listPtr).RunType == SIMPLEX)
checkModuleFailover((*listPtr).ProcessName);
else
break;
}
else
{
//run startup test script to perform basic DB sanity testing
if ( gOAMParentModuleFlag
&& (*listPtr).ProcessName == "DBRMWorkerNode"
&& opState != oam::MAN_INIT )
{
if ( runStartupTest() != oam::API_SUCCESS )
{
requestStatus = oam::API_FAILURE_DB_ERROR;
break;
}
}
}
// sleep(2);
}
else
{
// if DBRMWorkerNode and ACTIVE, run runStartupTest
if ( gOAMParentModuleFlag
&& (*listPtr).ProcessName == "DBRMWorkerNode"
&& (*listPtr).state == oam::ACTIVE
&& opState != oam::MAN_INIT )
{
if ( runStartupTest() != oam::API_SUCCESS )
{
requestStatus = oam::API_FAILURE_DB_ERROR;
break;
}
}
}
}
}
if ( requestStatus == oam::API_SUCCESS )
{
//check and send noitification
MonitorConfig config;
if ( config.moduleType() == "um" )
oam.sendDeviceNotification(config.moduleName(), UM_ACTIVE);
else if ( gOAMParentModuleFlag )
oam.sendDeviceNotification(config.moduleName(), PM_MASTER_ACTIVE);
else if (gOAMStandbyModuleFlag)
oam.sendDeviceNotification(config.moduleName(), PM_STANDBY_ACTIVE);
else
oam.sendDeviceNotification(config.moduleName(), PM_COLD_ACTIVE);
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) STARTALL;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "STARTALL: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case SHUTDOWNMODULE:
{
msg >> manualFlag;
log.writeLog(__LINE__, "MSG RECEIVED: Shutdown Module request...");
//Loop reversely thorugh the process list
/* processList* aPtr = config.monitoredListPtr();
processList::reverse_iterator rPtr;
uint16_t rtnCode;
for (rPtr = aPtr->rbegin(); rPtr != aPtr->rend(); ++rPtr)
{
// don't shut yourself or ProcessManager down"
if ((*rPtr).ProcessName == "ProcessMonitor" || (*rPtr).ProcessName == "ProcessManager")
continue;
// update local process state
if ( manualFlag )
(*rPtr).state = oam::MAN_OFFLINE;
else
(*rPtr).state = oam::AUTO_OFFLINE;
rtnCode = stopProcess((*rPtr).processID, (*rPtr).ProcessName, (*rPtr).ProcessLocation, actIndicator, manualFlag);
if (rtnCode)
log.writeLog(__LINE__, "Process cannot be stopped:" + (*rPtr).ProcessName, LOG_TYPE_DEBUG);
else
(*rPtr).processID = 0;
}
//send down notification
oam.sendDeviceNotification(config.moduleName(), MODULE_DOWN);
//stop the mysqld daemon and then columnstore
try {
oam.actionMysqlCalpont(MYSQL_STOP);
}
catch(...)
{}
*/
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) SHUTDOWNMODULE;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "SHUTDOWNMODULE: ACK back to ProcMgr, return status = " + oam.itoa((int) API_SUCCESS));
//sleep to give time for process-manager to finish up
sleep(5);
string cmd = "columnstore stop > /dev/null 2>&1";
system(cmd.c_str());
exit (0);
break;
}
default:
break;
} //end of switch
break;
}
case PROCUPDATELOG:
{
string action;
string level;
msg >> action;
msg >> level;
log.writeLog(__LINE__, "MSG RECEIVED: " + action + " logging at level " + level);
uint16_t rtnCode;
int requestStatus = API_SUCCESS;
rtnCode = updateLog(action, level);
if (rtnCode)
// error in updating log
requestStatus = API_FAILURE;
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCUPDATELOG;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCUPDATELOG: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case PROCGETCONFIGLOG:
{
log.writeLog(__LINE__, "MSG RECEIVED: Get Module Log Configuration data");
int16_t requestStatus = getConfigLog();
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCGETCONFIGLOG;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCGETCONFIGLOG: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case CHECKPOWERON:
{
log.writeLog(__LINE__, "MSG RECEIVED: Check Power-On Test results log file");
checkPowerOnResults();
break;
}
case PROCUPDATECONFIG:
{
log.writeLog(__LINE__, "MSG RECEIVED: Update Process Configuration");
int16_t requestStatus = updateConfig();
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCUPDATECONFIG;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCUPDATECONFIG: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case PROCBUILDSYSTEMTABLES:
{
log.writeLog(__LINE__, "MSG RECEIVED: Check and Build System Tables");
int16_t requestStatus = buildSystemTables();
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCBUILDSYSTEMTABLES;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCBUILDSYSTEMTABLES: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
break;
}
case LOCALHEARTBEAT:
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) LOCALHEARTBEAT;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
break;
}
case CONFIGURE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Configure Module");
string configureModuleName;
msg >> configureModuleName;
int requestStatus = API_SUCCESS;
configureModule(configureModuleName);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) CONFIGURE;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "CONFIGURE: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
//set startmodule flag
startProcMon = true;
break;
}
case RECONFIGURE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Reconfigure Module");
string reconfigureModuleName;
msg >> reconfigureModuleName;
uint16_t rtnCode;
int requestStatus = API_SUCCESS;
//validate that I should be receiving this message
if ( config.moduleType() != "um" &&
config.moduleType() != "pm" )
requestStatus = oam::API_FAILURE;
else
{
if ( config.moduleType() == "um" &&
reconfigureModuleName.find("pm") == string::npos )
requestStatus = oam::API_FAILURE;
else
{
if ( config.moduleType() == "pm" &&
reconfigureModuleName.find("um") == string::npos )
requestStatus = oam::API_FAILURE;
else
{
rtnCode = reconfigureModule(reconfigureModuleName);
if (rtnCode)
// error in updating log
requestStatus = rtnCode;
}
}
}
// install mysqld rpms if being reconfigured as a um
if ( reconfigureModuleName.find("um") != string::npos )
{
string cmd = "post-mysqld-install >> " + tmpLogDir + "/rpminstall";
system(cmd.c_str());
cmd = "post-mysql-install >> " + tmpLogDir + "/rpminstall";
system(cmd.c_str());
int ret = system("systemctl cat mariadb.service > /dev/null 2>&1");
if (!ret)
{
cmd = "systemctl start mariadb.service > " + tmpLogDir + "/mysqldstart";
system(cmd.c_str());
}
else
{
cmd = "/usr/bin/mysqld_safe & > " + tmpLogDir + "/mysqldstart";
system(cmd.c_str());
}
string tmpFile = tmpLogDir + "/mysqldstart";
ifstream file (tmpFile.c_str());
if (!file)
{
requestStatus = oam::API_FAILURE;
log.writeLog(__LINE__, "RECONFIGURE: mysqld failed to start", LOG_TYPE_ERROR);
}
else
{
char line[200];
string buf;
int count = 0;
while (file.getline(line, 200))
{
buf = line;
if ( buf.find("OK", 0) != string::npos )
count++;
}
file.close();
if (count == 0)
{
requestStatus = oam::API_FAILURE;
log.writeLog(__LINE__, "RECONFIGURE: mysqld failed to start", LOG_TYPE_ERROR);
}
else
log.writeLog(__LINE__, "RECONFIGURE: install started mysqld");
}
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) RECONFIGURE;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "RECONFIGURE: ACK back to ProcMgr, return status = " + oam.itoa((int) requestStatus));
//now exit so Process Monitor can restart and reinitialzation as the New module type
if ( requestStatus != oam::API_FAILURE )
{
log.writeLog(__LINE__, "RECONFIGURE: ProcMon exiting so it can restart as new module type", LOG_TYPE_DEBUG);
// sleep(1);
exit(1);
}
break;
}
case GETSOFTWAREINFO:
{
log.writeLog(__LINE__, "MSG RECEIVED: Get Calpont Software Info");
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) GETSOFTWAREINFO;
ackMsg << config.SoftwareVersion() + config.SoftwareRelease();
mq.write(ackMsg);
log.writeLog(__LINE__, "GETSOFTWAREINFO: ACK back to ProcMgr with " + config.SoftwareVersion() + config.SoftwareRelease());
break;
}
case UPDATEPARENTNFS:
{
string IPAddress;
msg >> IPAddress;
log.writeLog(__LINE__, "MSG RECEIVED: Update fstab file with new Parent OAM IP of " + IPAddress);
int requestStatus = API_SUCCESS;
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) UPDATEPARENTNFS;
ackMsg << (ByteStream::byte) requestStatus;
mq.write(ackMsg);
log.writeLog(__LINE__, "UPDATEPARENTNFS: ACK back to ProcMgr");
break;
}
case OAMPARENTACTIVE:
{
log.writeLog(__LINE__, "MSG RECEIVED: OAM Parent Activate");
runStandby = false;
log.writeLog(__LINE__, "Running Active", LOG_TYPE_INFO);
//give time for Status Control thread to start reading incoming messages
sleep(3);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) OAMPARENTACTIVE;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "OAMPARENTACTIVE: ACK back to ProcMgr");
MonitorConfig config;
break;
}
case UPDATECONFIGFILE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Update Calpont Config file");
(void)updateConfigFile(msg);
log.writeLog(__LINE__, "UPDATECONFIGFILE: Completed");
MonitorConfig config;
break;
}
case GETPARENTOAMMODULE:
{
log.writeLog(__LINE__, "MSG RECEIVED: Get Parent OAM Module");
MonitorConfig config;
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) GETPARENTOAMMODULE;
ackMsg << config.OAMParentName();
mq.write(ackMsg);
log.writeLog(__LINE__, "GETPARENTOAMMODULE: ACK back with " + config.OAMParentName());
break;
}
case OAMPARENTCOLD:
{
log.writeLog(__LINE__, "MSG RECEIVED: OAM Parent Standby ");
runStandby = true;
// delete any old active alarm log file
unlink ("/var/log/mariadb/columnstore/activeAlarms");
log.writeLog(__LINE__, "Running Standby", LOG_TYPE_INFO);
//give time for Status Control thread to start reading incoming messages
sleep(3);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) OAMPARENTCOLD;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "OAMPARENTCOLD: ACK back to ProcMgr");
MonitorConfig config;
break;
}
case PROCUNMOUNT:
{
string dbrootID;
msg >> dbrootID;
log.writeLog(__LINE__, "MSG RECEIVED: Unmount DBRoot: " + dbrootID);
//Flush the cache
cacheutils::flushPrimProcCache();
cacheutils::dropPrimProcFdCache();
flushInodeCache();
int return_status = API_SUCCESS;
if (DataRedundancyConfig == "n")
{
int retry = 1;
string tmpUmount = tmpLogDir + "/umount.log";
for ( ; retry < 5 ; retry++)
{
string cmd = "export LC_ALL=C;" + SUDO + "umount /var/lib/columnstore/data" + dbrootID + " > " + tmpUmount + " 2>&1";
system(cmd.c_str());
return_status = API_SUCCESS;
if (!oam.checkLogStatus(tmpUmount, "busy"))
break;
cmd = "lsof /var/lib/columnstore/data" + dbrootID + " >> " + tmpUmount + " 2>&1";
system(cmd.c_str());
cmd = "fuser -muvf /var/lib/columnstore/data" + dbrootID + " >> " + tmpUmount + " 2>&1";
system(cmd.c_str());
sleep(2);
//Flush the cache
cacheutils::flushPrimProcCache();
cacheutils::dropPrimProcFdCache();
flushInodeCache();
}
if ( retry >= 5 )
{
log.writeLog(__LINE__, "unmount failed, device busy, DBRoot: " + dbrootID, LOG_TYPE_ERROR);
return_status = API_FAILURE;
string cmd = "mv -f " + tmpUmount + " " + tmpUmount + "failed";
system(cmd.c_str());
}
}
else
{
try
{
int ret = glusterUnassign(dbrootID);
if ( ret != 0 )
log.writeLog(__LINE__, "Error unassigning gluster dbroot# " + dbrootID, LOG_TYPE_ERROR);
else
log.writeLog(__LINE__, "Gluster unassign gluster dbroot# " + dbrootID);
}
catch (...)
{
log.writeLog(__LINE__, "Exception unassigning gluster dbroot# " + dbrootID, LOG_TYPE_ERROR);
}
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCUNMOUNT;
ackMsg << (ByteStream::byte) return_status;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCUNMOUNT: ACK back to ProcMgr, status: " + oam.itoa(return_status));
break;
}
case PROCMOUNT:
{
string dbrootID;
msg >> dbrootID;
log.writeLog(__LINE__, "MSG RECEIVED: Mount DBRoot: " + dbrootID);;
int return_status = API_SUCCESS;
if (DataRedundancyConfig == "n")
{
string tmpMount = tmpLogDir + "/mount.log";
string cmd = SUDO + "export LC_ALL=C;" + SUDO + "mount /var/lib/columnstore/data" + dbrootID + " > " + tmpMount + "2>&1";
system(cmd.c_str());
if ( !rootUser )
{
cmd = SUDO + "chown -R " + USER + ":" + USER + " /var/lib/columnstore/data" + dbrootID + " > /dev/null 2>&1";
system(cmd.c_str());
}
return_status = API_SUCCESS;
ifstream in(tmpMount.c_str());
in.seekg(0, std::ios::end);
int size = in.tellg();
if ( size != 0 )
{
if (!oam.checkLogStatus(tmpMount, "already"))
{
log.writeLog(__LINE__, "mount failed, DBRoot: " + dbrootID, LOG_TYPE_ERROR);
return_status = API_FAILURE;
string cmd = "mv -f " + tmpMount + " " + tmpMount + "failed";
system(cmd.c_str());
}
}
}
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCMOUNT;
ackMsg << (ByteStream::byte) return_status;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCMOUNT: ACK back to ProcMgr, status: " + oam.itoa(return_status));
break;
}
case PROCFSTABUPDATE:
{
string entry;
msg >> entry;
//check if entry already exist in /etc/fstab
string cmd = "grep " + entry + " /etc/fstab /dev/null 2>&1";
int status = system(cmd.c_str());
if (WEXITSTATUS(status) != 0 )
{
//chmod before update, used on amazon ami EBS. not other systems
system("sudo chmod 666 /etc/fstab");
cmd = "echo " + entry + " >> /etc/fstab";
system(cmd.c_str());
log.writeLog(__LINE__, "Add line entry to /etc/fstab : " + entry);
}
//check if entry already exist in ../local/etc/pm1/fstab
cmd = "grep " + entry + "/var/lib/columnstore/local/etc/pm1/fstab /dev/null 2>&1";
status = system(cmd.c_str());
if (WEXITSTATUS(status) != 0 )
{
cmd = "echo " + entry + " >> /var/lib/columnstore/local/etc/pm1/fstab";
system(cmd.c_str());
log.writeLog(__LINE__, "Add line entry to /var/lib/columnstore/local/etc/pm1/fstab : " + entry);
}
//mkdir on entry directory
string::size_type pos = entry.find(" ", 0);
string::size_type pos1 = entry.find(" ", pos + 1);
string directory = entry.substr(pos + 1, pos1 - pos);
cmd = "mkdir " + directory;
system(cmd.c_str());
log.writeLog(__LINE__, "create directory: " + directory);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCFSTABUPDATE;
ackMsg << (ByteStream::byte) API_SUCCESS;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCFSTABUPDATE: ACK back to ProcMgr");
break;
}
case MASTERREP:
{
log.writeLog(__LINE__, "MSG RECEIVED: Run Master Replication script ");
string masterLogFile = oam::UnassignedName;
string masterLogPos = oam::UnassignedName;
if ( ( (PMwithUM == "n") && (config.moduleType() == "pm") ) &&
( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM) )
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) MASTERREP;
ackMsg << (ByteStream::byte) oam::API_FAILURE;
ackMsg << masterLogFile;
ackMsg << masterLogPos;
mq.write(ackMsg);
log.writeLog(__LINE__, "MASTERREP: Error PM invalid msg - ACK back to ProcMgr return status = " + oam.itoa((int) oam::API_FAILURE));
break;
}
//change local my.cnf file
int ret;
int retry;
for ( retry = 0 ; retry < 3 ; retry++ )
{
ret = changeMyCnf("master");
if ( ret == oam::API_FAILURE )
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) MASTERREP;
ackMsg << (ByteStream::byte) ret;
ackMsg << masterLogFile;
ackMsg << masterLogPos;
mq.write(ackMsg);
log.writeLog(__LINE__, "MASTERREP: Error in changeMyCnf - ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
// run Master Rep script
ret = runMasterRep(masterLogFile, masterLogPos);
if ( ret == API_FAILURE )
{
log.writeLog(__LINE__, "MASTERREP: runMasterRep failure, retry", LOG_TYPE_ERROR);
sleep(5);
continue;
}
else
break;
}
if ( retry >= 3 )
log.writeLog(__LINE__, "MASTERREP: runMasterRep failure", LOG_TYPE_CRITICAL);
else
runDisableRep(); //disable slave on new master
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) MASTERREP;
ackMsg << (ByteStream::byte) ret;
ackMsg << masterLogFile;
ackMsg << masterLogPos;
mq.write(ackMsg);
log.writeLog(__LINE__, "MASTERREP: ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
case SLAVEREP:
{
log.writeLog(__LINE__, "MSG RECEIVED: Run Slave Replication script ");
string masterLogFile;
msg >> masterLogFile;
string masterLogPos;
msg >> masterLogPos;
if ( ( (PMwithUM == "n") && (config.moduleType() == "pm") ) &&
( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM) )
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) SLAVEREP;
ackMsg << (ByteStream::byte) oam::API_FAILURE;
mq.write(ackMsg);
log.writeLog(__LINE__, "SLAVEREP: Error PM invalid msg - ACK back to ProcMgr return status = " + oam.itoa((int) oam::API_FAILURE));
break;
}
//change local my.cnf file
int ret = changeMyCnf("slave");
if ( ret == oam::API_FAILURE )
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) SLAVEREP;
ackMsg << (ByteStream::byte) ret;
mq.write(ackMsg);
log.writeLog(__LINE__, "SLAVEREP: Error in changeMyCnf - ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
// run Slave Rep script
ret = runSlaveRep(masterLogFile, masterLogPos);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) SLAVEREP;
ackMsg << (ByteStream::byte) ret;
mq.write(ackMsg);
log.writeLog(__LINE__, "SLAVEREP: ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
case MASTERDIST:
{
log.writeLog(__LINE__, "MSG RECEIVED: Run Master DB Distribute command ");
string password;
msg >> password;
string module;
msg >> module;
if ( ( (PMwithUM == "n") && (config.moduleType() == "pm") ) &&
( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM) )
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) MASTERDIST;
ackMsg << (ByteStream::byte) oam::API_FAILURE;
mq.write(ackMsg);
log.writeLog(__LINE__, "MASTERDIST: runMasterRep - ACK back to ProcMgr return status = " + oam.itoa((int) oam::API_FAILURE));
}
if ( password == oam::UnassignedName )
password = "ssh";
// run Master Dist
int ret = runMasterDist(password, module);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) MASTERDIST;
ackMsg << (ByteStream::byte) ret;
mq.write(ackMsg);
log.writeLog(__LINE__, "MASTERDIST: runMasterRep - ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
case DISABLEREP:
{
log.writeLog(__LINE__, "MSG RECEIVED: Disable MySQL Replication ");
//change local my.cnf file
int ret = changeMyCnf("disable");
// run Disable rep
ret = runDisableRep();
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) DISABLEREP;
ackMsg << (ByteStream::byte) ret;
mq.write(ackMsg);
log.writeLog(__LINE__, "DISABLEREP: ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
case PROCGLUSTERASSIGN:
{
string dbrootID;
msg >> dbrootID;
log.writeLog(__LINE__, "MSG RECEIVED: Gluster Assign DBRoot: " + dbrootID);
// run Master Dist
int ret = glusterAssign(dbrootID);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCGLUSTERASSIGN;
ackMsg << (ByteStream::byte) ret;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCGLUSTERASSIGN: ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
case PROCGLUSTERUNASSIGN:
{
string dbrootID;
msg >> dbrootID;
log.writeLog(__LINE__, "MSG RECEIVED: Gluster Unassign DBRoot: " + dbrootID);
// run Master Dist
int ret = glusterUnassign(dbrootID);
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) PROCGLUSTERUNASSIGN;
ackMsg << (ByteStream::byte) ret;
mq.write(ackMsg);
log.writeLog(__LINE__, "PROCGLUSTERUNASSIGN: ACK back to ProcMgr return status = " + oam.itoa((int) ret));
break;
}
case SYNCFSALL:
{
log.writeLog(__LINE__, "MSG RECEIVED: SYNC FileSystem...");
int requestStatus = API_SUCCESS;
requestStatus = syncFS();
if (requestStatus == API_SUCCESS)
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) SYNCFSALL;
ackMsg << (ByteStream::byte) API_SUCCESS;
}
else
{
ackMsg << (ByteStream::byte) ACK;
ackMsg << (ByteStream::byte) SYNCFSALL;
ackMsg << (ByteStream::byte) API_FAILURE;
}
mq.write(ackMsg);
log.writeLog(__LINE__, "SYNCFSALL: ACK back to ProcMgr, return status = " + oam.itoa((int) API_SUCCESS));
break;
}
default:
break;
} //end of switch
return;
}
/******************************************************************************************
* @brief stopProcess
*
* purpose: stop a process
*
******************************************************************************************/
int ProcessMonitor::stopProcess(pid_t processID, std::string processName, std::string processLocation, int actionIndicator, bool manualFlag)
{
int status;
MonitorLog log;
Oam oam;
log.writeLog(__LINE__, "STOPPING Process: " + processName, LOG_TYPE_DEBUG);
sendAlarm(processName, PROCESS_INIT_FAILURE, CLEAR);
if ( manualFlag )
{
// Mark the process offline
updateProcessInfo(processName, oam::MAN_OFFLINE, 0);
// Set the alarm
sendAlarm(processName, PROCESS_DOWN_MANUAL, SET);
}
else
{
// Mark the process offline
updateProcessInfo(processName, oam::AUTO_OFFLINE, 0);
// Set the alarm
sendAlarm(processName, PROCESS_DOWN_AUTO, SET);
}
// bypass if pid = 0
if ( processID == 0 )
status = API_SUCCESS;
else
{
// XXXPAT: StorageManager shouldn't be killed with KILL, or there's a chance of data corruption.
// once we minimize that chance, we could allow KILL to be sent.
if (actionIndicator == GRACEFUL || processName == "StorageManager")
{
status = kill(processID, SIGTERM);
}
else
{
status = kill(processID, SIGKILL);
}
// processID not found, set as success
if ( errno == ESRCH)
status = API_SUCCESS;
if ( status != API_SUCCESS)
{
status = errno;
log.writeLog(__LINE__, "Failure to stop Process: " + processName + ", error = " + oam.itoa(errno), LOG_TYPE_ERROR );
}
}
// sending sigkill to StorageManager right after sigterm would not allow it to
// exit gracefully. This will wait until StorageManager goes down to prevent
// weirdness that I suspect will happen if we combine a slow connection with a restart
// command.
if (processName == "StorageManager" && processID != 0)
{
while (status == API_SUCCESS)
{
sleep(1);
ostringstream os;
os << "Waiting for StorageManager to exit gracefully... pid is " << processID;
log.writeLog(__LINE__, os.str(), LOG_TYPE_DEBUG);
status = kill(processID, SIGTERM);
break;
}
return API_SUCCESS;
}
//now do a pkill on process just to make sure all is clean
string::size_type pos = processLocation.find("bin/", 0);
string procName = processLocation.substr(pos + 4, 15) + "\\*";
string cmd = "pkill -9 " + procName;
system(cmd.c_str());
log.writeLog(__LINE__, "Pkill Process just to make sure: " + procName, LOG_TYPE_DEBUG);
return status;
}
/******************************************************************************************
* @brief startProcess
*
* purpose: Start a process
*
******************************************************************************************/
pid_t ProcessMonitor::startProcess(string processModuleType, string processName, string processLocation,
string arg_list[MAXARGUMENTS], uint16_t launchID, uint16_t BootLaunch,
string RunType, string DepProcessName[MAXDEPENDANCY],
string DepModuleName[MAXDEPENDANCY], string LogFile, uint16_t initType, uint16_t actIndicator)
{
// Compiler complains about non-initialiased variable here.
pid_t newProcessID = 0;
char* argList[MAXARGUMENTS];
unsigned int i = 0;
MonitorLog log;
MonitorConfig currentConfig;
unsigned int numAugs = 0;
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
Config *cs_config = Config::makeConfig();
string DBRootStorageType = cs_config->getConfig("Installation", "DBRootStorageType");
log.writeLog(__LINE__, "STARTING Process: " + processName, LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "Process location: " + processLocation, LOG_TYPE_DEBUG);
//check process location
if (access(processLocation.c_str(), X_OK) != 0)
{
log.writeLog(__LINE__, "Process location: " + processLocation + " not found", LOG_TYPE_ERROR);
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, newProcessID, FAILED, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Mark Process INIT state
updateProcessInfo(processName, FAILED, newProcessID);
return oam::API_FAILURE;
}
//check process dependency
if (DepProcessName[i].length() != 0)
{
for (int i = 0; i < MAXDEPENDANCY; i++)
{
//Get dependent process status
if (DepProcessName[i].length() == 0)
{
break;
}
//check for System wild card on Module Name
if ( DepModuleName[i] == "*" )
{
//check for all accurrances of this Module Name
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int j = 0 ; j < systemprocessstatus.processstatus.size(); j++)
{
if ( systemprocessstatus.processstatus[j].ProcessName == DepProcessName[i] )
{
log.writeLog(__LINE__, "Dependent process of " + DepProcessName[i] + "/" + systemprocessstatus.processstatus[j].Module + " is " + oam.itoa(systemprocessstatus.processstatus[j].ProcessOpState), LOG_TYPE_DEBUG);
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(systemprocessstatus.processstatus[j].Module, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED ||
opState == oam::AUTO_OFFLINE)
continue;
if (systemprocessstatus.processstatus[j].ProcessOpState != oam::ACTIVE )
{
log.writeLog(__LINE__, "Dependent Process is not in correct state, Failed Restoral", LOG_TYPE_DEBUG);
return oam::API_MINOR_FAILURE;
}
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
}
else
{
//check for a Module Type wildcard
if ( DepModuleName[i].find("*") != string::npos)
{
string moduleName = DepModuleName[i].substr(0, MAX_MODULE_TYPE_SIZE);
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int j = 0 ; j < systemprocessstatus.processstatus.size(); j++)
{
if ( systemprocessstatus.processstatus[j].ProcessName == DepProcessName[i]
&& systemprocessstatus.processstatus[j].Module.find(moduleName, 0) != string::npos)
{
log.writeLog(__LINE__, "Dependent process of " + DepProcessName[i] + "/" + systemprocessstatus.processstatus[j].Module + " is " + oam.itoa(systemprocessstatus.processstatus[j].ProcessOpState), LOG_TYPE_DEBUG);
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(systemprocessstatus.processstatus[j].Module, opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED ||
opState == oam::AUTO_OFFLINE)
continue;
if (systemprocessstatus.processstatus[j].ProcessOpState != oam::ACTIVE )
{
log.writeLog(__LINE__, "Dependent Process is not in correct state, Failed Restoral", LOG_TYPE_DEBUG);
return oam::API_MINOR_FAILURE;
}
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
}
else
{
//check for a Current Module wildcard
if ( DepModuleName[i] == "@")
{
int state = oam::ACTIVE;;
try
{
ProcessStatus procstat;
oam.getProcessStatus(DepProcessName[i],
config.moduleName(),
procstat);
state = procstat.ProcessOpState;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
log.writeLog(__LINE__, "Dependent process of " + DepProcessName[i] + "/" + config.moduleName() + " is " + oam.itoa(state), LOG_TYPE_DEBUG);
if (state == oam::FAILED)
{
log.writeLog(__LINE__, "Dependent Process is FAILED state, Hard Failed Restoral", LOG_TYPE_DEBUG);
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, newProcessID, FAILED, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Mark Process INIT state
updateProcessInfo(processName, FAILED, newProcessID);
return oam::API_FAILURE;
}
if (state != oam::ACTIVE)
{
log.writeLog(__LINE__, "Dependent Process is not in correct state, Failed Restoral", LOG_TYPE_DEBUG);
return oam::API_MINOR_FAILURE;
}
}
else
{
// specific module name and process dependency
int state = oam::ACTIVE;
try
{
ProcessStatus procstat;
oam.getProcessStatus(DepProcessName[i],
DepModuleName[i],
procstat);
state = procstat.ProcessOpState;
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return oam::API_MINOR_FAILURE;
}
log.writeLog(__LINE__, "Dependent process of " + DepProcessName[i] + "/" + DepModuleName[i] + " is " + oam.itoa(state), LOG_TYPE_DEBUG);
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(DepModuleName[i], opState, degraded);
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED ||
opState == oam::AUTO_OFFLINE)
continue;
if (state != oam::ACTIVE)
{
log.writeLog(__LINE__, "Dependent Process is not in correct state, Failed Restoral", LOG_TYPE_DEBUG);
return oam::API_MINOR_FAILURE;
}
}
}
}
}//end of FOR
}
for (i = 0; i < MAXARGUMENTS - 1; i++)
{
if (arg_list[i].length() == 0)
break;
//check if workernode argument, if so setup argument #2 as the slave ID for this module
string::size_type pos = arg_list[i].find("DBRM_Worker", 0);
if (pos != string::npos)
{
try
{
int slavenodeID = oam.getLocalDBRMID(config.moduleName());
arg_list[i] = "DBRM_Worker" + oam.itoa(slavenodeID);
log.writeLog(__LINE__, "getLocalDBRMID Worker Node ID = " + oam.itoa(slavenodeID), LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getLocalDBRMID: no DBRM for module", LOG_TYPE_ERROR);
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, newProcessID, FAILED, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Mark Process INIT state
updateProcessInfo(processName, FAILED, newProcessID);
return oam::API_FAILURE;
}
}
argList[i] = new char[arg_list[i].length() + 1];
strcpy(argList[i], arg_list[i].c_str()) ;
// log.writeLog(__LINE__, "Arg list ");
// log.writeLog(__LINE__, argList[i]);
numAugs++;
}
argList[i] = NULL;
//run load-brm script before brm processes started
if ( actIndicator != oam::GRACEFUL)
{
if ( ( gOAMParentModuleFlag && processName == "DBRMControllerNode") ||
( !gOAMParentModuleFlag && processName == "DBRMWorkerNode") )
{
string DBRMDir;
// string tempDBRMDir = startup::StartUp::installDir() + "/data/dbrm";
// get DBRMroot config setting
string DBRMroot;
oam.getSystemConfig("DBRMRoot", DBRMroot);
string::size_type pos = DBRMroot.find("/BRM_saves", 0);
if (pos != string::npos)
//get directory path
DBRMDir = DBRMroot.substr(0, pos);
else
{
log.writeLog(__LINE__, "Error: /BRM_saves not found in DBRMRoot config setting", LOG_TYPE_CRITICAL);
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, newProcessID, FAILED, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Mark Process INIT state
updateProcessInfo(processName, FAILED, newProcessID);
return oam::API_FAILURE;
}
//create dbrm directory, just to make sure its there
string cmd = "mkdir -p " + DBRMDir + " > /dev/null 2>&1";
system(cmd.c_str());
// if Non Parent OAM Module, get the dbmr data from Parent OAM Module
if ( !gOAMParentModuleFlag && !HDFS)
{
//create temp dbrm directory
// string cmd = "mkdir " + tempDBRMDir + " > /dev/null 2>&1";
// system(cmd.c_str());
//setup softlink for editem on the 'um' or shared-nothing non active pm
/* if( config.moduleType() == "um" ||
(config.moduleType() == "pm") ) {
cmd = "mv -f " + DBRMDir + " /root/ > /dev/null 2>&1";
system(cmd.c_str());
cmd = "ln -s " + tempDBRMDir + " " + DBRMDir + " > /dev/null 2>&1";
system(cmd.c_str());
}
*/
//change DBRMDir to temp DBRMDir
// DBRMDir = tempDBRMDir;
// remove all files for temp directory
// cmd = "rm -f " + DBRMDir + "/*";
// system(cmd.c_str());
// go request files from parent OAM module
if ( getDBRMdata(&DBRMDir) != oam::API_SUCCESS )
{
log.writeLog(__LINE__, "Error: getDBRMdata failed", LOG_TYPE_ERROR);
sendAlarm("DBRM", DBRM_LOAD_DATA_ERROR, SET);
return oam::API_MINOR_FAILURE;
}
// DBRMDir might have changed, so need to change DBRMroot
bf::path tmp(DBRMroot);
tmp = tmp.filename();
DBRMroot = (bf::path(DBRMDir) / tmp).string();
sendAlarm("DBRM", DBRM_LOAD_DATA_ERROR, CLEAR);
// change DBRMroot to temp DBRMDir path
// DBRMroot = tempDBRMDir + "/BRM_saves";
}
//
// run the 'load_brm' script first if files exist
//
string loadScript = "load_brm";
string fileName = DBRMroot + "_current";
ssize_t fileSize = IDBPolicy::size(fileName.c_str());
boost::scoped_ptr<IDBDataFile> oldFile(IDBDataFile::open(
IDBPolicy::getType(fileName.c_str(),
IDBPolicy::WRITEENG),
fileName.c_str(), "r", 0));
if (oldFile && fileSize > 0)
{
char line[200] = {0};
oldFile->pread(line, 0, fileSize - 1); // skip the \n
line[fileSize] = '\0'; // not necessary, but be sure.
// MCOL-1558 - the _current file is now relative to DBRMRoot
string dbrmFile;
if (line[0] == '/') // handle absolute paths (saved by an old version)
dbrmFile = line;
else
dbrmFile = DBRMroot.substr(0, DBRMroot.find_last_of('/') + 1) + line;
// if ( !gOAMParentModuleFlag ) {
// string::size_type pos = dbrmFile.find("/BRM_saves",0);
// if (pos != string::npos)
// dbrmFile = tempDBRMDir + dbrmFile.substr(pos,80);;
// }
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
string cmd = "reset_locks > " + logdir + "/reset_locks.log1 2>&1";
system(cmd.c_str());
log.writeLog(__LINE__, "BRM reset_locks script run", LOG_TYPE_DEBUG);
cmd = "clearShm -c > /dev/null 2>&1";
system(cmd.c_str());
log.writeLog(__LINE__, "Clear Shared Memory script run", LOG_TYPE_DEBUG);
cmd = loadScript + " " + dbrmFile + " > " + logdir + "/load_brm.log1 2>&1";
log.writeLog(__LINE__, loadScript + " cmd = " + cmd, LOG_TYPE_DEBUG);
system(cmd.c_str());
cmd = logdir + "/load_brm.log1";
if (oam.checkLogStatus(cmd, "OK"))
log.writeLog(__LINE__, "Successfully return from " + loadScript, LOG_TYPE_DEBUG);
else
{
log.writeLog(__LINE__, "Error return DBRM " + loadScript, LOG_TYPE_ERROR);
sendAlarm("DBRM", DBRM_LOAD_DATA_ERROR, SET);
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, 0, FAILED, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Mark Process FAILED state
updateProcessInfo(processName, FAILED, 0);
return oam::API_FAILURE;
}
// now delete the dbrm data from local disk
if ( !gOAMParentModuleFlag && !HDFS && DataRedundancyConfig == "n")
{
IDBFileSystem &fs = IDBPolicy::getFs(DBRMDir);
fs.remove(DBRMDir.c_str());
log.writeLog(__LINE__, "removed downloaded DBRM files at " + DBRMDir, LOG_TYPE_DEBUG);
#if 0
string cmd = "rm -f " + DBRMDir + "/*";
system(cmd.c_str());
log.writeLog(__LINE__, "removed DBRM file with command: " + cmd, LOG_TYPE_DEBUG);
#endif
}
}
else
log.writeLog(__LINE__, "No DBRM files exist, must be a initial startup", LOG_TYPE_DEBUG);
}
sendAlarm("DBRM", DBRM_LOAD_DATA_ERROR, CLEAR);
}
//do a pkill on process just to make sure there is no rouge version running
string::size_type pos = processLocation.find("bin/", 0);
string procName = processLocation.substr(pos + 4, 15) + "\\*";
string cmd = "pkill -9 " + procName;
system(cmd.c_str());
log.writeLog(__LINE__, "Pkill Process just to make sure: " + procName, LOG_TYPE_DEBUG);
//Update Process Status: Mark Process INIT state
updateProcessInfo(processName, initType, 0);
//sleep, give time for INIT state to be update, prevent race condition with ACTIVE
sleep(1);
//check and setup for logfile
time_t now;
now = time(NULL);
struct tm tm;
localtime_r(&now, &tm);
char timestamp[200];
strftime (timestamp, 200, "%m:%d:%y-%H:%M:%S", &tm);
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
string outFileName = logdir + "/" + processName + ".out";
string errFileName = logdir + "/" + processName + ".err";
string saveoutFileName = outFileName + "." + timestamp + ".log1";
string saveerrFileName = errFileName + "." + timestamp + ".log1";
if ( LogFile == "off" )
{
string cmd = "mv " + outFileName + " " + saveoutFileName + " > /dev/null 2>&1";
system(cmd.c_str());
cmd = "mv " + errFileName + " " + saveerrFileName + " > /dev/null 2>&1";
system(cmd.c_str());
}
else
{
string cmd = "mv " + outFileName + " " + saveoutFileName + " > /dev/null 2>&1";
system(cmd.c_str());
cmd = "mv " + errFileName + " " + saveerrFileName + " > /dev/null 2>&1";
system(cmd.c_str());
}
//fork and exec new process
newProcessID = fork();
if (newProcessID != 0)
{
//
// parent processing
//
if ( newProcessID == -1)
{
log.writeLog(__LINE__, "New Process ID = -1, failed StartProcess", LOG_TYPE_DEBUG);
return oam::API_MINOR_FAILURE;
}
//FYI - NEEDS TO STAY HERE TO HAVE newProcessID
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, newProcessID, initType, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Update PID
updateProcessInfo(processName, PID_UPDATE, newProcessID);
log.writeLog(__LINE__, processName + " PID is " + oam.itoa(newProcessID), LOG_TYPE_DEBUG);
sendAlarm(processName, PROCESS_DOWN_MANUAL, CLEAR);
sendAlarm(processName, PROCESS_DOWN_AUTO, CLEAR);
sendAlarm(processName, PROCESS_INIT_FAILURE, CLEAR);
//give time to get status updates from process before starting next process
if ( processName == "DBRMWorkerNode" || processName == "ExeMgr" || processName == "DDLProc")
sleep(3);
else
{
if ( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM )
{
if ( processName == "PrimProc" || processName == "WriteEngineServer")
sleep(3);
}
else
{
if ( (PMwithUM == "y") && processName == "PrimProc" )
sleep(3);
}
}
for (i = 0; i < numAugs; i++)
{
if (strlen(argList[i]) == 0)
break;
delete [] argList[i];
}
}
else
{
//
//child processing
//
//Close all files opened by parent process
for (int i = 0; i < sysconf(_SC_OPEN_MAX); i++)
{
close(i);
}
{
int fd;
fd = open("/dev/null", O_RDONLY);
if (fd != 0)
{
dup2(fd, 0);
close(fd);
}
if ( LogFile == "off" )
{
fd = open("/dev/null", O_WRONLY); //Should be fd 1
if ( fd != 1 )
{
dup2(fd, 1);
close(fd);
}
fd = open("/dev/null", O_WRONLY); //Should be fd 2
if ( fd != 2 )
{
dup2(fd, 2);
close(fd);
}
}
else
{
// open STDOUT & STDERR to log file
// log.writeLog(__LINE__, "STDOUT to " + outFileName, LOG_TYPE_DEBUG);
fd = open(outFileName.c_str(), O_CREAT | O_WRONLY, 0644);
if (fd != 1)
{
dup2(fd, 1);
close(fd);
}
// log.writeLog(__LINE__, "STDERR to " + errFileName, LOG_TYPE_DEBUG);
fd = open(errFileName.c_str(), O_CREAT | O_WRONLY, 0644);
if (fd != 2)
{
dup2(fd, 2);
close(fd);
}
}
}
//give time to get INIT status updated in shared memory
sleep(1);
execv(processLocation.c_str(), argList);
//record the process information into processList
config.buildList(processModuleType, processName, processLocation, arg_list,
launchID, newProcessID, FAILED, BootLaunch, RunType,
DepProcessName, DepModuleName, LogFile);
//Update Process Status: Mark Process INIT state
updateProcessInfo(processName, FAILED, newProcessID);
return (oam::API_FAILURE);
}
return newProcessID;
}
/******************************************************************************************
* @brief reinitProcess
*
* purpose: re-Init a process
*
******************************************************************************************/
int ProcessMonitor::reinitProcess(pid_t processID, std::string processName, int actionIndicator)
{
MonitorLog log;
log.writeLog(__LINE__, "REINITTING Process: " + processName, LOG_TYPE_DEBUG);
kill(processID, SIGHUP);
return API_SUCCESS;
}
/******************************************************************************************
* @brief stopAllProcess
*
* purpose: Stop all processes started by this monitor
*
******************************************************************************************/
int stopAllProcess(int actionIndicator)
{
int i;
if (actionIndicator == GRACEFUL)
{
i = kill(0, SIGTERM);
}
else
{
i = kill(0, SIGKILL);
}
if ( i != API_SUCCESS)
{
i = errno;
}
return i;
}
/******************************************************************************************
* @brief sendMessage
*
* purpose: send message to the monitored process or the process manager
*
******************************************************************************************/
int ProcessMonitor::sendMessage(const string& toWho, const string& message)
{
int i = 0;
return i;
}
/******************************************************************************************
* @brief checkHeartBeat
*
* purpose: check child process heart beat
*
******************************************************************************************/
int ProcessMonitor::checkHeartBeat(const string processName)
{
int i = 0;
return i;
}
/******************************************************************************************
* @brief sendAlarm
*
* purpose: send a trap and log the process information
*
******************************************************************************************/
void ProcessMonitor::sendAlarm(string alarmItem, ALARMS alarmID, int action)
{
MonitorLog log;
Oam oam;
// cout << "sendAlarm" << endl;
// cout << alarmItem << endl;
// cout << oam.itoa(alarmID) << endl;
// cout << oam.itoa(action) << endl;
sendAlarmInfo_t* t1 = new sendAlarmInfo_t;
*t1 = boost::make_tuple(alarmItem, alarmID, action);
pthread_t SendAlarmThread;
int status = pthread_create (&SendAlarmThread, NULL, (void* (*)(void*)) &sendAlarmThread, t1);
if ( status != 0 )
log.writeLog(__LINE__, "SendAlarmThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
return;
}
/******************************************************************************************
* @brief sendAlarmThread
*
* purpose: send a trap and log the process information
*
******************************************************************************************/
void* sendAlarmThread(sendAlarmInfo_t* t)
{
MonitorLog log;
Oam oam;
ALARMManager alarmMgr;
pthread_mutex_lock(&ALARM_LOCK);
string alarmItem = boost::get<0>(*t);
ALARMS alarmID = boost::get<1>(*t);
int action = boost::get<2>(*t);
//valid alarmID
if ( alarmID < 1 || alarmID > oam::MAX_ALARM_ID )
{
log.writeLog(__LINE__, "sendAlarmThread error: Invalid alarm ID", LOG_TYPE_DEBUG);
delete t;
pthread_mutex_unlock(&ALARM_LOCK);
pthread_exit(0);
}
if ( action == SET )
{
log.writeLog(__LINE__, "Send SET Alarm ID " + oam.itoa(alarmID) + " on device " + alarmItem, LOG_TYPE_DEBUG);
}
else
{
log.writeLog(__LINE__, "Send CLEAR Alarm ID " + oam.itoa(alarmID) + " on device " + alarmItem, LOG_TYPE_DEBUG);
}
// cout << "sendAlarmThread" << endl;
// cout << alarmItem << endl;
// cout << oam.itoa(alarmID) << endl;
// cout << oam.itoa(action) << endl;
try
{
alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on sendAlarmReport", LOG_TYPE_ERROR );
}
delete t;
pthread_mutex_unlock(&ALARM_LOCK);
pthread_exit(0);
}
/******************************************************************************************
* @brief updateProcessInfo
*
* purpose: Send msg to update process state and status change time on disk
*
******************************************************************************************/
bool ProcessMonitor::updateProcessInfo(std::string processName, int state, pid_t PID)
{
MonitorLog log;
Oam oam;
log.writeLog(__LINE__, "StatusUpdate of Process " + processName + " State = " + oam.itoa(state) + " PID = " + oam.itoa(PID), LOG_TYPE_DEBUG);
sendProcessInfo_t* t1 = new sendProcessInfo_t;
*t1 = boost::make_tuple(processName, state, PID);
// if state is offline, use thread for faster results
if ( state == oam::MAN_OFFLINE || state == oam::AUTO_OFFLINE )
{
pthread_t SendProcessThread;
int status = pthread_create (&SendProcessThread, NULL, (void* (*)(void*)) &sendProcessThread, t1);
if ( status != 0 )
log.writeLog(__LINE__, "SendProcessThread: pthread_create failed, return status = " + oam.itoa(status), LOG_TYPE_ERROR);
}
else
{
sendProcessThread(t1);
}
return true;
}
/******************************************************************************************
* @brief sendProcessThread
*
* purpose: Send msg to update process state and status change time on disk
*
******************************************************************************************/
void* sendProcessThread(sendProcessInfo_t* t)
{
MonitorLog log;
MonitorConfig config;
Oam oam;
// pthread_mutex_lock(&PROCESS_LOCK);
string processName = boost::get<0>(*t);
int state = boost::get<1>(*t);
pid_t PID = boost::get<2>(*t);
try
{
oam.setProcessStatus(processName, config.moduleName(), state, PID);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on setProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on setProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR );
}
delete t;
// pthread_mutex_unlock(&PROCESS_LOCK);
return NULL;
}
/******************************************************************************************
* @brief updateLog
*
* purpose: Enable/Disable Logging configuration within the syslog.conf file
*
* action: LOG_ENABLE / LOG_DISABLE
* level: all, critical, error, warning, info, debug, data
******************************************************************************************/
int ProcessMonitor::updateLog(std::string action, std::string level)
{
MonitorLog log;
Oam oam;
struct flock fl;
int fd;
string fileName;
oam.getSystemConfig("SystemLogConfigFile", fileName);
if (fileName == oam::UnassignedName )
{
// unassigned
log.writeLog(__LINE__, "ERROR: syslog file not configured ", LOG_TYPE_ERROR );
return -1;
}
string::size_type pos = fileName.find("syslog-ng", 0);
if (pos != string::npos)
{
// not supported
log.writeLog(__LINE__, "ERROR: config file not support, " + fileName, LOG_TYPE_ERROR );
return -1;
}
bool syslog7 = false;
pos = fileName.find("49", 0);
if (pos != string::npos)
{
syslog7 = true;
}
vector <string> lines;
if ( level == "data" )
return 0;
ifstream oldFile (fileName.c_str());
if (!oldFile )
{
// no file found
log.writeLog(__LINE__, "ERROR: syslog file not found at " + fileName, LOG_TYPE_ERROR );
return -1;
}
//if non-root, change file permissions so we can update it
if ( !rootUser)
{
string cmd = "chmod 666 " + fileName + " > /dev/null 2>&1";
system(cmd.c_str());
}
char line[200];
string buf;
bool restart = true;
if ( action == oam::ENABLEDSTATE )
{
// check if enabling all log levels
if ( level == "all")
{
vector <int> fileIDs;
while (oldFile.getline(line, 200))
{
buf = line;
for ( int i = 0;; i++)
{
string localLogFile = oam::LogFile[i];
if (syslog7)
localLogFile = oam::LogFile7[i];
if ( localLogFile == "" )
{
// end of list
break;
}
string logFile = localLogFile;
pos = buf.find(logFile, 0);
if (pos != string::npos)
{
// log file already there, save fileID
fileIDs.push_back(i);
break;
}
}
// output to temp file
lines.push_back(buf);
} //end of while
// check which fileIDs aren't in the syslog.conf
bool update = false;
for ( int i = 0;; i++)
{
bool found = false;
string localLogFile = oam::LogFile[i];
if (syslog7)
localLogFile = oam::LogFile7[i];
if ( localLogFile == "" )
{
// end of list
break;
}
vector<int>::iterator p = fileIDs.begin();
while ( p != fileIDs.end() )
{
if ( i == *p )
{
//already there
found = true;
break;
}
p++;
}
if (!found)
{
lines.push_back(localLogFile);
log.writeLog(__LINE__, "Add in syslog.conf log file " + localLogFile, LOG_TYPE_DEBUG);
update = true;
}
}
if (!update)
{
log.writeLog(__LINE__, "Log level file's already in syslog.conf", LOG_TYPE_DEBUG);
restart = false;
}
}
else
{
// enable a specific level
// get log file for level
for ( int i = 0;; i++)
{
if ( oam::LogLevel[i] == "" )
{
// end of list
log.writeLog(__LINE__, "ERROR: log level file not found for level " + level, LOG_TYPE_ERROR );
oldFile.close();
return -1;
}
if ( level == oam::LogLevel[i] )
{
// match found
string localLogFile = oam::LogFile[i];
if (syslog7)
localLogFile = oam::LogFile7[i];
string logFile = localLogFile;
while (oldFile.getline(line, 200))
{
buf = line;
string::size_type pos = buf.find(logFile, 0);
if (pos != string::npos)
{
log.writeLog(__LINE__, "Log level file already in syslog.conf", LOG_TYPE_DEBUG);
restart = false;
}
// output to temp file
lines.push_back(buf);
}
// file not found, add at the bottom of syslog.conf
lines.push_back(logFile);
break;
}
}
}
}
else
{
// DISABLE LOG
// check if disabling all log levels
if ( level == "all")
{
bool update = false;
while (oldFile.getline(line, 200))
{
buf = line;
bool found = false;
for ( int i = 0;; i++)
{
string localLogFile = oam::LogFile[i];
if (syslog7)
localLogFile = oam::LogFile7[i];
if ( localLogFile == "" )
{
// end of list
break;
}
string logFile = localLogFile;
pos = buf.find(logFile, 0);
if (pos != string::npos)
{
// log file found
found = true;
update = true;
break;
}
}
if (!found)
// output to temp file
lines.push_back(buf);
} //end of while
if (!update)
{
log.writeLog(__LINE__, "No Log level file's in syslog.conf", LOG_TYPE_DEBUG);
restart = false;
}
}
else
{
// disable a specific level
// get log file for level
for ( int i = 0;; i++)
{
if ( oam::LogLevel[i] == "" )
{
// end of list
log.writeLog(__LINE__, "ERROR: log level file not found for level " + level, LOG_TYPE_ERROR );
oldFile.close();
return -1;
}
if ( level == oam::LogLevel[i] )
{
// match found
string localLogFile = oam::LogFile[i];
if (syslog7)
localLogFile = oam::LogFile7[i];
string logFile = localLogFile;
bool found = false;
while (oldFile.getline(line, 200))
{
buf = line;
string::size_type pos = buf.find(logFile, 0);
if (pos != string::npos)
{
// file found, don't push into new file
log.writeLog(__LINE__, "Log level file to DISABLE found in syslog.conf", LOG_TYPE_DEBUG);
found = true;
}
else
{
// no match, push into new temp file
lines.push_back(buf);
}
}
if (found)
break;
else
{
log.writeLog(__LINE__, "Log level file not in syslog.conf", LOG_TYPE_DEBUG);
restart = false;
}
}
}
}
}
oldFile.close();
//
// go write out new file if required
//
if (restart)
{
unlink (fileName.c_str());
ofstream newFile (fileName.c_str());
memset(&fl, 0, sizeof(fl));
fl.l_type = F_RDLCK; // read lock
fl.l_whence = SEEK_SET;
fl.l_start = 0;
fl.l_len = 0; //lock whole file
// create new file
if ((fd = open(fileName.c_str(), O_RDWR | O_CREAT, 0644)) >= 0)
{
// lock file
if (fcntl(fd, F_SETLKW, &fl) != 0)
{
ostringstream oss;
oss << "ProcessMonitor::updateLog: error locking file " <<
fileName <<
": " <<
strerror(errno) <<
", proceding anyway.";
cerr << oss.str() << endl;
}
copy(lines.begin(), lines.end(), ostream_iterator<string>(newFile, "\n"));
newFile.close();
fl.l_type = F_UNLCK; //unlock
fcntl(fd, F_SETLK, &fl);
close(fd);
oam.syslogAction("restart");
}
else
{
ostringstream oss;
oss << "ProcessMonitor::updateLog: error opening file " <<
fileName <<
": " <<
strerror(errno);
throw runtime_error(oss.str());
}
}
//update file priviledges
changeModLog();
return 0;
}
/******************************************************************************************
* @brief changeModLog
*
* purpose: Updates the file mods so files can be read/write
* from external modules
*
******************************************************************************************/
void ProcessMonitor::changeModLog()
{
for ( int i = 0;; i++)
{
if ( oam::LogFile[i].empty() )
//end of list
break;
string logFile = oam::LogFile[i];
string::size_type pos = logFile.find('/', 0);
logFile = logFile.substr(pos, 200);
string cmd = "chmod 755 " + logFile + " > /dev/null 2>&1";
system(cmd.c_str());
}
return;
}
/******************************************************************************************
* @brief getConfigLog
*
* purpose: Get Logging configuration within the syslog.conf file
*
******************************************************************************************/
int ProcessMonitor::getConfigLog()
{
MonitorLog log;
Oam oam;
string fileName;
oam.getSystemConfig("SystemLogConfigFile", fileName);
if (fileName == oam::UnassignedName )
{
// unassigned
log.writeLog(__LINE__, "ERROR: syslog file not configured ", LOG_TYPE_ERROR );
return API_FAILURE;
}
string::size_type pos = fileName.find("syslog-ng", 0);
if (pos != string::npos)
{
// not supported
log.writeLog(__LINE__, "ERROR: config file not support, " + fileName, LOG_TYPE_ERROR );
return API_FAILURE;
}
ifstream oldFile (fileName.c_str());
if (!oldFile)
{
// no file found
log.writeLog(__LINE__, "ERROR: syslog file not found at " + fileName, LOG_TYPE_ERROR );
return API_FILE_OPEN_ERROR;
}
int configData = 0;
char line[200];
string buf;
while (oldFile.getline(line, 200))
{
buf = line;
for ( int i = 0;; i++)
{
if ( oam::LogFile[i] == "" )
{
// end of list
break;
}
string logFile = oam::LogFile[i];
logFile = logFile.substr(14, 80);
string::size_type pos = buf.find(logFile, 0);
if (pos != string::npos)
{
// match found
switch (i + 1)
{
case 1:
configData = configData | LEVEL_CRITICAL;
break;
case 2:
configData = configData | LEVEL_ERROR;
break;
case 3:
configData = configData | LEVEL_WARNING;
break;
case 4:
configData = configData | LEVEL_INFO;
break;
case 5:
configData = configData | LEVEL_DEBUG;
break;
default:
break;
} //end of switch
}
}
} //end of while
oldFile.close();
// adjust data to be different from API RETURN CODES
configData = configData + API_MAX;
return configData;
}
/******************************************************************************************
* @brief checkPowerOnResults
*
* purpose: Read Power-On TEst results log file and report issues via Alarms
*
******************************************************************************************/
void ProcessMonitor::checkPowerOnResults()
{
string POWERON_TEST_RESULTS_FILE = "/var/lib/columnstore/st_status";
MonitorLog log;
ifstream oldFile (POWERON_TEST_RESULTS_FILE.c_str());
if (!oldFile)
{
// no file found
log.writeLog(__LINE__, "ERROR: Power-On Test results file not found at " + POWERON_TEST_RESULTS_FILE, LOG_TYPE_ERROR );
return;
}
int configData = 0;
char line[200];
string buf;
while (oldFile.getline(line, 200))
{
buf = line;
string name;
string level;
string info = "";
// extract name key word
string::size_type pos = buf.find("name:", 0);
string::size_type pos1;
if (pos != string::npos)
{
// match found
pos1 = buf.find("|", pos);
if (pos1 != string::npos)
// end of name found
name = buf.substr(pos + 5, pos1 - (pos + 5));
else
{
// name not found, skip this line
continue;
}
}
else
{
// name not found, skip this line
continue;
}
// extract level key word
pos = buf.find("level:", pos1);
if (pos != string::npos)
{
// match found
pos1 = buf.find("|", pos);
if (pos1 != string::npos)
// end of level found
level = buf.substr(pos + 6, pos1 - (pos + 6));
else
{
// level not found, skip this line
continue;
}
}
else
{
// level not found, skip this line
continue;
}
// extract info key word, if any exist
pos = buf.find("info:", pos1);
if (pos != string::npos)
// match found
info = buf.substr(pos + 5, 200);
// log findings
LoggingID lid(18);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Power-On self test: name = ");
args.add(name);
args.add(", level = ");
args.add(level);
args.add(", info = ");
args.add(info);
msg.format(args);
ml.logDebugMessage(msg);
// Issue alarm based on level
pos = level.find("3", 0);
if (pos != string::npos)
{
// Severe Warning found, Issue alarm
sendAlarm(name, POWERON_TEST_SEVERE, SET);
//Log this event
LoggingID lid(18);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Power-On self test Severe Warning on ");
args.add(name);
args.add(": ");
args.add(info);
msg.format(args);
ml.logWarningMessage(msg);
continue;
}
pos = level.find("2", 0);
if (pos != string::npos)
{
// Warning found, Issue alarm
sendAlarm(name, POWERON_TEST_WARNING, SET);
//Log this event
LoggingID lid(18);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Power-On self test Warning on ");
args.add(name);
args.add(": ");
args.add(info);
msg.format(args);
ml.logWarningMessage(msg);
continue;
}
pos = level.find("1", 0);
if (pos != string::npos)
{
// Info found, Log this event
LoggingID lid(18);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Power-On self test Info on ");
args.add(name);
args.add(": ");
args.add(info);
msg.format(args);
ml.logInfoMessage(msg);
continue;
}
} //end of while
oldFile.close();
// adjust data to be different from API RETURN CODES
configData = configData + API_MAX;
return;
}
/******************************************************************************************
* @brief updateConfig
*
* purpose: Update Config data from disk
*
******************************************************************************************/
int ProcessMonitor::updateConfig()
{
//ProcMon log file
MonitorLog log;
MonitorConfig currentConfig;
// ProcessMonitor aMonitor(config, log);
Oam oam;
//Read ProcessConfig file to get process list belong to this process monitor
SystemProcessConfig systemprocessconfig;
try
{
oam.getProcessConfig(systemprocessconfig);
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: " + error, LOG_TYPE_ERROR );
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessConfig: Caught unknown exception!", LOG_TYPE_ERROR );
}
//Update a map for application launch ID for this Process-Monitor
string OAMParentModuleType = currentConfig.OAMParentName().substr(0, MAX_MODULE_TYPE_SIZE);
string systemModuleType = config.moduleName().substr(0, MAX_MODULE_TYPE_SIZE);
for ( unsigned int i = 0 ; i < systemprocessconfig.processconfig.size(); i++)
{
if (systemprocessconfig.processconfig[i].ModuleType == systemModuleType
|| ( systemprocessconfig.processconfig[i].ModuleType == "um" &&
systemModuleType == "pm" && PMwithUM == "y" )
|| systemprocessconfig.processconfig[i].ModuleType == "ChildExtOAMModule"
|| ( systemprocessconfig.processconfig[i].ModuleType == "ChildOAMModule" )
|| (systemprocessconfig.processconfig[i].ModuleType == "ParentOAMModule" &&
systemModuleType == OAMParentModuleType) )
{
if ( systemprocessconfig.processconfig[i].ModuleType == "um" &&
systemModuleType == "pm" && PMwithUM == "y" &&
systemprocessconfig.processconfig[i].ProcessName == "DMLProc" )
continue;
if ( systemprocessconfig.processconfig[i].ModuleType == "um" &&
systemModuleType == "pm" && PMwithUM == "y" &&
systemprocessconfig.processconfig[i].ProcessName == "DDLProc" )
continue;
ProcessStatus processstatus;
try
{
//Get the process information
oam.getProcessStatus(systemprocessconfig.processconfig[i].ProcessName, config.moduleName(), processstatus);
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR );
return API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR );
return API_FAILURE;
}
config.buildList(systemprocessconfig.processconfig[i].ModuleType,
systemprocessconfig.processconfig[i].ProcessName,
systemprocessconfig.processconfig[i].ProcessLocation,
systemprocessconfig.processconfig[i].ProcessArgs,
systemprocessconfig.processconfig[i].LaunchID,
processstatus.ProcessID,
processstatus.ProcessOpState,
systemprocessconfig.processconfig[i].BootLaunch,
systemprocessconfig.processconfig[i].RunType,
systemprocessconfig.processconfig[i].DepProcessName,
systemprocessconfig.processconfig[i].DepModuleName,
systemprocessconfig.processconfig[i].LogFile);
}
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief buildSystemTables
*
* purpose: Check for and build System Tables if not there
* Only will be run from 'pm1'
*
******************************************************************************************/
int ProcessMonitor::buildSystemTables()
{
Oam oam;
string DBdir;
oam.getSystemConfig("DBRoot1", DBdir);
string fileName = DBdir + "/000.dir";
//check if postConfigure or dbbuilder is already running
string cmd = "ps aux | grep postConfigure | grep -v grep";
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) == 0)
return API_ALREADY_IN_PROGRESS;
cmd = "ps aux | grep dbbuilder | grep -v grep";
rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) == 0)
return API_ALREADY_IN_PROGRESS;
if (!IDBPolicy::exists(fileName.c_str()))
{
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
string cmd = "dbbuilder 7 > " + logdir + "/dbbuilder.log &";
system(cmd.c_str());
log.writeLog(__LINE__, "buildSystemTables: dbbuilder 7 Successfully Launched", LOG_TYPE_DEBUG);
return API_SUCCESS;
}
log.writeLog(__LINE__, "buildSystemTables: System Tables Already Exist", LOG_TYPE_DEBUG );
return API_FILE_ALREADY_EXIST;
}
/******************************************************************************************
* @brief reconfigureModule
*
* purpose: reconfigure Module functionality
* Edit the moduleFile file with new Module Name
*
******************************************************************************************/
int ProcessMonitor::reconfigureModule(std::string reconfigureModuleName)
{
Oam oam;
//create custom files
string dir = "/var/lib/columnstore/local/etc/" + reconfigureModuleName;
string cmd = "mkdir " + dir + " > /dev/null 2>&1";
system(cmd.c_str());
cmd = "rm -f " + dir + "/* > /dev/null 2>&1";
system(cmd.c_str());
if ( reconfigureModuleName.find("um") != string::npos)
{
cmd = "cp /var/lib/columnstore/local/etc/um1/* " + dir + "/.";
system(cmd.c_str());
}
else
{
cmd = "cp /var/lib/columnstore/local/etc/pm1/* " + dir + "/.";
system(cmd.c_str());
}
//update module file
string fileName = "/var/lib/columnstore/local/module";
unlink (fileName.c_str());
ofstream newFile (fileName.c_str());
cmd = "echo " + reconfigureModuleName + " > " + fileName;
system(cmd.c_str());
newFile.close();
return API_SUCCESS;
}
/******************************************************************************************
* @brief configureModule
*
* purpose: configure Module functionality
* Edit the moduleFile file with new Module Name
*
******************************************************************************************/
int ProcessMonitor::configureModule(std::string configureModuleName)
{
Oam oam;
//update module file
string fileName = "/var/lib/columnstore/local/module";
unlink (fileName.c_str());
ofstream newFile (fileName.c_str());
string cmd = "echo " + configureModuleName + " > " + fileName;
system(cmd.c_str());
newFile.close();
return API_SUCCESS;
}
/******************************************************************************************
* @brief checkSpecialProcessState
*
* purpose: Check if a SIMPLEX runtype Process has mates already up
*
******************************************************************************************/
int ProcessMonitor::checkSpecialProcessState( std::string processName, std::string runType, string processModuleType )
{
MonitorLog log;
MonitorConfig config;
Oam oam;
SystemProcessStatus systemprocessstatus;
ProcessStatus processstatus;
int retStatus = oam::MAN_INIT;
if ( runType == SIMPLEX || runType == ACTIVE_STANDBY)
{
log.writeLog(__LINE__, "checkSpecialProcessState on : " + processName, LOG_TYPE_DEBUG);
if ( runType == SIMPLEX && PMwithUM == "y" && processModuleType == "um" && gOAMParentModuleFlag)
retStatus = oam::COLD_STANDBY;
else if ( runType == SIMPLEX && gOAMParentModuleFlag )
retStatus = oam::MAN_INIT;
else if ( runType == ACTIVE_STANDBY && processModuleType == "ParentOAMModule" &&
( gOAMParentModuleFlag || !runStandby ) )
retStatus = oam::MAN_INIT;
else if ( runType == ACTIVE_STANDBY && processModuleType == "ParentOAMModule" && config.OAMStandbyParentFlag() )
retStatus = oam::STANDBY;
else if ( runType == ACTIVE_STANDBY && processModuleType == "ParentOAMModule" )
retStatus = oam::COLD_STANDBY;
else if ( runType == SIMPLEX && processModuleType == "ParentOAMModule" && !gOAMParentModuleFlag)
retStatus = oam::COLD_STANDBY;
else
{
//simplex on a non um1 or non-parent-pm
if ( processName == "DMLProc" || processName == "DDLProc" )
{
string PrimaryUMModuleName;
try
{
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
}
catch (...) {}
if ( PrimaryUMModuleName != config.moduleName() )
{
retStatus = oam::COLD_STANDBY;
}
}
if ( retStatus != oam::COLD_STANDBY )
{
try
{
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == processName &&
systemprocessstatus.processstatus[i].Module != config.moduleName() )
{
if ( systemprocessstatus.processstatus[i].ProcessOpState == ACTIVE ||
systemprocessstatus.processstatus[i].ProcessOpState == MAN_INIT ||
systemprocessstatus.processstatus[i].ProcessOpState == AUTO_INIT ||
// systemprocessstatus.processstatus[i].ProcessOpState == MAN_OFFLINE ||
// systemprocessstatus.processstatus[i].ProcessOpState == INITIAL ||
systemprocessstatus.processstatus[i].ProcessOpState == BUSY_INIT)
{
// found a ACTIVE or going ACTIVE mate
if ( runType == oam::SIMPLEX )
// SIMPLEX
retStatus = oam::COLD_STANDBY;
else
{
// ACTIVE_STANDBY
for ( unsigned int j = 0 ; j < systemprocessstatus.processstatus.size(); j++)
{
if ( systemprocessstatus.processstatus[j].ProcessName == processName &&
systemprocessstatus.processstatus[j].Module != config.moduleName() )
{
if ( systemprocessstatus.processstatus[j].ProcessOpState == STANDBY ||
systemprocessstatus.processstatus[j].ProcessOpState == STANDBY_INIT)
// FOUND ACTIVE AND STANDBY
retStatus = oam::COLD_STANDBY;
}
}
// FOUND AN ACTIVE, BUT NO STANDBY
log.writeLog(__LINE__, "checkSpecialProcessState, set STANDBY on " + processName, LOG_TYPE_DEBUG);
retStatus = oam::STANDBY;
}
}
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
}
if ( retStatus == oam::COLD_STANDBY || retStatus == oam::STANDBY )
{
updateProcessInfo(processName, retStatus, 0);
sendAlarm(processName, PROCESS_DOWN_MANUAL, CLEAR);
sendAlarm(processName, PROCESS_DOWN_AUTO, CLEAR);
sendAlarm(processName, PROCESS_INIT_FAILURE, CLEAR);
}
log.writeLog(__LINE__, "checkSpecialProcessState status return : " + oam.itoa(retStatus), LOG_TYPE_DEBUG);
return retStatus;
}
/******************************************************************************************
* @brief checkMateModuleState
*
* purpose: Check if Mate Module is Active
*
******************************************************************************************/
int ProcessMonitor::checkMateModuleState()
{
MonitorLog log;
// MonitorConfig config;
// ProcessMonitor aMonitor(config, log);
Oam oam;
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus, false);
for ( unsigned int i = 0 ; i < systemstatus.systemmodulestatus.modulestatus.size(); i++)
{
string moduleName = systemstatus.systemmodulestatus.modulestatus[i].Module;
string moduleType = moduleName.substr(0, MAX_MODULE_TYPE_SIZE);
int moduleID = atoi(moduleName.substr(MAX_MODULE_TYPE_SIZE, MAX_MODULE_ID_SIZE).c_str());
if ( moduleType == config.moduleType() && moduleID != config.moduleID() )
if ( systemstatus.systemmodulestatus.modulestatus[i].ModuleOpState == oam::ACTIVE )
return API_SUCCESS;
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
return API_FAILURE;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
return API_FAILURE;
}
return API_FAILURE;
}
/******************************************************************************************
* @brief createDataDirs
*
* purpose: Create the Calpont Data directories
*
*
******************************************************************************************/
int ProcessMonitor::createDataDirs(std::string cloud)
{
MonitorLog log;
Oam oam;
if ( config.moduleType() == "um" &&
( cloud == "amazon-ec2" || cloud == "amazon-vpc") )
{
string UMStorageType;
try
{
oam.getSystemConfig( "UMStorageType", UMStorageType);
}
catch (...) {}
if (UMStorageType == "external")
{
if (!amazonVolumeCheck())
{
return API_FAILURE;
}
}
}
if ( config.moduleType() == "pm" )
{
DBRootConfigList dbrootConfigList;
string DBRootStorageType;
try
{
oam.getSystemConfig( "DBRootStorageType", DBRootStorageType);
}
catch (...) {}
try
{
systemStorageInfo_t t;
t = oam.getStorageConfig();
if ( boost::get<1>(t) == 0 )
{
log.writeLog(__LINE__, "No dbroots are configured in Columnstore.xml file at proc mon startup time", LOG_TYPE_WARNING);
return API_INVALID_PARAMETER;
}
DeviceDBRootList moduledbrootlist = boost::get<2>(t);
DeviceDBRootList::iterator pt = moduledbrootlist.begin();
for ( ; pt != moduledbrootlist.end() ; pt++)
{
int moduleID = (*pt).DeviceID;
DBRootConfigList::iterator pt1 = (*pt).dbrootConfigList.begin();
for ( ; pt1 != (*pt).dbrootConfigList.end() ; pt1++)
{
int id = *pt1;
string DBRootName = "/var/lib/columnstore/data" + oam.itoa(id);
string cmd = "mkdir " + DBRootName + " > /dev/null 2>&1";
int rtnCode = system(cmd.c_str());
if (WEXITSTATUS(rtnCode) == 0)
log.writeLog(__LINE__, "Successful created directory " + DBRootName, LOG_TYPE_DEBUG);
cmd = "chmod 755 " + DBRootName + " > /dev/null 2>&1";
system(cmd.c_str());
if ( id == 1 )
{
cmd = "mkdir -p /var/lib/columnstore/data1/systemFiles/dbrm > /dev/null 2>&1";
system(cmd.c_str());
}
if ( (cloud == "amazon-ec2" || cloud == "amazon-vpc") &&
DBRootStorageType == "external" &&
config.moduleID() == moduleID)
{
if (!amazonVolumeCheck(id))
{
return API_FAILURE;
}
}
}
}
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getStorageConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getStorageConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief processRestarted
*
* purpose: Process Restarted, inform Process Mgr
*
*
******************************************************************************************/
int ProcessMonitor::processRestarted( std::string processName, bool manual)
{
MonitorLog log;
// MonitorConfig config;
// ProcessMonitor aMonitor(config, log);
Oam oam;
ByteStream msg;
log.writeLog(__LINE__, "Inform Process Mgr that process was restarted: " + processName, LOG_TYPE_DEBUG);
int returnStatus = API_FAILURE;
msg << (ByteStream::byte) PROCESSRESTART;
msg << config.moduleName();
msg << processName;
msg << (ByteStream::byte) manual;
try
{
MessageQueueClient mqRequest("ProcMgr");
mqRequest.write(msg);
mqRequest.shutdown();
returnStatus = API_SUCCESS;
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief getDBRMdata
*
* purpose: get DBRM Data from Process Mgr
*
*
******************************************************************************************/
int ProcessMonitor::getDBRMdata(string *path)
{
MonitorLog log;
Oam oam;
ByteStream msg;
int returnStatus = API_FAILURE;
msg << (ByteStream::byte) GETDBRMDATA;
msg << config.moduleName();
try
{
MessageQueueClient mqRequest("ProcMgr");
mqRequest.write(msg);
ByteStream receivedMSG;
struct timespec ts = { 30, 0 };
//read message type
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception!", LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
string type;
receivedMSG >> type;
log.writeLog(__LINE__, type, LOG_TYPE_DEBUG);
if ( type == "initial" )
{
log.writeLog(__LINE__, "initial system, no dbrm files to send", LOG_TYPE_DEBUG);
returnStatus = API_SUCCESS;
}
else
{
// files coming, read number of files
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception!", LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
ByteStream::byte numFiles;
receivedMSG >> numFiles;
log.writeLog(__LINE__, oam.itoa(numFiles), LOG_TYPE_DEBUG);
bool journalFile = false;
boost::uuids::uuid u = boost::uuids::random_generator()();
bf::path pTmp = bf::path(*path) / boost::uuids::to_string(u);
if (config::Config::makeConfig()->getConfig("Installation", "DBRootStorageType") != "storagemanager")
bf::create_directories(pTmp);
*path = pTmp.string();
log.writeLog(__LINE__, "Downloading DBRM files to " + *path, LOG_TYPE_DEBUG);
for ( int i = 0 ; i < numFiles ; i ++ )
{
string fileName;
//read file name
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception!", LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
receivedMSG >> fileName;
//check for journal file coming across
string::size_type pos = fileName.find("journal", 0);
if (pos != string::npos)
journalFile = true;
//change file name location to temp file local
// string::size_type pos1 = fileName.find("/dbrm",0);
// pos = fileName.find("data1",0);
// if (pos != string::npos)
// {
// string temp = fileName.substr(0,pos);
// string temp1 = temp + "data" + fileName.substr(pos1,80);
// fileName = temp1;
// }
bf::path pFilename(fileName);
pFilename = pTmp / pFilename.filename();
const char *cFilename = pFilename.string().c_str();
boost::scoped_ptr<IDBDataFile> out(IDBDataFile::open(
IDBPolicy::getType(cFilename,
IDBPolicy::WRITEENG),
cFilename, "w", 0));
// read file data
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception!", LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
out->write(receivedMSG.buf(), receivedMSG.length());
log.writeLog(__LINE__, fileName, LOG_TYPE_DEBUG);
log.writeLog(__LINE__, oam.itoa(receivedMSG.length()), LOG_TYPE_DEBUG);
}
else
log.writeLog(__LINE__, "ProcMgr Msg timeout on module", LOG_TYPE_ERROR);
}
else
log.writeLog(__LINE__, "ProcMgr Msg timeout on module", LOG_TYPE_ERROR);
}
//create journal file if none come across
if ( !journalFile)
{
bf::path pJournalFilename(pTmp / "BRM_saves_journal");
IDBDataFile *idbJournalFile = IDBDataFile::open(IDBPolicy::getType(pJournalFilename.string().c_str(),
IDBPolicy::WRITEENG), pJournalFilename.string().c_str(), "w", 0);
delete idbJournalFile;
//string cmd = "touch " + startup::StartUp::installDir() + "/data1/systemFiles/dbrm/BRM_saves_journal";
//system(cmd.c_str());
}
returnStatus = oam::API_SUCCESS;
}
else
log.writeLog(__LINE__, "ProcMgr Msg timeout on module", LOG_TYPE_ERROR);
}
}
else
log.writeLog(__LINE__, "ProcMgr Msg timeout on module", LOG_TYPE_ERROR);
mqRequest.shutdown();
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief runStartupTest
*
* purpose: Runs DB sanity test
*
*
******************************************************************************************/
int ProcessMonitor::runStartupTest()
{
//ProcMon log file
MonitorLog log;
// MonitorConfig config;
// ProcessMonitor aMonitor(config, log);
Oam oam;
//skip if module is DISABLED
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(config.moduleName(), opState, degraded);
if (geteuid() != 0 || opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
return oam::API_SUCCESS;
//run startup test script
string logdir("/var/log/mariadb/columnstore");
if (access(logdir.c_str(), W_OK) != 0) logdir = tmpLogDir;
string cmd = "startupTests.sh > " + logdir + "/startupTests.log1 2>&1";
system(cmd.c_str());
cmd = logdir + "/startupTests.log1";
bool fail = false;
if (oam.checkLogStatus(cmd, "OK"))
{
log.writeLog(__LINE__, "startupTests passed", LOG_TYPE_DEBUG);
}
else
{
log.writeLog(__LINE__, "startupTests failed", LOG_TYPE_CRITICAL);
fail = true;
}
if (!fail)
{
log.writeLog(__LINE__, "runStartupTest passed", LOG_TYPE_DEBUG);
//Clear the alarm
sendAlarm(config.moduleName().c_str(), STARTUP_DIAGNOTICS_FAILURE, CLEAR);
return oam::API_SUCCESS;
}
else
{
log.writeLog(__LINE__, "ERROR: runStartupTest failed", LOG_TYPE_CRITICAL);
//Set the alarm
sendAlarm(config.moduleName().c_str(), STARTUP_DIAGNOTICS_FAILURE, SET);
//setModule status to failed
try
{
oam.setModuleStatus(config.moduleName(), oam::FAILED);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on setModuleStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
return oam::API_FAILURE;
}
}
/******************************************************************************************
* @brief updateConfigFile
*
* purpose: Update local Calpont Config File
*
******************************************************************************************/
int ProcessMonitor::updateConfigFile(messageqcpp::ByteStream msg)
{
Config* sysConfig = Config::makeConfig();
sysConfig->writeConfigFile(msg);
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief sendMsgProcMon1
*
* purpose: Sends a Msg to ProcMon
*
******************************************************************************************/
std::string ProcessMonitor::sendMsgProcMon1( std::string module, ByteStream msg, int requestID )
{
string msgPort = module + "_ProcessMonitor";
string returnStatus = "FAILED";
// do a ping test to determine a quick failure
Config* sysConfig = Config::makeConfig();
string IPAddr = sysConfig->getConfig(msgPort, "IPAddr");
string cmdLine = "ping ";
string cmdOption = " -c 1 -w 5 >> /dev/null 2>&1";
string cmd = cmdLine + IPAddr + cmdOption;
if ( system(cmd.c_str()) != 0 )
{
//ping failure
log.writeLog(__LINE__, "sendMsgProcMon ping failure", LOG_TYPE_ERROR);
return returnStatus;
}
try
{
MessageQueueClient mqRequest(msgPort);
mqRequest.write(msg);
// wait 30 seconds for response
ByteStream::byte returnACK;
ByteStream::byte returnRequestID;
string requestStatus;
ByteStream receivedMSG;
struct timespec ts = { 30, 0 };
try
{
receivedMSG = mqRequest.read(&ts);
}
catch (SocketClosed& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: " + error, LOG_TYPE_ERROR);
return returnStatus;
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on mqRequest.read: Caught unknown exception!", LOG_TYPE_ERROR);
return returnStatus;
}
if (receivedMSG.length() > 0)
{
receivedMSG >> returnACK;
receivedMSG >> returnRequestID;
receivedMSG >> requestStatus;
if ( returnACK == oam::ACK && returnRequestID == requestID)
{
// ACK for this request
returnStatus = requestStatus;
}
else
log.writeLog(__LINE__, "sendMsgProcMon: message mismatch ", LOG_TYPE_ERROR);
}
else
log.writeLog(__LINE__, "sendMsgProcMon: ProcMon Msg timeout on module " + module, LOG_TYPE_ERROR);
mqRequest.shutdown();
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on MessageQueueClient: Caught unknown exception!", LOG_TYPE_ERROR);
}
return returnStatus;
}
/******************************************************************************************
* @brief checkModuleFailover
*
* purpose: check if module failover is needed due to a process outage
*
******************************************************************************************/
void ProcessMonitor::checkModuleFailover( std::string processName)
{
Oam oam;
//force failover on certain processes
if ( processName == "DDLProc" ||
processName == "DMLProc" ) {
log.writeLog(__LINE__, "checkModuleFailover: process failover, process outage of " + processName, LOG_TYPE_CRITICAL);
try
{
SystemProcessStatus systemprocessstatus;
oam.getProcessStatus(systemprocessstatus);
for ( unsigned int i = 0 ; i < systemprocessstatus.processstatus.size(); i++)
{
if ( systemprocessstatus.processstatus[i].ProcessName == processName &&
systemprocessstatus.processstatus[i].Module != config.moduleName() )
{
//make sure it matches module type
string procModuleType = systemprocessstatus.processstatus[i].Module.substr(0, MAX_MODULE_TYPE_SIZE);
if ( config.moduleType() != procModuleType )
continue;
if ( systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY ||
systemprocessstatus.processstatus[i].ProcessOpState == oam::AUTO_OFFLINE ||
systemprocessstatus.processstatus[i].ProcessOpState == oam::FAILED )
{
// found a AVAILABLE mate, start it
log.writeLog(__LINE__, "Change UM Master to module " + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "Stop local UM module " + config.moduleName(), LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "Disable Local will Enable UM module " + systemprocessstatus.processstatus[i].Module, LOG_TYPE_DEBUG);
oam::DeviceNetworkConfig devicenetworkconfig;
oam::DeviceNetworkList devicenetworklist;
devicenetworkconfig.DeviceName = config.moduleName();
devicenetworklist.push_back(devicenetworkconfig);
try
{
oam.stopModule(devicenetworklist, oam::FORCEFUL, oam::ACK_YES);
log.writeLog(__LINE__, "success stopModule on module " + config.moduleName(), LOG_TYPE_DEBUG);
try
{
oam.disableModule(devicenetworklist);
log.writeLog(__LINE__, "success disableModule on module " + config.moduleName(), LOG_TYPE_DEBUG);
}
catch (exception& e)
{
log.writeLog(__LINE__, "failed disableModule on module " + config.moduleName(), LOG_TYPE_ERROR);
}
}
catch (exception& e)
{
log.writeLog(__LINE__, "failed stopModule on module " + config.moduleName(), LOG_TYPE_ERROR);
}
break;
}
}
}
}
catch (exception& ex)
{
// string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getProcessStatus: Caught unknown exception!", LOG_TYPE_ERROR);
}
}
}
/******************************************************************************************
* @brief changeMyCnf
*
* purpose: Change my.cnf
*
******************************************************************************************/
int ProcessMonitor::changeMyCnf(std::string type)
{
Oam oam;
log.writeLog(__LINE__, "changeMyCnf function called for " + type, LOG_TYPE_DEBUG);
string mycnfFile = string(MCSMYCNFDIR) + "/columnstore.cnf";
ifstream file (mycnfFile.c_str());
if (!file)
{
log.writeLog(__LINE__, "changeMyCnf - my.cnf file not found: " + mycnfFile, LOG_TYPE_CRITICAL);
return oam::API_FAILURE;
}
//get server-id based on ExeMgrx setup
string serverID = "0";
string localModuleName = config.moduleName();
for ( int id = 1 ; ; id++ )
{
string Section = "ExeMgr" + oam.itoa(id);
string moduleName;
try
{
Config* sysConfig = Config::makeConfig();
moduleName = sysConfig->getConfig(Section, "Module");
if ( moduleName == localModuleName )
{
serverID = oam.itoa(id);
break;
}
}
catch (...) {}
}
if ( serverID == "0" )
{
log.writeLog(__LINE__, "changeMyCnf: ExeMgr for local module doesn't exist", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
// set server-id and other options in my.cnf
vector <string> lines;
char line[200];
string buf;
while (file.getline(line, 200))
{
buf = line;
string::size_type pos = buf.find("server-id", 0);
if ( pos != string::npos )
{
buf = "server-id = " + serverID;
string command = "SET GLOBAL server_id=" + serverID + ";";
int ret = runMariaDBCommandLine(command);
if (ret != 0)
{
log.writeLog(__LINE__, "changeMyCnf: runMariaDBCommandLine Error", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
pos = buf.find("log_bin", 0);
if ( pos != string::npos )
{
buf = "log_bin";
}
// set local query flag if on pm
if ( (PMwithUM == "y") && config.moduleType() == "pm" )
{
pos = buf.find("columnstore_local_query", 0);
if ( pos != string::npos )
{
buf = "columnstore_local_query=1";
string command = "SET GLOBAL " + buf + ";";
int ret = runMariaDBCommandLine(command);
if (ret != 0)
{
log.writeLog(__LINE__, "changeMyCnf: runMariaDBCommandLine Error", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
}
else
{
// disable, if needed
pos = buf.find("columnstore_local_query", 0);
if ( pos != string::npos )
{
buf = "columnstore_local_query=0";
string command = "SET GLOBAL " + buf + ";";
int ret = runMariaDBCommandLine(command);
if (ret != 0)
{
log.writeLog(__LINE__, "changeMyCnf: runMariaDBCommandLine Error", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
}
//output to temp file
lines.push_back(buf);
}
file.close();
unlink (mycnfFile.c_str());
ofstream newFile (mycnfFile.c_str());
//create new file
int fd = open(mycnfFile.c_str(), O_RDWR | O_CREAT, 0664);
copy(lines.begin(), lines.end(), ostream_iterator<string>(newFile, "\n"));
newFile.close();
close(fd);
// set owner and permission
string cmd = "chmod 664 " + mycnfFile + " >/dev/null 2>&1";
system(cmd.c_str());
cmd = "chown " + USER + ":" + USER + " " + mycnfFile + " >/dev/null 2>&1";
system(cmd.c_str());
// restart mysql
try {
oam.actionMysqlCalpont(MYSQL_RESTART);
sleep(5); // give after mysql restart
}
catch(...)
{}
log.writeLog(__LINE__, "changeMyCnf function successfully completed", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief runMariaDBCommandLine
*
* purpose: run MariaDB Command Line script
*
******************************************************************************************/
int ProcessMonitor::runMariaDBCommandLine(std::string command)
{
Oam oam;
log.writeLog(__LINE__, "runMariaDBCommandLine function called: cmd = " + command, LOG_TYPE_DEBUG);
// mysql port number
string MySQLPort;
try
{
oam.getSystemConfig("MySQLPort", MySQLPort);
}
catch (...)
{
MySQLPort = "3306";
}
if ( MySQLPort.empty() )
MySQLPort = "3306";
string cmd = "mariadb-command-line.sh --command='" + command + "' --port=" + MySQLPort + " --tmpdir=" + tmpLogDir + " > " + tmpLogDir + "/mariadb-command-line.sh.log 2>&1";
log.writeLog(__LINE__, "cmd = " + cmd, LOG_TYPE_DEBUG);
system(cmd.c_str());
string logFile = tmpLogDir + "/mariadb-command-line.sh.log";
if (oam.checkLogStatus(logFile, "ERROR 1045") )
{
log.writeLog(__LINE__, "mariadb-command-line.sh: MySQL Password Error, check .my.cnf", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
else
{
if (oam.checkLogStatus(logFile, "OK"))
{
log.writeLog(__LINE__, "mariadb-command-line.sh: Successful return", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
else
{
log.writeLog(__LINE__, "mariadb-command-line.sh: Error return, check log " + tmpLogDir + "/mariadb-command-line.sh.log", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
return oam::API_FAILURE;
}
/******************************************************************************************
* @brief runMasterRep
*
* purpose: run Master Replication script
*
******************************************************************************************/
int ProcessMonitor::runMasterRep(std::string& masterLogFile, std::string& masterLogPos)
{
Oam oam;
SystemModuleTypeConfig systemModuleTypeConfig;
try
{
oam.getSystemConfig(systemModuleTypeConfig);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
// mysql port number
string MySQLPort;
try
{
oam.getSystemConfig("MySQLPort", MySQLPort);
}
catch (...)
{
MySQLPort = "3306";
}
if ( MySQLPort.empty() )
MySQLPort = "3306";
// create user for each module by ip address
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
string moduleName = (*pt).DeviceName;
//skip if module is not ACTIVE
// int opState = oam::ACTIVE;
// bool degraded;
// oam.getModuleStatus(moduleName, opState, degraded);
// if (opState != oam::ACTIVE)
// continue;
bool passwordError = false;
string moduleType = systemModuleTypeConfig.moduletypeconfig[i].ModuleType;
if ( ( (PMwithUM == "n") && (moduleType == "pm") ) &&
( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM) )
continue;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
{
string ipAddr = (*pt1).IPAddr;
string logFile = tmpLogDir + "/master-rep-columnstore-" + moduleName + ".log";
string cmd = "master-rep-columnstore.sh --hostIP=" + ipAddr + " --port=" + MySQLPort + " --tmpdir=" + tmpLogDir + " > " + logFile + " 2>&1";
log.writeLog(__LINE__, "cmd = " + cmd, LOG_TYPE_DEBUG);
system(cmd.c_str());
if (oam.checkLogStatus(logFile, "ERROR 1045") )
{
if ( passwordError )
{
log.writeLog(__LINE__, "master-rep-columnstore.sh: MySQL Password Error", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
log.writeLog(__LINE__, "master-rep-columnstore.sh: Missing Password error, go check for a password and retry", LOG_TYPE_DEBUG);
passwordError = true;
break;
}
else
{
if (oam.checkLogStatus(logFile, "OK"))
log.writeLog(__LINE__, "master-rep-columnstore.sh: Successful return for node " + moduleName, LOG_TYPE_DEBUG);
else
{
log.writeLog(__LINE__, "master-rep-columnstore.sh: Error return, check log " + logFile, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
}
}
}
// go parse out the MASTER_LOG_FILE and MASTER_LOG_POS
// this is what the output will look like
//
// SHOW MASTER STATUS
// File Position Binlog_Do_DB Binlog_Ignore_DB
// mysql-bin.000006 2921
//
// in log - show-master-status.log
string masterLog = tmpLogDir + "/show-master-status.log";
ifstream file (masterLog.c_str());
if (!file)
{
log.writeLog(__LINE__, "runMasterRep - show master status log file doesn't exist - " + masterLog, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
else
{
char line[200];
string buf;
while (file.getline(line, 200))
{
buf = line;
string::size_type pos = buf.find("000", 0);
if ( pos != string::npos )
{
pos = 0;
string::size_type pos1 = buf.find("\t", pos);
if ( pos1 != string::npos )
{
string masterlogfile = buf.substr(pos, pos1 - pos);
//strip trailing spaces
string::size_type lead = masterlogfile.find_first_of(" ");
masterLogFile = masterlogfile.substr( 0, lead);
string masterlogpos = buf.substr(pos1, 80);
//strip off leading tab masterlogpos
lead = masterlogpos.find_first_not_of("\t");
masterlogpos = masterlogpos.substr( lead, masterlogpos.length() - lead);
//string trailing spaces
lead = masterlogpos.find_first_of(" ");
masterLogPos = masterlogpos.substr( 0, lead);
log.writeLog(__LINE__, "runMasterRep: masterlogfile=" + masterLogFile + ", masterlogpos=" + masterLogPos, LOG_TYPE_DEBUG);
file.close();
return oam::API_SUCCESS;
}
}
}
file.close();
}
log.writeLog(__LINE__, "runMasterRep - 'mysql-bin not found in log file - " + masterLog, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
/******************************************************************************************
* @brief runSlaveRep
*
* purpose: run Slave Replication script
*
******************************************************************************************/
int ProcessMonitor::runSlaveRep(std::string& masterLogFile, std::string& masterLogPos)
{
Oam oam;
// get master replicaion module IP Address
string PrimaryUMModuleName;
oam.getSystemConfig("PrimaryUMModuleName", PrimaryUMModuleName);
string masterIPAddress;
try
{
ModuleConfig moduleconfig;
oam.getSystemConfig(PrimaryUMModuleName, moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
masterIPAddress = (*pt1).IPAddr;
}
catch (...)
{}
// mysql port number
string MySQLPort;
try
{
oam.getSystemConfig("MySQLPort", MySQLPort);
}
catch (...)
{
MySQLPort = "3306";
}
if ( MySQLPort.empty() )
MySQLPort = "3306";
bool passwordError = false;
while (true)
{
string logFile = tmpLogDir + "/slave-rep-columnstore.log";
string cmd = "slave-rep-columnstore.sh --masteripaddr=" + masterIPAddress + " --masterlogfile=" + masterLogFile + " --masterlogpos=" + masterLogPos + " --port=" + MySQLPort + " --tmpdir=" + tmpLogDir + " > " + logFile + " 2>&1";
log.writeLog(__LINE__, "cmd = " + cmd, LOG_TYPE_DEBUG);
system(cmd.c_str());
if (oam.checkLogStatus(logFile, "ERROR 1045") )
{
if ( passwordError )
{
log.writeLog(__LINE__, "slave-rep-columnstore.sh: MySQL Password Error", LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
log.writeLog(__LINE__, "slave-rep-columnstore.sh: Missing Password error, go check for a password and retry", LOG_TYPE_DEBUG);
passwordError = true;
}
else
{
if (oam.checkLogStatus(logFile, "OK"))
{
log.writeLog(__LINE__, "slave-rep-columnstore.sh: Successful return", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
else
{
log.writeLog(__LINE__, "slave-rep-columnstore.sh: Error return, check log " + logFile, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
}
return oam::API_FAILURE;
}
/******************************************************************************************
* @brief runDisableRep
*
* purpose: run Disable Replication script
*
******************************************************************************************/
int ProcessMonitor::runDisableRep()
{
Oam oam;
// mysql port number
string MySQLPort;
try
{
oam.getSystemConfig("MySQLPort", MySQLPort);
}
catch (...)
{
MySQLPort = "3306";
}
if ( MySQLPort.empty() )
MySQLPort = "3306";
string logFile = tmpLogDir + "/disable-rep-columnstore.log";
string cmd = "disable-rep-columnstore.sh --tmpdir=" + tmpLogDir + " > " + logFile + " 2>&1";
log.writeLog(__LINE__, "cmd = " + cmd, LOG_TYPE_DEBUG);
system(cmd.c_str());
if (oam.checkLogStatus(logFile, "OK"))
{
log.writeLog(__LINE__, "disable-rep-columnstore.sh: Successful return", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
else
{
if (oam.checkLogStatus(cmd, "ERROR 1045") )
{
log.writeLog(__LINE__, "disable-rep-columnstore.sh: Missing Password error, return success", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
log.writeLog(__LINE__, "disable-rep-columnstore.sh: Error return, check log " + logFile, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
return oam::API_FAILURE;
}
/******************************************************************************************
* @brief runMasterDist
*
* purpose: run Master DB Distribution
*
******************************************************************************************/
int ProcessMonitor::runMasterDist(std::string& password, std::string& slaveModule)
{
Oam oam;
SystemModuleTypeConfig systemModuleTypeConfig;
try
{
oam.getSystemConfig(systemModuleTypeConfig);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
int slave = 0;
if ( slaveModule == "all" )
{
// Distrubuted MySQL Front-end DB to Slave Modules
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0)
continue;
string moduleType = systemModuleTypeConfig.moduletypeconfig[i].ModuleType;
if ( ( (PMwithUM == "n") && (moduleType == "pm") ) &&
( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM) )
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
string moduleName = (*pt).DeviceName;
//skip if local master mode
if ( moduleName == config.moduleName() )
continue;
slave++;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
{
string ipAddr = (*pt1).IPAddr;
string logFile = tmpLogDir + "/master-dist_" + moduleName + ".log";
string cmd = "rsync.sh " + ipAddr + " " + password + " 1 > " + logFile;
log.writeLog(__LINE__, "cmd = " + cmd, LOG_TYPE_DEBUG);
system(cmd.c_str());
if (!oam.checkLogStatus(logFile, "FAILED"))
{
log.writeLog(__LINE__, "runMasterDist: Success rsync to module: " + moduleName, LOG_TYPE_DEBUG);
break;
}
else
{
log.writeLog(__LINE__, "runMasterDist: Failure rsync to module: " + moduleName, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
}
}
}
else
{
// don't do PMs unless PMwithUM flag is set
string moduleType = slaveModule.substr(0, MAX_MODULE_TYPE_SIZE);
if ( (moduleType == "um") ||
( (PMwithUM == "y") && (moduleType == "pm") ) ||
( config.ServerInstallType() == oam::INSTALL_COMBINE_DM_UM_PM) )
{
slave++;
// get slave IP address
ModuleConfig moduleconfig;
oam.getSystemConfig(slaveModule, moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
string ipAddr = (*pt1).IPAddr;
string logFile = tmpLogDir + "/master-dist_" + slaveModule + ".log";
string cmd = "rsync.sh " + ipAddr + " " + password + " 1 > " + logFile;
system(cmd.c_str());
if (!oam.checkLogStatus(logFile, "FAILED"))
log.writeLog(__LINE__, "runMasterDist: Success rsync to module: " + slaveModule, LOG_TYPE_DEBUG);
else
{
log.writeLog(__LINE__, "runMasterDist: Failure rsync to module: " + slaveModule, LOG_TYPE_ERROR);
return oam::API_FAILURE;
}
}
}
if (slave == 0 )
log.writeLog(__LINE__, "runMasterDist: No configured slave nodes", LOG_TYPE_DEBUG);
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief amazonIPCheck
*
* purpose: check and setups Amazon EC2 IP Addresses
*
******************************************************************************************/
bool ProcessMonitor::amazonIPCheck()
{
MonitorLog log;
Oam oam;
// delete description file so it will create a new one
string tmpLog = tmpLogDir + "/describeInstance.log";
unlink(tmpLog.c_str());
//
// Get Module Info
//
SystemModuleTypeConfig systemModuleTypeConfig;
try
{
oam.getSystemConfig(systemModuleTypeConfig);
}
catch (exception& ex)
{
string error = ex.what();
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: " + error, LOG_TYPE_ERROR);
}
catch (...)
{
// log.writeLog(__LINE__, "EXCEPTION ERROR on getSystemConfig: Caught unknown exception!", LOG_TYPE_ERROR);
}
//get Elastic IP Address count
int AmazonElasticIPCount = 0;
try
{
oam.getSystemConfig("AmazonElasticIPCount", AmazonElasticIPCount);
}
catch (...)
{
AmazonElasticIPCount = 0;
}
ModuleTypeConfig moduletypeconfig;
//get module/instance IDs
for ( unsigned int i = 0 ; i < systemModuleTypeConfig.moduletypeconfig.size(); i++)
{
int moduleCount = systemModuleTypeConfig.moduletypeconfig[i].ModuleCount;
if ( moduleCount == 0 )
// skip of no modules configured
continue;
DeviceNetworkList::iterator pt = systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin();
for ( ; pt != systemModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; pt++)
{
DeviceNetworkConfig devicenetworkconfig;
HostConfig hostconfig;
devicenetworkconfig.DeviceName = (*pt).DeviceName;
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++ )
{
hostconfig.IPAddr = (*pt1).IPAddr;
hostconfig.HostName = (*pt1).HostName;
devicenetworkconfig.hostConfigList.push_back(hostconfig);
}
moduletypeconfig.ModuleNetworkList.push_back(devicenetworkconfig);
}
}
// now loop and wait for 5 minutes for all configured Instances to be running
// like after a reboot
bool startFail = false;
for ( int time = 0 ; time < 30 ; time++ )
{
startFail = false;
DeviceNetworkList::iterator pt = moduletypeconfig.ModuleNetworkList.begin();
for ( ; pt != moduletypeconfig.ModuleNetworkList.end() ; pt++)
{
string moduleName = (*pt).DeviceName;
// get all ips if parent oam
// get just parent and local if not parent oam
MonitorConfig currentConfig;
if ( config.moduleName() == currentConfig.OAMParentName() ||
moduleName == config.moduleName() ||
moduleName == currentConfig.OAMParentName() )
{
HostConfigList::iterator pt1 = (*pt).hostConfigList.begin();
for ( ; pt1 != (*pt).hostConfigList.end() ; pt1++)
{
string IPAddr = (*pt1).IPAddr;
string instanceID = (*pt1).HostName;
log.writeLog(__LINE__, "getEC2InstanceIpAddress called to get status for Module '" + moduleName + "' / Instance " + instanceID, LOG_TYPE_DEBUG);
string currentIPAddr = oam.getEC2InstanceIpAddress(instanceID);
if (currentIPAddr == "stopped")
{
log.writeLog(__LINE__, "Module '" + moduleName + "' / Instance '" + instanceID + "' not running", LOG_TYPE_WARNING);
startFail = true;
}
else
{
if (currentIPAddr == "terminated")
{
log.writeLog(__LINE__, "Module '" + moduleName + "' / Instance '" + instanceID + "' has no Private IP Address Assigned, system failed to start", LOG_TYPE_CRITICAL);
startFail = true;
break;
}
else
{
if ( currentIPAddr != IPAddr )
{
log.writeLog(__LINE__, "Module is Running: '" + moduleName + "' / Instance '" + instanceID + "' current IP being reconfigured in Columnstore.xml. old = " + IPAddr + ", new = " + currentIPAddr, LOG_TYPE_DEBUG);
// update the Columnstore.xml with the new IP Address
string cmd = "sed -i s/" + IPAddr + "/" + currentIPAddr + "/g " + MCSSYSCONFDIR + "/columnstore/Columnstore.xml";
system(cmd.c_str());
}
else
log.writeLog(__LINE__, "Module is Running: '" + moduleName + "' / Instance '" + instanceID + "' current IP didn't change.", LOG_TYPE_DEBUG);
}
}
//set Elastic IP Address, if configured
if (AmazonElasticIPCount > 0)
{
bool found = false;
int id = 1;
for ( ; id < AmazonElasticIPCount + 1 ; id++ )
{
string AmazonElasticModule = "AmazonElasticModule" + oam.itoa(id);
string ELmoduleName;
string AmazonElasticIPAddr = "AmazonElasticIPAddr" + oam.itoa(id);
string ELIPaddress;
try
{
oam.getSystemConfig(AmazonElasticModule, ELmoduleName);
oam.getSystemConfig(AmazonElasticIPAddr, ELIPaddress);
}
catch (...) {}
if ( ELmoduleName == moduleName )
{
found = true;
try
{
oam.assignElasticIP(instanceID, ELIPaddress);
log.writeLog(__LINE__, "Assign Elastic IP Address : '" + moduleName + "' / '" + ELIPaddress, LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "Assign Elastic IP Address failed : '" + moduleName + "' / '" + ELIPaddress, LOG_TYPE_ERROR);
break;
}
break;
}
if (found)
break;
}
}
}
}
}
//continue when no instances are stopped
if (!startFail)
break;
sleep(10);
}
//check if an instance is stopped, exit out...
if (startFail)
{
log.writeLog(__LINE__, "A configured Instance isn't running. Check warning log", LOG_TYPE_CRITICAL);
}
log.writeLog(__LINE__, "amazonIPCheck function successfully completed", LOG_TYPE_DEBUG);
return true;
}
/******************************************************************************************
* @brief amazonVolumeCheck
*
* purpose: check and setups Amazon EC2 Volume mounts
*
******************************************************************************************/
bool ProcessMonitor::amazonVolumeCheck(int dbrootID)
{
MonitorLog log;
Oam oam;
{
log.writeLog(__LINE__, "amazonVolumeCheck function called for DBRoot" + oam.itoa(dbrootID), LOG_TYPE_DEBUG);
string volumeNameID = "PMVolumeName" + oam.itoa(dbrootID);
string volumeName = oam::UnassignedName;
string deviceNameID = "PMVolumeDeviceName" + oam.itoa(dbrootID);
string deviceName = oam::UnassignedName;
try
{
oam.getSystemConfig( volumeNameID, volumeName);
oam.getSystemConfig( deviceNameID, deviceName);
}
catch (...)
{}
if ( volumeName.empty() || volumeName == oam::UnassignedName )
{
log.writeLog(__LINE__, "amazonVolumeCheck function exiting, no volume assigned to DBRoot " + oam.itoa(dbrootID), LOG_TYPE_WARNING);
return false;
}
string status = oam.getEC2VolumeStatus(volumeName);
log.writeLog(__LINE__, "amazonVolumeCheck: getEC2VolumeStatus: " + status, LOG_TYPE_DEBUG);
if ( status == "attached" )
{
log.writeLog(__LINE__, "amazonVolumeCheck function successfully completed, volume attached: " + volumeName, LOG_TYPE_DEBUG);
return true;
}
if ( status != "available" )
{
log.writeLog(__LINE__, "amazonVolumeCheck function failed, volume not attached and not available: " + volumeName, LOG_TYPE_WARNING);
return false;
}
else
{
//get Module HostName / InstanceName
string instanceName;
try
{
ModuleConfig moduleconfig;
oam.getSystemConfig(config.moduleName(), moduleconfig);
HostConfigList::iterator pt1 = moduleconfig.hostConfigList.begin();
instanceName = (*pt1).HostName;
}
catch (...)
{}
if (oam.attachEC2Volume(volumeName, deviceName, instanceName))
{
log.writeLog(__LINE__, "amazonVolumeCheck function , volume to attached: " + volumeName, LOG_TYPE_DEBUG);
string cmd = SUDO + "mount /var/lib/columnstore/data" + oam.itoa(dbrootID) + " > /dev/null";
system(cmd.c_str());
log.writeLog(__LINE__, "amazonVolumeCheck function , volume to mounted: " + volumeName, LOG_TYPE_DEBUG);
cmd = SUDO + "chown -R " + USER + ":" + USER + " /var/lib/columnstore/data" + oam.itoa(dbrootID);
system(cmd.c_str());
return true;
}
else
{
log.writeLog(__LINE__, "amazonVolumeCheck function failed, volume failed to attached: " + volumeName, LOG_TYPE_CRITICAL);
return false;
}
}
}
log.writeLog(__LINE__, "amazonVolumeCheck function successfully completed", LOG_TYPE_DEBUG);
return true;
}
/******************************************************************************************
* @brief unmountExtraDBroots
*
* purpose: unmount Extra DBroots which were left mounted during a move
*
*
******************************************************************************************/
void ProcessMonitor::unmountExtraDBroots()
{
MonitorLog log;
ModuleConfig moduleconfig;
Oam oam;
string DBRootStorageType = "internal";
try
{
oam.getSystemConfig("DBRootStorageType", DBRootStorageType);
if ( DBRootStorageType == "hdfs" || DBRootStorageType == "storagemanager" ||
( DBRootStorageType == "internal" && DataRedundancyConfig == "n") )
return;
}
catch (...) {}
// if (DataRedundancyConfig == "y")
// return;
try
{
systemStorageInfo_t t;
t = oam.getStorageConfig();
if ( boost::get<1>(t) == 0 )
{
return;
}
DeviceDBRootList moduledbrootlist = boost::get<2>(t);
//Flush the cache
cacheutils::flushPrimProcCache();
cacheutils::dropPrimProcFdCache();
flushInodeCache();
DeviceDBRootList::iterator pt = moduledbrootlist.begin();
for ( ; pt != moduledbrootlist.end() ; pt++)
{
int moduleID = (*pt).DeviceID;
DBRootConfigList::iterator pt1 = (*pt).dbrootConfigList.begin();
for ( ; pt1 != (*pt).dbrootConfigList.end() ; pt1++)
{
int id = *pt1;
if (config.moduleID() != moduleID)
{
if ( DataRedundancyConfig == "n" )
{
string cmd = SUDO + "umount /var/lib/columnstore/data" + oam.itoa(id) + " > /dev/null 2>&1";
system(cmd.c_str());
}
else
{
try
{
int ret = glusterUnassign(oam.itoa(id));
if ( ret != 0 )
log.writeLog(__LINE__, "Error unassigning gluster dbroot# " + oam.itoa(id), LOG_TYPE_ERROR);
else
log.writeLog(__LINE__, "Gluster unassign gluster dbroot# " + oam.itoa(id));
}
catch (...)
{
log.writeLog(__LINE__, "Exception unassigning gluster dbroot# " + oam.itoa(id), LOG_TYPE_ERROR);
}
}
}
}
}
}
catch (...)
{}
log.writeLog(__LINE__, "unmountExtraDBroots finished ", LOG_TYPE_DEBUG);
return;
}
/******************************************************************************************
* @brief checkDataMount
*
* purpose: Check Data Mounts
*
*
******************************************************************************************/
int ProcessMonitor::checkDataMount()
{
MonitorLog log;
ModuleConfig moduleconfig;
Oam oam;
//check/update the pmMount files
string DBRootStorageType = "internal";
vector <string> dbrootList;
for ( int retry = 0 ; retry < 10 ; retry++)
{
try
{
systemStorageInfo_t t;
t = oam.getStorageConfig();
if ( boost::get<1>(t) == 0 )
{
log.writeLog(__LINE__, "getStorageConfig return: No dbroots are configured in Columnstore.xml file", LOG_TYPE_WARNING);
return API_INVALID_PARAMETER;
}
DeviceDBRootList moduledbrootlist = boost::get<2>(t);
DeviceDBRootList::iterator pt = moduledbrootlist.begin();
for ( ; pt != moduledbrootlist.end() ; pt++)
{
int moduleID = (*pt).DeviceID;
DBRootConfigList::iterator pt1 = (*pt).dbrootConfigList.begin();
for ( ; pt1 != (*pt).dbrootConfigList.end() ; pt1++)
{
if (config.moduleID() == moduleID)
{
dbrootList.push_back(oam.itoa(*pt1));
}
}
}
break;
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on getStorageConfig: " + error, LOG_TYPE_ERROR);
sleep (1);
}
catch (...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on getStorageConfig: Caught unknown exception!", LOG_TYPE_ERROR);
sleep (1);
}
}
try
{
oam.getSystemConfig("DBRootStorageType", DBRootStorageType);
}
catch (...) {}
//asign DBRoot is gluster
if (DataRedundancyConfig == "y")
{
vector<string>::iterator p = dbrootList.begin();
while ( p != dbrootList.end() )
{
string dbrootID = *p;
p++;
try
{
int ret = glusterAssign(dbrootID);
if ( ret != 0 )
log.writeLog(__LINE__, "Error assigning gluster dbroot# " + dbrootID, LOG_TYPE_ERROR);
else
log.writeLog(__LINE__, "Gluster assign gluster dbroot# " + dbrootID);
}
catch (...)
{
log.writeLog(__LINE__, "Exception assigning gluster dbroot# " + dbrootID, LOG_TYPE_ERROR);
}
}
}
if ( dbrootList.size() == 0 )
{
log.writeLog(__LINE__, "No dbroots are configured in Columnstore.xml file", LOG_TYPE_WARNING);
return API_INVALID_PARAMETER;
}
if ( DBRootStorageType == "hdfs" ||
(DBRootStorageType == "internal" && DataRedundancyConfig == "n") )
{
//create OAM-Test-Flag
vector<string>::iterator p = dbrootList.begin();
while ( p != dbrootList.end() )
{
string dbroot = "/var/lib/columnstore/data" + *p;
p++;
string fileName = dbroot + "/OAMdbrootCheck";
ofstream fout(fileName.c_str());
if (!fout)
{
log.writeLog(__LINE__, "ERROR: Failed test write to DBRoot: " + dbroot, LOG_TYPE_ERROR);
return API_FAILURE;
}
}
return API_SUCCESS;
}
else if (DBRootStorageType == "storagemanager")
{
/* StorageManager isn't running yet. Can't check for writability here. */
return API_SUCCESS;
}
//go unmount disk NOT assigned to this pm
unmountExtraDBroots();
//Flush the cache
cacheutils::flushPrimProcCache();
cacheutils::dropPrimProcFdCache();
flushInodeCache();
//external or gluster
vector<string>::iterator p = dbrootList.begin();
while ( p != dbrootList.end() )
{
string dbroot = "/var/lib/columnstore/data" + *p;
string fileName = dbroot + "/OAMdbrootCheck";
if ( DataRedundancyConfig == "n" )
{
//remove any local check flag for starters
string cmd = SUDO + "umount " + dbroot + " > " + tmpLogDir + "/umount.log 2>&1";
system(cmd.c_str());
unlink(fileName.c_str());
// check if already/still mounted, skip if so
cmd = "grep " + dbroot + " /proc/mounts > /dev/null 2>&1";
int status = system(cmd.c_str());
if (WEXITSTATUS(status) != 0 )
{
// not mounted, mount
string mountLog = tmpLogDir + "/mount.log";
cmd = "export LC_ALL=C;" + SUDO + "mount " + dbroot + " > " + mountLog + " 2>&1";
system(cmd.c_str());
ifstream in(mountLog.c_str());
in.seekg(0, std::ios::end);
int size = in.tellg();
if ( size != 0 )
{
if (!oam.checkLogStatus(mountLog, "already"))
{
log.writeLog(__LINE__, "checkDataMount: mount failed, DBRoot: " + dbroot, LOG_TYPE_ERROR);
try
{
oam.setDbrootStatus(*p, oam::AUTO_OFFLINE);
}
catch (exception& ex)
{}
return API_FAILURE;
}
}
}
if ( !rootUser)
{
cmd = SUDO + "chown -R " + USER + ":" + USER + " " + dbroot + " > /dev/null 2>&1";
system(cmd.c_str());
}
log.writeLog(__LINE__, "checkDataMount: successfull mount " + dbroot, LOG_TYPE_DEBUG);
}
//create OAM-Test-Flag check rw mount
ofstream fout(fileName.c_str());
if (!fout)
{
log.writeLog(__LINE__, "ERROR: Failed test write to DBRoot: " + dbroot, LOG_TYPE_ERROR);
try
{
oam.setDbrootStatus(*p, oam::AUTO_OFFLINE);
}
catch (exception& ex)
{}
return API_FAILURE;
}
try
{
oam.setDbrootStatus(*p, oam::ACTIVE);
}
catch (exception& ex)
{}
p++;
}
return API_SUCCESS;
}
/******************************************************************************************
* @brief calTotalUmMemory
*
* purpose: Calculate TotalUmMemory
*
*
******************************************************************************************/
void ProcessMonitor::calTotalUmMemory()
{
MonitorLog log;
Oam oam;
struct sysinfo myinfo;
//check/update the pmMount files
try
{
sysinfo(&myinfo);
}
catch (...)
{
return;
}
//get memory stats
long long total = myinfo.totalram / 1024 / 1000;
// adjust max memory, 25% of total memory
string value;
if ( total <= 2000 )
value = "256M";
else if ( total <= 4000 )
value = "512M";
else if ( total <= 8000 )
value = "1G";
else if ( total <= 16000 )
value = "2G";
else if ( total <= 32000 )
value = "4G";
else if ( total <= 64000 )
value = "8G";
else
value = "16G";
try
{
Config* sysConfig = Config::makeConfig();
sysConfig->setConfig("HashJoin", "TotalUmMemory", value);
//update Calpont Config table
try
{
sysConfig->write();
}
catch (...)
{
log.writeLog(__LINE__, "ERROR: sysConfig->write", LOG_TYPE_ERROR);
return;
}
log.writeLog(__LINE__, "set TotalUmMemory to " + value, LOG_TYPE_DEBUG);
}
catch (...)
{
log.writeLog(__LINE__, "Failed to set TotalUmMemory to " + value, LOG_TYPE_ERROR);
}
return;
}
/******************************************************************************************
* @brief flushInodeCache
*
* purpose: flush cache
*
*
******************************************************************************************/
void ProcessMonitor::flushInodeCache()
{
int fd;
ByteStream reply;
#ifdef __linux__
fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
if (fd >= 0)
{
if (write(fd, "3\n", 2) == 2)
{
log.writeLog(__LINE__, "flushInodeCache successful", LOG_TYPE_DEBUG);
}
else
{
log.writeLog(__LINE__, "flushInodeCache failed", LOG_TYPE_DEBUG);
}
close(fd);
}
else
{
log.writeLog(__LINE__, "flushInodeCache failed to open file", LOG_TYPE_DEBUG);
}
#endif
}
/******************************************************************************************
* @brief glusterAssign
*
* purpose: Gluster Assign DBroot on local module
*
*
******************************************************************************************/
int ProcessMonitor::glusterAssign(std::string dbrootID)
{
Oam oam;
Config* sysConfig = Config::makeConfig();
string command;
std::string errmsg = "";
log.writeLog(__LINE__, "glusterAssign called : " + dbrootID, LOG_TYPE_DEBUG);
string pmid = oam.itoa(config.moduleID());
string dataDupIPaddr = "ModuleIPAddr" + pmid + "-1-3";
string moduleIPAddr = sysConfig->getConfig("DataRedundancyConfig", dataDupIPaddr);
if (moduleIPAddr.empty() || moduleIPAddr == oam::UnassignedIpAddr)
{
moduleIPAddr = sysConfig->getConfig("SystemModuleConfig", dataDupIPaddr);
}
string tmpLog = tmpLogDir + "/glusterAssign.log";
command = SUDO + "mount -tglusterfs -odirect-io-mode=enable " + moduleIPAddr + ":/dbroot" + dbrootID + " /var/lib/columnstore/data" + dbrootID + " > " + tmpLog + " 2>&1";
int ret = system(command.c_str());
if ( WEXITSTATUS(ret) != 0 )
{
//log.writeLog(__LINE__, "glusterAssign mount failure: dbroot: " + dbrootID + " error: " + oam.itoa(WEXITSTATUS(ret)), LOG_TYPE_ERROR);
ifstream in(tmpLog.c_str());
in.seekg(0, std::ios::end);
int size = in.tellg();
if ( size != 0 )
{
if (!oam.checkLogStatus(tmpLog, "already"))
{
log.writeLog(__LINE__, "glusterAssign failed.", LOG_TYPE_ERROR);
string cmd = "mv -f " + tmpLog + " " + tmpLog + "failed";
system(cmd.c_str());
return oam::API_FAILURE;
}
}
}
return oam::API_SUCCESS;
}
/******************************************************************************************
* @brief glusterAssign
*
* purpose: Gluster Assign DBroot on local module
*
*
******************************************************************************************/
int ProcessMonitor::glusterUnassign(std::string dbrootID)
{
Oam oam;
string command;
std::string errmsg = "";
log.writeLog(__LINE__, "glusterUnassign called: " + dbrootID, LOG_TYPE_DEBUG);
string tmpLog = tmpLogDir + "/glusterUnassign.log";
command = SUDO + "umount -f /var/lib/columnstore/data" + dbrootID + " > " + tmpLog + " 2>&1";
int ret = system(command.c_str());
if ( WEXITSTATUS(ret) != 0 )
{
//log.writeLog(__LINE__, "glusterUnassign mount failure: dbroot: " + dbrootID + " error: " + oam.itoa(WEXITSTATUS(ret)), LOG_TYPE_ERROR);
ifstream in(tmpLog.c_str());
in.seekg(0, std::ios::end);
int size = in.tellg();
if ( size != 0 )
{
if (!oam.checkLogStatus(tmpLog, "not mounted"))
{
log.writeLog(__LINE__, "glusterUnassign failed.", LOG_TYPE_ERROR);
string cmd = "mv -f " + tmpLog + " " + tmpLog + "failed";
system(cmd.c_str());
return oam::API_FAILURE;
}
}
}
return oam::API_SUCCESS;
}
int ProcessMonitor::syncFS()
{
Oam oam;
string DBRMroot;
oam.getSystemConfig("DBRMRoot", DBRMroot);
string currentFileName = DBRMroot + "_current";
IDBFileSystem &fs = IDBPolicy::getFs(currentFileName.c_str());
bool success = fs.filesystemSync();
if (!success)
return oam::API_FAILURE;
return oam::API_SUCCESS;
}
} //end of namespace
// vim:ts=4 sw=4: