1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-04 04:42:30 +03:00
Files
mariadb-columnstore-engine/oamapps/hardwareMonitor/hardwareMonitor.cpp
2017-10-26 17:18:17 +01:00

413 lines
12 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***************************************************************************
* $Id: hardwareMonitor.cpp 34 2006-09-29 21:13:54Z dhill $
*
* Author: David Hill
***************************************************************************/
#include "hardwareMonitor.h"
using namespace std;
using namespace oam;
using namespace alarmmanager;
using namespace logging;
/************************************************************************************************************
* @brief main function
*
* purpose: Get current hardware status and report alarms
*
* Parses file generated by the ipmitool
*
* pattern = what it is | value | units | status | value 1 | value 2 | value 3 | value 4 | value 5 | value 6
* data(0) = what it is
* data(1) = value
* data(2) = units
* data(3) = status
* data(4)-data(9) = barrier values
* data(4) - low non-recoverable, i.e. fatal
* data(5) - low critical
* data(6) - low warning
* data(7) - high warning
* data(8) - high critical
* data(9) - high non-recoverable, i.e. fatal
*
************************************************************************************************************/
int main (int argc, char** argv)
{
string data[10];
string SensorName;
float SensorValue;
string Units;
string SensorStatus;
float lowFatal;
float lowCritical;
float lowWarning;
float highWarning;
float highCritical;
float highFatal;
char* p;
// check for IPMI_SUPPORT FLAG passed in
if (argc > 1)
IPMI_SUPPORT = atoi(argv[1]);
// loop forever reading the hardware status
while (true)
{
if ( IPMI_SUPPORT == 0)
{
int returnCode = system("ipmitool sensor list > /tmp/harwareMonitor.txt");
if (returnCode)
{
// System error, Log this event
LoggingID lid;
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Error running ipmitool sensor list!!!");
msg.format(args);
ml.logWarningMessage(msg);
sleep(300);
continue;
}
}
// parse output file
ifstream File ("/tmp/harwareMonitor.txt");
if (!File)
{
// System error, Log this event
LoggingID lid;
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Error opening /tmp/harwareMonitor.txt!!!");
msg.format(args);
ml.logWarningMessage(msg);
sleep(300);
continue;
}
char line[200];
while (File.getline(line, 200))
{
// parse the line
int f = 0;
p = strtok(line, "|");
while (p)
{
data[f] = p;
data[f] = StripWhitespace(data[f]);
p = strtok (NULL, "|");
f++;
}
if ( f == 0 )
// nothing on this line, skip
continue;
SensorName = data[0];
SensorValue = atof(data[1].c_str());
Units = data[2];
SensorStatus = data[3];
lowFatal = atof(data[4].c_str());
lowCritical = atof(data[5].c_str());
lowWarning = atof(data[6].c_str());
highWarning = atof(data[7].c_str());
highCritical = atof(data[8].c_str());
highFatal = atof(data[9].c_str());
// check status and issue apporiate alarm if needed
if ( (SensorStatus != "ok") && (SensorStatus != "nr") && (SensorStatus != "na") )
{
// Status error, check for warning or critical levels
if ( SensorValue >= highFatal )
{
// issue critical alarm and send message to shutdown Server
sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue);
sendMsgShutdownServer();
}
else if ( (SensorValue < highFatal) && (SensorValue >= highCritical) )
// issue major alarm
sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue);
else if ( (SensorValue < highCritical ) && (SensorValue >= highWarning) )
// issue minor alarm
sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue);
else if ( (SensorValue <= lowWarning) && (SensorValue > lowCritical) )
// issue minor alarm
sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue);
else if ( (SensorValue <= lowCritical) && (SensorValue > lowFatal) )
// issue major alarm
sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue);
else if ( SensorValue <= lowFatal )
{
// issue critical alarm and send message to shutdown Server
sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue);
sendMsgShutdownServer();
}
else
// check if there are any active alarms that needs to be cleared
checkAlarm(SensorName);
}
else
// check if there are any active alarms that needs to be cleared
checkAlarm(SensorName);
} //end of parsing file while
File.close();
// sleep for 1 minute
sleep(60);
} //end of forever while loop
}
/******************************************************************************************
* @brief sendAlarm
*
* purpose: send a trap and log the process information
*
******************************************************************************************/
void sendAlarm(string alarmItem, ALARMS alarmID, int action, float sensorValue)
{
Oam oam;
//Log this event
LoggingID lid;
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add(alarmItem);
args.add(", sensor value out-of-range: ");
args.add(sensorValue);
// get current server name
string serverName;
oamServerInfo_t st;
try
{
st = oam.getServerInfo();
serverName = boost::get<0>(st);
}
catch (...)
{
serverName = "Unknown Server";
}
// check if there is an active alarm above the reporting theshold
// that needs to be cleared
checkAlarm(alarmItem, alarmID);
// check if Alarm is already active, don't resend
if ( !( oam.checkActiveAlarm(alarmID, serverName, alarmItem)) )
{
ALARMManager alarmMgr;
// send alarm
alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);
args.add(", Alarm set: ");
args.add(alarmID);
}
// output log
msg.format(args);
ml.logWarningMessage(msg);
return;
}
/******************************************************************************************
* @brief checkAlarm
*
* purpose: check to see if an alarm(s) is set on device and clear if so
*
******************************************************************************************/
void checkAlarm(string alarmItem, ALARMS alarmID)
{
Oam oam;
// get current server name
string serverName;
oamServerInfo_t st;
try
{
st = oam.getServerInfo();
serverName = boost::get<0>(st);
}
catch (...)
{
serverName = "Unknown Server";
}
switch (alarmID)
{
case ALARM_NONE: // clear all alarms set if any found
if ( oam.checkActiveAlarm(HARDWARE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, HARDWARE_HIGH);
if ( oam.checkActiveAlarm(HARDWARE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, HARDWARE_MED);
if ( oam.checkActiveAlarm(HARDWARE_LOW, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, HARDWARE_LOW);
break;
case HARDWARE_LOW: // clear high and medium alarms set if any found
if ( oam.checkActiveAlarm(HARDWARE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, HARDWARE_HIGH);
if ( oam.checkActiveAlarm(HARDWARE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, HARDWARE_MED);
break;
case HARDWARE_MED: // clear high alarms set if any found
if ( oam.checkActiveAlarm(HARDWARE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, HARDWARE_HIGH);
break;
default: // none to clear
break;
} // end of switch
return;
}
/******************************************************************************************
* @brief clearAlarm
*
* purpose: clear Alarm that was previously set
*
******************************************************************************************/
void clearAlarm(string alarmItem, ALARMS alarmID)
{
ALARMManager alarmMgr;
alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, CLEAR);
//Log this event
LoggingID lid;
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add(alarmItem);
args.add(" alarm #");
args.add(alarmID);
args.add("cleared");
msg.format(args);
ml.logWarningMessage(msg);
}
/******************************************************************************************
* @brief sendMsgShutdownServer
*
* purpose: send a Message to Shutdown server
*
******************************************************************************************/
void sendMsgShutdownServer()
{
Oam oam;
//Log this event
LoggingID lid;
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Fatal Hardware Alarm detected, Server being shutdown");
msg.format(args);
ml.logCriticalMessage(msg);
string serverName;
oamServerInfo_t st;
try
{
st = oam.getServerInfo();
serverName = boost::get<0>(st);
}
catch (...)
{
// o well, let's take out own action
if ( IPMI_SUPPORT == 0)
system("init 0");
}
try
{
oam.shutdownServer(serverName, FORCEFUL, ACK_NO);
}
catch (exception& e)
{
// o well, let's take out own action
if ( IPMI_SUPPORT == 0)
system("init 0");
}
}
/******************************************************************************************
* @brief StripWhitespace
*
* purpose: strip off whitespaces from a string
*
******************************************************************************************/
string StripWhitespace(string value)
{
for (;;)
{
string::size_type pos = value.find (' ', 0);
if (pos == string::npos)
// no more found
break;
// strip leading
if (pos == 0)
{
value = value.substr (pos + 1, 10000);
}
else
{
// strip trailing
value = value.substr (0, pos);
}
}
return value;
}