1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-10 01:22:48 +03:00
Files
mariadb-columnstore-engine/oamapps/serverMonitor/hardwareMonitor.cpp
David Hill 4e6e5647ef MCOL-520
2018-09-21 10:40:05 -05:00

270 lines
8.5 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
* Copyright (C) 2016 MariaDB Corporation.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***************************************************************************
* $Id: hardwareMonitor.cpp 34 2006-09-29 21:13:54Z dhill $
*
* Author: David Hill
***************************************************************************/
#include "serverMonitor.h"
using namespace std;
using namespace oam;
using namespace alarmmanager;
using namespace logging;
using namespace servermonitor;
//using namespace procheartbeat;
extern string tmpDir;
/************************************************************************************************************
* @brief hardwareMonitor function
*
* purpose: Monitor Hardware and report problems
*
* Parses file generated by the ipmitool
*
* pattern = what it is | value | units | status | value 1 | value 2 | value 3 | value 4 | value 5 | value 6
* data(0) = what it is
* data(1) = value
* data(2) = units
* data(3) = status
* data(4)-data(9) = barrier values
* data(4) - low non-recoverable, i.e. fatal
* data(5) - low critical
* data(6) - low warning
* data(7) - high warning
* data(8) - high critical
* data(9) - high non-recoverable, i.e. fatal
*
************************************************************************************************************/
void hardwareMonitor(int IPMI_SUPPORT)
{
ServerMonitor serverMonitor;
string data[10];
string SensorName;
float SensorValue;
string Units;
string SensorStatus;
float lowFatal;
float lowCritical;
float lowWarning;
float highWarning;
float highCritical;
float highFatal;
char* p;
if ( IPMI_SUPPORT == 0)
{
string tmpharwareMonitor = tmpDir + "/harwareMonitor.txt";
string cmd = "ipmitool sensor list > > " + tmpharwareMonitor;
int returnCode = system(cmd.c_str());
if (returnCode)
{
// System error, Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Error running ipmitool sensor list!!!");
msg.format(args);
ml.logWarningMessage(msg);
while (TRUE)
sleep(10000);
}
}
else
{
while (TRUE)
sleep(10000);
}
// register for Heartbeat monitoring
/* try {
ProcHeartbeat procheartbeat;
procheartbeat.registerHeartbeat(HW_HEARTBEAT_ID);
}
catch (exception& ex)
{
string error = ex.what();
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("EXCEPTION ERROR on registerHeartbeat: ");
args.add(error);
msg.format(args);
ml.logErrorMessage(msg);
}
catch(...)
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
msg.format(args);
ml.logErrorMessage(msg);
}
*/
// loop forever reading the hardware status
while (TRUE)
{
// parse output file
ifstream File (tmpharwareMonitor);
if (!File)
{
// System error, Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Error opening harwareMonitor.txt!!!");
msg.format(args);
ml.logWarningMessage(msg);
sleep(300);
continue;
}
char line[200];
while (File.getline(line, 200))
{
// parse the line
int f = 0;
p = strtok(line, "|");
while (p)
{
data[f] = p;
data[f] = serverMonitor.StripWhitespace(data[f]);
p = strtok (NULL, "|");
f++;
}
if ( f == 0 )
// nothing on this line, skip
continue;
SensorName = data[0];
SensorValue = atof(data[1].c_str());
Units = data[2];
SensorStatus = data[3];
lowFatal = atof(data[4].c_str());
lowCritical = atof(data[5].c_str());
lowWarning = atof(data[6].c_str());
highWarning = atof(data[7].c_str());
highCritical = atof(data[8].c_str());
highFatal = atof(data[9].c_str());
// check status and issue apporiate alarm if needed
if ( (SensorStatus != "ok") && (SensorStatus != "nr") && (SensorStatus != "na") )
{
// Status error, check for warning or critical levels
if ( SensorValue >= highFatal )
{
// issue critical alarm and send message to shutdown Server
serverMonitor.sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue);
serverMonitor.sendMsgShutdownServer();
}
else if ( (SensorValue < highFatal) && (SensorValue >= highCritical) )
// issue major alarm
serverMonitor.sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue);
else if ( (SensorValue < highCritical ) && (SensorValue >= highWarning) )
// issue minor alarm
serverMonitor.sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue);
else if ( (SensorValue <= lowWarning) && (SensorValue > lowCritical) )
// issue minor alarm
serverMonitor.sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue);
else if ( (SensorValue <= lowCritical) && (SensorValue > lowFatal) )
// issue major alarm
serverMonitor.sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue);
else if ( SensorValue <= lowFatal )
{
// issue critical alarm and send message to shutdown Server
serverMonitor.sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue);
serverMonitor.sendMsgShutdownServer();
}
else
// check if there are any active alarms that needs to be cleared
serverMonitor.checkAlarm(SensorName);
}
else
// check if there are any active alarms that needs to be cleared
serverMonitor.checkAlarm(SensorName);
} //end of parsing file while
File.close();
// send heartbeat message
/* try {
ProcHeartbeat procheartbeat;
procheartbeat.sendHeartbeat(HW_HEARTBEAT_ID);
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Sent Heartbeat Msg");
msg.format(args);
ml.logDebugMessage(msg);
}
catch (exception& ex)
{
string error = ex.what();
if ( error.find("Disabled") == string::npos ) {
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("EXCEPTION ERROR on sendHeartbeat: ");
args.add(error);
msg.format(args);
ml.logErrorMessage(msg);
}
}
catch(...)
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
msg.format(args);
ml.logErrorMessage(msg);
}
*/
// sleep
sleep(MONITOR_PERIOD);
} //end of forever while loop
}