1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-08 14:22:09 +03:00
Files
mariadb-columnstore-engine/oamapps/serverMonitor/diskMonitor.cpp

757 lines
26 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
Copyright (C) 2016 MariaDB Corporaton
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***************************************************************************
* $Id: diskMonitor.cpp 34 2006-09-29 21:13:54Z dhill $
*
* Author: Zhixuan Zhu
***************************************************************************/
#include "serverMonitor.h"
#include "installdir.h"
using namespace std;
using namespace oam;
using namespace alarmmanager;
using namespace logging;
using namespace servermonitor;
using namespace config;
//using namespace procheartbeat;
SystemDiskList sdl;
typedef struct DBrootData_struct
{
std::string dbrootDir;
bool downFlag;
}
DBrootData;
typedef std::vector<DBrootData> DBrootList;
/*****************************************************************************************
* @brief diskMonitor Thread
*
* purpose: Get current Local and External disk usage and report alarms
*
*****************************************************************************************/
void diskMonitor()
{
ServerMonitor serverMonitor;
Oam oam;
SystemConfig systemConfig;
ModuleTypeConfig moduleTypeConfig;
typedef std::vector<std::string> LocalFileSystems;
LocalFileSystems lfs;
struct statvfs buf;
// set defaults
int localDiskCritical = 90,
localDiskMajor = 80,
localDiskMinor = 70,
ExternalDiskCritical = 90,
ExternalDiskMajor = 80,
ExternalDiskMinor = 70;
// get module types
string moduleType;
int moduleID = -1;
string moduleName;
oamModuleInfo_t t;
try
{
t = oam.getModuleInfo();
moduleType = boost::get<1>(t);
moduleID = boost::get<2>(t);
moduleName = boost::get<0>(t);
}
catch (exception& e) {}
bool Externalflag = false;
string cloud = oam::UnassignedName;
try
{
oam.getSystemConfig( "Cloud", cloud);
}
catch (...)
{
cloud = oam::UnassignedName;
}
//get Gluster Config setting
string DataRedundancyConfig = "n";
try
{
oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig);
}
catch (...)
{
DataRedundancyConfig = "n";
}
int diskSpaceCheck = 0;
while (true)
{
//check for external disk
DBrootList dbrootList;
if (moduleType == "pm")
{
systemStorageInfo_t t;
t = oam.getStorageConfig();
if ( boost::get<0>(t) == "external")
Externalflag = true;
// get dbroot list and storage type from config file
DBRootConfigList dbrootConfigList;
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
DBRootConfigList::iterator pt = dbrootConfigList.begin();
for ( ; pt != dbrootConfigList.end() ; pt++)
{
int dbrootID = *pt;
string dbroot = "DBRoot" + oam.itoa(dbrootID);
string dbootdir;
try
{
oam.getSystemConfig(dbroot, dbootdir);
}
catch (...) {}
if ( dbootdir.empty() || dbootdir == "" )
continue;
DBrootData dbrootData;
dbrootData.dbrootDir = dbootdir;
dbrootData.downFlag = false;
dbrootList.push_back(dbrootData);
}
}
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{}
if (systemstatus.SystemOpState != oam::ACTIVE )
{
sleep(5);
continue;
}
// Get Local/External Disk Mount points to monitor and associated thresholds
try
{
oam.getSystemConfig (moduleTypeConfig);
localDiskCritical = moduleTypeConfig.ModuleDiskCriticalThreshold;
localDiskMajor = moduleTypeConfig.ModuleDiskMajorThreshold;
localDiskMinor = moduleTypeConfig.ModuleDiskMinorThreshold;
DiskMonitorFileSystems::iterator p = moduleTypeConfig.FileSystems.begin();
for ( ; p != moduleTypeConfig.FileSystems.end() ; p++)
{
string fs = *p;
lfs.push_back(fs);
if (DISK_DEBUG)
{
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Config File System to monitor =");
args.add(fs);
msg.format(args);
ml.logDebugMessage(msg);
}
}
}
catch (...)
{
sleep(5);
continue;
}
// get External info
try
{
oam.getSystemConfig(systemConfig);
}
catch (...)
{
sleep(5);
continue;
}
if (Externalflag)
{
// get External info
try
{
ExternalDiskCritical = systemConfig.ExternalCriticalThreshold;
ExternalDiskMajor = systemConfig.ExternalMajorThreshold;
ExternalDiskMinor = systemConfig.ExternalMinorThreshold;
}
catch (...)
{
sleep(5);
continue;
}
}
if ( diskSpaceCheck == 0 )
{
//check for local file systems
LocalFileSystems::iterator p = lfs.begin();
while (p != lfs.end())
{
string deviceName = *p;
++p;
string fileName;
// check local
if ( deviceName == "/")
{
fileName = deviceName + startup::StartUp::installDir();
}
else
{
fileName = deviceName + "/000.dir";
}
uint64_t totalBlocks;
uint64_t usedBlocks;
if (!statvfs(fileName.c_str(), &buf))
{
uint64_t blksize, blocks, freeblks, free;
blksize = buf.f_bsize;
blocks = buf.f_blocks;
freeblks = buf.f_bavail;
totalBlocks = blocks * blksize;
free = freeblks * blksize;
usedBlocks = totalBlocks - free;
}
else
continue;
int64_t diskUsage = 0;
if ( totalBlocks == 0 )
{
diskUsage = 0;
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Total Disk Usage is set to 0");
msg.format(args);
ml.logWarningMessage(msg);
}
else
diskUsage = (usedBlocks / (totalBlocks / 100)) + 1;
SMSystemDisk sd;
sd.deviceName = deviceName;
sd.usedPercent = diskUsage;
sd.totalBlocks = totalBlocks;
sd.usedBlocks = usedBlocks;
sdl.push_back(sd);
if (DISK_DEBUG)
cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl;
if (diskUsage >= localDiskCritical && localDiskCritical > 0 )
{
//adjust if over 100%
if ( diskUsage > 100 )
diskUsage = 100;
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, (int) diskUsage) )
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Disk above Critical Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= localDiskMajor && localDiskMajor > 0 )
{
if (serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, (int) diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Disk above Major Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= localDiskMinor && localDiskMinor > 0 )
{
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, (int) diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Disk above Minor Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else
serverMonitor.checkDiskAlarm(deviceName);
}
//check for external file systems/devices
if (Externalflag ||
(!Externalflag && DataRedundancyConfig == "y" && moduleType == "pm") )
{
try
{
DBRootConfigList dbrootConfigList;
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
DBRootConfigList::iterator pt = dbrootConfigList.begin();
for ( ; pt != dbrootConfigList.end() ; pt++)
{
int dbroot = *pt;
string deviceName = systemConfig.DBRoot[dbroot - 1];
string fileName = deviceName + "/000.dir";
if (DISK_DEBUG)
{
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("DBRoots monitoring");
args.add(dbroot);
args.add(" ,file system =" );
args.add(fileName);
msg.format(args);
ml.logDebugMessage(msg);
}
uint64_t totalBlocks;
uint64_t usedBlocks;
if (!statvfs(fileName.c_str(), &buf))
{
uint64_t blksize, blocks, freeblks, free;
blksize = buf.f_bsize;
blocks = buf.f_blocks;
freeblks = buf.f_bavail;
totalBlocks = blocks * blksize;
free = freeblks * blksize;
usedBlocks = totalBlocks - free;
}
else
{
SMSystemDisk sd;
sd.deviceName = deviceName;
sd.usedPercent = 0;
sd.totalBlocks = 0;
sd.usedBlocks = 0;
sdl.push_back(sd);
continue;
}
int diskUsage = 0;
if ( totalBlocks == 0 )
{
diskUsage = 0;
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Total Disk Usage is set to 0");
msg.format(args);
ml.logWarningMessage(msg);
}
else
diskUsage = (usedBlocks / (totalBlocks / 100)) + 1;
SMSystemDisk sd;
sd.deviceName = deviceName;
sd.usedPercent = diskUsage;
sd.totalBlocks = totalBlocks;
sd.usedBlocks = usedBlocks;
sdl.push_back(sd);
if (DISK_DEBUG)
cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl;
if (diskUsage >= ExternalDiskCritical && ExternalDiskCritical > 0 )
{
//adjust if over 100%
if ( diskUsage > 100 )
diskUsage = 100;
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Disk usage for");
args.add(deviceName);
args.add(" above Critical Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= ExternalDiskMajor && ExternalDiskMajor > 0 )
{
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Disk usage for");
args.add(deviceName);
args.add(" above Major Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= ExternalDiskMinor && ExternalDiskMinor > 0 )
{
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Disk usage for");
args.add(deviceName);
args.add(" above Minor Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else
serverMonitor.checkDiskAlarm(deviceName);
}
}
catch (exception& e)
{
cout << endl << "**** getPmDbrootConfig Failed : " << e.what() << endl;
}
}
}
//check OAM dbroot test flag to validate dbroot exist if on pm
if ( moduleName.find("pm") != string::npos )
{
//check OAM dbroot test flag to validate dbroot exist
if ( dbrootList.size() != 0 )
{
DBrootList::iterator p = dbrootList.begin();
while ( p != dbrootList.end() )
{
//get dbroot directory
string dbrootDir = (*p).dbrootDir;
string dbrootName;
string dbrootID;
//get dbroot name
string::size_type pos = dbrootDir.rfind("/", 80);
if (pos != string::npos)
dbrootName = dbrootDir.substr(pos + 1, 80);
//get ID
dbrootID = dbrootName.substr(4, 80);
string fileName = dbrootDir + "/OAMdbrootCheck";
// retry in case we hit the remount window
for ( int retry = 0 ; ; retry++ )
{
bool fail = false;
//first test, check if OAMdbrootCheck exists
ifstream file (fileName.c_str());
if (!file)
fail = true;
else
fail = false;
if (fail)
{
//double check system status before reporting any error BUG 5078
SystemStatus systemstatus;
try
{
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{}
if (systemstatus.SystemOpState != oam::ACTIVE )
{
break;
}
if ( retry < 10 )
{
sleep(3);
continue;
}
else
{
if ( !(*p).downFlag )
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("dbroot monitoring: Lost access to ");
args.add(dbrootDir);
msg.format(args);
ml.logWarningMessage(msg);
oam.sendDeviceNotification(dbrootName, DBROOT_DOWN, moduleName);
(*p).downFlag = true;
try
{
oam.setDbrootStatus(dbrootID, oam::AUTO_OFFLINE);
}
catch (exception& ex)
{}
break;
}
}
}
else
{
if ( (*p).downFlag )
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("dbroot monitoring: Access back to ");
args.add(dbrootDir);
msg.format(args);
ml.logInfoMessage(msg);
oam.sendDeviceNotification(dbrootName, DBROOT_UP, moduleName);
(*p).downFlag = false;
try
{
oam.setDbrootStatus(dbrootID, oam::ACTIVE);
}
catch (exception& ex)
{}
}
file.close();
break;
}
}
p++;
}
}
}
//do Gluster status check, if configured
if ( DataRedundancyConfig == "y")
{
bool pass = true;
string errmsg = "unknown";
try
{
string arg1 = "";
string arg2 = "";
int ret = oam.glusterctl(oam::GLUSTER_STATUS, arg1, arg2, errmsg);
if ( ret != 0 )
{
cerr << "FAILURE: Status check error: " + errmsg << endl;
pass = false;
}
}
catch (exception& e)
{
cerr << endl << "**** glusterctl API exception: " << e.what() << endl;
cerr << "FAILURE: Status check error" << endl;
pass = false;
}
catch (...)
{
cerr << endl << "**** glusterctl API exception: UNKNOWN" << endl;
cerr << "FAILURE: Status check error" << endl;
pass = false;
}
if ( !pass )
{
// issue log and alarm
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Gluster Status check failure error msg: ");
args.add(errmsg);
msg.format(args);
ml.logWarningMessage(msg);
serverMonitor.sendResourceAlarm(errmsg, GLUSTER_DISK_FAILURE, SET, 0);
}
}
// sleep 30 seconds
sleep(MONITOR_PERIOD / 3);
//check disk space every 10 minutes
diskSpaceCheck++;
if ( diskSpaceCheck >= 20 )
{
diskSpaceCheck = 0;
lfs.clear();
sdl.clear();
}
} // end of while loop
}
/******************************************************************************************
* @brief checkDiskAlarm
*
* purpose: check to see if an alarm(s) is set on Disk and clear if so
*
******************************************************************************************/
void ServerMonitor::checkDiskAlarm(string alarmItem, ALARMS alarmID)
{
Oam oam;
ServerMonitor serverMonitor;
// get current server name
string serverName;
oamModuleInfo_t st;
try
{
st = oam.getModuleInfo();
serverName = boost::get<0>(st);
}
catch (...)
{
serverName = "Unknown Server";
}
switch (alarmID)
{
case ALARM_NONE: // clear all alarms set if any found
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_HIGH);
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_MED);
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_LOW, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_LOW);
break;
case DISK_USAGE_LOW: // clear high and medium alarms set if any found
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_HIGH);
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_MED);
break;
case DISK_USAGE_MED: // clear high alarms set if any found
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_HIGH);
break;
default: // none to clear
break;
} // end of switch
return;
}