1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-12-12 11:01:17 +03:00
Files
mariadb-columnstore-engine/oamapps/serverMonitor/diskMonitor.cpp
Andrew Hutchings be83194c31 MCOL-2273 Improve disk usage reporting
Use the amount of available space rather than the amount of free space.
The two numbers are usually different because the free blocks may be
unusable at the time.
2019-05-02 10:32:49 +01:00

677 lines
18 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
Copyright (C) 2016 MariaDB Corporaton
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***************************************************************************
* $Id: diskMonitor.cpp 34 2006-09-29 21:13:54Z dhill $
*
* Author: Zhixuan Zhu
***************************************************************************/
#include "serverMonitor.h"
#include "installdir.h"
using namespace std;
using namespace oam;
using namespace alarmmanager;
using namespace logging;
using namespace servermonitor;
using namespace config;
//using namespace procheartbeat;
SystemDiskList sdl;
typedef struct DBrootData_struct
{
std::string dbrootDir;
bool downFlag;
}
DBrootData;
typedef std::vector<DBrootData> DBrootList;
/*****************************************************************************************
* @brief diskMonitor Thread
*
* purpose: Get current Local and External disk usage and report alarms
*
*****************************************************************************************/
void diskMonitor()
{
ServerMonitor serverMonitor;
Oam oam;
SystemConfig systemConfig;
ModuleTypeConfig moduleTypeConfig;
typedef std::vector<std::string> LocalFileSystems;
LocalFileSystems lfs;
struct statvfs buf;
// set defaults
int localDiskCritical = 90,
localDiskMajor = 80,
localDiskMinor = 70,
ExternalDiskCritical = 90,
ExternalDiskMajor = 80,
ExternalDiskMinor = 70;
// get module types
string moduleType;
int moduleID=-1;
string moduleName;
oamModuleInfo_t t;
try {
t = oam.getModuleInfo();
moduleType = boost::get<1>(t);
moduleID = boost::get<2>(t);
moduleName = boost::get<0>(t);
}
catch (exception& e) {}
bool Externalflag = false;
string cloud = oam::UnassignedName;
try {
oam.getSystemConfig( "Cloud", cloud);
}
catch(...) {
cloud = oam::UnassignedName;
}
//get Gluster Config setting
string DataRedundancyConfig = "n";
try {
oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig);
}
catch(...)
{
DataRedundancyConfig = "n";
}
int diskSpaceCheck = 0;
while(true)
{
//check for external disk
DBrootList dbrootList;
if (moduleType == "pm") {
systemStorageInfo_t t;
t = oam.getStorageConfig();
if ( boost::get<0>(t) == "external")
Externalflag = true;
// get dbroot list and storage type from config file
DBRootConfigList dbrootConfigList;
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
DBRootConfigList::iterator pt = dbrootConfigList.begin();
for( ; pt != dbrootConfigList.end() ; pt++)
{
int dbrootID = *pt;
string dbroot = "DBRoot" + oam.itoa(dbrootID);
string dbootdir;
try{
oam.getSystemConfig(dbroot, dbootdir);
}
catch(...) {}
if ( dbootdir.empty() || dbootdir == "" )
continue;
DBrootData dbrootData;
dbrootData.dbrootDir = dbootdir;
dbrootData.downFlag = false;
dbrootList.push_back(dbrootData);
}
}
SystemStatus systemstatus;
try {
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{}
if (systemstatus.SystemOpState != oam::ACTIVE ) {
sleep(5);
continue;
}
// Get Local/External Disk Mount points to monitor and associated thresholds
try {
oam.getSystemConfig (moduleTypeConfig);
localDiskCritical = moduleTypeConfig.ModuleDiskCriticalThreshold;
localDiskMajor = moduleTypeConfig.ModuleDiskMajorThreshold;
localDiskMinor = moduleTypeConfig.ModuleDiskMinorThreshold;
DiskMonitorFileSystems::iterator p = moduleTypeConfig.FileSystems.begin();
for( ; p != moduleTypeConfig.FileSystems.end() ; p++)
{
string fs = *p;
lfs.push_back(fs);
if (DISK_DEBUG) {
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Config File System to monitor =");
args.add(fs);
msg.format(args);
ml.logDebugMessage(msg);
}
}
} catch (...)
{
sleep(5);
continue;
}
// get External info
try
{
oam.getSystemConfig(systemConfig);
} catch (...)
{
sleep(5);
continue;
}
if (Externalflag) {
// get External info
try
{
ExternalDiskCritical = systemConfig.ExternalCriticalThreshold;
ExternalDiskMajor = systemConfig.ExternalMajorThreshold;
ExternalDiskMinor = systemConfig.ExternalMinorThreshold;
} catch (...)
{
sleep(5);
continue;
}
}
if ( diskSpaceCheck == 0 )
{
//check for local file systems
LocalFileSystems::iterator p = lfs.begin();
while(p != lfs.end())
{
string deviceName = *p;
++p;
string fileName;
// check local
if ( deviceName == "/") {
fileName = deviceName + startup::StartUp::installDir();
}
else
{
fileName = deviceName + "/000.dir";
}
uint64_t totalBlocks;
uint64_t usedBlocks;
if (!statvfs(fileName.c_str(), &buf)) {
uint64_t blksize, blocks, freeblks, free;
blksize = buf.f_bsize;
blocks = buf.f_blocks;
freeblks = buf.f_bavail;
totalBlocks = blocks * blksize;
free = freeblks * blksize;
usedBlocks = totalBlocks - free;
}
else
continue;
int64_t diskUsage = 0;
if ( totalBlocks == 0 ) {
diskUsage = 0;
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Total Disk Usage is set to 0");
msg.format(args);
ml.logWarningMessage(msg);
}
else
diskUsage = (usedBlocks / (totalBlocks / 100)) + 1;
SMSystemDisk sd;
sd.deviceName = deviceName;
sd.usedPercent = diskUsage;
sd.totalBlocks = totalBlocks;
sd.usedBlocks = usedBlocks;
sdl.push_back(sd);
if (DISK_DEBUG)
cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl;
if (diskUsage >= localDiskCritical && localDiskCritical > 0 ) {
//adjust if over 100%
if ( diskUsage > 100 )
diskUsage = 100;
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, (int) diskUsage) )
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Disk above Critical Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= localDiskMajor && localDiskMajor > 0 ) {
if (serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, (int) diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Disk above Major Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= localDiskMinor && localDiskMinor > 0 ) {
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, (int) diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Disk above Minor Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else
serverMonitor.checkDiskAlarm(deviceName);
}
//check for external file systems/devices
if (Externalflag ||
(!Externalflag && DataRedundancyConfig == "y" && moduleType == "pm") ){
try
{
DBRootConfigList dbrootConfigList;
oam.getPmDbrootConfig(moduleID, dbrootConfigList);
DBRootConfigList::iterator pt = dbrootConfigList.begin();
for( ; pt != dbrootConfigList.end() ; pt++)
{
int dbroot = *pt;
string deviceName = systemConfig.DBRoot[dbroot-1];
string fileName = deviceName + "/000.dir";
if (DISK_DEBUG) {
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("DBRoots monitoring");
args.add(dbroot);
args.add(" ,file system =" );
args.add(fileName);
msg.format(args);
ml.logDebugMessage(msg);
}
uint64_t totalBlocks;
uint64_t usedBlocks;
if (!statvfs(fileName.c_str(), &buf)) {
uint64_t blksize, blocks, freeblks, free;
blksize = buf.f_bsize;
blocks = buf.f_blocks;
freeblks = buf.f_bavail;
totalBlocks = blocks * blksize;
free = freeblks * blksize;
usedBlocks = totalBlocks - free;
}
else
{
SMSystemDisk sd;
sd.deviceName = deviceName;
sd.usedPercent = 0;
sd.totalBlocks = 0;
sd.usedBlocks = 0;
sdl.push_back(sd);
continue;
}
int diskUsage = 0;
if ( totalBlocks == 0 ) {
diskUsage = 0;
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Total Disk Usage is set to 0");
msg.format(args);
ml.logWarningMessage(msg);
}
else
diskUsage = (usedBlocks / (totalBlocks / 100)) + 1;
SMSystemDisk sd;
sd.deviceName = deviceName;
sd.usedPercent = diskUsage;
sd.totalBlocks = totalBlocks;
sd.usedBlocks = usedBlocks;
sdl.push_back(sd);
if (DISK_DEBUG)
cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl;
if (diskUsage >= ExternalDiskCritical && ExternalDiskCritical > 0 ) {
//adjust if over 100%
if ( diskUsage > 100 )
diskUsage = 100;
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Disk usage for");
args.add(deviceName);
args.add(" above Critical Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= ExternalDiskMajor && ExternalDiskMajor > 0 ) {
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Disk usage for");
args.add(deviceName);
args.add(" above Major Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else if (diskUsage >= ExternalDiskMinor && ExternalDiskMinor > 0 ) {
if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, diskUsage))
{
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Disk usage for");
args.add(deviceName);
args.add(" above Minor Disk threshold with a percentage of ");
args.add((int) diskUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
else
serverMonitor.checkDiskAlarm(deviceName);
}
}
catch (exception& e)
{
cout << endl << "**** getPmDbrootConfig Failed : " << e.what() << endl;
}
}
}
//check OAM dbroot test flag to validate dbroot exist if on pm
if ( moduleName.find("pm") != string::npos ) {
//check OAM dbroot test flag to validate dbroot exist
if ( dbrootList.size() != 0 ) {
DBrootList::iterator p = dbrootList.begin();
while ( p != dbrootList.end() )
{
//get dbroot directory
string dbrootDir = (*p).dbrootDir;
string dbrootName;
string dbrootID;
//get dbroot name
string::size_type pos = dbrootDir.rfind("/",80);
if (pos != string::npos)
dbrootName = dbrootDir.substr(pos+1,80);
//get ID
dbrootID = dbrootName.substr(4,80);
string fileName = dbrootDir + "/OAMdbrootCheck";
// retry in case we hit the remount window
for ( int retry = 0 ; ; retry++ )
{
bool fail = false;
//first test, check if OAMdbrootCheck exists
ifstream file (fileName.c_str());
if (!file)
fail = true;
else
fail = false;
if (fail) {
//double check system status before reporting any error BUG 5078
SystemStatus systemstatus;
try {
oam.getSystemStatus(systemstatus);
}
catch (exception& ex)
{}
if (systemstatus.SystemOpState != oam::ACTIVE ) {
break;
}
if ( retry < 10 ) {
sleep(3);
continue;
}
else
{
if ( !(*p).downFlag ) {
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("dbroot monitoring: Lost access to ");
args.add(dbrootDir);
msg.format(args);
ml.logWarningMessage(msg);
oam.sendDeviceNotification(dbrootName, DBROOT_DOWN, moduleName);
(*p).downFlag = true;
try{
oam.setDbrootStatus(dbrootID, oam::AUTO_OFFLINE);
}
catch (exception& ex)
{}
break;
}
}
}
else
{
if ( (*p).downFlag ) {
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("dbroot monitoring: Access back to ");
args.add(dbrootDir);
msg.format(args);
ml.logInfoMessage(msg);
oam.sendDeviceNotification(dbrootName, DBROOT_UP, moduleName);
(*p).downFlag = false;
try{
oam.setDbrootStatus(dbrootID, oam::ACTIVE);
}
catch (exception& ex)
{}
}
file.close();
break;
}
}
p++;
}
}
}
//do Gluster status check, if configured
if ( DataRedundancyConfig == "y")
{
bool pass = true;
string errmsg = "unknown";
try {
string arg1 = "";
string arg2 = "";
int ret = oam.glusterctl(oam::GLUSTER_STATUS, arg1, arg2, errmsg);
if ( ret != 0 )
{
cerr << "FAILURE: Status check error: " + errmsg << endl;
pass = false;
}
}
catch (exception& e)
{
cerr << endl << "**** glusterctl API exception: " << e.what() << endl;
cerr << "FAILURE: Status check error" << endl;
pass = false;
}
catch (...)
{
cerr << endl << "**** glusterctl API exception: UNKNOWN" << endl;
cerr << "FAILURE: Status check error" << endl;
pass = false;
}
if ( !pass )
{ // issue log and alarm
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Gluster Status check failure error msg: ");
args.add(errmsg);
msg.format(args);
ml.logWarningMessage(msg);
serverMonitor.sendResourceAlarm(errmsg, GLUSTER_DISK_FAILURE, SET, 0);
}
}
// sleep 30 seconds
sleep(MONITOR_PERIOD/3);
//check disk space every 10 minutes
diskSpaceCheck++;
if ( diskSpaceCheck >= 20 ) {
diskSpaceCheck = 0;
lfs.clear();
sdl.clear();
}
} // end of while loop
}
/******************************************************************************************
* @brief checkDiskAlarm
*
* purpose: check to see if an alarm(s) is set on Disk and clear if so
*
******************************************************************************************/
void ServerMonitor::checkDiskAlarm(string alarmItem, ALARMS alarmID)
{
Oam oam;
ServerMonitor serverMonitor;
// get current server name
string serverName;
oamModuleInfo_t st;
try {
st = oam.getModuleInfo();
serverName = boost::get<0>(st);
}
catch (...) {
serverName = "Unknown Server";
}
switch (alarmID) {
case ALARM_NONE: // clear all alarms set if any found
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_HIGH);
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_MED);
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_LOW, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_LOW);
break;
case DISK_USAGE_LOW: // clear high and medium alarms set if any found
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_HIGH);
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_MED);
break;
case DISK_USAGE_MED: // clear high alarms set if any found
if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, DISK_USAGE_HIGH);
break;
default: // none to clear
break;
} // end of switch
return;
}