/* Copyright (C) 2014 InfiniDB, Inc. Copyright (C) 2016 MariaDB Corporaton This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /*************************************************************************** * $Id: diskMonitor.cpp 34 2006-09-29 21:13:54Z dhill $ * * Author: Zhixuan Zhu ***************************************************************************/ #include "serverMonitor.h" using namespace std; using namespace oam; using namespace alarmmanager; using namespace logging; using namespace servermonitor; using namespace config; //using namespace procheartbeat; SystemDiskList sdl; typedef struct DBrootData_struct { std::string dbrootDir; bool downFlag; } DBrootData; typedef std::vector DBrootList; /***************************************************************************************** * @brief diskMonitor Thread * * purpose: Get current Local and External disk usage and report alarms * *****************************************************************************************/ void diskMonitor() { ServerMonitor serverMonitor; Oam oam; SystemConfig systemConfig; ModuleTypeConfig moduleTypeConfig; typedef std::vector LocalFileSystems; LocalFileSystems lfs; struct statvfs buf; // set defaults int localDiskCritical = 90, localDiskMajor = 80, localDiskMinor = 70, ExternalDiskCritical = 90, ExternalDiskMajor = 80, ExternalDiskMinor = 70; // get module types string moduleType; int moduleID=-1; string moduleName; oamModuleInfo_t t; try { t = oam.getModuleInfo(); moduleType = boost::get<1>(t); moduleID = boost::get<2>(t); moduleName = boost::get<0>(t); } catch (exception& e) {} bool Externalflag = false; string cloud = oam::UnassignedName; try { oam.getSystemConfig( "Cloud", cloud); } catch(...) { cloud = oam::UnassignedName; } //get Gluster Config setting string DataRedundancyConfig = "n"; try { oam.getSystemConfig( "DataRedundancyConfig", DataRedundancyConfig); } catch(...) { DataRedundancyConfig = "n"; } int diskSpaceCheck = 0; while(true) { //check for external disk DBrootList dbrootList; if (moduleType == "pm") { systemStorageInfo_t t; t = oam.getStorageConfig(); if ( boost::get<0>(t) == "external") Externalflag = true; // get dbroot list and storage type from config file DBRootConfigList dbrootConfigList; oam.getPmDbrootConfig(moduleID, dbrootConfigList); DBRootConfigList::iterator pt = dbrootConfigList.begin(); for( ; pt != dbrootConfigList.end() ; pt++) { int dbrootID = *pt; string dbroot = "DBRoot" + oam.itoa(dbrootID); string dbootdir; try{ oam.getSystemConfig(dbroot, dbootdir); } catch(...) {} if ( dbootdir.empty() || dbootdir == "" ) continue; DBrootData dbrootData; dbrootData.dbrootDir = dbootdir; dbrootData.downFlag = false; dbrootList.push_back(dbrootData); } } SystemStatus systemstatus; try { oam.getSystemStatus(systemstatus); } catch (exception& ex) {} if (systemstatus.SystemOpState != oam::ACTIVE ) { sleep(5); continue; } // Get Local/External Disk Mount points to monitor and associated thresholds try { oam.getSystemConfig (moduleTypeConfig); localDiskCritical = moduleTypeConfig.ModuleDiskCriticalThreshold; localDiskMajor = moduleTypeConfig.ModuleDiskMajorThreshold; localDiskMinor = moduleTypeConfig.ModuleDiskMinorThreshold; DiskMonitorFileSystems::iterator p = moduleTypeConfig.FileSystems.begin(); for( ; p != moduleTypeConfig.FileSystems.end() ; p++) { string fs = *p; lfs.push_back(fs); if (DISK_DEBUG) { //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Config File System to monitor ="); args.add(fs); msg.format(args); ml.logDebugMessage(msg); } } } catch (...) { sleep(5); continue; } // get External info try { oam.getSystemConfig(systemConfig); } catch (...) { sleep(5); continue; } if (Externalflag) { // get External info try { ExternalDiskCritical = systemConfig.ExternalCriticalThreshold; ExternalDiskMajor = systemConfig.ExternalMajorThreshold; ExternalDiskMinor = systemConfig.ExternalMinorThreshold; } catch (...) { sleep(5); continue; } } if ( diskSpaceCheck == 0 ) { //check for local file systems LocalFileSystems::iterator p = lfs.begin(); while(p != lfs.end()) { string deviceName = *p; ++p; string fileName; // check local if ( deviceName == "/") { fileName = deviceName + "usr/local/mariadb/columnstore/"; } else { fileName = deviceName + "/000.dir"; } uint64_t totalBlocks; uint64_t usedBlocks; if (!statvfs(fileName.c_str(), &buf)) { uint64_t blksize, blocks, freeblks, free; blksize = buf.f_bsize; blocks = buf.f_blocks; freeblks = buf.f_bfree; totalBlocks = blocks * blksize; free = freeblks * blksize; usedBlocks = totalBlocks - free; } else continue; int64_t diskUsage = 0; if ( totalBlocks == 0 ) { diskUsage = 0; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Total Disk Usage is set to 0"); msg.format(args); ml.logWarningMessage(msg); } else diskUsage = (usedBlocks / (totalBlocks / 100)) + 1; SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = diskUsage; sd.totalBlocks = totalBlocks; sd.usedBlocks = usedBlocks; sdl.push_back(sd); if (DISK_DEBUG) cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl; if (diskUsage >= localDiskCritical && localDiskCritical > 0 ) { //adjust if over 100% if ( diskUsage > 100 ) diskUsage = 100; if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, (int) diskUsage) ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Critical Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= localDiskMajor && localDiskMajor > 0 ) { if (serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, (int) diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Major Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= localDiskMinor && localDiskMinor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, (int) diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Minor Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else serverMonitor.checkDiskAlarm(deviceName); } //check for external file systems/devices if (Externalflag || (!Externalflag && DataRedundancyConfig == "y" && moduleType == "pm") ){ try { DBRootConfigList dbrootConfigList; oam.getPmDbrootConfig(moduleID, dbrootConfigList); DBRootConfigList::iterator pt = dbrootConfigList.begin(); for( ; pt != dbrootConfigList.end() ; pt++) { int dbroot = *pt; string deviceName = systemConfig.DBRoot[dbroot-1]; string fileName = deviceName + "/000.dir"; if (DISK_DEBUG) { //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("DBRoots monitoring"); args.add(dbroot); args.add(" ,file system =" ); args.add(fileName); msg.format(args); ml.logDebugMessage(msg); } uint64_t totalBlocks; uint64_t usedBlocks; if (!statvfs(fileName.c_str(), &buf)) { uint64_t blksize, blocks, freeblks, free; blksize = buf.f_bsize; blocks = buf.f_blocks; freeblks = buf.f_bfree; totalBlocks = blocks * blksize; free = freeblks * blksize; usedBlocks = totalBlocks - free; } else { SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = 0; sd.totalBlocks = 0; sd.usedBlocks = 0; sdl.push_back(sd); continue; } int diskUsage = 0; if ( totalBlocks == 0 ) { diskUsage = 0; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Total Disk Usage is set to 0"); msg.format(args); ml.logWarningMessage(msg); } else diskUsage = (usedBlocks / (totalBlocks / 100)) + 1; SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = diskUsage; sd.totalBlocks = totalBlocks; sd.usedBlocks = usedBlocks; sdl.push_back(sd); if (DISK_DEBUG) cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl; if (diskUsage >= ExternalDiskCritical && ExternalDiskCritical > 0 ) { //adjust if over 100% if ( diskUsage > 100 ) diskUsage = 100; if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Critical Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= ExternalDiskMajor && ExternalDiskMajor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Major Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= ExternalDiskMinor && ExternalDiskMinor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Minor Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else serverMonitor.checkDiskAlarm(deviceName); } } catch (exception& e) { cout << endl << "**** getPmDbrootConfig Failed : " << e.what() << endl; } } } //check OAM dbroot test flag to validate dbroot exist if on pm if ( moduleName.find("pm") != string::npos ) { //check OAM dbroot test flag to validate dbroot exist if ( dbrootList.size() != 0 ) { DBrootList::iterator p = dbrootList.begin(); while ( p != dbrootList.end() ) { //get dbroot directory string dbrootDir = (*p).dbrootDir; string dbrootName; string dbrootID; //get dbroot name string::size_type pos = dbrootDir.rfind("/",80); if (pos != string::npos) dbrootName = dbrootDir.substr(pos+1,80); //get ID dbrootID = dbrootName.substr(4,80); string fileName = dbrootDir + "/OAMdbrootCheck"; // retry in case we hit the remount window for ( int retry = 0 ; ; retry++ ) { bool fail = false; //first test, check if OAMdbrootCheck exists ifstream file (fileName.c_str()); if (!file) fail = true; else fail = false; if (fail) { //double check system status before reporting any error BUG 5078 SystemStatus systemstatus; try { oam.getSystemStatus(systemstatus); } catch (exception& ex) {} if (systemstatus.SystemOpState != oam::ACTIVE ) { break; } if ( retry < 10 ) { sleep(3); continue; } else { if ( !(*p).downFlag ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Lost access to "); args.add(dbrootDir); msg.format(args); ml.logWarningMessage(msg); oam.sendDeviceNotification(dbrootName, DBROOT_DOWN, moduleName); (*p).downFlag = true; try{ oam.setDbrootStatus(dbrootID, oam::AUTO_OFFLINE); } catch (exception& ex) {} break; } } } else { if ( (*p).downFlag ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Access back to "); args.add(dbrootDir); msg.format(args); ml.logInfoMessage(msg); oam.sendDeviceNotification(dbrootName, DBROOT_UP, moduleName); (*p).downFlag = false; try{ oam.setDbrootStatus(dbrootID, oam::ACTIVE); } catch (exception& ex) {} } file.close(); break; } } p++; } } } //do Gluster status check, if configured if ( DataRedundancyConfig == "y") { bool pass = true; string errmsg = "unknown"; try { string arg1 = ""; string arg2 = ""; int ret = oam.glusterctl(oam::GLUSTER_STATUS, arg1, arg2, errmsg); if ( ret != 0 ) { cerr << "FAILURE: Status check error: " + errmsg << endl; pass = false; } } catch (exception& e) { cerr << endl << "**** glusterctl API exception: " << e.what() << endl; cerr << "FAILURE: Status check error" << endl; pass = false; } catch (...) { cerr << endl << "**** glusterctl API exception: UNKNOWN" << endl; cerr << "FAILURE: Status check error" << endl; pass = false; } if ( !pass ) { // issue log and alarm LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Gluster Status check failure error msg: "); args.add(errmsg); msg.format(args); ml.logWarningMessage(msg); serverMonitor.sendResourceAlarm(errmsg, GLUSTER_DISK_FAILURE, SET, 0); } } // sleep 30 seconds sleep(MONITOR_PERIOD/3); //check disk space every 10 minutes diskSpaceCheck++; if ( diskSpaceCheck >= 20 ) { diskSpaceCheck = 0; lfs.clear(); sdl.clear(); } } // end of while loop } /****************************************************************************************** * @brief checkDiskAlarm * * purpose: check to see if an alarm(s) is set on Disk and clear if so * ******************************************************************************************/ void ServerMonitor::checkDiskAlarm(string alarmItem, ALARMS alarmID) { Oam oam; ServerMonitor serverMonitor; // get current server name string serverName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); serverName = boost::get<0>(st); } catch (...) { serverName = "Unknown Server"; } switch (alarmID) { case ALARM_NONE: // clear all alarms set if any found if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_HIGH); if ( serverMonitor.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_MED); if ( serverMonitor.checkActiveAlarm(DISK_USAGE_LOW, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_LOW); break; case DISK_USAGE_LOW: // clear high and medium alarms set if any found if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_HIGH); if ( serverMonitor.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_MED); break; case DISK_USAGE_MED: // clear high alarms set if any found if ( serverMonitor.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_HIGH); break; default: // none to clear break; } // end of switch return; }