/* Copyright (C) 2014 InfiniDB, Inc. Copyright (C) 2016 MariaDB Corporaton This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /*************************************************************************** * $Id: cpuMonitor.cpp 34 2006-09-29 21:13:54Z dhill $ * * Author: Zhixuan Zhu ***************************************************************************/ #include "serverMonitor.h" using namespace std; using namespace oam; using namespace alarmmanager; using namespace logging; using namespace servermonitor; //using namespace procheartbeat; float currentCpuUsage; ProcessCPUList pcl; pthread_mutex_t CPU_LOCK; /** * constants define */ const std::string FE_MOUNT_DIR = "/var/log/mariadb/columnstore/"; // FE mount dir const int MONITOR_FREQ = 5; // monitor frequency in sec const int LOG_FREQ = 900; // log frequency in sec const int RESOURCE_DEBUG = false; static unsigned int usage[LOG_FREQ / MONITOR_FREQ]; static int usageCount = 0; extern string tmpDir; /***************************************************************************************** * @brief cpuMonitor Thread * * purpose: Get current CPU usage, average over 5 readings and report alarms * *****************************************************************************************/ void cpuMonitor() { ServerMonitor serverMonitor; // register for Heartbeat monitoring /* try { ProcHeartbeat procheartbeat; procheartbeat.registerHeartbeat(CPU_HEARTBEAT_ID); } catch (exception& ex) { string error = ex.what(); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on registerHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ int periodCount = 5; float cpuPeriod[periodCount]; int periodCounter = 0; float averageCpuUsage = 0; currentCpuUsage = 0; // set defaults unsigned int cpuCritical = 0, cpuMajor = 0, cpuMinor = 0, cpuMinorClear = 0; // initial cpu Period table for (int i = 0; i < periodCount; i++) { cpuPeriod[i] = 0; } while (true) { // Get CPU usage water mark from server configuration and compare ModuleTypeConfig moduleTypeConfig; Oam oam; try { oam.getSystemConfig(moduleTypeConfig); cpuCritical = moduleTypeConfig.ModuleCPUCriticalThreshold; cpuMajor = moduleTypeConfig.ModuleCPUMajorThreshold; cpuMinor = moduleTypeConfig.ModuleCPUMinorThreshold; cpuMinorClear = moduleTypeConfig.ModuleCPUMinorClearThreshold; } catch (...) { sleep(5); continue; } if (RESOURCE_DEBUG) cout << "critical water: " << moduleTypeConfig.ModuleCPUCriticalThreshold << endl; pthread_mutex_lock(&CPU_LOCK); // // get Process and System CPU usage // serverMonitor.getCPUdata(); // store and get average cpuPeriod[periodCounter] = currentCpuUsage; averageCpuUsage = 0; for (int i = 0; i < periodCount; i++) { averageCpuUsage += cpuPeriod[i]; } averageCpuUsage = averageCpuUsage / periodCount; // serverMonitor.logCPUactive(averageCpuUsage); if (CPU_DEBUG) { cout << "Current CPU Usage: " << currentCpuUsage << endl; cout << "Average CPU Usage: " << averageCpuUsage << endl; } if (averageCpuUsage >= cpuCritical && cpuCritical > 0 ) { serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_HIGH, SET, (int) averageCpuUsage); } else if (averageCpuUsage >= cpuMajor && cpuMajor > 0 ) serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_MED, SET, (int) averageCpuUsage); else if (averageCpuUsage >= cpuMinor && cpuMinor > 0 ) serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_LOW, SET, (int) averageCpuUsage); else if (averageCpuUsage >= cpuMinorClear && cpuMinorClear > 0 ) { serverMonitor.checkCPUAlarm("CPU", CPU_USAGE_LOW); //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Current CPU usage = "); args.add((int) currentCpuUsage); args.add(", Average CPU usage = "); args.add((int) averageCpuUsage); msg.format(args); ml.logInfoMessage(msg); } else serverMonitor.checkCPUAlarm("CPU"); // // check CPU usage by process // ProcessCPUList::iterator p = pcl.begin(); while (p != pcl.end()) { string processName = (*p).processName; double cpuUsage = (*p).usedPercent; p++; if (CPU_DEBUG) { cout << "Process Name : " << processName << endl; cout << "CPU Usage: " << cpuUsage << endl; } // check if a Calpont Process, if so alarm is over thresholds // if not, just log if over thresholds if (cpuUsage >= cpuCritical && cpuCritical > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_HIGH, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Critical CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } else if (cpuUsage >= cpuMajor && cpuMajor > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_MED, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Major CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } else if (cpuUsage >= cpuMinor && cpuMinor > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_LOW, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Minor CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } /* else if (cpuUsage >= cpuMinorClear) { try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.checkCPUAlarm(processName, CPU_USAGE_LOW); } catch (...) {} } else serverMonitor.checkCPUAlarm(processName); */ } // send heartbeat message /* try { ProcHeartbeat procheartbeat; procheartbeat.sendHeartbeat(CPU_HEARTBEAT_ID); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Sent Heartbeat Msg"); msg.format(args); ml.logInfoMessage(msg); } catch (exception& ex) { string error = ex.what(); if ( error.find("Disabled") == string::npos ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ pthread_mutex_unlock(&CPU_LOCK); // sleep, 10 minutes sleep(MONITOR_PERIOD * 10); ++periodCounter; if ( periodCounter >= periodCount ) periodCounter = 0; } // end of while loop } /****************************************************************************************** * @brief checkCPUAlarm * * purpose: check to see if an alarm(s) is set on CPU and clear if so * ******************************************************************************************/ void ServerMonitor::checkCPUAlarm(string alarmItem, ALARMS alarmID) { Oam oam; ServerMonitor serverMonitor; // get current server name string serverName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); serverName = boost::get<0>(st); } catch (...) { serverName = "Unknown Server"; } switch (alarmID) { case ALARM_NONE: // clear all alarms set if any found if ( serverMonitor.checkActiveAlarm(CPU_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, CPU_USAGE_HIGH); if ( serverMonitor.checkActiveAlarm(CPU_USAGE_MED, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, CPU_USAGE_MED); if ( serverMonitor.checkActiveAlarm(CPU_USAGE_LOW, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, CPU_USAGE_LOW); break; case CPU_USAGE_LOW: // clear high and medium alarms set if any found if ( serverMonitor.checkActiveAlarm(CPU_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, CPU_USAGE_HIGH); if ( serverMonitor.checkActiveAlarm(CPU_USAGE_MED, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, CPU_USAGE_MED); break; case CPU_USAGE_MED: // clear high alarms set if any found if ( serverMonitor.checkActiveAlarm(CPU_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, CPU_USAGE_HIGH); break; default: // none to clear break; } // end of switch return; } /***************************************************************************************** * @brief logCPUactive * * purpose: Log Peak and Average CPU usage * *****************************************************************************************/ void ServerMonitor::logCPUactive (unsigned int cpuUsage) { ServerMonitor serverMonitor; // determin the active log file name string usageLogFileName = FE_MOUNT_DIR; usageLogFileName = usageLogFileName + "cpu.log"; if (RESOURCE_DEBUG) cout << usageLogFileName << endl; fstream usageLogFile; usageLogFile.open (usageLogFileName.c_str(), ios::in | ios::out); if (usageLogFile.fail()) { ofstream file (usageLogFileName.c_str()); file.close(); usageLogFile.open(usageLogFileName.c_str(), ios::in | ios::out); if (!usageLogFile) cout << "--error" << endl; } // get the counter usageLogFile.seekg(0, ios::beg); usageLogFile.read (reinterpret_cast(&usageCount), sizeof (int)); if (usageLogFile.eof()) usageLogFile.clear(); // new iteration if (usageCount == 0) { usageLogFile.seekp(0, ios::beg); usageLogFile.write (reinterpret_cast(&usageCount), sizeof (int)); } usageCount ++; // append new usage data to the end usageLogFile.seekp (0, ios::end); usageLogFile.write (reinterpret_cast(&cpuUsage), sizeof (int)); if (RESOURCE_DEBUG) cout << "usage: " << usageCount << endl; // calculate peak and average if it's time to log usage data if (usageCount >= LOG_FREQ / MONITOR_FREQ) { usageLogFile.seekg (4, ios::beg); usageLogFile.read ((char*)usage, sizeof(unsigned int) * LOG_FREQ / MONITOR_FREQ); if (usageLogFile.eof()) usageLogFile.clear(); if (RESOURCE_DEBUG) { for (int i = 0; i < usageCount; i++) { cout << usage [i] << endl; } } serverMonitor.logCPUstat(usageCount); // delete the file usageLogFile.close(); unlink (usageLogFileName.c_str()); } // else, update usageCount else { usageLogFile.seekp(0, ios::beg); usageLogFile.write (reinterpret_cast(&usageCount), sizeof (int)); usageLogFile.close(); } } /***************************************************************************************** * @brief logCPUstat * * purpose: Log CPU stat using system API * *****************************************************************************************/ void ServerMonitor::logCPUstat (int usageCount) { unsigned int max = 0; unsigned int sum = 0; float average = 0.0; for (int i = 0; i < usageCount; i++) { if (usage[i] > max) max = usage[i]; sum += usage[i]; } if ( usageCount == 0 ) average = 0; else average = sum / usageCount; // Call system log api to store stats. // for now, write on local for testing purpose. string statFileName = FE_MOUNT_DIR; statFileName = statFileName + "cpustat.log"; ofstream file (statFileName.c_str(), ios::app); file << max << " " << average << endl; file.close(); } /***************************************************************************************** * @brief logCPUstat * * purpose: Log CPU stat using system API * *****************************************************************************************/ void ServerMonitor::getCPUdata() { pcl.clear(); string tmpProcessCpu = tmpDir + "/processCpu"; string cmd = "top -b -n1 | head -12 | awk '{print $9,$12}' | tail -5 > " + tmpProcessCpu; system(cmd.c_str()); ifstream oldFile1 (tmpProcessCpu.c_str()); // read top 5 users int i = 0; char line[400]; while (oldFile1.getline(line, 400)) { string buf = line; string::size_type pos = buf.find (' ', 0); if (pos != string::npos) { processCPU pc; pc.processName = buf.substr(pos + 1, 80); pc.usedPercent = atol(buf.substr(0, pos).c_str()); pcl.push_back(pc); i++; } } oldFile1.close(); // // get and check Total CPU usage // string tmpsystemCpu = tmpDir + "/processCpu"; cmd = "top -b -n 6 -d 1 | grep '%Cpu' | awk '{print $8}' > " + tmpsystemCpu; system(cmd.c_str()); ifstream oldFile (tmpsystemCpu.c_str()); float systemIdle = 0; // skip first line in file, and average the next 5 entries which contains idle times oldFile.getline(line, 400); int count = 0; while (oldFile.getline(line, 400)) { string buf = line; string::size_type pos = buf.find ('id,', 0); if (pos == string::npos) { systemIdle = systemIdle + atol(buf.substr(0, pos - 1).c_str()); count++; } else { systemIdle = systemIdle + 100; count++; } } oldFile.close(); if ( count == 0 ) currentCpuUsage = 0; else currentCpuUsage = 100 - (systemIdle / count); }