1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-10 01:22:48 +03:00
Files
mariadb-columnstore-engine/oamapps/serverMonitor/memoryMonitor.cpp
2016-05-25 14:08:04 -05:00

487 lines
14 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***************************************************************************
* $Id: memoryMonitor.cpp 34 2006-09-29 21:13:54Z dhill $
*
* Author: Zhixuan Zhu
***************************************************************************/
#include "cgroupconfigurator.h"
#include "serverMonitor.h"
using namespace std;
using namespace oam;
using namespace snmpmanager;
using namespace logging;
using namespace servermonitor;
//using namespace procheartbeat;
unsigned long totalMem;
ProcessMemoryList pml;
int swapFlag = 0;
pthread_mutex_t MEMORY_LOCK;
/*****************************************************************************************
* @brief memoryMonitor Thread
*
* purpose: Get current Memory and Swap usage, report alarms
*
*****************************************************************************************/
void memoryMonitor()
{
ServerMonitor serverMonitor;
int swapUsagePercent = 0;
// set defaults
int memoryCritical = 90,
memoryMajor = 0,
memoryMinor = 0,
swapCritical = 90,
swapMajor = 80,
swapMinor = 70;
int day = 0;
//set monitoring period to 60 seconds
int monitorPeriod = MONITOR_PERIOD;
utils::CGroupConfigurator cg;
while(true)
{
// Get MEMORY usage water mark from server configuration and compare
ModuleTypeConfig moduleTypeConfig;
Oam oam;
try {
oam.getSystemConfig (moduleTypeConfig);
memoryCritical = moduleTypeConfig.ModuleMemCriticalThreshold;
memoryMajor = moduleTypeConfig.ModuleMemMajorThreshold;
memoryMinor = moduleTypeConfig.ModuleMemMinorThreshold;
swapCritical = moduleTypeConfig.ModuleSwapCriticalThreshold;
swapMajor = moduleTypeConfig.ModuleSwapMajorThreshold;
swapMinor = moduleTypeConfig.ModuleSwapMinorThreshold;
} catch (...)
{
sleep(5);
continue;
}
//get memory stats
totalMem = cg.getTotalMemory();
uint64_t freeMem = cg.getFreeMemory();
uint64_t usedMem = totalMem - freeMem;
//get swap stats
uint64_t totalSwap = cg.getTotalSwapSpace();
uint64_t usedSwap = cg.getSwapInUse();
if ( totalSwap == 0 ) {
swapUsagePercent = 0;
swapFlag = 2;
//get current day, log warning only once a day
time_t now;
now = time(NULL);
struct tm tm;
localtime_r(&now, &tm);
if ( day != tm.tm_mday) {
day = tm.tm_mday;
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Total Swap space is set to 0");
msg.format(args);
ml.logWarningMessage(msg);
}
}
else
swapUsagePercent = usedSwap / (totalSwap / 100);
int memoryUsagePercent;
if ( totalMem == 0 ) {
memoryUsagePercent = 0;
//Log this event
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Total Memory space is set to 0");
msg.format(args);
ml.logWarningMessage(msg);
}
else
memoryUsagePercent = (usedMem / (totalMem / 100)) + 1;
// check for Memory alarms
if (memoryUsagePercent >= memoryCritical && memoryCritical > 0 ) {
if ( monitorPeriod == MONITOR_PERIOD ) {
//first time called, log
//adjust if over 100%
if ( memoryUsagePercent > 100 )
memoryUsagePercent = 100;
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Memory above Critical Memory threshold with a percentage of ");
args.add((int) memoryUsagePercent);
args.add(" ; Swap ");
args.add((int) swapUsagePercent);
msg.format(args);
ml.logInfoMessage(msg);
serverMonitor.sendResourceAlarm("Local-Memory", MEMORY_USAGE_HIGH, SET, memoryUsagePercent);
pthread_mutex_lock(&MEMORY_LOCK);
serverMonitor.outputProcMemory(true);
pthread_mutex_unlock(&MEMORY_LOCK);
}
// change to 1 second for quick swap space monitoring
monitorPeriod = 1;
}
else if (memoryUsagePercent >= memoryMajor && memoryMajor > 0 ) {
monitorPeriod = MONITOR_PERIOD;
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Memory above Major Memory threshold with a percentage of ");
args.add((int) memoryUsagePercent);
msg.format(args);
ml.logInfoMessage(msg);
serverMonitor.sendResourceAlarm("Local-Memory", MEMORY_USAGE_MED, SET, memoryUsagePercent);
}
else if (memoryUsagePercent >= memoryMinor && memoryMinor > 0 ) {
monitorPeriod = MONITOR_PERIOD;
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Local Memory above Minor Memory threshold with a percentage of ");
args.add((int) memoryUsagePercent);
msg.format(args);
ml.logInfoMessage(msg);
serverMonitor.sendResourceAlarm("Local-Memory", MEMORY_USAGE_LOW, SET, memoryUsagePercent);
}
else {
monitorPeriod = MONITOR_PERIOD;
serverMonitor.checkMemoryAlarm("Local-Memory");
}
// check for Swap alarms
if (swapUsagePercent >= swapCritical && swapCritical > 0 ) {
//adjust if over 100%
if ( swapUsagePercent > 100 )
swapUsagePercent = 100;
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Swap above Critical Memory threshold with a percentage of ");
args.add((int) swapUsagePercent);
msg.format(args);
ml.logInfoMessage(msg);
serverMonitor.sendResourceAlarm("Swap", SWAP_USAGE_HIGH, SET, swapUsagePercent);
serverMonitor.checkSwapAction();
}
else if (swapUsagePercent >= swapMajor && swapMajor > 0 ) {
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Swap above Major Memory threshold with a percentage of ");
args.add((int) swapUsagePercent);
msg.format(args);
ml.logInfoMessage(msg);
serverMonitor.sendResourceAlarm("Swap", SWAP_USAGE_MED, SET, swapUsagePercent);
serverMonitor.checkSwapAction();
}
else if (swapUsagePercent >= swapMinor && swapMinor > 0 ) {
swapFlag = 2;
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Swap above Minor Memory threshold with a percentage of ");
args.add((int) swapUsagePercent);
msg.format(args);
ml.logInfoMessage(msg);
serverMonitor.sendResourceAlarm("Swap", SWAP_USAGE_LOW, SET, swapUsagePercent);
}
else
{
swapFlag = 2;
serverMonitor.checkSwapAlarm("Swap");
}
// sleep, 1 minute
sleep(monitorPeriod);
} // end of while loop
}
/******************************************************************************************
* @brief checkMemoryAlarm
*
* purpose: check to see if an alarm(s) is set on MEMORY and clear if so
*
******************************************************************************************/
void ServerMonitor::checkMemoryAlarm(string alarmItem, ALARMS alarmID)
{
Oam oam;
ServerMonitor serverMonitor;
// get current server name
string serverName;
oamModuleInfo_t st;
try {
st = oam.getModuleInfo();
serverName = boost::get<0>(st);
}
catch (...) {
serverName = "Unknown Server";
}
switch (alarmID) {
case ALARM_NONE: // clear all alarms set if any found
if ( oam.checkActiveAlarm(MEMORY_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, MEMORY_USAGE_HIGH);
if ( oam.checkActiveAlarm(MEMORY_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, MEMORY_USAGE_MED);
if ( oam.checkActiveAlarm(MEMORY_USAGE_LOW, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, MEMORY_USAGE_LOW);
break;
case MEMORY_USAGE_LOW: // clear high and medium alarms set if any found
if ( oam.checkActiveAlarm(MEMORY_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, MEMORY_USAGE_HIGH);
if ( oam.checkActiveAlarm(MEMORY_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, MEMORY_USAGE_MED);
break;
case MEMORY_USAGE_MED: // clear high alarms set if any found
if ( oam.checkActiveAlarm(MEMORY_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, MEMORY_USAGE_HIGH);
break;
default: // none to clear
break;
} // end of switch
return;
}
/******************************************************************************************
* @brief checkSwapAlarm
*
* purpose: check to see if an alarm(s) is set on SWAP and clear if so
*
******************************************************************************************/
void ServerMonitor::checkSwapAlarm(string alarmItem, ALARMS alarmID)
{
Oam oam;
ServerMonitor serverMonitor;
// get current server name
string serverName;
oamModuleInfo_t st;
try {
st = oam.getModuleInfo();
serverName = boost::get<0>(st);
}
catch (...) {
serverName = "Unknown Server";
}
switch (alarmID) {
case ALARM_NONE: // clear all alarms set if any found
if ( oam.checkActiveAlarm(SWAP_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, SWAP_USAGE_HIGH);
if ( oam.checkActiveAlarm(SWAP_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, SWAP_USAGE_MED);
if ( oam.checkActiveAlarm(SWAP_USAGE_LOW, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, SWAP_USAGE_LOW);
break;
case SWAP_USAGE_LOW: // clear high and medium alarms set if any found
if ( oam.checkActiveAlarm(SWAP_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, SWAP_USAGE_HIGH);
if ( oam.checkActiveAlarm(SWAP_USAGE_MED, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, SWAP_USAGE_MED);
break;
case SWAP_USAGE_MED: // clear high alarms set if any found
if ( oam.checkActiveAlarm(SWAP_USAGE_HIGH, serverName, alarmItem) )
// alarm set, clear it
clearAlarm(alarmItem, SWAP_USAGE_HIGH);
break;
default: // none to clear
break;
} // end of switch
return;
}
/******************************************************************************************
* @brief checkSwapAction
*
* purpose: check if any system action needs tyo be taken
*
******************************************************************************************/
void ServerMonitor::checkSwapAction()
{
Oam oam;
if ( swapFlag == 0 ) {
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Swap Space usage over Major threashold, startSystem failure");
msg.format(args);
ml.logCriticalMessage(msg);
swapFlag = 1;
sleep(5);
return;
}
string swapAction = "restartSystem";
try {
oam.getSystemConfig ("SwapAction", swapAction);
} catch (...)
{}
if (swapAction == "none")
return;
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Swap Space usage over Major threashold, perform OAM command ");
args.add( swapAction);
msg.format(args);
ml.logCriticalMessage(msg);
GRACEFUL_FLAG gracefulTemp = GRACEFUL;
ACK_FLAG ackTemp = ACK_YES;
if ( swapAction == "stopSystem") {
try
{
oam.stopSystem(gracefulTemp, ackTemp);
}
catch (exception& e)
{
}
}
else if ( swapAction == "restartSystem") {
try
{
oam.restartSystem(gracefulTemp, ackTemp);
}
catch (exception& e)
{
}
}
}
/******************************************************************************************
* @brief outputProcMemory
*
* purpose: output Top memory users
*
******************************************************************************************/
void ServerMonitor::outputProcMemory(bool log)
{
//
// get top 5 Memory users by process
//
system("ps -e -orss=1,args= | sort -b -k1,1n |tail -n 5 | awk '{print $1,$2}' > /tmp/columnstore_tmp_files/processMem");
ifstream oldFile ("/tmp/columnstore_tmp_files/processMem");
string process;
long long memory;
int memoryUsage;
pml.clear();
char line[400];
while (oldFile.getline(line, 400))
{
string buf = line;
string::size_type pos = buf.find (' ',0);
if (pos != string::npos) {
memory = atol(buf.substr(0,pos-1).c_str());
memoryUsage = (memory * 1024 * 1000 / totalMem) + 1 ;
process = buf.substr(pos+1,80);
//cleanup process name
pos = process.rfind ('/');
if (pos != string::npos)
process=process.substr(pos+1,80);
else
{
pos = process.find ('[',0);
if (pos != string::npos)
process=process.substr(pos+1,80);
pos = process.find (']',0);
if (pos != string::npos)
process=process.substr(0,pos);
}
processMemory pm;
pm.processName = process;
pm.usedBlocks = memory;
pm.usedPercent = memoryUsage;
pml.push_back(pm);
if (log) {
LoggingID lid(SERVER_MONITOR_LOG_ID);
MessageLog ml(lid);
Message msg;
Message::Args args;
args.add("Memory Usage for Process: ");
args.add(process);
args.add(" : Memory Used ");
args.add((int) memory);
args.add(" : % Used ");
args.add(memoryUsage);
msg.format(args);
ml.logInfoMessage(msg);
}
}
}
oldFile.close();
}