From 23dad9de021f78b63c0153e9e5e3ec30097a7497 Mon Sep 17 00:00:00 2001 From: david hill Date: Wed, 25 May 2016 13:03:11 -0500 Subject: [PATCH 01/10] fix engine create issue --- dbcon/mysql/install_calpont_mysql.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/dbcon/mysql/install_calpont_mysql.sh b/dbcon/mysql/install_calpont_mysql.sh index acf8cefee..44b3a6d9e 100755 --- a/dbcon/mysql/install_calpont_mysql.sh +++ b/dbcon/mysql/install_calpont_mysql.sh @@ -29,7 +29,6 @@ df=$installdir/mysql/my.cnf $installdir/mysql/bin/mysql --defaults-file=$df --force --user=root $pwprompt mysql 2>/tmp/mysql_install.log <>/tmp/mysql_install.log < Date: Wed, 25 May 2016 14:08:04 -0500 Subject: [PATCH 02/10] change tmp storage name --- oam/etc/Calpont.xml | 2 +- oam/etc/Calpont.xml.singleserver | 2 +- oamapps/serverMonitor/cpuMonitor.cpp | 8 ++++---- oamapps/serverMonitor/memoryMonitor.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/oam/etc/Calpont.xml b/oam/etc/Calpont.xml index 2088fd747..b1d9958ad 100755 --- a/oam/etc/Calpont.xml +++ b/oam/etc/Calpont.xml @@ -251,7 +251,7 @@ 0.0.0.0 128M 10 - /tmp/infinidb_tmp_files + /tmp/columnstore_tmp_files $INSTALLDIR 10 120 diff --git a/oam/etc/Calpont.xml.singleserver b/oam/etc/Calpont.xml.singleserver index 82261c80a..65895bf22 100644 --- a/oam/etc/Calpont.xml.singleserver +++ b/oam/etc/Calpont.xml.singleserver @@ -236,7 +236,7 @@ 0.0.0.0 128M 10 - /tmp/infinidb_tmp_files + /tmp/columnstore_tmp_files $INSTALLDIR 10 3 diff --git a/oamapps/serverMonitor/cpuMonitor.cpp b/oamapps/serverMonitor/cpuMonitor.cpp index cab683b9c..95d17e19e 100644 --- a/oamapps/serverMonitor/cpuMonitor.cpp +++ b/oamapps/serverMonitor/cpuMonitor.cpp @@ -479,9 +479,9 @@ void ServerMonitor::getCPUdata() { pcl.clear(); - system("top -b -n1 | head -12 | awk '{print $9,$12}' | tail -5 > /tmp/infinidb_tmp_files/processCpu"); + system("top -b -n1 | head -12 | awk '{print $9,$12}' | tail -5 > /tmp/columnstore_tmp_files/processCpu"); - ifstream oldFile1 ("/tmp/infinidb_tmp_files/processCpu"); + ifstream oldFile1 ("/tmp/columnstore_tmp_files/processCpu"); // read top 5 users int i = 0; @@ -503,9 +503,9 @@ void ServerMonitor::getCPUdata() // // get and check Total CPU usage // - system("top -b -n 6 -d 1 | awk '{print $5}' | grep %id > /tmp/infinidb_tmp_files/systemCpu"); + system("top -b -n 6 -d 1 | awk '{print $5}' | grep %id > /tmp/columnstore_tmp_files/systemCpu"); - ifstream oldFile ("/tmp/infinidb_tmp_files/systemCpu"); + ifstream oldFile ("/tmp/columnstore_tmp_files/systemCpu"); float systemIdle = 0; // skip first line in file, and average the next 5 entries which contains idle times diff --git a/oamapps/serverMonitor/memoryMonitor.cpp b/oamapps/serverMonitor/memoryMonitor.cpp index 95877d03e..5b5d8d31f 100644 --- a/oamapps/serverMonitor/memoryMonitor.cpp +++ b/oamapps/serverMonitor/memoryMonitor.cpp @@ -427,9 +427,9 @@ void ServerMonitor::outputProcMemory(bool log) // get top 5 Memory users by process // - system("ps -e -orss=1,args= | sort -b -k1,1n |tail -n 5 | awk '{print $1,$2}' > /tmp/infinidb_tmp_files/processMem"); + system("ps -e -orss=1,args= | sort -b -k1,1n |tail -n 5 | awk '{print $1,$2}' > /tmp/columnstore_tmp_files/processMem"); - ifstream oldFile ("/tmp/infinidb_tmp_files/processMem"); + ifstream oldFile ("/tmp/columnstore_tmp_files/processMem"); string process; long long memory; From a2264bea7007ce2de83fe00eaba4c42733071d8d Mon Sep 17 00:00:00 2001 From: david hill Date: Wed, 25 May 2016 14:51:34 -0500 Subject: [PATCH 03/10] add a linefeed before system being started --- oamapps/mcsadmin/mcsadmin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oamapps/mcsadmin/mcsadmin.cpp b/oamapps/mcsadmin/mcsadmin.cpp index 1b67b11ee..e5c546f39 100644 --- a/oamapps/mcsadmin/mcsadmin.cpp +++ b/oamapps/mcsadmin/mcsadmin.cpp @@ -2327,7 +2327,7 @@ int processCommand(string* arguments) else { //just kick off local server - cout << " System being started, please wait..."; + cout << endl << " System being started, please wait..."; cout.flush(); cmd = startup::StartUp::installDir() + "/bin/columnstore restart > /tmp/start.log 2>&1"; int rtnCode = system(cmd.c_str()); @@ -2349,7 +2349,7 @@ int processCommand(string* arguments) try { - cout << " System being started, please wait..."; + cout << endl << " System being started, please wait..."; cout.flush(); oam.startSystem(ackTemp); if ( waitForActive() ) From f9775e7ce6487bdc589529f2581f9a987fe7875f Mon Sep 17 00:00:00 2001 From: david hill Date: Wed, 25 May 2016 15:08:00 -0500 Subject: [PATCH 04/10] remove amazon start when module is stopped --- oam/etc/Calpont.xml | 2 +- procmgr/main.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/oam/etc/Calpont.xml b/oam/etc/Calpont.xml index b1d9958ad..34b82efaf 100755 --- a/oam/etc/Calpont.xml +++ b/oam/etc/Calpont.xml @@ -231,7 +231,7 @@ unassigned 1 3 - 30 // 2.5 minutes + 6 // 2.5 minutes diff --git a/procmgr/main.cpp b/procmgr/main.cpp index c4364cde7..2d8511145 100644 --- a/procmgr/main.cpp +++ b/procmgr/main.cpp @@ -1383,12 +1383,12 @@ void pingDeviceThread() //restart module processes int retry = 0; - int ModuleProcMonWaitCount = 30; + int ModuleProcMonWaitCount = 6; try{ oam.getSystemConfig("ModuleProcMonWaitCount", ModuleProcMonWaitCount); } catch(...) { - ModuleProcMonWaitCount = 30; + ModuleProcMonWaitCount = 6; } for ( ; retry < ModuleProcMonWaitCount ; retry ++ ) @@ -1702,7 +1702,7 @@ void pingDeviceThread() // return values = 'ip address' for running or rebooting, stopped or terminated string currentIPAddr = oam.getEC2InstanceIpAddress(hostName); - if (currentIPAddr == "stopped") +/* if (currentIPAddr == "stopped") { // start instance log.writeLog(__LINE__, "Instance in stopped state, try starting it: " + hostName, LOG_TYPE_DEBUG); @@ -1754,7 +1754,7 @@ void pingDeviceThread() currentIPAddr = "terminated"; } } - +*/ if ( currentIPAddr == "terminated") { //check if down module was Standby OAM, if so find another one From da11ab24a662858a94e961ee8ea2044bfab956f4 Mon Sep 17 00:00:00 2001 From: david hill Date: Wed, 25 May 2016 15:41:37 -0500 Subject: [PATCH 05/10] fix amazon getgroup command --- oam/cloud/IDBInstanceCmds.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/oam/cloud/IDBInstanceCmds.sh b/oam/cloud/IDBInstanceCmds.sh index 0436bb137..47a75295a 100644 --- a/oam/cloud/IDBInstanceCmds.sh +++ b/oam/cloud/IDBInstanceCmds.sh @@ -347,6 +347,9 @@ getGroup() { describeInstance fi group=`grep -B1 -A6 -m 1 $instance $describeInstanceFile | grep -m 1 GROUP | awk '{gsub(/^[ \t]+|[ \t]+$/,"");print $2}'` + if [ "$group" == "" ]; then + group=`grep -B1 -A4 -m 1 $instance $describeInstanceFile | grep -m 1 INSTANCE | awk '{gsub(/^[ \t]+|[ \t]+$/,"");print $21}'` + fi fi echo $group From c6f89d33361e2f1b57bfe3931f27dc0bff700c5a Mon Sep 17 00:00:00 2001 From: david hill Date: Thu, 26 May 2016 09:14:14 -0500 Subject: [PATCH 06/10] get localhostname --- oamapps/postConfigure/postConfigure.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/oamapps/postConfigure/postConfigure.cpp b/oamapps/postConfigure/postConfigure.cpp index a0aaed24c..0c3000506 100644 --- a/oamapps/postConfigure/postConfigure.cpp +++ b/oamapps/postConfigure/postConfigure.cpp @@ -1721,7 +1721,17 @@ int main(int argc, char *argv[]) } } else + { + if ( moduleHostName == oam::UnassignedName && + newModuleName == "pm1" ) + { + char hostname[128]; + gethostname(hostname, sizeof hostname); + moduleHostName = hostname; + } + prompt = "Enter Nic Interface #" + oam.itoa(nicID) + " Host Name (" + moduleHostName + ") > "; + } if ( prompt != "" ) { From d6388afd6422313d22d6dab6dd220fb5955b7f0e Mon Sep 17 00:00:00 2001 From: david hill Date: Thu, 26 May 2016 09:21:46 -0500 Subject: [PATCH 07/10] get localhostname - add check for nic = 1 --- oamapps/postConfigure/postConfigure.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oamapps/postConfigure/postConfigure.cpp b/oamapps/postConfigure/postConfigure.cpp index 0c3000506..121a2cd09 100644 --- a/oamapps/postConfigure/postConfigure.cpp +++ b/oamapps/postConfigure/postConfigure.cpp @@ -1651,7 +1651,7 @@ int main(int argc, char *argv[]) newModuleHostName = moduleHostName; if (amazonInstall) { if ( moduleHostName == oam::UnassignedName && - newModuleName == "pm1" ) + newModuleName == "pm1" && nicID == 1) { //get local instance name (pm1) string localInstance = oam.getEC2LocalInstance(); @@ -1723,7 +1723,7 @@ int main(int argc, char *argv[]) else { if ( moduleHostName == oam::UnassignedName && - newModuleName == "pm1" ) + newModuleName == "pm1" && nicID == 1) { char hostname[128]; gethostname(hostname, sizeof hostname); From fbb2b64d5c1e1557bf363515af99320d6d489f24 Mon Sep 17 00:00:00 2001 From: david hill Date: Thu, 26 May 2016 13:38:45 -0500 Subject: [PATCH 08/10] tweak for amazon pm1 terminate not restarting --- procmgr/main.cpp | 62 +++++------------------------------------------- 1 file changed, 6 insertions(+), 56 deletions(-) diff --git a/procmgr/main.cpp b/procmgr/main.cpp index 2d8511145..c07576dae 100644 --- a/procmgr/main.cpp +++ b/procmgr/main.cpp @@ -1609,9 +1609,12 @@ void pingDeviceThread() break; case oam::DOWN: - // if disabled or initial state, skip - if (opState == oam::AUTO_DISABLED || - opState == oam::INITIAL) + // if initial state, skip + if (opState == oam::INITIAL) + break; + + // if disabled and not amazon, skip + if (opState == oam::AUTO_DISABLED && !amazon) break; log.writeLog(__LINE__, "module failed to respond to pings: " + moduleName, LOG_TYPE_WARNING); @@ -1702,59 +1705,6 @@ void pingDeviceThread() // return values = 'ip address' for running or rebooting, stopped or terminated string currentIPAddr = oam.getEC2InstanceIpAddress(hostName); -/* if (currentIPAddr == "stopped") - { // start instance - log.writeLog(__LINE__, "Instance in stopped state, try starting it: " + hostName, LOG_TYPE_DEBUG); - - int retryCount = 6; // 1 minutes - if( moduleName.find("pm") == 0 ) - { - if ( PMInstanceType == "m2.4xlarge" ) - retryCount = 15; // 2.5 minutes - } - else - { - if( moduleName.find("um") == 0 ) - if ( UMInstanceType == "m2.4xlarge" ) - retryCount = 15; // 2.5 minutes - } - - int retry = 0; - for ( ; retry < retryCount ; retry++ ) - { - if ( oam.startEC2Instance(hostName) ) - { - log.writeLog(__LINE__, "Instance started, sleep for 30 seconds to allow it to fully come up: " + hostName, LOG_TYPE_DEBUG); - - //delay then get new IP Address - sleep(30); - string currentIPAddr = oam.getEC2InstanceIpAddress(hostName); - if (currentIPAddr == "stopped" || currentIPAddr == "terminated") { - log.writeLog(__LINE__, "Instance failed to start (no ip-address), retry: " + hostName, LOG_TYPE_DEBUG); - } - else - { - // update the Calpont.xml with the new IP Address - string cmd = "sed -i s/" + ipAddr + "/" + currentIPAddr + "/g " + startup::StartUp::installDir() + "/etc/Calpont.xml"; - system(cmd.c_str()); - break; - } - } - else - { - log.writeLog(__LINE__, "Instance failed to start, retry: " + hostName, LOG_TYPE_DEBUG); - - sleep(10); - } - } - - if ( retry >= retryCount ) - { - log.writeLog(__LINE__, "Instance failed to start, restart a new instance: " + hostName, LOG_TYPE_DEBUG); - currentIPAddr = "terminated"; - } - } -*/ if ( currentIPAddr == "terminated") { //check if down module was Standby OAM, if so find another one From 873111ce3ff9c17fc7a0b6600b2ca4db948e8fe4 Mon Sep 17 00:00:00 2001 From: david hill Date: Thu, 26 May 2016 15:52:41 -0500 Subject: [PATCH 09/10] change rsa key checkin --- oam/oamcpp/liboamcpp.cpp | 2 +- procmgr/processmanager.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/oam/oamcpp/liboamcpp.cpp b/oam/oamcpp/liboamcpp.cpp index ed406345f..7f15c6c68 100644 --- a/oam/oamcpp/liboamcpp.cpp +++ b/oam/oamcpp/liboamcpp.cpp @@ -4657,7 +4657,7 @@ namespace oam { buf = line; - string::size_type pos = buf.find("Offending RSA key",0); + string::size_type pos = buf.find("Offending",0); if (pos != string::npos) { // line ID pos = buf.find(":",0); diff --git a/procmgr/processmanager.cpp b/procmgr/processmanager.cpp index 78715db2e..97f83fd43 100644 --- a/procmgr/processmanager.cpp +++ b/procmgr/processmanager.cpp @@ -4602,8 +4602,8 @@ int ProcessManager::addModule(oam::DeviceNetworkList devicenetworklist, std::str system(cmd.c_str()); if (!oam.checkLogStatus("/tmp/login_test.log", "README")) { //check for RSA KEY ISSUE and fix - if (oam.checkLogStatus("/tmp/login_test.log", "Offending RSA key")) { - log.writeLog(__LINE__, "addModule - login failed, RSA key issue, try fixing: " + moduleName, LOG_TYPE_DEBUG); + if (oam.checkLogStatus("/tmp/login_test.log", "Offending")) { + log.writeLog(__LINE__, "addModule - login failed, Offending key issue, try fixing: " + moduleName, LOG_TYPE_DEBUG); string file = "/tmp/login_test.log"; oam.fixRSAkey(file); } From aaf65f9943a864d2f76ffc3071dc9c05c64cd8e7 Mon Sep 17 00:00:00 2001 From: david hill Date: Fri, 27 May 2016 09:21:23 -0500 Subject: [PATCH 10/10] add check for confirm arg in restartprocess --- oamapps/mcsadmin/mcsadmin.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/oamapps/mcsadmin/mcsadmin.cpp b/oamapps/mcsadmin/mcsadmin.cpp index e5c546f39..4511c876a 100644 --- a/oamapps/mcsadmin/mcsadmin.cpp +++ b/oamapps/mcsadmin/mcsadmin.cpp @@ -6835,9 +6835,13 @@ int ProcessSupportCommand(int CommandID, std::string arguments[]) getFlags(arguments, gracefulTemp, ackTemp, suspendAnswer, bNeedsConfirm); - // confirm request - if (confirmPrompt("This command restarts the processing of an application on a Module within the MariaDB Columnstore System")) - break; + if (arguments[3] != "y") + { + // confirm request + if (confirmPrompt("This command restarts the processing of an application on a Module within the MariaDB Columnstore System")) + break; + } + try {