1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-259 replace system call to columnstore status with a function call to alleviate a race condition. Also added exception logging in a few places that will be helpful.

This commit is contained in:
David Hall
2016-09-02 09:26:44 -05:00
parent 75ecb950f9
commit 7e0723a8bc
4 changed files with 281 additions and 187 deletions

View File

@ -1368,10 +1368,7 @@ namespace oam
void Oam::getSystemStatus(SystemStatus& systemstatus, bool systemStatusOnly)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
exceptionControl("getSystemStatus", API_FAILURE);
checkSystemRunning("getSystemStatus");
#ifdef _MSC_VER
// TODO: Remove when we create OAM for Windows
@ -1414,15 +1411,15 @@ namespace oam
catch (exception& e)
{
processor.shutdown();
// string error = e.what();
// writeLog("getSystemStatus: write exception: " + error, LOG_TYPE_ERROR);
exceptionControl("getSystemStatus", API_FAILURE);
string error = e.what();
writeLog("getSystemStatus: write exception: " + error, LOG_TYPE_ERROR);
exceptionControl("getSystemStatus write", API_FAILURE);
}
catch(...)
{
processor.shutdown();
// writeLog("getSystemStatus: write exception: unknown", LOG_TYPE_ERROR);
exceptionControl("getSystemStatus", API_FAILURE);
writeLog("getSystemStatus: write exception: unknown", LOG_TYPE_ERROR);
exceptionControl("getSystemStatus write", API_FAILURE);
}
// wait 30 seconds for ACK from Process Monitor
@ -1434,14 +1431,14 @@ namespace oam
{
processor.shutdown();
string error = e.what();
// writeLog("getSystemStatus: read exception: " + error, LOG_TYPE_ERROR);
exceptionControl("getSystemStatus", API_FAILURE);
writeLog("getSystemStatus: read exception: " + error, LOG_TYPE_ERROR);
exceptionControl("getSystemStatus read", API_FAILURE);
}
catch(...)
{
processor.shutdown();
// writeLog("getSystemStatus: read exception: unknown", LOG_TYPE_ERROR);
exceptionControl("getSystemStatus", API_FAILURE);
writeLog("getSystemStatus: read exception: unknown", LOG_TYPE_ERROR);
exceptionControl("getSystemStatus read", API_FAILURE);
}
if (ibs.length() > 0)
@ -1521,13 +1518,24 @@ namespace oam
processor.shutdown();
return;
}
else
{
writeLog("getSystemStatus: ProcStatusControl returns 0 length", LOG_TYPE_ERROR);
}
// timeout ocurred, shutdown connection
processor.shutdown();
// writeLog("getSystemStatus: read 0 length", LOG_TYPE_ERROR);
exceptionControl("getSystemStatus", API_FAILURE);
writeLog("getSystemStatus: read 0 length", LOG_TYPE_ERROR);
exceptionControl("getSystemStatus read 0", API_FAILURE);
}
catch (exception& e)
{
string error = e.what();
writeLog("getSystemStatus: final exception: " + error, LOG_TYPE_ERROR);
}
catch(...)
{}
{
writeLog("getSystemStatus: final exception: unknown", LOG_TYPE_ERROR);
}
}
exceptionControl("getSystemStatus:MessageQueueClient-Error", API_FAILURE);
@ -1572,6 +1580,7 @@ namespace oam
ModuleConfig moduleconfig;
std::vector<int> NICstates;
degraded = false;
state = oam::UNEQUIP;
try
{
@ -1595,8 +1604,19 @@ namespace oam
getNICStatus((*pt1).HostName, state);
NICstates.push_back(state);
}
catch (...)
{}
catch (exception& e)
{
Oam oam;
ostringstream os;
os << "Oam::getModuleStatus exception while getNICStatus " << (*pt1).HostName << " " << e.what();
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
catch (...) {
Oam oam;
ostringstream os;
os << "Oam::getModuleStatus exception while getNICStatus " << (*pt1).HostName;
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
}
vector<int>::iterator pt = NICstates.begin();
@ -1609,16 +1629,37 @@ namespace oam
}
return;
}
catch (...)
{}
catch (exception& e)
{
Oam oam;
ostringstream os;
os << "Oam::getModuleStatus exception while getSystemConfig " << name << " " << e.what();
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
catch (...) {
Oam oam;
ostringstream os;
os << "Oam::getModuleStatus exception while getSystemConfig " << name;
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
}
}
catch(...)
{}
}
catch (exception& e)
{
Oam oam;
ostringstream os;
os << "Oam::getModuleStatus exception while getSystemStatus " << e.what();
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
catch (...) {
Oam oam;
ostringstream os;
os << "Oam::getModuleStatus exception while getSystemStatus";
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
// no match found
state = oam::UNEQUIP;
exceptionControl("getModuleStatus", API_INVALID_PARAMETER);
}
@ -1794,8 +1835,12 @@ namespace oam
}
}
}
catch (exception&)
catch (exception& e)
{
Oam oam;
ostringstream os;
os << "Oam::getNICStatus exception while getSystemStatus for " << name << " " << e.what();
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
exceptionControl("getNICStatus", API_FAILURE);
}
@ -2091,10 +2136,7 @@ namespace oam
void Oam::getProcessStatus(SystemProcessStatus& systemprocessstatus, string port)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
exceptionControl("getProcessStatus", API_FAILURE);
checkSystemRunning("getProcessStatus");
ProcessStatus processstatus;
systemprocessstatus.processstatus.clear();
@ -2194,10 +2236,7 @@ namespace oam
return;
#endif
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
exceptionControl("getProcessStatus", API_FAILURE);
checkSystemRunning("getProcessStatus");
for ( int i = 0 ; i < 5 ; i ++)
{
@ -2292,10 +2331,7 @@ namespace oam
void Oam::setProcessStatus(const std::string process, const std::string module, const int state, pid_t PID)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
exceptionControl("setProcessStatus", API_FAILURE);
checkSystemRunning("setProcessStatus");
//send and wait for ack and resend if not received
//retry 5 time max
@ -2812,10 +2848,7 @@ namespace oam
exceptionControl("getMyProcessStatus", API_FAILURE);
}
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
exceptionControl("getMyProcessStatus", API_FAILURE);
checkSystemRunning("getMyProcessStatus");
for ( int i = 0 ; i < 5 ; i ++)
{
@ -4652,20 +4685,31 @@ namespace oam
{
ifstream file (fileName.c_str());
char line[400];
if (!file.is_open())
{
ostringstream os;
os << "checkLogStatus error while opening file " << fileName << " " << strerror(errno);
writeLog(os.str(), LOG_TYPE_ERROR );
}
string buf;
while (file.getline(line, 400))
while (getline(file, buf))
{
buf = line;
string::size_type pos = buf.find(phrase,0);
if (pos != string::npos)
//found phrase
return true;
}
if (file.bad())
{
ostringstream os;
os << "checkLogStatus error while reading file " << fileName << " " << strerror(errno);
writeLog(os.str(), LOG_TYPE_ERROR );
}
file.close();
ostringstream os;
os << "checkLogStatus failed " << fileName << " expected \"" << phrase.c_str() << "\" found \"" << buf.c_str() << "\"";
writeLog(os.str(), LOG_TYPE_ERROR );
return false;
}
@ -4821,10 +4865,7 @@ namespace oam
********************************************************************/
bool Oam::switchParentOAMModule(std::string moduleName, GRACEFUL_FLAG gracefulflag)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
exceptionControl("switchParentOAMModule", API_FAILURE);
checkSystemRunning("switchParentOAMModule");
int returnStatus;
// We assume that moduleName is a valid pm
@ -4837,7 +4878,7 @@ namespace oam
string cmdLine = "ping ";
string cmdOption = " -w 1 >> /dev/null";
cmd = cmdLine + IPAddr + cmdOption;
string cmd = cmdLine + IPAddr + cmdOption;
if ( system(cmd.c_str()) != 0 ) {
//ping failure
try{
@ -6267,10 +6308,7 @@ namespace oam
exceptionControl("sysConfig->write", API_FAILURE);
}
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
return;
checkSystemRunning("addDbroot");
//get updated Columnstore.xml distributed
distributeConfigFile("system");
@ -8734,9 +8772,7 @@ namespace oam
GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string argument1,
const std::string argument2, int timeout)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
if (!checkSystemRunning(""))
return API_CONN_REFUSED;
int returnStatus = API_SUCCESS; //default
@ -8837,9 +8873,7 @@ namespace oam
int Oam::sendMsgToProcMgr2(messageqcpp::ByteStream::byte requestType, DeviceNetworkList devicenetworklist,
GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string password, const std::string mysqlpw)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
if (!checkSystemRunning(""))
return API_CONN_REFUSED;
int returnStatus = API_TIMEOUT; //default
@ -8953,9 +8987,7 @@ namespace oam
int Oam::sendMsgToProcMgr3(messageqcpp::ByteStream::byte requestType, AlarmList& alarmlist, const std::string date)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
if (!checkSystemRunning(""))
return API_CONN_REFUSED;
int returnStatus = API_SUCCESS; //default
@ -9056,9 +9088,7 @@ namespace oam
GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag,
const std::string argument1, const std::string argument2, int timeout)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
if (!checkSystemRunning(""))
return API_CONN_REFUSED;
int returnStatus = API_STILL_WORKING;
@ -9244,9 +9274,7 @@ namespace oam
void Oam::sendStatusUpdate(ByteStream obs, ByteStream::byte returnRequestType)
{
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
system(cmd.c_str());
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
if (!checkSystemRunning(""))
return;
for ( int i = 0 ; i < 5 ; i ++)
@ -9622,6 +9650,33 @@ namespace oam
return returnStatus;
}
bool Oam::checkSystemRunning(const char* function)
{
struct stat st;
if (stat("/var/lock/subsys/columnstore", &st) == 0)
{
return true;
}
if (geteuid() != 0)
{
// not root user
// The stat above may fail for non-root because of permissions
// This is a non-optimal solution
string cmd = "pgrep ProcMon";
if (system(cmd.c_str()) != 0)
{
return true;
}
}
ostringstream os;
os << function << " system is not running: " << strerror(errno);
writeLog(os.str(), LOG_TYPE_ERROR );
if (strlen(function))
{
throw runtime_error(os.str());
}
return false;
}
} //namespace oam

View File

@ -2482,6 +2482,8 @@ namespace oam
*/
void sendStatusUpdate(messageqcpp::ByteStream obs, messageqcpp::ByteStream::byte returnRequestType);
bool checkSystemRunning(const char* function);
std::string CalpontConfigFile;
std::string AlarmConfigFile;
std::string ProcessConfigFile;

View File

@ -120,7 +120,19 @@ void OamCache::checkReload()
try {
oam.getModuleStatus(string("pm") + num, state, degraded);
}
catch (...) {break;}
catch (std::exception& e)
{
ostringstream os;
os << "OamCache::checkReload exception while getModuleStatus pm" << num << " " << e.what();
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
break;
}
catch (...) {
ostringstream os;
os << "OamCache::checkReload exception while getModuleStatus pm" << num;
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
break;
}
if (state == oam::ACTIVE || state == oam::DEGRADED) {
pmToConnectionMap[*it] = i++;
@ -134,10 +146,22 @@ void OamCache::checkReload()
{
ostringstream os;
os << "OamCache::checkReload shows state for pm" << num << " as " << oamState[state];
oam.writeLog(os.str(), logging::LOG_TYPE_WARNING);
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
}
}
catch (...) { /* doesn't get added to the connection map */ }
catch (std::exception& e)
{
ostringstream os;
os << "OamCache::checkReload final exception while getModuleStatus " << e.what();
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
break;
}
catch (...) {
ostringstream os;
os << "OamCache::checkReload final exception while getModuleStatus";
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
break;
}
}
#else
moduleIds.push_back(*it);

View File

@ -437,7 +437,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
size_t mlread = 0;
if (readToMagic(msecs, isTimeOut, stats) == false) //indicates a timeout or EOF
{
logIoError("InetStreamSocket::read: timeout during readToMagic", 0);
return SBS(new ByteStream(0));
}
//FIXME: This seems like a lot of work to read 4 bytes...
while (mlread < sizeof(msglen))
@ -458,6 +461,7 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
{
if (isTimeOut)
*isTimeOut = true;
logIoError("InetStreamSocket::read: timeout during first poll", 0);
return SBS(new ByteStream(0));
}
}
@ -470,7 +474,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
if (t == 0)
{
if (timeout == NULL)
{
logIoError("InetStreamSocket::read: timeout during first read", 0);
return SBS(new ByteStream(0)); // don't return an incomplete message
}
else
throw SocketClosed("InetStreamSocket::read: Remote is closed");
}
@ -518,7 +525,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
if (err == 0) // timeout
{
if (isTimeOut)
{
logIoError("InetStreamSocket::read: timeout during second poll", 0);
*isTimeOut = true;
}
if (stats)
stats->dataRecvd(nread);
return SBS(new ByteStream(0));
@ -538,8 +548,11 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
if (timeout == NULL)
return SBS(new ByteStream(0)); // don't return an incomplete message
else
{
logIoError("InetStreamSocket::read: timeout during second read", 0);
throw SocketClosed("InetStreamSocket::read: Remote is closed");
}
}
if (t < 0) {
ostringstream oss;
#ifdef _MSC_VER