You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
MCOL-259 replace system call to columnstore status with a function call to alleviate a race condition. Also added exception logging in a few places that will be helpful.
This commit is contained in:
@ -1368,10 +1368,7 @@ namespace oam
|
||||
|
||||
void Oam::getSystemStatus(SystemStatus& systemstatus, bool systemStatusOnly)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
exceptionControl("getSystemStatus", API_FAILURE);
|
||||
checkSystemRunning("getSystemStatus");
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// TODO: Remove when we create OAM for Windows
|
||||
@ -1414,15 +1411,15 @@ namespace oam
|
||||
catch (exception& e)
|
||||
{
|
||||
processor.shutdown();
|
||||
// string error = e.what();
|
||||
// writeLog("getSystemStatus: write exception: " + error, LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus", API_FAILURE);
|
||||
string error = e.what();
|
||||
writeLog("getSystemStatus: write exception: " + error, LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus write", API_FAILURE);
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
processor.shutdown();
|
||||
// writeLog("getSystemStatus: write exception: unknown", LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus", API_FAILURE);
|
||||
writeLog("getSystemStatus: write exception: unknown", LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus write", API_FAILURE);
|
||||
}
|
||||
|
||||
// wait 30 seconds for ACK from Process Monitor
|
||||
@ -1434,14 +1431,14 @@ namespace oam
|
||||
{
|
||||
processor.shutdown();
|
||||
string error = e.what();
|
||||
// writeLog("getSystemStatus: read exception: " + error, LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus", API_FAILURE);
|
||||
writeLog("getSystemStatus: read exception: " + error, LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus read", API_FAILURE);
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
processor.shutdown();
|
||||
// writeLog("getSystemStatus: read exception: unknown", LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus", API_FAILURE);
|
||||
writeLog("getSystemStatus: read exception: unknown", LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus read", API_FAILURE);
|
||||
}
|
||||
|
||||
if (ibs.length() > 0)
|
||||
@ -1521,13 +1518,24 @@ namespace oam
|
||||
processor.shutdown();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
writeLog("getSystemStatus: ProcStatusControl returns 0 length", LOG_TYPE_ERROR);
|
||||
}
|
||||
// timeout ocurred, shutdown connection
|
||||
processor.shutdown();
|
||||
// writeLog("getSystemStatus: read 0 length", LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus", API_FAILURE);
|
||||
writeLog("getSystemStatus: read 0 length", LOG_TYPE_ERROR);
|
||||
exceptionControl("getSystemStatus read 0", API_FAILURE);
|
||||
}
|
||||
catch (exception& e)
|
||||
{
|
||||
string error = e.what();
|
||||
writeLog("getSystemStatus: final exception: " + error, LOG_TYPE_ERROR);
|
||||
}
|
||||
catch(...)
|
||||
{}
|
||||
{
|
||||
writeLog("getSystemStatus: final exception: unknown", LOG_TYPE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
exceptionControl("getSystemStatus:MessageQueueClient-Error", API_FAILURE);
|
||||
@ -1572,6 +1580,7 @@ namespace oam
|
||||
ModuleConfig moduleconfig;
|
||||
std::vector<int> NICstates;
|
||||
degraded = false;
|
||||
state = oam::UNEQUIP;
|
||||
|
||||
try
|
||||
{
|
||||
@ -1595,8 +1604,19 @@ namespace oam
|
||||
getNICStatus((*pt1).HostName, state);
|
||||
NICstates.push_back(state);
|
||||
}
|
||||
catch (...)
|
||||
{}
|
||||
catch (exception& e)
|
||||
{
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getModuleStatus exception while getNICStatus " << (*pt1).HostName << " " << e.what();
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
catch (...) {
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getModuleStatus exception while getNICStatus " << (*pt1).HostName;
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
vector<int>::iterator pt = NICstates.begin();
|
||||
@ -1609,16 +1629,37 @@ namespace oam
|
||||
}
|
||||
return;
|
||||
}
|
||||
catch (...)
|
||||
{}
|
||||
catch (exception& e)
|
||||
{
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getModuleStatus exception while getSystemConfig " << name << " " << e.what();
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
catch (...) {
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getModuleStatus exception while getSystemConfig " << name;
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(...)
|
||||
{}
|
||||
}
|
||||
catch (exception& e)
|
||||
{
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getModuleStatus exception while getSystemStatus " << e.what();
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
catch (...) {
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getModuleStatus exception while getSystemStatus";
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
|
||||
// no match found
|
||||
state = oam::UNEQUIP;
|
||||
exceptionControl("getModuleStatus", API_INVALID_PARAMETER);
|
||||
}
|
||||
|
||||
@ -1794,8 +1835,12 @@ namespace oam
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (exception&)
|
||||
catch (exception& e)
|
||||
{
|
||||
Oam oam;
|
||||
ostringstream os;
|
||||
os << "Oam::getNICStatus exception while getSystemStatus for " << name << " " << e.what();
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
exceptionControl("getNICStatus", API_FAILURE);
|
||||
}
|
||||
|
||||
@ -2091,10 +2136,7 @@ namespace oam
|
||||
|
||||
void Oam::getProcessStatus(SystemProcessStatus& systemprocessstatus, string port)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
exceptionControl("getProcessStatus", API_FAILURE);
|
||||
checkSystemRunning("getProcessStatus");
|
||||
|
||||
ProcessStatus processstatus;
|
||||
systemprocessstatus.processstatus.clear();
|
||||
@ -2194,10 +2236,7 @@ namespace oam
|
||||
return;
|
||||
#endif
|
||||
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
exceptionControl("getProcessStatus", API_FAILURE);
|
||||
checkSystemRunning("getProcessStatus");
|
||||
|
||||
for ( int i = 0 ; i < 5 ; i ++)
|
||||
{
|
||||
@ -2292,10 +2331,7 @@ namespace oam
|
||||
|
||||
void Oam::setProcessStatus(const std::string process, const std::string module, const int state, pid_t PID)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
exceptionControl("setProcessStatus", API_FAILURE);
|
||||
checkSystemRunning("setProcessStatus");
|
||||
|
||||
//send and wait for ack and resend if not received
|
||||
//retry 5 time max
|
||||
@ -2812,10 +2848,7 @@ namespace oam
|
||||
exceptionControl("getMyProcessStatus", API_FAILURE);
|
||||
}
|
||||
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
exceptionControl("getMyProcessStatus", API_FAILURE);
|
||||
checkSystemRunning("getMyProcessStatus");
|
||||
|
||||
for ( int i = 0 ; i < 5 ; i ++)
|
||||
{
|
||||
@ -4652,20 +4685,31 @@ namespace oam
|
||||
{
|
||||
ifstream file (fileName.c_str());
|
||||
|
||||
char line[400];
|
||||
if (!file.is_open())
|
||||
{
|
||||
ostringstream os;
|
||||
os << "checkLogStatus error while opening file " << fileName << " " << strerror(errno);
|
||||
writeLog(os.str(), LOG_TYPE_ERROR );
|
||||
}
|
||||
string buf;
|
||||
|
||||
while (file.getline(line, 400))
|
||||
while (getline(file, buf))
|
||||
{
|
||||
buf = line;
|
||||
|
||||
string::size_type pos = buf.find(phrase,0);
|
||||
if (pos != string::npos)
|
||||
//found phrase
|
||||
return true;
|
||||
}
|
||||
if (file.bad())
|
||||
{
|
||||
ostringstream os;
|
||||
os << "checkLogStatus error while reading file " << fileName << " " << strerror(errno);
|
||||
writeLog(os.str(), LOG_TYPE_ERROR );
|
||||
}
|
||||
file.close();
|
||||
|
||||
ostringstream os;
|
||||
os << "checkLogStatus failed " << fileName << " expected \"" << phrase.c_str() << "\" found \"" << buf.c_str() << "\"";
|
||||
writeLog(os.str(), LOG_TYPE_ERROR );
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -4821,10 +4865,7 @@ namespace oam
|
||||
********************************************************************/
|
||||
bool Oam::switchParentOAMModule(std::string moduleName, GRACEFUL_FLAG gracefulflag)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
exceptionControl("switchParentOAMModule", API_FAILURE);
|
||||
checkSystemRunning("switchParentOAMModule");
|
||||
|
||||
int returnStatus;
|
||||
// We assume that moduleName is a valid pm
|
||||
@ -4837,7 +4878,7 @@ namespace oam
|
||||
|
||||
string cmdLine = "ping ";
|
||||
string cmdOption = " -w 1 >> /dev/null";
|
||||
cmd = cmdLine + IPAddr + cmdOption;
|
||||
string cmd = cmdLine + IPAddr + cmdOption;
|
||||
if ( system(cmd.c_str()) != 0 ) {
|
||||
//ping failure
|
||||
try{
|
||||
@ -6267,10 +6308,7 @@ namespace oam
|
||||
exceptionControl("sysConfig->write", API_FAILURE);
|
||||
}
|
||||
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
return;
|
||||
checkSystemRunning("addDbroot");
|
||||
|
||||
//get updated Columnstore.xml distributed
|
||||
distributeConfigFile("system");
|
||||
@ -8734,9 +8772,7 @@ namespace oam
|
||||
GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string argument1,
|
||||
const std::string argument2, int timeout)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
if (!checkSystemRunning(""))
|
||||
return API_CONN_REFUSED;
|
||||
|
||||
int returnStatus = API_SUCCESS; //default
|
||||
@ -8837,9 +8873,7 @@ namespace oam
|
||||
int Oam::sendMsgToProcMgr2(messageqcpp::ByteStream::byte requestType, DeviceNetworkList devicenetworklist,
|
||||
GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag, const std::string password, const std::string mysqlpw)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
if (!checkSystemRunning(""))
|
||||
return API_CONN_REFUSED;
|
||||
|
||||
int returnStatus = API_TIMEOUT; //default
|
||||
@ -8953,9 +8987,7 @@ namespace oam
|
||||
|
||||
int Oam::sendMsgToProcMgr3(messageqcpp::ByteStream::byte requestType, AlarmList& alarmlist, const std::string date)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
if (!checkSystemRunning(""))
|
||||
return API_CONN_REFUSED;
|
||||
|
||||
int returnStatus = API_SUCCESS; //default
|
||||
@ -9056,9 +9088,7 @@ namespace oam
|
||||
GRACEFUL_FLAG gracefulflag, ACK_FLAG ackflag,
|
||||
const std::string argument1, const std::string argument2, int timeout)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
if (!checkSystemRunning(""))
|
||||
return API_CONN_REFUSED;
|
||||
|
||||
int returnStatus = API_STILL_WORKING;
|
||||
@ -9244,9 +9274,7 @@ namespace oam
|
||||
|
||||
void Oam::sendStatusUpdate(ByteStream obs, ByteStream::byte returnRequestType)
|
||||
{
|
||||
string cmd = startup::StartUp::installDir() + "/bin/columnstore status > /tmp/status.log";
|
||||
system(cmd.c_str());
|
||||
if (!checkLogStatus("/tmp/status.log", "MariaDB Columnstore is running") )
|
||||
if (!checkSystemRunning(""))
|
||||
return;
|
||||
|
||||
for ( int i = 0 ; i < 5 ; i ++)
|
||||
@ -9622,6 +9650,33 @@ namespace oam
|
||||
return returnStatus;
|
||||
}
|
||||
|
||||
bool Oam::checkSystemRunning(const char* function)
|
||||
{
|
||||
struct stat st;
|
||||
if (stat("/var/lock/subsys/columnstore", &st) == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (geteuid() != 0)
|
||||
{
|
||||
// not root user
|
||||
// The stat above may fail for non-root because of permissions
|
||||
// This is a non-optimal solution
|
||||
string cmd = "pgrep ProcMon";
|
||||
if (system(cmd.c_str()) != 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
ostringstream os;
|
||||
os << function << " system is not running: " << strerror(errno);
|
||||
writeLog(os.str(), LOG_TYPE_ERROR );
|
||||
if (strlen(function))
|
||||
{
|
||||
throw runtime_error(os.str());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} //namespace oam
|
||||
|
||||
|
||||
|
@ -2482,6 +2482,8 @@ namespace oam
|
||||
*/
|
||||
void sendStatusUpdate(messageqcpp::ByteStream obs, messageqcpp::ByteStream::byte returnRequestType);
|
||||
|
||||
bool checkSystemRunning(const char* function);
|
||||
|
||||
std::string CalpontConfigFile;
|
||||
std::string AlarmConfigFile;
|
||||
std::string ProcessConfigFile;
|
||||
|
@ -120,7 +120,19 @@ void OamCache::checkReload()
|
||||
try {
|
||||
oam.getModuleStatus(string("pm") + num, state, degraded);
|
||||
}
|
||||
catch (...) {break;}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "OamCache::checkReload exception while getModuleStatus pm" << num << " " << e.what();
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
break;
|
||||
}
|
||||
catch (...) {
|
||||
ostringstream os;
|
||||
os << "OamCache::checkReload exception while getModuleStatus pm" << num;
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
break;
|
||||
}
|
||||
|
||||
if (state == oam::ACTIVE || state == oam::DEGRADED) {
|
||||
pmToConnectionMap[*it] = i++;
|
||||
@ -134,10 +146,22 @@ void OamCache::checkReload()
|
||||
{
|
||||
ostringstream os;
|
||||
os << "OamCache::checkReload shows state for pm" << num << " as " << oamState[state];
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_WARNING);
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
}
|
||||
}
|
||||
catch (...) { /* doesn't get added to the connection map */ }
|
||||
catch (std::exception& e)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "OamCache::checkReload final exception while getModuleStatus " << e.what();
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
break;
|
||||
}
|
||||
catch (...) {
|
||||
ostringstream os;
|
||||
os << "OamCache::checkReload final exception while getModuleStatus";
|
||||
oam.writeLog(os.str(), logging::LOG_TYPE_ERROR);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#else
|
||||
moduleIds.push_back(*it);
|
||||
|
@ -437,7 +437,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
|
||||
size_t mlread = 0;
|
||||
|
||||
if (readToMagic(msecs, isTimeOut, stats) == false) //indicates a timeout or EOF
|
||||
{
|
||||
logIoError("InetStreamSocket::read: timeout during readToMagic", 0);
|
||||
return SBS(new ByteStream(0));
|
||||
}
|
||||
|
||||
//FIXME: This seems like a lot of work to read 4 bytes...
|
||||
while (mlread < sizeof(msglen))
|
||||
@ -458,6 +461,7 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
|
||||
{
|
||||
if (isTimeOut)
|
||||
*isTimeOut = true;
|
||||
logIoError("InetStreamSocket::read: timeout during first poll", 0);
|
||||
return SBS(new ByteStream(0));
|
||||
}
|
||||
}
|
||||
@ -470,7 +474,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
|
||||
if (t == 0)
|
||||
{
|
||||
if (timeout == NULL)
|
||||
{
|
||||
logIoError("InetStreamSocket::read: timeout during first read", 0);
|
||||
return SBS(new ByteStream(0)); // don't return an incomplete message
|
||||
}
|
||||
else
|
||||
throw SocketClosed("InetStreamSocket::read: Remote is closed");
|
||||
}
|
||||
@ -518,7 +525,10 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
|
||||
if (err == 0) // timeout
|
||||
{
|
||||
if (isTimeOut)
|
||||
{
|
||||
logIoError("InetStreamSocket::read: timeout during second poll", 0);
|
||||
*isTimeOut = true;
|
||||
}
|
||||
if (stats)
|
||||
stats->dataRecvd(nread);
|
||||
return SBS(new ByteStream(0));
|
||||
@ -538,8 +548,11 @@ const SBS InetStreamSocket::read(const struct ::timespec* timeout, bool* isTimeO
|
||||
if (timeout == NULL)
|
||||
return SBS(new ByteStream(0)); // don't return an incomplete message
|
||||
else
|
||||
{
|
||||
logIoError("InetStreamSocket::read: timeout during second read", 0);
|
||||
throw SocketClosed("InetStreamSocket::read: Remote is closed");
|
||||
}
|
||||
}
|
||||
if (t < 0) {
|
||||
ostringstream oss;
|
||||
#ifdef _MSC_VER
|
||||
|
Reference in New Issue
Block a user