You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-08-05 16:15:50 +03:00
MCOL-1137 - fixed master/slave setup after failvover
This commit is contained in:
@@ -56,7 +56,7 @@ checkForError
|
||||
#
|
||||
# Run reset slave command
|
||||
#
|
||||
echo "Run start slave command" >>/tmp/disable-rep-status.log
|
||||
echo "Run reset slave command" >>/tmp/disable-rep-status.log
|
||||
cat >/tmp/idb_disable-rep.sql <<EOD
|
||||
reset slave;
|
||||
EOD
|
||||
|
130
procmgr/main.cpp
130
procmgr/main.cpp
@@ -1645,7 +1645,7 @@ void pingDeviceThread()
|
||||
DeviceNetworkConfig devicenetworkconfig;
|
||||
devicenetworkconfig.DeviceName = moduleName;
|
||||
devicenetworklist.push_back(devicenetworkconfig);
|
||||
processManager.setMySQLReplication(devicenetworklist);
|
||||
processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, false, true);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -1833,71 +1833,73 @@ void pingDeviceThread()
|
||||
if (LANOUTAGEACTIVE)
|
||||
break;
|
||||
|
||||
//Log failure, issue alarm, set moduleOpState
|
||||
Configuration config;
|
||||
log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
|
||||
// if not disabled and amazon, skip
|
||||
if (opState != oam::AUTO_DISABLED )
|
||||
{
|
||||
//Log failure, issue alarm, set moduleOpState
|
||||
Configuration config;
|
||||
log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
|
||||
|
||||
//set query system state not ready
|
||||
BRM::DBRM dbrm;
|
||||
dbrm.setSystemQueryReady(false);
|
||||
|
||||
processManager.setQuerySystemState(false);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
processManager.reinitProcessType("cpimport");
|
||||
|
||||
// halt the dbrm
|
||||
oam.dbrmctl("halt");
|
||||
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
//string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
|
||||
//system(cmd.c_str());
|
||||
|
||||
//send notification
|
||||
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
|
||||
|
||||
//Issue an alarm
|
||||
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
|
||||
|
||||
//set query system state not ready
|
||||
BRM::DBRM dbrm;
|
||||
dbrm.setSystemQueryReady(false);
|
||||
//mark all processes running on module auto-offline
|
||||
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
|
||||
|
||||
//set module to disable state
|
||||
processManager.disableModule(moduleName, false);
|
||||
|
||||
processManager.setQuerySystemState(false);
|
||||
//call dbrm control
|
||||
oam.dbrmctl("reload");
|
||||
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
processManager.reinitProcessType("cpimport");
|
||||
|
||||
// halt the dbrm
|
||||
oam.dbrmctl("halt");
|
||||
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
|
||||
|
||||
processManager.setSystemState(oam::BUSY_INIT);
|
||||
|
||||
//string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
|
||||
//system(cmd.c_str());
|
||||
|
||||
//send notification
|
||||
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
|
||||
|
||||
//Issue an alarm
|
||||
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
|
||||
|
||||
//mark all processes running on module auto-offline
|
||||
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
|
||||
|
||||
//set module to disable state
|
||||
processManager.disableModule(moduleName, false);
|
||||
|
||||
//call dbrm control
|
||||
oam.dbrmctl("reload");
|
||||
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
|
||||
|
||||
// if pm, move dbroots to other pms
|
||||
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
|
||||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
|
||||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
|
||||
try {
|
||||
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
|
||||
oam.autoMovePmDbroot(moduleName);
|
||||
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
|
||||
//distribute config file
|
||||
processManager.distributeConfigFile("system");
|
||||
}
|
||||
catch (exception& ex)
|
||||
{
|
||||
string error = ex.what();
|
||||
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
||||
// if pm, move dbroots to other pms
|
||||
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
|
||||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
|
||||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
|
||||
try {
|
||||
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
|
||||
oam.autoMovePmDbroot(moduleName);
|
||||
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
|
||||
//distribute config file
|
||||
processManager.distributeConfigFile("system");
|
||||
}
|
||||
catch (exception& ex)
|
||||
{
|
||||
string error = ex.what();
|
||||
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// if Cloud Instance
|
||||
// state = running, then instance is rebooting, monitor for recovery
|
||||
// state = stopped, then try starting, if fail, remove/addmodule to launch new instance
|
||||
// state = terminate or nothing, remove/addmodule to launch new instance
|
||||
// state = terminate, remove/addmodule to launch new instance
|
||||
if ( amazon ) {
|
||||
if ( moduleName.find("um") == 0 )
|
||||
{
|
||||
@@ -2104,7 +2106,9 @@ void pingDeviceThread()
|
||||
}
|
||||
}
|
||||
|
||||
if ( moduleName.find("pm") == 0 )
|
||||
if ( ( moduleName.find("pm") == 0 ) &&
|
||||
( opState != oam::AUTO_DISABLED ) )
|
||||
|
||||
{
|
||||
// resume the dbrm
|
||||
oam.dbrmctl("resume");
|
||||
@@ -2165,6 +2169,10 @@ void pingDeviceThread()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if disabled and amazon, break out
|
||||
if ( (opState == oam::AUTO_DISABLED ) && amazon )
|
||||
break;
|
||||
|
||||
//start SIMPLEX runtype processes on a SIMPLEX runtype module
|
||||
string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
||||
|
@@ -9715,8 +9715,11 @@ std::string ProcessManager::getStandbyModule()
|
||||
|
||||
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
|
||||
systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY )
|
||||
{
|
||||
// Found a ProcessManager in a COLD_STANDBY state
|
||||
newStandbyModule = systemprocessstatus.processstatus[i].Module;
|
||||
break;
|
||||
}
|
||||
|
||||
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
|
||||
systemprocessstatus.processstatus[i].ProcessOpState == oam::MAN_OFFLINE &&
|
||||
@@ -10346,6 +10349,18 @@ int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist
|
||||
if ( remoteModuleName == masterModule )
|
||||
continue;
|
||||
|
||||
// skip disabled modules
|
||||
int opState = oam::ACTIVE;
|
||||
bool degraded;
|
||||
try {
|
||||
oam.getModuleStatus(remoteModuleName, opState, degraded);
|
||||
}
|
||||
catch(...)
|
||||
{}
|
||||
|
||||
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
|
||||
continue;
|
||||
|
||||
// don't do PMs unless PMwithUM flag is set
|
||||
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM ) {
|
||||
string moduleType = remoteModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
||||
@@ -10423,6 +10438,18 @@ int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist
|
||||
if ( remoteModuleName == masterModule )
|
||||
continue;
|
||||
|
||||
// skip disabled modules
|
||||
int opState = oam::ACTIVE;
|
||||
bool degraded;
|
||||
try {
|
||||
oam.getModuleStatus(remoteModuleName, opState, degraded);
|
||||
}
|
||||
catch(...)
|
||||
{}
|
||||
|
||||
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
|
||||
continue;
|
||||
|
||||
// don't do PMs unless PMwithUM flag is set
|
||||
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM ) {
|
||||
string moduleType = remoteModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
|
||||
@@ -10471,7 +10498,19 @@ int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist
|
||||
if ( remoteModuleName == masterModule )
|
||||
continue;
|
||||
|
||||
ByteStream msg1;
|
||||
// skip disabled modules
|
||||
int opState = oam::ACTIVE;
|
||||
bool degraded;
|
||||
try {
|
||||
oam.getModuleStatus(remoteModuleName, opState, degraded);
|
||||
}
|
||||
catch(...)
|
||||
{}
|
||||
|
||||
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
|
||||
continue;
|
||||
|
||||
ByteStream msg1;
|
||||
ByteStream::byte requestID = oam::SLAVEREP;
|
||||
if ( !enable ) {
|
||||
requestID = oam::DISABLEREP;
|
||||
|
@@ -4950,6 +4950,16 @@ int ProcessMonitor::runMasterRep(std::string& masterLogFile, std::string& master
|
||||
{
|
||||
string moduleName = (*pt).DeviceName;
|
||||
|
||||
//skip if local module or module is not ACTIVE
|
||||
if ( moduleName == config.moduleName() )
|
||||
continue;
|
||||
|
||||
int opState = oam::ACTIVE;
|
||||
bool degraded;
|
||||
oam.getModuleStatus(moduleName, opState, degraded);
|
||||
if (opState != oam::ACTIVE)
|
||||
continue;
|
||||
|
||||
bool passwordError = false;
|
||||
|
||||
string moduleType = systemModuleTypeConfig.moduletypeconfig[i].ModuleType;
|
||||
|
Reference in New Issue
Block a user