1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-05 16:15:50 +03:00

MCOL-1137 - fixed master/slave setup after failvover

This commit is contained in:
david hill
2018-01-18 17:14:14 -06:00
parent 12e4960a04
commit c56555e9fe
4 changed files with 120 additions and 63 deletions

View File

@@ -56,7 +56,7 @@ checkForError
# #
# Run reset slave command # Run reset slave command
# #
echo "Run start slave command" >>/tmp/disable-rep-status.log echo "Run reset slave command" >>/tmp/disable-rep-status.log
cat >/tmp/idb_disable-rep.sql <<EOD cat >/tmp/idb_disable-rep.sql <<EOD
reset slave; reset slave;
EOD EOD

View File

@@ -1645,7 +1645,7 @@ void pingDeviceThread()
DeviceNetworkConfig devicenetworkconfig; DeviceNetworkConfig devicenetworkconfig;
devicenetworkconfig.DeviceName = moduleName; devicenetworkconfig.DeviceName = moduleName;
devicenetworklist.push_back(devicenetworkconfig); devicenetworklist.push_back(devicenetworkconfig);
processManager.setMySQLReplication(devicenetworklist); processManager.setMySQLReplication(devicenetworklist, oam::UnassignedName, false, true);
} }
} }
else else
@@ -1833,71 +1833,73 @@ void pingDeviceThread()
if (LANOUTAGEACTIVE) if (LANOUTAGEACTIVE)
break; break;
//Log failure, issue alarm, set moduleOpState // if not disabled and amazon, skip
Configuration config; if (opState != oam::AUTO_DISABLED )
log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL); {
//Log failure, issue alarm, set moduleOpState
Configuration config;
log.writeLog(__LINE__, "module is down: " + moduleName, LOG_TYPE_CRITICAL);
//set query system state not ready
BRM::DBRM dbrm;
dbrm.setSystemQueryReady(false);
processManager.setQuerySystemState(false);
processManager.setSystemState(oam::BUSY_INIT);
processManager.reinitProcessType("cpimport");
// halt the dbrm
oam.dbrmctl("halt");
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG);
processManager.setSystemState(oam::BUSY_INIT);
//string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1";
//system(cmd.c_str());
//send notification
oam.sendDeviceNotification(moduleName, MODULE_DOWN);
//Issue an alarm
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET);
//set query system state not ready //mark all processes running on module auto-offline
BRM::DBRM dbrm; processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
dbrm.setSystemQueryReady(false);
//set module to disable state
processManager.disableModule(moduleName, false);
processManager.setQuerySystemState(false); //call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
processManager.setSystemState(oam::BUSY_INIT); // if pm, move dbroots to other pms
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
processManager.reinitProcessType("cpimport"); ( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
// halt the dbrm try {
oam.dbrmctl("halt"); log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
log.writeLog(__LINE__, "'dbrmctl halt' done", LOG_TYPE_DEBUG); oam.autoMovePmDbroot(moduleName);
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
processManager.setSystemState(oam::BUSY_INIT); //distribute config file
processManager.distributeConfigFile("system");
//string cmd = "/etc/init.d/glusterd restart > /dev/null 2>&1"; }
//system(cmd.c_str()); catch (exception& ex)
{
//send notification string error = ex.what();
oam.sendDeviceNotification(moduleName, MODULE_DOWN); log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
}
//Issue an alarm catch(...)
aManager.sendAlarmReport(moduleName.c_str(), MODULE_DOWN_AUTO, SET); {
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
//mark all processes running on module auto-offline }
processManager.setProcessStates(moduleName, oam::AUTO_OFFLINE);
//set module to disable state
processManager.disableModule(moduleName, false);
//call dbrm control
oam.dbrmctl("reload");
log.writeLog(__LINE__, "'dbrmctl reload' done", LOG_TYPE_DEBUG);
// if pm, move dbroots to other pms
if ( ( moduleName.find("pm") == 0 && !amazon && ( DBRootStorageType != "internal") ) ||
( moduleName.find("pm") == 0 && amazon && downActiveOAMModule ) ||
( moduleName.find("pm") == 0 && amazon && AmazonPMFailover == "y") ) {
try {
log.writeLog(__LINE__, "Call autoMovePmDbroot", LOG_TYPE_DEBUG);
oam.autoMovePmDbroot(moduleName);
log.writeLog(__LINE__, "autoMovePmDbroot success", LOG_TYPE_DEBUG);
//distribute config file
processManager.distributeConfigFile("system");
}
catch (exception& ex)
{
string error = ex.what();
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: " + error, LOG_TYPE_DEBUG);
}
catch(...)
{
log.writeLog(__LINE__, "EXCEPTION ERROR on autoMovePmDbroot: Caught unknown exception!", LOG_TYPE_ERROR);
} }
} }
// if Cloud Instance // if Cloud Instance
// state = running, then instance is rebooting, monitor for recovery // state = terminate, remove/addmodule to launch new instance
// state = stopped, then try starting, if fail, remove/addmodule to launch new instance
// state = terminate or nothing, remove/addmodule to launch new instance
if ( amazon ) { if ( amazon ) {
if ( moduleName.find("um") == 0 ) if ( moduleName.find("um") == 0 )
{ {
@@ -2104,7 +2106,9 @@ void pingDeviceThread()
} }
} }
if ( moduleName.find("pm") == 0 ) if ( ( moduleName.find("pm") == 0 ) &&
( opState != oam::AUTO_DISABLED ) )
{ {
// resume the dbrm // resume the dbrm
oam.dbrmctl("resume"); oam.dbrmctl("resume");
@@ -2165,6 +2169,10 @@ void pingDeviceThread()
} }
} }
} }
// if disabled and amazon, break out
if ( (opState == oam::AUTO_DISABLED ) && amazon )
break;
//start SIMPLEX runtype processes on a SIMPLEX runtype module //start SIMPLEX runtype processes on a SIMPLEX runtype module
string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE); string moduletype = moduleName.substr(0,MAX_MODULE_TYPE_SIZE);

View File

@@ -9715,8 +9715,11 @@ std::string ProcessManager::getStandbyModule()
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" && if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY ) systemprocessstatus.processstatus[i].ProcessOpState == oam::COLD_STANDBY )
{
// Found a ProcessManager in a COLD_STANDBY state // Found a ProcessManager in a COLD_STANDBY state
newStandbyModule = systemprocessstatus.processstatus[i].Module; newStandbyModule = systemprocessstatus.processstatus[i].Module;
break;
}
if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" && if ( systemprocessstatus.processstatus[i].ProcessName == "ProcessManager" &&
systemprocessstatus.processstatus[i].ProcessOpState == oam::MAN_OFFLINE && systemprocessstatus.processstatus[i].ProcessOpState == oam::MAN_OFFLINE &&
@@ -10346,6 +10349,18 @@ int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist
if ( remoteModuleName == masterModule ) if ( remoteModuleName == masterModule )
continue; continue;
// skip disabled modules
int opState = oam::ACTIVE;
bool degraded;
try {
oam.getModuleStatus(remoteModuleName, opState, degraded);
}
catch(...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
// don't do PMs unless PMwithUM flag is set // don't do PMs unless PMwithUM flag is set
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM ) { if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM ) {
string moduleType = remoteModuleName.substr(0,MAX_MODULE_TYPE_SIZE); string moduleType = remoteModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
@@ -10423,6 +10438,18 @@ int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist
if ( remoteModuleName == masterModule ) if ( remoteModuleName == masterModule )
continue; continue;
// skip disabled modules
int opState = oam::ACTIVE;
bool degraded;
try {
oam.getModuleStatus(remoteModuleName, opState, degraded);
}
catch(...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
// don't do PMs unless PMwithUM flag is set // don't do PMs unless PMwithUM flag is set
if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM ) { if ( config.ServerInstallType() != oam::INSTALL_COMBINE_DM_UM_PM ) {
string moduleType = remoteModuleName.substr(0,MAX_MODULE_TYPE_SIZE); string moduleType = remoteModuleName.substr(0,MAX_MODULE_TYPE_SIZE);
@@ -10471,7 +10498,19 @@ int ProcessManager::setMySQLReplication(oam::DeviceNetworkList devicenetworklist
if ( remoteModuleName == masterModule ) if ( remoteModuleName == masterModule )
continue; continue;
ByteStream msg1; // skip disabled modules
int opState = oam::ACTIVE;
bool degraded;
try {
oam.getModuleStatus(remoteModuleName, opState, degraded);
}
catch(...)
{}
if (opState == oam::MAN_DISABLED || opState == oam::AUTO_DISABLED)
continue;
ByteStream msg1;
ByteStream::byte requestID = oam::SLAVEREP; ByteStream::byte requestID = oam::SLAVEREP;
if ( !enable ) { if ( !enable ) {
requestID = oam::DISABLEREP; requestID = oam::DISABLEREP;

View File

@@ -4950,6 +4950,16 @@ int ProcessMonitor::runMasterRep(std::string& masterLogFile, std::string& master
{ {
string moduleName = (*pt).DeviceName; string moduleName = (*pt).DeviceName;
//skip if local module or module is not ACTIVE
if ( moduleName == config.moduleName() )
continue;
int opState = oam::ACTIVE;
bool degraded;
oam.getModuleStatus(moduleName, opState, degraded);
if (opState != oam::ACTIVE)
continue;
bool passwordError = false; bool passwordError = false;
string moduleType = systemModuleTypeConfig.moduletypeconfig[i].ModuleType; string moduleType = systemModuleTypeConfig.moduletypeconfig[i].ModuleType;