1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-01 03:47:19 +03:00

ndb - bug#26457

master failure during master take over


ndb/src/kernel/blocks/ERROR_codes.txt:
  new error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Make sure to clear NF_XX_LCP if master fails during master take-over
ndb/test/include/NdbRestarter.hpp:
  Add support for querying next master and node group
    (for multi node failure testing)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
ndb/test/src/NdbRestarter.cpp:
  Add support for querying next master and node group
    (for multi node failure testing)
This commit is contained in:
unknown
2007-02-17 23:52:17 +01:00
parent 0e39133ad7
commit 778b4aad59
6 changed files with 139 additions and 2 deletions

View File

@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4014 Next DBTUP 4014
Next DBLQH 5043 Next DBLQH 5043
Next DBDICT 6007 Next DBDICT 6007
Next DBDIH 7178 Next DBDIH 7181
Next DBTC 8039 Next DBTC 8039
Next CMVMI 9000 Next CMVMI 9000
Next BACKUP 10022 Next BACKUP 10022
@ -71,6 +71,8 @@ Delay GCP_SAVEREQ by 10 secs
7177: Delay copying of sysfileData in execCOPY_GCIREQ 7177: Delay copying of sysfileData in execCOPY_GCIREQ
7180: Crash master during master-take-over in execMASTER_LCPCONF
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
----------------------------------------------------------------- -----------------------------------------------------------------

View File

@ -4612,6 +4612,8 @@ void
Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
jam(); jam();
Uint32 oldNode = c_lcpMasterTakeOverState.failedNodeId;
c_lcpMasterTakeOverState.minTableId = ~0; c_lcpMasterTakeOverState.minTableId = ~0;
c_lcpMasterTakeOverState.minFragId = ~0; c_lcpMasterTakeOverState.minFragId = ~0;
c_lcpMasterTakeOverState.failedNodeId = nodeId; c_lcpMasterTakeOverState.failedNodeId = nodeId;
@ -4630,7 +4632,20 @@ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
/** /**
* Node failure during master take over... * Node failure during master take over...
*/ */
ndbout_c("Nodefail during master take over"); ndbout_c("Nodefail during master take over (old: %d)", oldNode);
}
NodeRecordPtr nodePtr;
nodePtr.i = oldNode;
if (oldNode > 0 && oldNode < MAX_NDB_NODES)
{
jam();
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (nodePtr.p->m_nodefailSteps.get(NF_LCP_TAKE_OVER))
{
jam();
checkLocalNodefailComplete(signal, oldNode, NF_LCP_TAKE_OVER);
}
} }
setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER); setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
@ -5646,6 +5661,14 @@ void Dbdih::execMASTER_LCPREQ(Signal* signal)
jamEntry(); jamEntry();
const BlockReference newMasterBlockref = req->masterRef; const BlockReference newMasterBlockref = req->masterRef;
if (newMasterBlockref != cmasterdihref)
{
jam();
ndbout_c("resending GSN_MASTER_LCPREQ");
sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
signal->getLength(), 50);
return;
}
Uint32 failedNodeId = req->failedNodeId; Uint32 failedNodeId = req->failedNodeId;
/** /**
@ -5946,6 +5969,8 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal)
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord); ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
nodePtr.p->lcpStateAtTakeOver = lcpState; nodePtr.p->lcpStateAtTakeOver = lcpState;
CRASH_INSERTION(7180);
#ifdef VM_TRACE #ifdef VM_TRACE
ndbout_c("MASTER_LCPCONF"); ndbout_c("MASTER_LCPCONF");
printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0); printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0);

View File

@ -62,6 +62,8 @@ public:
int dumpStateAllNodes(int * _args, int _num_args); int dumpStateAllNodes(int * _args, int _num_args);
int getMasterNodeId(); int getMasterNodeId();
int getNextMasterNodeId(int nodeId);
int getNodeGroup(int nodeId);
int getRandomNodeSameNodeGroup(int nodeId, int randomNumber); int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber); int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
int getRandomNotMasterNodeId(int randomNumber); int getRandomNotMasterNodeId(int randomNumber);

View File

@ -1045,6 +1045,45 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK; return NDBT_OK;
} }
int
runBug26457(NDBT_Context* ctx, NDBT_Step* step)
{
NdbRestarter res;
if (res.getNumDbNodes() < 4)
return NDBT_OK;
int loops = ctx->getNumLoops();
while (loops --)
{
retry:
int master = res.getMasterNodeId();
int next = res.getNextMasterNodeId(master);
ndbout_c("master: %d next: %d", master, next);
if (res.getNodeGroup(master) == res.getNodeGroup(next))
{
res.restartOneDbNode(next, false, false, true);
if (res.waitClusterStarted())
return NDBT_FAILED;
goto retry;
}
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 2 };
if (res.dumpStateOneNode(next, val2, 2))
return NDBT_FAILED;
if (res.insertErrorInNode(next, 7180))
return NDBT_FAILED;
res.restartOneDbNode(master, false, false, true);
if (res.waitClusterStarted())
return NDBT_FAILED;
}
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
@ -1367,6 +1406,9 @@ TESTCASE("Bug25364", ""){
TESTCASE("Bug25554", ""){ TESTCASE("Bug25554", ""){
INITIALIZER(runBug25554); INITIALIZER(runBug25554);
} }
TESTCASE("Bug26457", ""){
INITIALIZER(runBug26457);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){

View File

@ -477,6 +477,10 @@ max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug25554 T1 args: -n Bug25554 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug26457 T1
# OLD FLEX # OLD FLEX
max-time: 500 max-time: 500
cmd: flexBench cmd: flexBench

View File

@ -128,6 +128,68 @@ NdbRestarter::getMasterNodeId(){
return node; return node;
} }
int
NdbRestarter::getNodeGroup(int nodeId){
if (!isConnected())
return -1;
if (getStatus() != 0)
return -1;
for(size_t i = 0; i < ndbNodes.size(); i++)
{
if(ndbNodes[i].node_id == nodeId)
{
return ndbNodes[i].node_group;
}
}
return -1;
}
int
NdbRestarter::getNextMasterNodeId(int nodeId){
if (!isConnected())
return -1;
if (getStatus() != 0)
return -1;
size_t i;
for(i = 0; i < ndbNodes.size(); i++)
{
if(ndbNodes[i].node_id == nodeId)
{
break;
}
}
assert(i < ndbNodes.size());
if (i == ndbNodes.size())
return -1;
int dynid = ndbNodes[i].dynamic_id;
int minid = dynid;
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id > minid)
minid = ndbNodes[i].dynamic_id;
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id > dynid &&
ndbNodes[i].dynamic_id < minid)
{
minid = ndbNodes[i].dynamic_id;
}
if (minid != ~0)
{
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id == minid)
return ndbNodes[i].node_id;
}
return getMasterNodeId();
}
int int
NdbRestarter::getRandomNotMasterNodeId(int rand){ NdbRestarter::getRandomNotMasterNodeId(int rand){
int master = getMasterNodeId(); int master = getMasterNodeId();