From d9fa993e07f6f0827aa2d217b675a9baa6b2b8e1 Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Wed, 3 Jan 2007 06:17:34 +0100 Subject: [PATCH] ndb - bug#25364 on master node failure during qmgr-commitreq make sure to remove all committed failed nodes from failed/prepfailed arrays --- .../kernel/signaldata/DumpStateOrd.hpp | 1 + ndb/src/kernel/blocks/ERROR_codes.txt | 3 + ndb/src/kernel/blocks/qmgr/Qmgr.hpp | 4 ++ ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 72 +++++++++++++++---- ndb/test/ndbapi/testNodeRestart.cpp | 43 +++++++++++ ndb/test/run-test/daily-basic-tests.txt | 4 ++ 6 files changed, 114 insertions(+), 13 deletions(-) diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index a2993ad5d03..04f94aaba58 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -68,6 +68,7 @@ public: // 100-105 TUP and ACC // 200-240 UTIL // 300-305 TRIX + QmgrErr935 = 935, NdbfsDumpFileStat = 400, NdbfsDumpAllFiles = 401, NdbfsDumpOpenFiles = 402, diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index 16f5da8a553..0bcc99a6334 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -21,6 +21,9 @@ Crash president when he starts to run in ArbitState 1-9. 910: Crash new president after node crash +935 : Crash master on node failure (delayed) + and skip sending GSN_COMMIT_FAILREQ to specified node + ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING: ----------------------------------------------------------------- diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp index e728ea81a7d..0c4bdc5d3c1 100644 --- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp +++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp @@ -426,6 +426,10 @@ private: StopReq c_stopReq; bool check_multi_node_shutdown(Signal* signal); + +#ifdef ERROR_INSERT + Uint32 c_error_insert_extra; +#endif }; #endif diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index cc981f37987..66ee7549b9d 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -3110,6 +3110,18 @@ Qmgr::sendCommitFailReq(Signal* signal) for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { jam(); ptrAss(nodePtr, nodeRec); + +#ifdef ERROR_INSERT + if (ERROR_INSERTED(935) && nodePtr.i == c_error_insert_extra) + { + ndbout_c("skipping node %d", c_error_insert_extra); + CLEAR_ERROR_INSERT_VALUE; + signal->theData[0] = 9999; + sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1); + continue; + } +#endif + if (nodePtr.p->phase == ZRUNNING) { jam(); nodePtr.p->sendCommitFailReqStatus = Q_ACTIVE; @@ -3180,6 +3192,33 @@ void Qmgr::execPREP_FAILREF(Signal* signal) return; }//Qmgr::execPREP_FAILREF() +static +Uint32 +clear_nodes(Uint32 dstcnt, Uint16 dst[], Uint32 srccnt, const Uint16 src[]) +{ + if (srccnt == 0) + return dstcnt; + + Uint32 pos = 0; + for (Uint32 i = 0; i 0) { - jam(); - guard0 = cnoFailedNodes - 1; - arrGuard(guard0 + cnoCommitFailedNodes, MAX_NDB_NODES); - for (Tj = 0; Tj <= guard0; Tj++) { - jam(); - cfailedNodes[Tj] = cfailedNodes[Tj + cnoCommitFailedNodes]; - }//for - }//if - }//if + + /** + * Remove committed nodes from failed/prepared + */ + cnoFailedNodes = clear_nodes(cnoFailedNodes, + cfailedNodes, + cnoCommitFailedNodes, + ccommitFailedNodes); + cnoPrepFailedNodes = clear_nodes(cnoPrepFailedNodes, + cprepFailedNodes, + cnoCommitFailedNodes, + ccommitFailedNodes); cnoCommitFailedNodes = 0; }//if /**----------------------------------------------------------------------- @@ -4658,6 +4696,14 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal) default: ; }//switch + +#ifdef ERROR_INSERT + if (signal->theData[0] == 935 && signal->getLength() == 2) + { + SET_ERROR_INSERT_VALUE(935); + c_error_insert_extra = signal->theData[1]; + } +#endif }//Qmgr::execDUMP_STATE_ORD() void Qmgr::execSET_VAR_REQ(Signal* signal) diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index 082013f07cc..c0c5cc5163a 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -955,6 +955,46 @@ int runBug24717(NDBT_Context* ctx, NDBT_Step* step){ return NDBT_OK; } +int runBug25364(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + NdbRestarter restarter; + Ndb* pNdb = GETNDB(step); + int loops = ctx->getNumLoops(); + + if (restarter.getNumDbNodes() < 4) + return NDBT_OK; + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + + for (; loops; loops --) + { + int master = restarter.getMasterNodeId(); + int victim = restarter.getRandomNodeOtherNodeGroup(master, rand()); + int second = restarter.getRandomNodeSameNodeGroup(victim, rand()); + + int dump[] = { 935, victim } ; + if (restarter.dumpStateOneNode(master, dump, 2)) + return NDBT_FAILED; + + if (restarter.dumpStateOneNode(master, val2, 2)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(second, false, true, true)) + return NDBT_FAILED; + + int nodes[2] = { master, second }; + if (restarter.waitNodesNoStart(nodes, 2)) + return NDBT_FAILED; + + restarter.startNodes(nodes, 2); + + if (restarter.waitNodesStarted(nodes, 2)) + return NDBT_FAILED; + } + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -1271,6 +1311,9 @@ TESTCASE("Bug20185", TESTCASE("Bug24717", ""){ INITIALIZER(runBug24717); } +TESTCASE("Bug25364", ""){ + INITIALIZER(runBug25364); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index a1443970388..41070275935 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -469,6 +469,10 @@ max-time: 1000 cmd: testNodeRestart args: -n Bug24717 T1 +max-time: 1000 +cmd: testNodeRestart +args: -n Bug25364 T1 + # OLD FLEX max-time: 500 cmd: flexBench