diff --git a/storage/ndb/include/kernel/GlobalSignalNumbers.h b/storage/ndb/include/kernel/GlobalSignalNumbers.h index 49f937ba221..a2a5adeed9e 100644 --- a/storage/ndb/include/kernel/GlobalSignalNumbers.h +++ b/storage/ndb/include/kernel/GlobalSignalNumbers.h @@ -183,7 +183,7 @@ extern const GlobalSignalNumber NO_OF_SIGNAL_NAMES; #define GSN_CNTR_START_REP 119 /* 120 not unused */ #define GSN_ROUTE_ORD 121 -/* 122 unused */ +#define GSN_NODE_VERSION_REP 122 /* 123 unused */ /* 124 unused */ #define GSN_CHECK_LCP_STOP 125 diff --git a/storage/ndb/include/kernel/NodeInfo.hpp b/storage/ndb/include/kernel/NodeInfo.hpp index fffd94b5258..75b2654d699 100644 --- a/storage/ndb/include/kernel/NodeInfo.hpp +++ b/storage/ndb/include/kernel/NodeInfo.hpp @@ -90,4 +90,14 @@ operator<<(NdbOut& ndbout, const NodeInfo & info){ return ndbout; } +struct NodeVersionInfo +{ + STATIC_CONST( DataLength = 6 ); + struct + { + Uint32 m_min_version; + Uint32 m_max_version; + } m_type [3]; // Indexed as NodeInfo::Type +}; + #endif diff --git a/storage/ndb/include/kernel/signaldata/ApiRegSignalData.hpp b/storage/ndb/include/kernel/signaldata/ApiRegSignalData.hpp index 84dca8fb260..4a8adddc4d5 100644 --- a/storage/ndb/include/kernel/signaldata/ApiRegSignalData.hpp +++ b/storage/ndb/include/kernel/signaldata/ApiRegSignalData.hpp @@ -80,12 +80,13 @@ class ApiRegConf { friend class ClusterMgr; public: - STATIC_CONST( SignalLength = 3 + NodeState::DataLength ); + STATIC_CONST( SignalLength = 4 + NodeState::DataLength ); private: Uint32 qmgrRef; Uint32 version; // Version of NDB node Uint32 apiHeartbeatFrequency; + Uint32 minDbVersion; NodeState nodeState; }; diff --git a/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp index 1f89d9f09ea..0ce1b640bb0 100644 --- a/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -68,6 +68,7 @@ public: // 100-105 TUP and ACC // 200-240 UTIL // 300-305 TRIX + QmgrErr935 = 935, NdbfsDumpFileStat = 400, NdbfsDumpAllFiles = 401, NdbfsDumpOpenFiles = 402, diff --git a/storage/ndb/include/ndb_version.h.in b/storage/ndb/include/ndb_version.h.in index 9eb609e3830..0bbb12ed223 100644 --- a/storage/ndb/include/ndb_version.h.in +++ b/storage/ndb/include/ndb_version.h.in @@ -72,5 +72,7 @@ char ndb_version_string_buf[NDB_VERSION_STRING_BUF_SZ]; #define NDBD_QMGR_SINGLEUSER_VERSION_5 MAKE_VERSION(5,0,25) +#define NDBD_NODE_VERSION_REP MAKE_VERSION(6,1,1) + #endif diff --git a/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp b/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp index 25a491422ef..74a090994b1 100644 --- a/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp +++ b/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp @@ -637,5 +637,6 @@ const GsnName SignalNames [] = { ,{ GSN_DICT_COMMIT_REQ, "DICT_COMMIT_REQ"} ,{ GSN_ROUTE_ORD, "ROUTE_ORD" } + ,{ GSN_NODE_VERSION_REP, "NODE_VERSION_REP" } }; const unsigned short NO_OF_SIGNAL_NAMES = sizeof(SignalNames)/sizeof(GsnName); diff --git a/storage/ndb/src/kernel/blocks/ERROR_codes.txt b/storage/ndb/src/kernel/blocks/ERROR_codes.txt index 83aa1183772..2fc28c8ac07 100644 --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt @@ -23,6 +23,8 @@ Crash president when he starts to run in ArbitState 1-9. 934 : Crash president in ALLOC_NODE_ID_REQ +935 : Crash master on node failure (delayed) + and skip sending GSN_COMMIT_FAILREQ to specified node ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING: ----------------------------------------------------------------- diff --git a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index beb7e0ceb7b..6d2be7dde39 100644 --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -10050,9 +10050,20 @@ void Dbdict::execSUB_START_REQ(Signal* signal) } OpSubEventPtr subbPtr; Uint32 errCode = 0; + + DictLockPtr loopPtr; + if (c_dictLockQueue.first(loopPtr) && + loopPtr.p->lt->lockType == DictLockReq::NodeRestartLock) + { + jam(); + errCode = 1405; + goto busy; + } + if (!c_opSubEvent.seize(subbPtr)) { errCode = SubStartRef::Busy; busy: + jam(); SubStartRef * ref = (SubStartRef *)signal->getDataPtrSend(); { // fix @@ -10151,6 +10162,7 @@ void Dbdict::execSUB_START_REF(Signal* signal) SubStartRef* ref = (SubStartRef*) signal->getDataPtrSend(); ref->senderRef = reference(); ref->senderData = subbPtr.p->m_senderData; + ref->errorCode = err; sendSignal(subbPtr.p->m_senderRef, GSN_SUB_START_REF, signal, SubStartRef::SignalLength2, JBB); c_opSubEvent.release(subbPtr); @@ -10213,6 +10225,7 @@ void Dbdict::execSUB_START_CONF(Signal* signal) #ifdef EVENT_PH3_DEBUG ndbout_c("DBDICT(Coordinator) got GSN_SUB_START_CONF = (%d)", subbPtr.i); #endif + subbPtr.p->m_sub_start_conf = *conf; subbPtr.p->m_reqTracker.reportConf(c_counterMgr, refToNode(senderRef)); completeSubStartReq(signal,subbPtr.i,0); } @@ -10252,6 +10265,9 @@ void Dbdict::completeSubStartReq(Signal* signal, #ifdef EVENT_DEBUG ndbout_c("SUB_START_CONF"); #endif + + SubStartConf* conf = (SubStartConf*)signal->getDataPtrSend(); + * conf = subbPtr.p->m_sub_start_conf; sendSignal(subbPtr.p->m_senderRef, GSN_SUB_START_CONF, signal, SubStartConf::SignalLength, JBB); c_opSubEvent.release(subbPtr); @@ -10373,6 +10389,7 @@ void Dbdict::execSUB_STOP_REF(Signal* signal) SubStopRef* ref = (SubStopRef*) signal->getDataPtrSend(); ref->senderRef = reference(); ref->senderData = subbPtr.p->m_senderData; + ref->errorCode = err; sendSignal(subbPtr.p->m_senderRef, GSN_SUB_STOP_REF, signal, SubStopRef::SignalLength, JBB); c_opSubEvent.release(subbPtr); @@ -10425,6 +10442,7 @@ void Dbdict::execSUB_STOP_CONF(Signal* signal) * Coordinator */ ndbrequire(refToBlock(senderRef) == DBDICT); + subbPtr.p->m_sub_stop_conf = *conf; subbPtr.p->m_reqTracker.reportConf(c_counterMgr, refToNode(senderRef)); completeSubStopReq(signal,subbPtr.i,0); } @@ -10465,6 +10483,8 @@ void Dbdict::completeSubStopReq(Signal* signal, #ifdef EVENT_DEBUG ndbout_c("SUB_STOP_CONF"); #endif + SubStopConf* conf = (SubStopConf*)signal->getDataPtrSend(); + * conf = subbPtr.p->m_sub_stop_conf; sendSignal(subbPtr.p->m_senderRef, GSN_SUB_STOP_CONF, signal, SubStopConf::SignalLength, JBB); c_opSubEvent.release(subbPtr); @@ -10713,6 +10733,7 @@ Dbdict::execSUB_REMOVE_REF(Signal* signal) SubRemoveRef* ref = (SubRemoveRef*) signal->getDataPtrSend(); ref->senderRef = reference(); ref->senderData = subbPtr.p->m_senderData; + ref->errorCode = err; sendSignal(subbPtr.p->m_senderRef, GSN_SUB_REMOVE_REF, signal, SubRemoveRef::SignalLength, JBB); } diff --git a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp index 718d53d8b96..414b3dabb52 100644 --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp @@ -52,6 +52,7 @@ #include #include #include +#include #include "SchemaFile.hpp" #include #include @@ -1632,6 +1633,10 @@ private: Uint32 m_senderRef; Uint32 m_senderData; Uint32 m_errorCode; + union { + SubStartConf m_sub_start_conf; + SubStopConf m_sub_stop_conf; + }; RequestTracker m_reqTracker; }; typedef Ptr OpSubEventPtr; diff --git a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index 37eb54028a6..3436a609fe7 100644 --- a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -637,6 +637,7 @@ private: void execTCGETOPSIZECONF(Signal *); void execTC_CLOPSIZECONF(Signal *); + int handle_invalid_lcp_no(const class LcpFragRep*, ReplicaRecordPtr); void execLCP_FRAG_REP(Signal *); void execLCP_COMPLETE_REP(Signal *); void execSTART_LCP_REQ(Signal *); diff --git a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 50c7c5472ba..2e68addb1d7 100644 --- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -3741,7 +3741,6 @@ void Dbdih::endTakeOver(Uint32 takeOverPtrI) takeOverPtr.i = takeOverPtrI; ptrCheckGuard(takeOverPtr, MAX_NDB_NODES, takeOverRecord); - releaseTakeOver(takeOverPtrI); if ((takeOverPtr.p->toMasterStatus != TakeOverRecord::IDLE) && (takeOverPtr.p->toMasterStatus != TakeOverRecord::TO_WAIT_START_TAKE_OVER)) { jam(); @@ -3755,6 +3754,7 @@ void Dbdih::endTakeOver(Uint32 takeOverPtrI) }//if setAllowNodeStart(takeOverPtr.p->toStartingNode, true); initTakeOver(takeOverPtr); + releaseTakeOver(takeOverPtrI); }//Dbdih::endTakeOver() void Dbdih::releaseTakeOver(Uint32 takeOverPtrI) @@ -4046,6 +4046,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal) Uint32 newMasterId = nodeFail->masterNodeId; const Uint32 noOfFailedNodes = nodeFail->noOfNodes; + if (ERROR_INSERTED(7179)) + { + CLEAR_ERROR_INSERT_VALUE; + } + /*-------------------------------------------------------------------------*/ // The first step is to convert from a bit mask to an array of failed nodes. /*-------------------------------------------------------------------------*/ @@ -4909,6 +4914,7 @@ void Dbdih::handleTakeOverNewMaster(Signal* signal, Uint32 takeOverPtrI) break; } ndbrequire(ok); + endTakeOver(takeOverPtr.i); }//if }//Dbdih::handleTakeOverNewMaster() @@ -10256,12 +10262,42 @@ void Dbdih::execLCP_FRAG_REP(Signal* signal) Uint32 fragId = lcpReport->fragId; jamEntry(); + + if (ERROR_INSERTED(7178) && nodeId != getOwnNodeId()) + { + jam(); + Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups); + Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups); + if (owng == nodeg) + { + jam(); + ndbout_c("throwing away LCP_FRAG_REP from (and killing) %d", nodeId); + SET_ERROR_INSERT_VALUE(7179); + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, nodeId), + GSN_NDB_TAMPER, signal, 1, JBA); + return; + } + } + if (ERROR_INSERTED(7179) && nodeId != getOwnNodeId()) + { + jam(); + Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups); + Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups); + if (owng == nodeg) + { + jam(); + ndbout_c("throwing away LCP_FRAG_REP from %d", nodeId); + return; + } + } + CRASH_INSERTION2(7025, isMaster()); CRASH_INSERTION2(7016, !isMaster()); - + bool fromTimeQueue = (signal->senderBlockRef() == reference()); - + TabRecordPtr tabPtr; tabPtr.i = tableId; ptrCheckGuard(tabPtr, ctabFileSize, tabRecord); @@ -10463,6 +10499,37 @@ void Dbdih::findReplica(ReplicaRecordPtr& replicaPtr, ndbrequire(false); }//Dbdih::findReplica() + +int +Dbdih::handle_invalid_lcp_no(const LcpFragRep* rep, + ReplicaRecordPtr replicaPtr) +{ + ndbrequire(!isMaster()); + Uint32 lcpNo = rep->lcpNo; + Uint32 lcpId = rep->lcpId; + Uint32 replicaLcpNo = replicaPtr.p->nextLcp; + Uint32 prevReplicaLcpNo = prevLcpNo(replicaLcpNo); + + warningEvent("Detected previous node failure of %d during lcp", + rep->nodeId); + replicaPtr.p->nextLcp = lcpNo; + replicaPtr.p->lcpId[lcpNo] = 0; + replicaPtr.p->lcpStatus[lcpNo] = ZINVALID; + + for (Uint32 i = lcpNo; i != lcpNo; i = nextLcpNo(i)) + { + jam(); + if (replicaPtr.p->lcpStatus[i] == ZVALID && + replicaPtr.p->lcpId[i] >= lcpId) + { + ndbout_c("i: %d lcpId: %d", i, replicaPtr.p->lcpId[i]); + ndbrequire(false); + } + } + + return 0; +} + /** * Return true if table is all fragment replicas have been checkpointed * to disk (in all LQHs) @@ -10491,9 +10558,12 @@ Dbdih::reportLcpCompletion(const LcpFragRep* lcpReport) ndbrequire(replicaPtr.p->lcpOngoingFlag == true); if(lcpNo != replicaPtr.p->nextLcp){ - ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d", - lcpNo, replicaPtr.p->nextLcp); - ndbrequire(false); + if (handle_invalid_lcp_no(lcpReport, replicaPtr)) + { + ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d", + lcpNo, replicaPtr.p->nextLcp); + ndbrequire(false); + } } ndbrequire(lcpNo == replicaPtr.p->nextLcp); ndbrequire(lcpNo < MAX_LCP_STORED); diff --git a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp index de080237668..0fdce8b5166 100644 --- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp +++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp @@ -446,6 +446,15 @@ private: StopReq c_stopReq; bool check_multi_node_shutdown(Signal* signal); + +#ifdef ERROR_INSERT + Uint32 c_error_insert_extra; +#endif + + void recompute_version_info(Uint32 type); + void recompute_version_info(Uint32 type, Uint32 version); + void execNODE_VERSION_REP(Signal* signal); + void sendApiVersionRep(Signal* signal, NodeRecPtr nodePtr); }; #endif diff --git a/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp b/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp index 8ec5e681045..aac9db03625 100644 --- a/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp +++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp @@ -37,6 +37,13 @@ void Qmgr::initData() setHbApiDelay(hbDBAPI); c_connectedNodes.set(getOwnNodeId()); c_stopReq.senderRef = 0; + + /** + * Check sanity for NodeVersion + */ + ndbrequire((Uint32)NodeInfo::DB == 0); + ndbrequire((Uint32)NodeInfo::API == 1); + ndbrequire((Uint32)NodeInfo::MGM == 2); }//Qmgr::initData() void Qmgr::initRecords() @@ -107,6 +114,7 @@ Qmgr::Qmgr(Block_context& ctx) addRecSignal(GSN_DIH_RESTARTREF, &Qmgr::execDIH_RESTARTREF); addRecSignal(GSN_DIH_RESTARTCONF, &Qmgr::execDIH_RESTARTCONF); + addRecSignal(GSN_NODE_VERSION_REP, &Qmgr::execNODE_VERSION_REP); initData(); }//Qmgr::Qmgr() diff --git a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index e725e5cb6a6..89b1d18f22c 100644 --- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -260,6 +260,9 @@ void Qmgr::execSTTOR(Signal* signal) case 1: initData(signal); startphase1(signal); + recompute_version_info(NodeInfo::DB); + recompute_version_info(NodeInfo::API); + recompute_version_info(NodeInfo::MGM); return; case 7: cactivateApiCheck = 1; @@ -765,6 +768,7 @@ void Qmgr::execCM_REGREQ(Signal* signal) */ UintR TdynId = ++c_maxDynamicId; setNodeInfo(addNodePtr.i).m_version = startingVersion; + recompute_version_info(NodeInfo::DB, startingVersion); addNodePtr.p->ndynamicId = TdynId; /** @@ -1503,7 +1507,8 @@ void Qmgr::execCM_NODEINFOCONF(Signal* signal) replyNodePtr.p->ndynamicId = dynamicId; replyNodePtr.p->blockRef = signal->getSendersBlockRef(); setNodeInfo(replyNodePtr.i).m_version = version; - + recompute_version_info(NodeInfo::DB, version); + if(!c_start.m_nodes.done()){ jam(); return; @@ -1602,6 +1607,7 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){ } sendCmAckAdd(signal, nodePtr.i, CmAdd::Prepare); + sendApiVersionRep(signal, nodePtr); /* President have prepared us */ CmNodeInfoConf * conf = (CmNodeInfoConf*)signal->getDataPtrSend(); @@ -1613,6 +1619,29 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){ DEBUG_START(GSN_CM_NODEINFOCONF, refToNode(nodePtr.p->blockRef), ""); } +void +Qmgr::sendApiVersionRep(Signal* signal, NodeRecPtr nodePtr) +{ + if (getNodeInfo(nodePtr.i).m_version >= NDBD_NODE_VERSION_REP) + { + jam(); + Uint32 ref = calcQmgrBlockRef(nodePtr.i); + for(Uint32 i = 1; itheData[0] = i; + signal->theData[1] = version; + sendSignal(ref, GSN_NODE_VERSION_REP, signal, 2, JBB); + } + } + } +} + void Qmgr::sendCmAckAdd(Signal * signal, Uint32 nodeId, CmAdd::RequestType type){ @@ -2401,7 +2430,9 @@ void Qmgr::sendApiFailReq(Signal* signal, Uint16 failedNodeNo) * SECONDS. *-------------------------------------------------------------------------*/ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0; - + setNodeInfo(failedNodePtr.i).m_version = 0; + recompute_version_info(getNodeInfo(failedNodePtr.i).m_type); + CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0]; closeCom->xxxBlockRef = reference(); @@ -2707,7 +2738,6 @@ void Qmgr::execAPI_REGREQ(Signal* signal) } setNodeInfo(apiNodePtr.i).m_version = version; - setNodeInfo(apiNodePtr.i).m_heartbeat_cnt= 0; ApiRegConf * const apiRegConf = (ApiRegConf *)&signal->theData[0]; @@ -2728,8 +2758,9 @@ void Qmgr::execAPI_REGREQ(Signal* signal) apiRegConf->nodeState.dynamicId = -dynamicId; } } + NodeVersionInfo info = getNodeVersionInfo(); + apiRegConf->minDbVersion = info.m_type[NodeInfo::DB].m_min_version; apiRegConf->nodeState.m_connected_nodes.assign(c_connectedNodes); - sendSignal(ref, GSN_API_REGCONF, signal, ApiRegConf::SignalLength, JBB); if (apiNodePtr.p->phase == ZAPI_INACTIVE && @@ -2748,6 +2779,33 @@ void Qmgr::execAPI_REGREQ(Signal* signal) signal->theData[0] = apiNodePtr.i; sendSignal(CMVMI_REF, GSN_ENABLE_COMORD, signal, 1, JBA); + recompute_version_info(type, version); + + if (info.m_type[NodeInfo::DB].m_min_version >= NDBD_NODE_VERSION_REP) + { + jam(); + NodeReceiverGroup rg(QMGR, c_clusterNodes); + rg.m_nodes.clear(getOwnNodeId()); + signal->theData[0] = apiNodePtr.i; + signal->theData[1] = version; + sendSignal(rg, GSN_NODE_VERSION_REP, signal, 2, JBB); + } + else + { + Uint32 i = 0; + while((i = c_clusterNodes.find(i + 1)) != NdbNodeBitmask::NotFound) + { + jam(); + if (i == getOwnNodeId()) + continue; + if (getNodeInfo(i).m_version >= NDBD_NODE_VERSION_REP) + { + jam(); + sendSignal(calcQmgrBlockRef(i), GSN_NODE_VERSION_REP, signal, 2,JBB); + } + } + } + signal->theData[0] = apiNodePtr.i; EXECUTE_DIRECT(NDBCNTR, GSN_API_START_REP, signal, 1); } @@ -2783,6 +2841,76 @@ Qmgr::execAPI_VERSION_REQ(Signal * signal) { ApiVersionConf::SignalLength, JBB); } +void +Qmgr::execNODE_VERSION_REP(Signal* signal) +{ + jamEntry(); + Uint32 nodeId = signal->theData[0]; + Uint32 version = signal->theData[1]; + + if (nodeId < MAX_NODES) + { + jam(); + Uint32 type = getNodeInfo(nodeId).m_type; + setNodeInfo(nodeId).m_version = version; + recompute_version_info(type, version); + } +} + +void +Qmgr::recompute_version_info(Uint32 type, Uint32 version) +{ + NodeVersionInfo& info = setNodeVersionInfo(); + switch(type){ + case NodeInfo::DB: + case NodeInfo::API: + case NodeInfo::MGM: + break; + default: + return; + } + + if (info.m_type[type].m_min_version == 0 || + version < info.m_type[type].m_min_version) + info.m_type[type].m_min_version = version; + if (version > info.m_type[type].m_max_version) + info.m_type[type].m_max_version = version; +} + +void +Qmgr::recompute_version_info(Uint32 type) +{ + switch(type){ + case NodeInfo::DB: + case NodeInfo::API: + case NodeInfo::MGM: + break; + default: + return; + } + + Uint32 min = ~0, max = 0; + Uint32 cnt = type == NodeInfo::DB ? MAX_NDB_NODES : MAX_NODES; + for (Uint32 i = 1; i max) + max = version; + } + } + } + + NodeVersionInfo& info = setNodeVersionInfo(); + info.m_type[type].m_min_version = min == ~(Uint32)0 ? 0 : min; + info.m_type[type].m_max_version = max; +} #if 0 bool @@ -2922,6 +3050,17 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, systemErrorLab(signal, __LINE__); return; }//if + + if (getNodeState().startLevel < NodeState::SL_STARTED) + { + jam(); + CRASH_INSERTION(932); + char buf[100]; + BaseString::snprintf(buf, 100, "Node failure during restart"); + progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED, buf); + ndbrequire(false); + } + TnoFailedNodes = cnoFailedNodes; failReport(signal, failedNodePtr.i, (UintR)ZTRUE, aFailCause); if (cpresident == getOwnNodeId()) { @@ -3008,6 +3147,16 @@ void Qmgr::execPREP_FAILREQ(Signal* signal) return; }//if + if (getNodeState().startLevel < NodeState::SL_STARTED) + { + jam(); + CRASH_INSERTION(932); + char buf[100]; + BaseString::snprintf(buf, 100, "Node failure during restart"); + progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED, buf); + ndbrequire(false); + } + guard0 = cnoPrepFailedNodes - 1; arrGuard(guard0, MAX_NDB_NODES); for (Tindex = 0; Tindex <= guard0; Tindex++) { @@ -3185,6 +3334,18 @@ Qmgr::sendCommitFailReq(Signal* signal) for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { jam(); ptrAss(nodePtr, nodeRec); + +#ifdef ERROR_INSERT + if (ERROR_INSERTED(935) && nodePtr.i == c_error_insert_extra) + { + ndbout_c("skipping node %d", c_error_insert_extra); + CLEAR_ERROR_INSERT_VALUE; + signal->theData[0] = 9999; + sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1); + continue; + } +#endif + if (nodePtr.p->phase == ZRUNNING) { jam(); nodePtr.p->sendCommitFailReqStatus = Q_ACTIVE; @@ -3255,6 +3416,33 @@ void Qmgr::execPREP_FAILREF(Signal* signal) return; }//Qmgr::execPREP_FAILREF() +static +Uint32 +clear_nodes(Uint32 dstcnt, Uint16 dst[], Uint32 srccnt, const Uint16 src[]) +{ + if (srccnt == 0) + return dstcnt; + + Uint32 pos = 0; + for (Uint32 i = 0; i 0) { - jam(); - guard0 = cnoFailedNodes - 1; - arrGuard(guard0 + cnoCommitFailedNodes, MAX_NDB_NODES); - for (Tj = 0; Tj <= guard0; Tj++) { - jam(); - cfailedNodes[Tj] = cfailedNodes[Tj + cnoCommitFailedNodes]; - }//for - }//if - }//if + + /** + * Remove committed nodes from failed/prepared + */ + cnoFailedNodes = clear_nodes(cnoFailedNodes, + cfailedNodes, + cnoCommitFailedNodes, + ccommitFailedNodes); + cnoPrepFailedNodes = clear_nodes(cnoPrepFailedNodes, + cprepFailedNodes, + cnoCommitFailedNodes, + ccommitFailedNodes); cnoCommitFailedNodes = 0; }//if /**----------------------------------------------------------------------- @@ -4733,6 +4920,14 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal) default: ; }//switch + +#ifdef ERROR_INSERT + if (signal->theData[0] == 935 && signal->getLength() == 2) + { + SET_ERROR_INSERT_VALUE(935); + c_error_insert_extra = signal->theData[1]; + } +#endif }//Qmgr::execDUMP_STATE_ORD() void Qmgr::execSET_VAR_REQ(Signal* signal) diff --git a/storage/ndb/src/kernel/blocks/suma/Suma.cpp b/storage/ndb/src/kernel/blocks/suma/Suma.cpp index 92efca36a35..6f45cfb1975 100644 --- a/storage/ndb/src/kernel/blocks/suma/Suma.cpp +++ b/storage/ndb/src/kernel/blocks/suma/Suma.cpp @@ -230,7 +230,6 @@ Suma::execREAD_CONFIG_REQ(Signal* signal) c_startup.m_wait_handover= false; c_failedApiNodes.clear(); - c_startup.m_restart_server_node_id = 0; // Server for my NR ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtrSend(); conf->senderRef = reference(); @@ -261,6 +260,14 @@ Suma::execSTTOR(Signal* signal) { if(startphase == 5) { + if (ERROR_INSERTED(13029)) /* Hold startphase 5 */ + { + sendSignalWithDelay(SUMA_REF, GSN_STTOR, signal, + 30, signal->getLength()); + DBUG_VOID_RETURN; + } + + c_startup.m_restart_server_node_id = 0; getNodeGroupMembers(signal); if (typeOfStart == NodeState::ST_NODE_RESTART || typeOfStart == NodeState::ST_INITIAL_NODE_RESTART) @@ -373,6 +380,8 @@ Suma::execSUMA_START_ME_REF(Signal* signal) infoEvent("Suma: node %d refused %d", c_startup.m_restart_server_node_id, ref->errorCode); + + c_startup.m_restart_server_node_id++; send_start_me_req(signal); } @@ -887,6 +896,22 @@ Suma::execDUMP_STATE_ORD(Signal* signal){ ptr->m_buffer_head.m_page_id); } } + + if (tCase == 8006) + { + SET_ERROR_INSERT_VALUE(13029); + } + + if (tCase == 8007) + { + c_startup.m_restart_server_node_id = MAX_NDB_NODES + 1; + SET_ERROR_INSERT_VALUE(13029); + } + + if (tCase == 8008) + { + CLEAR_ERROR_INSERT_VALUE; + } } /************************************************************* @@ -1092,14 +1117,14 @@ Suma::execSUB_CREATE_REQ(Signal* signal) } } else { if (c_startup.m_restart_server_node_id && - refToNode(subRef) != c_startup.m_restart_server_node_id) + subRef != calcSumaBlockRef(c_startup.m_restart_server_node_id)) { /** * only allow "restart_server" Suma's to come through * for restart purposes */ jam(); - sendSubStartRef(signal, 1405); + sendSubCreateRef(signal, 1415); DBUG_VOID_RETURN; } // Check that id/key is unique @@ -2232,14 +2257,17 @@ Suma::execSUB_START_REQ(Signal* signal){ key.m_subscriptionKey = req->subscriptionKey; if (c_startup.m_restart_server_node_id && - refToNode(senderRef) != c_startup.m_restart_server_node_id) + senderRef != calcSumaBlockRef(c_startup.m_restart_server_node_id)) { /** * only allow "restart_server" Suma's to come through * for restart purposes */ jam(); - sendSubStartRef(signal, 1405); + Uint32 err = c_startup.m_restart_server_node_id != RNIL ? 1405 : + SubStartRef::NF_FakeErrorREF; + + sendSubStartRef(signal, err); DBUG_VOID_RETURN; } @@ -2454,6 +2482,21 @@ Suma::execSUB_STOP_REQ(Signal* signal){ DBUG_VOID_RETURN; } + if (c_startup.m_restart_server_node_id && + senderRef != calcSumaBlockRef(c_startup.m_restart_server_node_id)) + { + /** + * only allow "restart_server" Suma's to come through + * for restart purposes + */ + jam(); + Uint32 err = c_startup.m_restart_server_node_id != RNIL ? 1405 : + SubStopRef::NF_FakeErrorREF; + + sendSubStopRef(signal, err); + DBUG_VOID_RETURN; + } + if(!c_subscriptions.find(subPtr, key)){ jam(); DBUG_PRINT("error", ("not found")); @@ -2461,18 +2504,6 @@ Suma::execSUB_STOP_REQ(Signal* signal){ DBUG_VOID_RETURN; } - if (c_startup.m_restart_server_node_id && - refToNode(senderRef) != c_startup.m_restart_server_node_id) - { - /** - * only allow "restart_server" Suma's to come through - * for restart purposes - */ - jam(); - sendSubStopRef(signal, 1405); - DBUG_VOID_RETURN; - } - if (subPtr.p->m_state == Subscription::LOCKED) { jam(); DBUG_PRINT("error", ("locked")); diff --git a/storage/ndb/src/kernel/blocks/suma/SumaInit.cpp b/storage/ndb/src/kernel/blocks/suma/SumaInit.cpp index a9b9727cf99..c6311058035 100644 --- a/storage/ndb/src/kernel/blocks/suma/SumaInit.cpp +++ b/storage/ndb/src/kernel/blocks/suma/SumaInit.cpp @@ -122,6 +122,8 @@ Suma::Suma(Block_context& ctx) : addRecSignal(GSN_SUB_GCP_COMPLETE_REP, &Suma::execSUB_GCP_COMPLETE_REP); + + c_startup.m_restart_server_node_id = RNIL; // Server for my NR } Suma::~Suma() diff --git a/storage/ndb/src/kernel/vm/GlobalData.hpp b/storage/ndb/src/kernel/vm/GlobalData.hpp index 2761edb0571..fa0ad996c01 100644 --- a/storage/ndb/src/kernel/vm/GlobalData.hpp +++ b/storage/ndb/src/kernel/vm/GlobalData.hpp @@ -36,6 +36,7 @@ enum restartStates {initial_state, struct GlobalData { Uint32 m_restart_seq; // + NodeVersionInfo m_versionInfo; NodeInfo m_nodeInfo[MAX_NODES]; Signal VMSignals[1]; // Owned by FastScheduler:: diff --git a/storage/ndb/src/kernel/vm/SimulatedBlock.hpp b/storage/ndb/src/kernel/vm/SimulatedBlock.hpp index 46fe03de98e..01897825b2e 100644 --- a/storage/ndb/src/kernel/vm/SimulatedBlock.hpp +++ b/storage/ndb/src/kernel/vm/SimulatedBlock.hpp @@ -403,6 +403,9 @@ protected: const NodeInfo & getNodeInfo(NodeId nodeId) const; NodeInfo & setNodeInfo(NodeId); + const NodeVersionInfo& getNodeVersionInfo() const; + NodeVersionInfo& setNodeVersionInfo(); + /********************** * Xfrm stuff */ @@ -709,6 +712,18 @@ SimulatedBlock::getNodeInfo(NodeId nodeId) const { return globalData.m_nodeInfo[nodeId]; } +inline +const NodeVersionInfo & +SimulatedBlock::getNodeVersionInfo() const { + return globalData.m_versionInfo; +} + +inline +NodeVersionInfo & +SimulatedBlock::setNodeVersionInfo() { + return globalData.m_versionInfo; +} + inline void SimulatedBlock::EXECUTE_DIRECT(Uint32 block, diff --git a/storage/ndb/test/ndbapi/testNodeRestart.cpp b/storage/ndb/test/ndbapi/testNodeRestart.cpp index d346a4f0057..01e138830d6 100644 --- a/storage/ndb/test/ndbapi/testNodeRestart.cpp +++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp @@ -932,6 +932,81 @@ retry: return NDBT_OK; } +int runBug24717(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + NdbRestarter restarter; + Ndb* pNdb = GETNDB(step); + + HugoTransactions hugoTrans(*ctx->getTab()); + + int dump[] = { 9002, 0 } ; + Uint32 ownNode = refToNode(pNdb->getReference()); + dump[1] = ownNode; + + for (; loops; loops --) + { + int nodeId = restarter.getRandomNotMasterNodeId(rand()); + restarter.restartOneDbNode(nodeId, false, true, true); + restarter.waitNodesNoStart(&nodeId, 1); + + if (restarter.dumpStateOneNode(nodeId, dump, 2)) + return NDBT_FAILED; + + restarter.startNodes(&nodeId, 1); + + for (Uint32 i = 0; i < 100; i++) + { + hugoTrans.pkReadRecords(pNdb, 100, 1, NdbOperation::LM_CommittedRead); + } + + restarter.waitClusterStarted(); + } + + return NDBT_OK; +} + +int runBug25364(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + NdbRestarter restarter; + Ndb* pNdb = GETNDB(step); + int loops = ctx->getNumLoops(); + + if (restarter.getNumDbNodes() < 4) + return NDBT_OK; + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + + for (; loops; loops --) + { + int master = restarter.getMasterNodeId(); + int victim = restarter.getRandomNodeOtherNodeGroup(master, rand()); + int second = restarter.getRandomNodeSameNodeGroup(victim, rand()); + + int dump[] = { 935, victim } ; + if (restarter.dumpStateOneNode(master, dump, 2)) + return NDBT_FAILED; + + if (restarter.dumpStateOneNode(master, val2, 2)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(second, false, true, true)) + return NDBT_FAILED; + + int nodes[2] = { master, second }; + if (restarter.waitNodesNoStart(nodes, 2)) + return NDBT_FAILED; + + restarter.startNodes(nodes, 2); + + if (restarter.waitNodesStarted(nodes, 2)) + return NDBT_FAILED; + } + + return NDBT_OK; +} + int runBug21271(NDBT_Context* ctx, NDBT_Step* step){ int result = NDBT_OK; @@ -996,40 +1071,111 @@ runBug24543(NDBT_Context* ctx, NDBT_Step* step){ } return NDBT_OK; } -int runBug24717(NDBT_Context* ctx, NDBT_Step* step){ + +int runBug25468(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; int loops = ctx->getNumLoops(); int records = ctx->getNumRecords(); NdbRestarter restarter; - Ndb* pNdb = GETNDB(step); - HugoTransactions hugoTrans(*ctx->getTab()); - - int dump[] = { 9000, 0 } ; - Uint32 ownNode = refToNode(pNdb->getReference()); - dump[1] = ownNode; - - for (; loops; loops --) + for (int i = 0; igetNumLoops(); + int records = ctx->getNumRecords(); + NdbRestarter restarter; + + if (restarter.getNumDbNodes() < 4) + return NDBT_OK; + + for (int i = 0; i