From fd6d45f0b0de5a8f0767d96d8f12cbca649f454c Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 16 Mar 2006 11:21:18 +0100 Subject: [PATCH 01/15] Fix bug in mysql-test-run.pl in ^C signal handler. mysql-test/lib/mtr_timer.pl: Fix bug where ^C would trigger cleanup handler in both parent and timeout child processes, causing duplicated messages and potential conflicts. --- mysql-test/lib/mtr_timer.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mysql-test/lib/mtr_timer.pl b/mysql-test/lib/mtr_timer.pl index 709cebd6407..a85ab8c6122 100644 --- a/mysql-test/lib/mtr_timer.pl +++ b/mysql-test/lib/mtr_timer.pl @@ -78,6 +78,12 @@ sub mtr_timer_start($$$) { { # Child, redirect output and exec # FIXME do we need to redirect streams? + + # Don't do the ^C cleanup in the timeout child processes! + # There is actually a race here, if we get ^C after fork(), but before + # clearing the signal handler. + $SIG{INT}= 'DEFAULT'; + $0= "mtr_timer(timers,$name,$duration)"; sleep($duration); exit(0); From 6ac6b08c41beb47cb38aa2d19a86157997f5cda0 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 17 Mar 2006 10:09:35 +0100 Subject: [PATCH 02/15] ndb - bug#18298 8 repeated nr with table wo/ logging cause crash Dont create crashed replica for temporary tables ndb/src/kernel/blocks/dbdih/Dbdih.hpp: Dont create crashed replica for temporary tables ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Dont create crashed replica for temporary tables --- ndb/src/kernel/blocks/dbdih/Dbdih.hpp | 3 ++- ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index 0c107e35603..f74c0f36c4d 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -1038,7 +1038,8 @@ private: void prepareReplicas(FragmentstorePtr regFragptr); void removeNodeFromStored(Uint32 nodeId, FragmentstorePtr regFragptr, - ReplicaRecordPtr replicaPtr); + ReplicaRecordPtr replicaPtr, + bool temporary); void removeOldStoredReplica(FragmentstorePtr regFragptr, ReplicaRecordPtr replicaPtr); void removeStoredReplica(FragmentstorePtr regFragptr, diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 776e59ea495..fab428aadef 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -5212,6 +5212,7 @@ void Dbdih::removeNodeFromTable(Signal* signal, //const Uint32 lcpId = SYSFILE->latestLCP_ID; const bool lcpOngoingFlag = (tabPtr.p->tabLcpStatus== TabRecord::TLS_ACTIVE); + const bool temporary = !tabPtr.p->storedTable; FragmentstorePtr fragPtr; for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){ @@ -5232,7 +5233,7 @@ void Dbdih::removeNodeFromTable(Signal* signal, jam(); found = true; noOfRemovedReplicas++; - removeNodeFromStored(nodeId, fragPtr, replicaPtr); + removeNodeFromStored(nodeId, fragPtr, replicaPtr, temporary); if(replicaPtr.p->lcpOngoingFlag){ jam(); /** @@ -12051,9 +12052,18 @@ void Dbdih::removeDeadNode(NodeRecordPtr removeNodePtr) /*---------------------------------------------------------------*/ void Dbdih::removeNodeFromStored(Uint32 nodeId, FragmentstorePtr fragPtr, - ReplicaRecordPtr replicatePtr) + ReplicaRecordPtr replicatePtr, + bool temporary) { - newCrashedReplica(nodeId, replicatePtr); + if (!temporary) + { + jam(); + newCrashedReplica(nodeId, replicatePtr); + } + else + { + jam(); + } removeStoredReplica(fragPtr, replicatePtr); linkOldStoredReplica(fragPtr, replicatePtr); ndbrequire(fragPtr.p->storedReplicas != RNIL); From 367442f754a97d87077c99bc5805b41da5ac7119 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 17 Mar 2006 10:25:29 +0100 Subject: [PATCH 03/15] BUG#18283 When InnoDB returns error 'lock table full', MySQL can write to binlog too much. When InnoDB has to rollback a transaction because the lock table has filled up, it also needs to inform the upper layer that the transaction was rolled back so that the cached transaction is not written to the binary log. sql/ha_innodb.cc: When InnoDB rolls back a transaction in HA_ERR_LOCK_TABLE_FULL, it needs to inform the upper layer to rollback the transaction also. --- sql/ha_innodb.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index d24587e23ea..8455bbaf4d0 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -332,6 +332,13 @@ convert_error_code_to_mysql( return(HA_ERR_NO_SAVEPOINT); } else if (error == (int) DB_LOCK_TABLE_FULL) { + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + ha_rollback(thd); + } return(HA_ERR_LOCK_TABLE_FULL); } else { From 3bfaf33392901b90d420e37450164d7a0db8e3ed Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 17 Mar 2006 10:55:02 +0100 Subject: [PATCH 04/15] ndb - bug#16772 dont't allow node to join cluster until all nodes has completed failure handling ndb/src/kernel/blocks/qmgr/QmgrMain.cpp: When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override. But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later) ndb/test/ndbapi/testNodeRestart.cpp: testcase for bug#16772 ndb/test/run-test/daily-basic-tests.txt: Run test in basic suite --- ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 103 ++++++++++++++++++++---- ndb/test/ndbapi/testNodeRestart.cpp | 50 ++++++++++++ ndb/test/run-test/daily-basic-tests.txt | 4 + 3 files changed, 142 insertions(+), 15 deletions(-) diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index 6095895e7c2..70084e6b171 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout) void Qmgr::execCONNECT_REP(Signal* signal) { + jamEntry(); const Uint32 nodeId = signal->theData[0]; c_connectedNodes.set(nodeId); NodeRecPtr nodePtr; @@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal) ptrCheckGuard(nodePtr, MAX_NODES, nodeRec); switch(nodePtr.p->phase){ case ZSTARTING: - jam(); - break; case ZRUNNING: + jam(); + if(!c_start.m_nodes.isWaitingFor(nodeId)){ + jam(); + return; + } + break; case ZPREPARE_FAIL: case ZFAIL_CLOSING: jam(); @@ -277,21 +282,28 @@ void Qmgr::execCONNECT_REP(Signal* signal) case ZAPI_INACTIVE: return; } - - if(!c_start.m_nodes.isWaitingFor(nodeId)){ - jam(); - return; - } - + switch(c_start.m_gsn){ case GSN_CM_REGREQ: jam(); sendCmRegReq(signal, nodeId); return; - case GSN_CM_NODEINFOREQ:{ + case GSN_CM_NODEINFOREQ: jam(); sendCmNodeInfoReq(signal, nodeId, nodePtr.p); return; + case GSN_CM_ADD:{ + jam(); + + ndbrequire(getOwnNodeId() != cpresident); + c_start.m_nodes.clearWaitingFor(nodeId); + c_start.m_gsn = RNIL; + + NodeRecPtr addNodePtr; + addNodePtr.i = nodeId; + ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec); + cmAddPrepare(signal, addNodePtr, nodePtr.p); + return; } default: return; @@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){ return; case ZFAIL_CLOSING: jam(); -#ifdef VM_TRACE - ndbout_c("Enabling communication to CM_ADD node state=%d", - nodePtr.p->phase); -#endif + +#if 1 + warningEvent("Recieved request to incorperate node %u, " + "while error handling has not yet completed", + nodePtr.i); + + ndbrequire(getOwnNodeId() != cpresident); + ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD); + c_start.m_nodes.clearWaitingFor(); + c_start.m_nodes.setWaitingFor(nodePtr.i); + c_start.m_gsn = GSN_CM_ADD; +#else + warningEvent("Enabling communication to CM_ADD node %u state=%d", + nodePtr.i, + nodePtr.p->phase); nodePtr.p->phase = ZSTARTING; nodePtr.p->failState = NORMAL; signal->theData[0] = 0; signal->theData[1] = nodePtr.i; sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA); +#endif return; case ZSTARTING: break; @@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal) jamEntry(); failedNodePtr.i = signal->theData[0]; + + if (ERROR_INSERTED(930)) + { + CLEAR_ERROR_INSERT_VALUE; + infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i); + return; + } + ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){ failedNodePtr.p->failState = NORMAL; } else { jam(); + + char buf[100]; + BaseString::snprintf(buf, 100, + "Received NDB_FAILCONF for node %u with state: %d %d", + failedNodePtr.i, + failedNodePtr.p->phase, + failedNodePtr.p->failState); + progError(__LINE__, 0, buf); systemErrorLab(signal, __LINE__); }//if if (cpresident == getOwnNodeId()) { @@ -2077,10 +2117,42 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.i == getOwnNodeId()) { jam(); - systemErrorLab(signal, __LINE__); + + const char * msg = 0; + switch(aFailCause){ + case FailRep::ZOWN_FAILURE: + msg = "Own failure"; + break; + case FailRep::ZOTHER_NODE_WHEN_WE_START: + case FailRep::ZOTHERNODE_FAILED_DURING_START: + msg = "Other node died during start"; + break; + case FailRep::ZIN_PREP_FAIL_REQ: + msg = "Prep fail"; + break; + case FailRep::ZSTART_IN_REGREQ: + msg = "Start timeout"; + break; + case FailRep::ZHEARTBEAT_FAILURE: + msg = "Hearbeat failure"; + break; + case FailRep::ZLINK_FAILURE: + msg = "Connection failure"; + break; + } + + char buf[100]; + BaseString::snprintf(buf, 100, + "We(%u) have been declared dead by %u reason: %s(%u)", + getOwnNodeId(), + refToNode(signal->getSendersBlockRef()), + aFailCause, + msg ? msg : ""); + + progError(__LINE__, 0, buf); return; }//if - + myNodePtr.i = getOwnNodeId(); ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec); if (myNodePtr.p->phase != ZRUNNING) { @@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal, cfailureNr = cprepareFailureNr; ctoFailureNr = 0; ctoStatus = Q_ACTIVE; + c_start.reset(); // Don't take over nodes being started if (cnoCommitFailedNodes > 0) { jam(); /**----------------------------------------------------------------- diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index a741e6233d9..eebd631af94 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -535,6 +535,52 @@ err: return NDBT_FAILED; } +int +runBug16772(NDBT_Context* ctx, NDBT_Step* step){ + + NdbRestarter restarter; + if (restarter.getNumDbNodes() < 2) + { + ctx->stopTest(); + return NDBT_OK; + } + + int aliveNodeId = restarter.getRandomNotMasterNodeId(rand()); + int deadNodeId = aliveNodeId; + while (deadNodeId == aliveNodeId) + deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes()); + + if (restarter.insertErrorInNode(aliveNodeId, 930)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(deadNodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&deadNodeId, 1)) + return NDBT_FAILED; + + if (restarter.startNodes(&deadNodeId, 1)) + return NDBT_FAILED; + + // It should now be hanging since we throw away NDB_FAILCONF + int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10); + // So this should fail...i.e it should not reach startphase 3 + + // Now send a NDB_FAILCONF for deadNo + int dump[] = { 7020, 323, 252, 0 }; + dump[3] = deadNodeId; + if (restarter.dumpStateOneNode(aliveNodeId, dump, 4)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&deadNodeId, 1)) + return NDBT_FAILED; + + return ret ? NDBT_OK : NDBT_FAILED; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -820,6 +866,10 @@ TESTCASE("Bug15685", STEP(runBug15685); FINALIZER(runClearTable); } +TESTCASE("Bug16772", + "Test bug with restarting before NF handling is complete"){ + STEP(runBug16772); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 6378b4a06d3..169daae6d7f 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -446,6 +446,10 @@ max-time: 500 cmd: testNodeRestart args: -n Bug15685 T1 +max-time: 500 +cmd: testNodeRestart +args: -n Bug16772 T1 + # OLD FLEX max-time: 500 cmd: flexBench From 37230a2a8867a2cc6066dac51ddc775688cb1cba Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 20 Mar 2006 11:29:58 +0100 Subject: [PATCH 05/15] ndb - wl2610 Activly abort transactions (that's affected) during NF This removes a lot of bugs that can occur otherwise is using high value for TransactionDeadLockTimout ndb/include/kernel/signaldata/TcContinueB.hpp: New continueb for active transaction abort on nf ndb/src/kernel/blocks/dbtc/Dbtc.hpp: Add bitmask of participating nodes to transaction record Add bitmask of node fail steps, so that NF_CompleteRep is not sent until all steps has completed ndb/src/kernel/blocks/dbtc/DbtcMain.cpp: Active transaction baortion --- ndb/include/kernel/signaldata/TcContinueB.hpp | 3 +- ndb/src/kernel/blocks/dbtc/Dbtc.hpp | 18 +- ndb/src/kernel/blocks/dbtc/DbtcMain.cpp | 192 +++++++++++++----- 3 files changed, 164 insertions(+), 49 deletions(-) diff --git a/ndb/include/kernel/signaldata/TcContinueB.hpp b/ndb/include/kernel/signaldata/TcContinueB.hpp index 85213791b2a..b87b982e49b 100644 --- a/ndb/include/kernel/signaldata/TcContinueB.hpp +++ b/ndb/include/kernel/signaldata/TcContinueB.hpp @@ -44,7 +44,8 @@ private: CHECK_WAIT_DROP_TAB_FAILED_LQH = 16, TRIGGER_PENDING = 17, - DelayTCKEYCONF = 18 + DelayTCKEYCONF = 18, + ZNF_CHECK_TRANSACTIONS = 19 }; }; diff --git a/ndb/src/kernel/blocks/dbtc/Dbtc.hpp b/ndb/src/kernel/blocks/dbtc/Dbtc.hpp index 61afef30b43..23c5a7d08eb 100644 --- a/ndb/src/kernel/blocks/dbtc/Dbtc.hpp +++ b/ndb/src/kernel/blocks/dbtc/Dbtc.hpp @@ -636,6 +636,7 @@ public: ConnectionState apiConnectstate; UintR transid[2]; UintR firstTcConnect; + NdbNodeBitmask m_transaction_nodes; //--------------------------------------------------- // Second 16 byte cache line. Hot variables. @@ -941,6 +942,17 @@ public: UintR noOfWordsTCINDXCONF; UintR packedWordsTCINDXCONF[30]; BlockReference hostLqhBlockRef; + + enum NodeFailBits + { + NF_TAKEOVER = 0x1, + NF_CHECK_SCAN = 0x2, + NF_CHECK_TRANSACTION = 0x4, + NF_CHECK_DROP_TAB = 0x8, + NF_NODE_FAIL_BITS = 0xF // All bits... + }; + Uint32 m_nf_bits; + NdbNodeBitmask m_lqh_trans_conf; }; /* p2c: size = 128 bytes */ typedef Ptr HostRecordPtr; @@ -1578,7 +1590,7 @@ private: void wrongSchemaVersionErrorLab(Signal* signal); void noFreeConnectionErrorLab(Signal* signal); void tckeyreq050Lab(Signal* signal); - void timeOutFoundLab(Signal* signal, UintR anAdd); + void timeOutFoundLab(Signal* signal, UintR anAdd, Uint32 errCode); void completeTransAtTakeOverLab(Signal* signal, UintR TtakeOverInd); void completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd); void completeTransAtTakeOverDoOne(Signal* signal, UintR TtakeOverInd); @@ -1600,6 +1612,9 @@ private: void checkScanFragList(Signal*, Uint32 failedNodeId, ScanRecord * scanP, LocalDLList::Head&); + void nodeFailCheckTransactions(Signal*,Uint32 transPtrI,Uint32 failedNodeId); + void checkNodeFailComplete(Signal* signal, Uint32 failedNodeId, Uint32 bit); + // Initialisation void initData(); void initRecords(); @@ -1626,6 +1641,7 @@ private: HostRecord *hostRecord; HostRecordPtr hostptr; UintR chostFilesize; + NdbNodeBitmask c_alive_nodes; GcpRecord *gcpRecord; GcpRecordPtr gcpPtr; diff --git a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp index d9d1f01b213..4750a8c388a 100644 --- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp +++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp @@ -262,6 +262,10 @@ void Dbtc::execCONTINUEB(Signal* signal) jam(); checkScanActiveInFailedLqh(signal, Tdata0, Tdata1); return; + case TcContinueB::ZNF_CHECK_TRANSACTIONS: + jam(); + nodeFailCheckTransactions(signal, Tdata0, Tdata1); + return; case TcContinueB::CHECK_WAIT_DROP_TAB_FAILED_LQH: jam(); checkWaitDropTabFailedLqh(signal, Tdata0, Tdata1); @@ -301,6 +305,7 @@ void Dbtc::execINCL_NODEREQ(Signal* signal) hostptr.p->hostStatus = HS_ALIVE; hostptr.p->takeOverStatus = TOS_IDLE; signal->theData[0] = cownref; + c_alive_nodes.set(hostptr.i); sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB); } @@ -487,6 +492,7 @@ Dbtc::checkWaitDropTabFailedLqh(Signal* signal, Uint32 nodeId, Uint32 tableId) * Finished */ jam(); + checkNodeFailComplete(signal, nodeId, HostRecord::NF_CHECK_DROP_TAB); return; } @@ -859,6 +865,7 @@ void Dbtc::execREAD_NODESCONF(Signal* signal) jam(); con_lineNodes++; hostptr.p->hostStatus = HS_ALIVE; + c_alive_nodes.set(i); }//if }//if }//for @@ -2314,6 +2321,7 @@ void Dbtc::initApiConnectRec(Signal* signal, regApiPtr->commitAckMarker = RNIL; regApiPtr->buddyPtr = RNIL; regApiPtr->currSavePointId = 0; + regApiPtr->m_transaction_nodes.clear(); // Trigger data releaseFiredTriggerData(®ApiPtr->theFiredTriggers), // Index data @@ -2921,6 +2929,10 @@ void Dbtc::tckeyreq050Lab(Signal* signal) signal->theData[0] = TdihConnectptr; signal->theData[1] = Ttableref; signal->theData[2] = TdistrHashValue; + signal->theData[3] = 0; + signal->theData[4] = 0; + signal->theData[5] = 0; + signal->theData[6] = 0; /*-------------------------------------------------------------*/ /* FOR EFFICIENCY REASONS WE AVOID THE SIGNAL SENDING HERE AND */ @@ -3098,6 +3110,7 @@ void Dbtc::sendlqhkeyreq(Signal* signal, TcConnectRecord * const regTcPtr = tcConnectptr.p; ApiConnectRecord * const regApiPtr = apiConnectptr.p; CacheRecord * const regCachePtr = cachePtr.p; + UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6; #ifdef ERROR_INSERT if (ERROR_INSERTED(8002)) { systemErrorLab(signal); @@ -3135,6 +3148,9 @@ void Dbtc::sendlqhkeyreq(Signal* signal, LqhKeyReq::setScanTakeOverFlag(tslrAttrLen, regCachePtr->scanTakeOverInd); Tdata10 = 0; + sig0 = regCachePtr->opSimple; + sig1 = regTcPtr->operation; + bool simpleRead = (sig1 == ZREAD && sig0 == ZTRUE); LqhKeyReq::setKeyLen(Tdata10, regCachePtr->keylen); LqhKeyReq::setLastReplicaNo(Tdata10, regTcPtr->lastReplicaNo); LqhKeyReq::setLockType(Tdata10, regCachePtr->opLock); @@ -3144,8 +3160,8 @@ void Dbtc::sendlqhkeyreq(Signal* signal, LqhKeyReq::setApplicationAddressFlag(Tdata10, 1); LqhKeyReq::setDirtyFlag(Tdata10, regTcPtr->dirtyOp); LqhKeyReq::setInterpretedFlag(Tdata10, regCachePtr->opExec); - LqhKeyReq::setSimpleFlag(Tdata10, regCachePtr->opSimple); - LqhKeyReq::setOperation(Tdata10, regTcPtr->operation); + LqhKeyReq::setSimpleFlag(Tdata10, sig0); + LqhKeyReq::setOperation(Tdata10, sig1); /* ----------------------------------------------------------------------- * Sequential Number of first LQH = 0, bit 22-23 * IF ATTRIBUTE INFORMATION IS SENT IN TCKEYREQ, @@ -3158,18 +3174,16 @@ void Dbtc::sendlqhkeyreq(Signal* signal, * ----------------------------------------------------------------------- */ //LqhKeyReq::setAPIVersion(Tdata10, regCachePtr->apiVersionNo); Uint32 commitAckMarker = regTcPtr->commitAckMarker; + const Uint32 noOfLqhs = regTcPtr->noOfNodes; if(commitAckMarker != RNIL){ jam(); - LqhKeyReq::setMarkerFlag(Tdata10, 1); - CommitAckMarker * tmp; - tmp = m_commitAckMarkerHash.getPtr(commitAckMarker); + CommitAckMarker * tmp = m_commitAckMarkerHash.getPtr(commitAckMarker); /** * Populate LQH array */ - const Uint32 noOfLqhs = regTcPtr->noOfNodes; tmp->noOfLqhs = noOfLqhs; for(Uint32 i = 0; ilqhNodeId[i] = regTcPtr->tcNodedata[i]; @@ -3180,7 +3194,6 @@ void Dbtc::sendlqhkeyreq(Signal* signal, /* NO READ LENGTH SENT FROM TC. SEQUENTIAL NUMBER IS 1 AND IT */ /* IS SENT TO A PRIMARY NODE. */ /* ************************************************************> */ - UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6; LqhKeyReq * const lqhKeyReq = (LqhKeyReq *)signal->getDataPtrSend(); @@ -3204,6 +3217,14 @@ void Dbtc::sendlqhkeyreq(Signal* signal, sig5 = regTcPtr->clientData; sig6 = regCachePtr->scanInfo; + if (! simpleRead) + { + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[0]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[1]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[2]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[3]); + } + lqhKeyReq->tableSchemaVersion = sig0; lqhKeyReq->fragmentData = sig1; lqhKeyReq->transId1 = sig2; @@ -4587,6 +4608,7 @@ void Dbtc::copyApi(Signal* signal) UintR TgcpPointer = regTmpApiPtr->gcpPointer; UintR TgcpFilesize = cgcpFilesize; UintR TcommitAckMarker = regTmpApiPtr->commitAckMarker; + NdbNodeBitmask Tnodes = regTmpApiPtr->m_transaction_nodes; GcpRecord *localGcpRecord = gcpRecord; regApiPtr->ndbapiBlockref = regTmpApiPtr->ndbapiBlockref; @@ -4597,6 +4619,7 @@ void Dbtc::copyApi(Signal* signal) regApiPtr->transid[1] = Ttransid2; regApiPtr->lqhkeyconfrec = Tlqhkeyconfrec; regApiPtr->commitAckMarker = TcommitAckMarker; + regApiPtr->m_transaction_nodes = Tnodes; gcpPtr.i = TgcpPointer; ptrCheckGuard(gcpPtr, TgcpFilesize, localGcpRecord); @@ -4607,6 +4630,7 @@ void Dbtc::copyApi(Signal* signal) regTmpApiPtr->commitAckMarker = RNIL; regTmpApiPtr->firstTcConnect = RNIL; regTmpApiPtr->lastTcConnect = RNIL; + regTmpApiPtr->m_transaction_nodes.clear(); releaseAllSeizedIndexOperations(regTmpApiPtr); }//Dbtc::copyApi() @@ -4865,7 +4889,7 @@ void Dbtc::releaseTransResources(Signal* signal) TcConnectRecordPtr localTcConnectptr; UintR TtcConnectFilesize = ctcConnectFilesize; TcConnectRecord *localTcConnectRecord = tcConnectRecord; - + apiConnectptr.p->m_transaction_nodes.clear(); localTcConnectptr.i = apiConnectptr.p->firstTcConnect; do { jam(); @@ -5269,7 +5293,8 @@ void Dbtc::execTC_COMMITREQ(Signal* signal) break; case CS_ABORTING: jam(); - errorCode = ZABORTINPROGRESS; + errorCode = regApiPtr->returncode ? + regApiPtr->returncode : ZABORTINPROGRESS; break; case CS_START_SCAN: jam(); @@ -5808,9 +5833,9 @@ void Dbtc::abort010Lab(Signal* signal) if (transP->firstTcConnect == RNIL) { jam(); - /*-----------------------------------------------------------------------*/ - /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION. */ - /*-----------------------------------------------------------------------*/ + /*--------------------------------------------------------------------*/ + /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION. */ + /*--------------------------------------------------------------------*/ releaseAbortResources(signal); return; }//if @@ -6087,10 +6112,12 @@ void Dbtc::timeOutLoopStartLab(Signal* signal, Uint32 api_con_ptr) if (api_timer != 0) { time_out_value= time_out_param + (api_con_ptr & mask_value); time_passed= tc_timer - api_timer; - if (time_passed > time_out_value) { + if (time_passed > time_out_value) + { jam(); - timeOutFoundLab(signal, api_con_ptr); - return; + timeOutFoundLab(signal, api_con_ptr, ZTIME_OUT_ERROR); + api_con_ptr++; + break; } } } @@ -6110,10 +6137,8 @@ void Dbtc::timeOutLoopStartLab(Signal* signal, Uint32 api_con_ptr) return; }//Dbtc::timeOutLoopStartLab() -void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) +void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr, Uint32 errCode) { - sendContinueTimeOutControl(signal, TapiConPtr + 1); - apiConnectptr.i = TapiConPtr; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); /*------------------------------------------------------------------*/ @@ -6126,7 +6151,8 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) << "Time-out in state = " << apiConnectptr.p->apiConnectstate << " apiConnectptr.i = " << apiConnectptr.i << " - exec: " << apiConnectptr.p->m_exec_flag - << " - place: " << c_apiConTimer_line[apiConnectptr.i]); + << " - place: " << c_apiConTimer_line[apiConnectptr.i] + << " code: " << errCode); switch (apiConnectptr.p->apiConnectstate) { case CS_STARTED: if(apiConnectptr.p->lqhkeyreqrec == apiConnectptr.p->lqhkeyconfrec){ @@ -6143,7 +6169,7 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) }//if } apiConnectptr.p->returnsignal = RS_TCROLLBACKREP; - apiConnectptr.p->returncode = ZTIME_OUT_ERROR; + apiConnectptr.p->returncode = errCode; abort010Lab(signal); return; case CS_RECEIVING: @@ -6156,7 +6182,7 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) /* START ABORTING THE TRANSACTION. ALSO START CHECKING THE */ /* REMAINING TRANSACTIONS. */ /*------------------------------------------------------------------*/ - terrorCode = ZTIME_OUT_ERROR; + terrorCode = errCode; abortErrorLab(signal); return; case CS_COMMITTING: @@ -6820,6 +6846,8 @@ void Dbtc::execNODE_FAILREP(Signal* signal) /* FAILED. */ /*------------------------------------------------------------*/ hostptr.p->hostStatus = HS_DEAD; + hostptr.p->m_nf_bits = HostRecord::NF_NODE_FAIL_BITS; + c_alive_nodes.clear(hostptr.i); if (hostptr.p->takeOverStatus == TOS_COMPLETED) { jam(); @@ -6832,14 +6860,7 @@ void Dbtc::execNODE_FAILREP(Signal* signal) /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ /* USED THEM IS COMPLETED. */ /*------------------------------------------------------------*/ - { - NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; - nfRep->blockNo = DBTC; - nfRep->nodeId = cownNodeid; - nfRep->failedNodeId = hostptr.i; - } - sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, - NFCompleteRep::SignalLength, JBB); + hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; } else { ndbrequire(hostptr.p->takeOverStatus == TOS_IDLE); hostptr.p->takeOverStatus = TOS_NODE_FAILED; @@ -6892,16 +6913,9 @@ void Dbtc::execNODE_FAILREP(Signal* signal) /* MASTER IT MIGHT START A NEW TAKE OVER EVEN AFTER THE */ /* CRASHED NODE HAVE ALREADY RECOVERED. */ /*------------------------------------------------------------*/ - for(tmpHostptr.i = 1; tmpHostptr.i < MAX_NDB_NODES;tmpHostptr.i++) { - jam(); - ptrAss(tmpHostptr, hostRecord); - if (tmpHostptr.p->hostStatus == HS_ALIVE) { - jam(); - tblockref = calcTcBlockRef(tmpHostptr.i); - signal->theData[0] = hostptr.i; - sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//for + NodeReceiverGroup rg(DBTC, c_alive_nodes); + signal->theData[0] = hostptr.i; + sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB); }//if }//if }//for @@ -6939,10 +6953,30 @@ void Dbtc::execNODE_FAILREP(Signal* signal) /*------------------------------------------------------------*/ checkScanActiveInFailedLqh(signal, 0, hostptr.i); checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid + nodeFailCheckTransactions(signal, 0, hostptr.i); }//for }//Dbtc::execNODE_FAILREP() +void +Dbtc::checkNodeFailComplete(Signal* signal, + Uint32 failedNodeId, + Uint32 bit) +{ + hostptr.i = failedNodeId; + ptrCheckGuard(hostptr, chostFilesize, hostRecord); + hostptr.p->m_nf_bits &= ~bit; + if (hostptr.p->m_nf_bits == 0) + { + NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; + nfRep->blockNo = DBTC; + nfRep->nodeId = cownNodeid; + nfRep->failedNodeId = hostptr.i; + sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, + NFCompleteRep::SignalLength, JBB); + } +} + void Dbtc::checkScanActiveInFailedLqh(Signal* signal, Uint32 scanPtrI, Uint32 failedNodeId){ @@ -6984,8 +7018,44 @@ void Dbtc::checkScanActiveInFailedLqh(Signal* signal, sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); return; }//for + + checkNodeFailComplete(signal, failedNodeId, HostRecord::NF_CHECK_SCAN); } +void +Dbtc::nodeFailCheckTransactions(Signal* signal, + Uint32 transPtrI, + Uint32 failedNodeId) +{ + jam(); + Ptr transPtr; + for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++) + { + ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); + if (transPtr.p->m_transaction_nodes.get(failedNodeId)) + { + jam(); + // Force timeout regardless of state + Uint32 save = c_appl_timeout_value; + c_appl_timeout_value = 1; + setApiConTimer(transPtr.i, 0, __LINE__); + timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT); + c_appl_timeout_value = save; + } + + // Send CONTINUEB to continue later + signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS; + signal->theData[1] = transPtr.i + 1; // Check next + signal->theData[2] = failedNodeId; + sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + return; + } + + checkNodeFailComplete(signal, failedNodeId, + HostRecord::NF_CHECK_TRANSACTION); +} + + void Dbtc::checkScanFragList(Signal* signal, Uint32 failedNodeId, @@ -7025,14 +7095,7 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal) /* USED THEM IS COMPLETED. */ /*------------------------------------------------------------*/ hostptr.p->takeOverStatus = TOS_COMPLETED; - { - NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; - nfRep->blockNo = DBTC; - nfRep->nodeId = cownNodeid; - nfRep->failedNodeId = hostptr.i; - } - sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, - NFCompleteRep::SignalLength, JBB); + checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); break; case TOS_COMPLETED: jam(); @@ -7979,6 +8042,7 @@ void Dbtc::initApiConnectFail(Signal* signal) apiConnectptr.p->ndbapiBlockref = 0; apiConnectptr.p->ndbapiConnect = 0; apiConnectptr.p->buddyPtr = RNIL; + apiConnectptr.p->m_transaction_nodes.clear(); setApiConTimer(apiConnectptr.i, 0, __LINE__); switch(ttransStatus){ case LqhTransConf::Committed: @@ -9756,6 +9820,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = tiacTmp - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9783,6 +9848,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = (2 * tiacTmp) - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9810,6 +9876,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = (3 * tiacTmp) - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9877,6 +9944,7 @@ void Dbtc::inithost(Signal* signal) hostptr.p->noOfPackedWordsLqh = 0; hostptr.p->hostLqhBlockRef = calcLqhBlockRef(hostptr.i); }//for + c_alive_nodes.clear(); }//Dbtc::inithost() void Dbtc::initialiseRecordsLab(Signal* signal, UintR Tdata0, @@ -10126,6 +10194,7 @@ void Dbtc::releaseAbortResources(Signal* signal) }//while apiConnectptr.p->firstTcConnect = RNIL; apiConnectptr.p->lastTcConnect = RNIL; + apiConnectptr.p->m_transaction_nodes.clear(); // MASV let state be CS_ABORTING until all // signals in the "air" have been received. Reset to CS_CONNECTED @@ -10199,6 +10268,7 @@ void Dbtc::releaseApiCon(Signal* signal, UintR TapiConnectPtr) cfirstfreeApiConnect = TlocalApiConnectptr.i; setApiConTimer(TlocalApiConnectptr.i, 0, __LINE__); TlocalApiConnectptr.p->apiConnectstate = CS_DISCONNECTED; + ndbassert(TlocalApiConnectptr.p->m_transaction_nodes.isclear()); ndbassert(TlocalApiConnectptr.p->apiScanRec == RNIL); TlocalApiConnectptr.p->ndbapiBlockref = 0; }//Dbtc::releaseApiCon() @@ -10734,6 +10804,34 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) c_theIndexOperationPool.getSize(), c_theIndexOperationPool.getNoOfFree()); } + + if (dumpState->args[0] == 2514) + { + if (signal->getLength() == 2) + { + dumpState->args[0] = DumpStateOrd::TcDumpOneApiConnectRec; + execDUMP_STATE_ORD(signal); + } + + NodeReceiverGroup rg(CMVMI, c_alive_nodes); + dumpState->args[0] = 15; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB); + + signal->theData[0] = 2515; + sendSignalWithDelay(cownref, GSN_DUMP_STATE_ORD, signal, 1000, 1); + return; + } + + if (dumpState->args[0] == 2515) + { + NdbNodeBitmask mask = c_alive_nodes; + mask.clear(getOwnNodeId()); + NodeReceiverGroup rg(NDBCNTR, mask); + + sendSignal(rg, GSN_SYSTEM_ERROR, signal, 1, JBB); + sendSignalWithDelay(cownref, GSN_SYSTEM_ERROR, signal, 300, 1); + return; + } }//Dbtc::execDUMP_STATE_ORD() void Dbtc::execSET_VAR_REQ(Signal* signal) From 51a093f18762d299899c7c9e5cb0a2a639631720 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 20 Mar 2006 14:49:46 +0100 Subject: [PATCH 06/15] ndb - bug#18352 Use variable waitfor_response_timeout (depending on TransactionDeadLockTimeout) When getting 4012, set NeedAbort and ReleaseOnClose ndb/src/ndbapi/NdbConnection.cpp: Use variable for WAITFOR_RESPONSE_TIMEOUT ndb/src/ndbapi/Ndbif.cpp: Use variable timeout for waitfor, when receiving 4012, set NeedAbort and ReleaseOnClose ndb/src/ndbapi/TransporterFacade.cpp: Init wait_for_response_timoue as max TRANSACTION_DEADLOCK_TIMEOUT ndb/src/ndbapi/TransporterFacade.hpp: Init wait_for_response_timoue as max TRANSACTION_DEADLOCK_TIMEOUT ndb/test/ndbapi/testTimeout.cpp: Add testcase for 4012 ndb/test/run-test/daily-basic-tests.txt: Add testcase for 4012 --- ndb/src/ndbapi/NdbConnection.cpp | 4 +- ndb/src/ndbapi/Ndbif.cpp | 12 +-- ndb/src/ndbapi/TransporterFacade.cpp | 14 ++++ ndb/src/ndbapi/TransporterFacade.hpp | 1 + ndb/test/ndbapi/testTimeout.cpp | 101 ++++++++++++++++++++++++ ndb/test/run-test/daily-basic-tests.txt | 4 + 6 files changed, 129 insertions(+), 7 deletions(-) diff --git a/ndb/src/ndbapi/NdbConnection.cpp b/ndb/src/ndbapi/NdbConnection.cpp index c9e26f8ccaf..9cd7d6ed42e 100644 --- a/ndb/src/ndbapi/NdbConnection.cpp +++ b/ndb/src/ndbapi/NdbConnection.cpp @@ -450,12 +450,12 @@ NdbConnection::executeNoBlobs(ExecType aTypeOfExec, //------------------------------------------------------------------------ Ndb* tNdb = theNdb; + Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout; m_waitForReply = false; executeAsynchPrepare(aTypeOfExec, NULL, NULL, abortOption); if (m_waitForReply){ while (1) { - int noOfComp = tNdb->sendPollNdb((3 * WAITFOR_RESPONSE_TIMEOUT), - 1, forceSend); + int noOfComp = tNdb->sendPollNdb(3 * timeout, 1, forceSend); if (noOfComp == 0) { /** * This timeout situation can occur if NDB crashes. diff --git a/ndb/src/ndbapi/Ndbif.cpp b/ndb/src/ndbapi/Ndbif.cpp index 3ebba7e1c4a..d753117aa9a 100644 --- a/ndb/src/ndbapi/Ndbif.cpp +++ b/ndb/src/ndbapi/Ndbif.cpp @@ -954,23 +954,25 @@ Ndb::pollCompleted(NdbConnection** aCopyArray) void Ndb::check_send_timeout() { + Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout; NDB_TICKS current_time = NdbTick_CurrentMillisecond(); if (current_time - the_last_check_time > 1000) { the_last_check_time = current_time; Uint32 no_of_sent = theNoOfSentTransactions; for (Uint32 i = 0; i < no_of_sent; i++) { NdbConnection* a_con = theSentTransactionsArray[i]; - if ((current_time - a_con->theStartTransTime) > - WAITFOR_RESPONSE_TIMEOUT) { + if ((current_time - a_con->theStartTransTime) > timeout) + { #ifdef VM_TRACE a_con->printState(); Uint32 t1 = a_con->theTransactionId; Uint32 t2 = a_con->theTransactionId >> 32; - ndbout_c("[%.8x %.8x]", t1, t2); - abort(); + ndbout_c("4012 [%.8x %.8x]", t1, t2); + //abort(); #endif + a_con->theReleaseOnClose = true; a_con->setOperationErrorCodeAbort(4012); - a_con->theCommitStatus = NdbConnection::Aborted; + a_con->theCommitStatus = NdbConnection::NeedAbort; a_con->theCompletionStatus = NdbConnection::CompletedFailure; a_con->handleExecuteCompletion(); remove_sent_list(i); diff --git a/ndb/src/ndbapi/TransporterFacade.cpp b/ndb/src/ndbapi/TransporterFacade.cpp index b6fb2d6cded..5e9147304eb 100644 --- a/ndb/src/ndbapi/TransporterFacade.cpp +++ b/ndb/src/ndbapi/TransporterFacade.cpp @@ -567,6 +567,20 @@ TransporterFacade::init(Uint32 nodeId, const ndb_mgm_configuration* props) } #endif + Uint32 timeout = 120000; + iter.first(); + for (iter.first(); iter.valid(); iter.next()) + { + Uint32 tmp1 = 0, tmp2 = 0; + iter.get(CFG_DB_TRANSACTION_CHECK_INTERVAL, &tmp1); + iter.get(CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, &tmp2); + tmp1 += tmp2; + if (tmp1 > timeout) + timeout = tmp1; + } + m_waitfor_timeout = timeout; + ndbout_c("Using waitfor: %d", timeout); + if (!theTransporterRegistry->start_service(m_socket_server)){ ndbout_c("Unable to start theTransporterRegistry->start_service"); DBUG_RETURN(false); diff --git a/ndb/src/ndbapi/TransporterFacade.hpp b/ndb/src/ndbapi/TransporterFacade.hpp index 99edea846c1..1e7377a3b4d 100644 --- a/ndb/src/ndbapi/TransporterFacade.hpp +++ b/ndb/src/ndbapi/TransporterFacade.hpp @@ -172,6 +172,7 @@ private: */ public: STATIC_CONST( MAX_NO_THREADS = 4711 ); + Uint32 m_waitfor_timeout; // in milli seconds... private: struct ThreadData { diff --git a/ndb/test/ndbapi/testTimeout.cpp b/ndb/test/ndbapi/testTimeout.cpp index 71c11b25859..25392698642 100644 --- a/ndb/test/ndbapi/testTimeout.cpp +++ b/ndb/test/ndbapi/testTimeout.cpp @@ -24,6 +24,7 @@ #define TIMEOUT (Uint32)3000 Uint32 g_org_timeout = 3000; +Uint32 g_org_deadlock = 3000; int setTransactionTimeout(NDBT_Context* ctx, NDBT_Step* step){ @@ -59,6 +60,60 @@ resetTransactionTimeout(NDBT_Context* ctx, NDBT_Step* step){ return NDBT_OK; } +int +setDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + int timeout = ctx->getProperty("TransactionDeadlockTimeout", TIMEOUT); + + NdbConfig conf(GETNDB(step)->getNodeId()+1); + unsigned int nodeId = conf.getMasterNodeId(); + if (!conf.getProperty(nodeId, + NODE_TYPE_DB, + CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, + &g_org_deadlock)) + return NDBT_FAILED; + + g_err << "Setting timeout: " << timeout << endl; + int val[] = { DumpStateOrd::TcSetTransactionTimeout, timeout }; + if(restarter.dumpStateAllNodes(val, 2) != 0){ + return NDBT_FAILED; + } + + return NDBT_OK; +} + +int +getDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + + Uint32 val = 0; + NdbConfig conf(GETNDB(step)->getNodeId()+1); + unsigned int nodeId = conf.getMasterNodeId(); + if (!conf.getProperty(nodeId, + NODE_TYPE_DB, + CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, + &val)) + return NDBT_FAILED; + + if (val < 120000) + val = 120000; + ctx->setProperty("TransactionDeadlockTimeout", 4*val); + + return NDBT_OK; +} + +int +resetDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + + int val[] = { DumpStateOrd::TcSetTransactionTimeout, g_org_deadlock }; + if(restarter.dumpStateAllNodes(val, 2) != 0){ + return NDBT_FAILED; + } + + return NDBT_OK; +} + int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){ @@ -374,6 +429,43 @@ int runBuddyTransNoTimeout(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int +runError4012(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int stepNo = step->getStepNo(); + + int timeout = ctx->getProperty("TransactionDeadlockTimeout", TIMEOUT); + + HugoOperations hugoOps(*ctx->getTab()); + Ndb* pNdb = GETNDB(step); + + do{ + // Commit transaction + CHECK(hugoOps.startTransaction(pNdb) == 0); + CHECK(hugoOps.pkUpdateRecord(pNdb, 0) == 0); + int ret = hugoOps.execute_NoCommit(pNdb); + if (ret == 0) + { + int sleep = timeout; + ndbout << "Sleeping for " << sleep << " milliseconds" << endl; + NdbSleep_MilliSleep(sleep); + + // Expect that transaction has NOT timed-out + CHECK(hugoOps.execute_Commit(pNdb) == 0); + } + else + { + CHECK(ret == 4012); + } + } while(false); + + hugoOps.closeTransaction(pNdb); + + return result; +} + + NDBT_TESTSUITE(testTimeout); TESTCASE("DontTimeoutTransaction", "Test that the transaction does not timeout "\ @@ -465,6 +557,15 @@ TESTCASE("BuddyTransNoTimeout5", FINALIZER(resetTransactionTimeout); FINALIZER(runClearTable); } +TESTCASE("Error4012", ""){ + TC_PROPERTY("TransactionDeadlockTimeout", 120000); + INITIALIZER(runLoadTable); + INITIALIZER(getDeadlockTimeout); + INITIALIZER(setDeadlockTimeout); + STEPS(runError4012, 2); + FINALIZER(runClearTable); +} + NDBT_TESTSUITE_END(testTimeout); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 169daae6d7f..70518f7881d 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -236,6 +236,10 @@ max-time: 500 cmd: testTimeout args: -n TimeoutRandTransaction T1 +max-time: 600 +cmd: testTimeout +args: -n Error4012 T1 + # SCAN TESTS # max-time: 500 From d230d0e1e6c7aa92bd6afabee378746d9d46c340 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 20 Mar 2006 14:53:29 +0100 Subject: [PATCH 07/15] ndb - wl2610, bug#18352 Remove useless and tricky state fiddleing in TC to syncronize NF_CompleteRep as code is already present in DIH aswell Keep broadcast of TAKEOVER_TCCONF for online upgrade ndb/src/kernel/blocks/dblqh/DblqhMain.cpp: Add clever dump for showing active operations ndb/src/kernel/blocks/dbtc/Dbtc.hpp: Remove useless and tricky state fiddleing in TC to syncronize NF_CompleteRep as code is already present in DIH aswell Keep broadcast of TAKEOVER_TCCONF for online upgrade ndb/src/kernel/blocks/dbtc/DbtcMain.cpp: Remove useless and tricky state fiddleing in TC to syncronize NF_CompleteRep as code is already present in DIH aswell Keep broadcast of TAKEOVER_TCCONF for online upgrade --- ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 166 +++++++++++++++++++ ndb/src/kernel/blocks/dbtc/Dbtc.hpp | 9 -- ndb/src/kernel/blocks/dbtc/DbtcMain.cpp | 186 +++++----------------- 3 files changed, 208 insertions(+), 153 deletions(-) diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index ff7e3c32924..0aeeaccd55e 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -18448,6 +18448,172 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal) c_error_insert_table_id = dumpState->args[1]; SET_ERROR_INSERT_VALUE(5042); } + + TcConnectionrec *regTcConnectionrec = tcConnectionrec; + Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize; + Uint32 arg = dumpState->args[0]; + if(arg == 2306) + { + for(Uint32 i = 0; i<1024; i++) + { + TcConnectionrecPtr tcRec; + tcRec.i = ctransidHash[i]; + while(tcRec.i != RNIL) + { + ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); + ndbout << "TcConnectionrec " << tcRec.i; + signal->theData[0] = 2307; + signal->theData[1] = tcRec.i; + execDUMP_STATE_ORD(signal); + tcRec.i = tcRec.p->nextHashRec; + } + } + } + + if(arg == 2307 || arg == 2308) + { + TcConnectionrecPtr tcRec; + tcRec.i = signal->theData[1]; + ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); + + ndbout << " transactionState = " << tcRec.p->transactionState<theData[1]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); hostptr.p->hostStatus = HS_ALIVE; - hostptr.p->takeOverStatus = TOS_IDLE; signal->theData[0] = cownref; c_alive_nodes.set(hostptr.i); sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB); @@ -856,8 +855,6 @@ void Dbtc::execREAD_NODESCONF(Signal* signal) hostptr.i = i; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - hostptr.p->takeOverStatus = TOS_IDLE; - if (NodeBitmask::get(readNodes->inactiveNodes, i)) { jam(); hostptr.p->hostStatus = HS_DEAD; @@ -6826,21 +6823,27 @@ void Dbtc::execNODE_FAILREP(Signal* signal) const Uint32 tnewMasterId = nodeFail->masterNodeId; arrGuard(tnoOfNodes, MAX_NDB_NODES); + Uint32 i; int index = 0; - for (unsigned i = 1; i< MAX_NDB_NODES; i++) { - if(NodeBitmask::get(nodeFail->theNodes, i)){ + for (i = 1; i< MAX_NDB_NODES; i++) + { + if(NodeBitmask::get(nodeFail->theNodes, i)) + { cdata[index] = i; index++; }//if }//for + cmasterNodeId = tnewMasterId; + tcNodeFailptr.i = 0; ptrAss(tcNodeFailptr, tcFailRecord); - Uint32 tindex; - for (tindex = 0; tindex < tnoOfNodes; tindex++) { + for (i = 0; i < tnoOfNodes; i++) + { jam(); - hostptr.i = cdata[tindex]; + hostptr.i = cdata[i]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); + /*------------------------------------------------------------*/ /* SET STATUS OF THE FAILED NODE TO DEAD SINCE IT HAS */ /* FAILED. */ @@ -6849,30 +6852,15 @@ void Dbtc::execNODE_FAILREP(Signal* signal) hostptr.p->m_nf_bits = HostRecord::NF_NODE_FAIL_BITS; c_alive_nodes.clear(hostptr.i); - if (hostptr.p->takeOverStatus == TOS_COMPLETED) { - jam(); - /*------------------------------------------------------------*/ - /* A VERY UNUSUAL SITUATION. THE TAKE OVER WAS COMPLETED*/ - /* EVEN BEFORE WE HEARD ABOUT THE NODE FAILURE REPORT. */ - /* HOWEVER UNUSUAL THIS SITUATION IS POSSIBLE. */ - /*------------------------------------------------------------*/ - /* RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE */ - /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ - /* USED THEM IS COMPLETED. */ - /*------------------------------------------------------------*/ - hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; - } else { - ndbrequire(hostptr.p->takeOverStatus == TOS_IDLE); - hostptr.p->takeOverStatus = TOS_NODE_FAILED; - }//if - - if (tcNodeFailptr.p->failStatus == FS_LISTENING) { + if (tcNodeFailptr.p->failStatus == FS_LISTENING) + { jam(); /*------------------------------------------------------------*/ /* THE CURRENT TAKE OVER CAN BE AFFECTED BY THIS NODE */ /* FAILURE. */ /*------------------------------------------------------------*/ - if (hostptr.p->lqhTransStatus == LTS_ACTIVE) { + if (hostptr.p->lqhTransStatus == LTS_ACTIVE) + { jam(); /*------------------------------------------------------------*/ /* WE WERE WAITING FOR THE FAILED NODE IN THE TAKE OVER */ @@ -6884,78 +6872,25 @@ void Dbtc::execNODE_FAILREP(Signal* signal) }//if }//if - }//for - - const bool masterFailed = (cmasterNodeId != tnewMasterId); - cmasterNodeId = tnewMasterId; - - if(getOwnNodeId() == cmasterNodeId && masterFailed){ - /** - * Master has failed and I'm the new master - */ - jam(); - - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { + if (getOwnNodeId() != tnewMasterId) + { jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus != HS_ALIVE) { - jam(); - if (hostptr.p->takeOverStatus == TOS_COMPLETED) { - jam(); - /*------------------------------------------------------------*/ - /* SEND TAKE OVER CONFIRMATION TO ALL ALIVE NODES IF */ - /* TAKE OVER IS COMPLETED. THIS IS PERFORMED TO ENSURE */ - /* THAT ALL NODES AGREE ON THE IDLE STATE OF THE TAKE */ - /* OVER. THIS MIGHT BE MISSED IN AN ERROR SITUATION IF */ - /* MASTER FAILS AFTER SENDING CONFIRMATION TO NEW */ - /* MASTER BUT FAILING BEFORE SENDING TO ANOTHER NODE */ - /* WHICH WAS NOT MASTER. IF THIS NODE LATER BECOMES */ - /* MASTER IT MIGHT START A NEW TAKE OVER EVEN AFTER THE */ - /* CRASHED NODE HAVE ALREADY RECOVERED. */ - /*------------------------------------------------------------*/ - NodeReceiverGroup rg(DBTC, c_alive_nodes); - signal->theData[0] = hostptr.i; - sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//if - }//for - } - - if(getOwnNodeId() == cmasterNodeId){ - jam(); - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { + /** + * Only master does takeover currently + */ + hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; + } + else + { jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus != HS_ALIVE) { - jam(); - if (hostptr.p->takeOverStatus == TOS_NODE_FAILED) { - jam(); - /*------------------------------------------------------------*/ - /* CONCLUDE ALL ACTIVITIES THE FAILED TC DID CONTROL */ - /* SINCE WE ARE THE MASTER. THIS COULD HAVE BEEN STARTED*/ - /* BY A PREVIOUS MASTER BUT HAVE NOT BEEN CONCLUDED YET.*/ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_ACTIVE; - signal->theData[0] = hostptr.i; - sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); - }//if - }//if - }//for - }//if - for (tindex = 0; tindex < tnoOfNodes; tindex++) { - jam(); - hostptr.i = cdata[tindex]; - ptrCheckGuard(hostptr, chostFilesize, hostRecord); - /*------------------------------------------------------------*/ - /* LOOP THROUGH AND ABORT ALL SCANS THAT WHERE */ - /* CONTROLLED BY THIS TC AND ACTIVE IN THE FAILED */ - /* NODE'S LQH */ - /*------------------------------------------------------------*/ + signal->theData[0] = hostptr.i; + sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); + } + checkScanActiveInFailedLqh(signal, 0, hostptr.i); checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid nodeFailCheckTransactions(signal, 0, hostptr.i); - }//for - + } }//Dbtc::execNODE_FAILREP() void @@ -7071,47 +7006,17 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal) tfailedNodeId = signal->theData[0]; hostptr.i = tfailedNodeId; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - switch (hostptr.p->takeOverStatus) { - case TOS_IDLE: + + ndbout_c("received execTAKE_OVERTCCONF(%d) from %x (%x)", + tfailedNodeId, signal->getSendersBlockRef(), reference()); + if (signal->getSendersBlockRef() != reference()) + { jam(); - /*------------------------------------------------------------*/ - /* THIS MESSAGE ARRIVED EVEN BEFORE THE NODE_FAILREP */ - /* MESSAGE. THIS IS POSSIBLE IN EXTREME SITUATIONS. */ - /* WE SET THE STATE TO TAKE_OVER_COMPLETED AND WAIT */ - /* FOR THE NODE_FAILREP MESSAGE. */ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_COMPLETED; - break; - case TOS_NODE_FAILED: - case TOS_ACTIVE: - jam(); - /*------------------------------------------------------------*/ - /* WE ARE NOT MASTER AND THE TAKE OVER IS ACTIVE OR WE */ - /* ARE MASTER AND THE TAKE OVER IS ACTIVE. IN BOTH */ - /* WE SET THE STATE TO TAKE_OVER_COMPLETED. */ - /*------------------------------------------------------------*/ - /* RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE */ - /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ - /* USED THEM IS COMPLETED. */ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_COMPLETED; - checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); - break; - case TOS_COMPLETED: - jam(); - /*------------------------------------------------------------*/ - /* WE HAVE ALREADY RECEIVED THE CONF SIGNAL. IT IS MOST */ - /* LIKELY SENT FROM A NEW MASTER WHICH WASN'T SURE IF */ - /* THIS NODE HEARD THE CONF SIGNAL FROM THE OLD MASTER. */ - /* WE SIMPLY IGNORE THE MESSAGE. */ - /*------------------------------------------------------------*/ - /*empty*/; - break; - default: - jam(); - systemErrorLab(signal); return; - }//switch + } + + + checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); }//Dbtc::execTAKE_OVERTCCONF() void Dbtc::execTAKE_OVERTCREQ(Signal* signal) @@ -7351,16 +7256,10 @@ void Dbtc::completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd) /* TO REPORT THE COMPLETION OF THE TAKE OVER TO ALL */ /* NODES THAT ARE ALIVE. */ /*------------------------------------------------------------*/ - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { - jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus == HS_ALIVE) { - jam(); - tblockref = calcTcBlockRef(hostptr.i); - signal->theData[0] = tcNodeFailptr.p->takeOverNode; - sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//for + NodeReceiverGroup rg(DBTC, c_alive_nodes); + signal->theData[0] = tcNodeFailptr.p->takeOverNode; + sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB); + if (tcNodeFailptr.p->queueIndex > 0) { jam(); /*------------------------------------------------------------*/ @@ -9937,7 +9836,6 @@ void Dbtc::inithost(Signal* signal) ptrAss(hostptr, hostRecord); hostptr.p->hostStatus = HS_DEAD; hostptr.p->inPackedList = false; - hostptr.p->takeOverStatus = TOS_NOT_DEFINED; hostptr.p->lqhTransStatus = LTS_IDLE; hostptr.p->noOfWordsTCKEYCONF = 0; hostptr.p->noOfWordsTCINDXCONF = 0; From ad6dcfb1277b3b0a8692c3bfd802ba48cc3fe537 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 20 Mar 2006 14:55:14 +0100 Subject: [PATCH 08/15] ndb - bug#18352 remove debug prinout --- ndb/src/ndbapi/TransporterFacade.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ndb/src/ndbapi/TransporterFacade.cpp b/ndb/src/ndbapi/TransporterFacade.cpp index 5e9147304eb..30d0eec1e4a 100644 --- a/ndb/src/ndbapi/TransporterFacade.cpp +++ b/ndb/src/ndbapi/TransporterFacade.cpp @@ -579,7 +579,6 @@ TransporterFacade::init(Uint32 nodeId, const ndb_mgm_configuration* props) timeout = tmp1; } m_waitfor_timeout = timeout; - ndbout_c("Using waitfor: %d", timeout); if (!theTransporterRegistry->start_service(m_socket_server)){ ndbout_c("Unable to start theTransporterRegistry->start_service"); From 8ed36cb667b675244f55072cefa15fb65ec89ee7 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 21 Mar 2006 14:47:10 +0100 Subject: [PATCH 09/15] ndb - bug#18385 Partial system restart, can not try to start with higher GCI that own even if knowing about a higher number ndb/include/kernel/signaldata/DumpStateOrd.hpp: Add new dump for setting time between gcp ndb/include/kernel/signaldata/StartPerm.hpp: Move error codes into StartPerm + Add new error code ndb/src/kernel/blocks/ERROR_codes.txt: Add new error insert ndb/src/kernel/blocks/dbdih/Dbdih.hpp: Move error codes into StartPerm + Add new error code ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Fix so that we don't try to restart to a too new GCI when doing a partial start Add new error code when this node later tries to join ndb/test/include/NdbRestarter.hpp: Add new method for selecting random node ndb/test/ndbapi/testSystemRestart.cpp: Add new testcase for bug#18385 ndb/test/run-test/daily-basic-tests.txt: Run test in daily-basic ndb/test/src/NdbRestarter.cpp: Add new method for selecting random node --- .../kernel/signaldata/DumpStateOrd.hpp | 1 + ndb/include/kernel/signaldata/StartPerm.hpp | 6 ++ ndb/src/kernel/blocks/ERROR_codes.txt | 2 + ndb/src/kernel/blocks/dbdih/Dbdih.hpp | 1 - ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 99 ++++++++++++++----- ndb/test/include/NdbRestarter.hpp | 1 + ndb/test/ndbapi/testSystemRestart.cpp | 53 ++++++++++ ndb/test/run-test/daily-basic-tests.txt | 4 + ndb/test/src/NdbRestarter.cpp | 33 +++++++ 9 files changed, 177 insertions(+), 23 deletions(-) diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index 4dd22cf5092..2c824670cef 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -127,6 +127,7 @@ public: DihMinTimeBetweenLCP = 7017, DihMaxTimeBetweenLCP = 7018, EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP + DihSetTimeBetweenGcp = 7090, DihStartLcpImmediately = 7099, // 8000 Suma // 12000 Tux diff --git a/ndb/include/kernel/signaldata/StartPerm.hpp b/ndb/include/kernel/signaldata/StartPerm.hpp index 38be72835a3..63e01ed3868 100644 --- a/ndb/include/kernel/signaldata/StartPerm.hpp +++ b/ndb/include/kernel/signaldata/StartPerm.hpp @@ -64,5 +64,11 @@ private: Uint32 startingNodeId; Uint32 errorCode; + + enum ErrorCode + { + ZNODE_ALREADY_STARTING_ERROR = 305, + InitialStartRequired = 320 + }; }; #endif diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index 62481837c14..e5576450846 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -303,6 +303,8 @@ Test Crashes in handling node restarts 7131: Crash when receiving START_COPYREQ in master node 7132: Crash when receiving START_COPYCONF in starting node +7170: Crash when receiving START_PERMREF (InitialStartRequired) + DICT: 6000 Crash during NR when receiving DICTSTARTREQ 6001 Crash during NR when receiving SCHEMA_INFO diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index f74c0f36c4d..78acf1ffd19 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -81,7 +81,6 @@ #define ZWRONG_FAILURE_NUMBER_ERROR 302 #define ZWRONG_START_NODE_ERROR 303 #define ZNO_REPLICA_FOUND_ERROR 304 -#define ZNODE_ALREADY_STARTING_ERROR 305 #define ZNODE_START_DISALLOWED_ERROR 309 // -------------------------------------- diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index fab428aadef..eb4ae61a3e4 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -1420,6 +1420,33 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref) return; } + NodeRecordPtr nodePtr; + Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()]; + for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) + { + jam(); + ptrAss(nodePtr, nodeRecord); + if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) + { + jam(); + /** + * Since we're starting(is master) and there + * there are other nodes with higher GCI... + * there gci's must be invalidated... + * and they _must_ do an initial start + * indicate this by setting lastCompletedGCI = 0 + */ + SYSFILE->lastCompletedGCI[nodePtr.i] = 0; + ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE); + warningEvent("Making filesystem for node %d unusable", + nodePtr.i); + } + } + /** + * This set which GCI we will try to restart to + */ + SYSFILE->newestRestorableGCI = gci; + ndbrequire(isMaster()); copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file! }//Dbdih::ndbStartReqLab() @@ -1557,7 +1584,7 @@ void Dbdih::execSTART_PERMREF(Signal* signal) { jamEntry(); Uint32 errorCode = signal->theData[1]; - if (errorCode == ZNODE_ALREADY_STARTING_ERROR) { + if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) { jam(); /*-----------------------------------------------------------------------*/ // The master was busy adding another node. We will wait for a second and @@ -1567,6 +1594,20 @@ void Dbdih::execSTART_PERMREF(Signal* signal) sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1); return; }//if + + if (errorCode == StartPermRef::InitialStartRequired) + { + CRASH_INSERTION(7170); + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), + "Cluster requires this node to be started " + " with --initial as partial start has been performed" + " and this filesystem is unusable"); + progError(__LINE__, + ERR_SR_RESTARTCONFLICT, + buf); + ndbrequire(false); + } /*------------------------------------------------------------------------*/ // Some node process in another node involving our node was still active. We // will recover from this by crashing here. @@ -1657,7 +1698,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) (c_nodeStartMaster.wait != ZFALSE)) { jam(); signal->theData[0] = nodeId; - signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR; + signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR; sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); return; }//if @@ -1667,6 +1708,16 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) ndbrequire(false); }//if + if (SYSFILE->lastCompletedGCI[nodeId] == 0 && + typeStart != NodeState::ST_INITIAL_NODE_RESTART) + { + jam(); + signal->theData[0] = nodeId; + signal->theData[1] = StartPermRef::InitialStartRequired; + sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); + return; + } + /*---------------------------------------------------------------------- * WE START THE INCLUSION PROCEDURE * ---------------------------------------------------------------------*/ @@ -3515,24 +3566,12 @@ void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr) /* ------------------------------------------------------------------------- */ void Dbdih::selectMasterCandidateAndSend(Signal* signal) { - Uint32 gci = 0; - Uint32 masterCandidateId = 0; - NodeRecordPtr nodePtr; - for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { - jam(); - ptrAss(nodePtr, nodeRecord); - if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) { - jam(); - masterCandidateId = nodePtr.i; - gci = SYSFILE->lastCompletedGCI[nodePtr.i]; - }//if - }//for - ndbrequire(masterCandidateId != 0); setNodeGroups(); - signal->theData[0] = masterCandidateId; - signal->theData[1] = gci; + signal->theData[0] = getOwnNodeId(); + signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()]; sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB); - + + NodeRecordPtr nodePtr; Uint32 node_groups[MAX_NDB_NODES]; memset(node_groups, 0, sizeof(node_groups)); for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { @@ -3550,10 +3589,10 @@ void Dbdih::selectMasterCandidateAndSend(Signal* signal) if(count != 0 && count != cnoReplicas){ char buf[255]; BaseString::snprintf(buf, sizeof(buf), - "Illegal configuration change." - " Initial start needs to be performed " - " when changing no of replicas (%d != %d)", - node_groups[nodePtr.i], cnoReplicas); + "Illegal configuration change." + " Initial start needs to be performed " + " when changing no of replicas (%d != %d)", + node_groups[nodePtr.i], cnoReplicas); progError(__LINE__, ERR_INVALID_CONFIG, buf); @@ -13359,6 +13398,22 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) c_lcpState.ctimer += (1 << c_lcpState.clcpDelay); return; } + + if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp) + { + if (signal->getLength() == 1) + { + const ndb_mgm_configuration_iterator * p = + theConfiguration.getOwnConfigIterator(); + ndbrequire(p != 0); + ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay); + } + else + { + cgcpDelay = signal->theData[1]; + } + ndbout_c("Setting time between gcp : %d", cgcpDelay); + } }//Dbdih::execDUMP_STATE_ORD() void diff --git a/ndb/test/include/NdbRestarter.hpp b/ndb/test/include/NdbRestarter.hpp index 19a88b4f8ad..3ec92ae786e 100644 --- a/ndb/test/include/NdbRestarter.hpp +++ b/ndb/test/include/NdbRestarter.hpp @@ -62,6 +62,7 @@ public: int dumpStateAllNodes(int * _args, int _num_args); int getMasterNodeId(); + int getRandomNodeSameNodeGroup(int nodeId, int randomNumber); int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber); int getRandomNotMasterNodeId(int randomNumber); diff --git a/ndb/test/ndbapi/testSystemRestart.cpp b/ndb/test/ndbapi/testSystemRestart.cpp index 35016896495..30f7aca9b06 100644 --- a/ndb/test/ndbapi/testSystemRestart.cpp +++ b/ndb/test/ndbapi/testSystemRestart.cpp @@ -1051,6 +1051,52 @@ int runSystemRestart9(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int runBug18385(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + const Uint32 nodeCount = restarter.getNumDbNodes(); + if(nodeCount < 2){ + g_info << "Bug18385 - Needs atleast 2 nodes to test" << endl; + return NDBT_OK; + } + + int node1 = restarter.getDbNodeId(rand() % nodeCount); + int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand()); + + if (node1 == -1 || node2 == -1) + return NDBT_OK; + + int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 300 }; + + int result = NDBT_OK; + do { + CHECK(restarter.dumpStateAllNodes(dump, 2) == 0); + CHECK(restarter.restartOneDbNode(node1, false, true, false) == 0); + NdbSleep_SecSleep(3); + CHECK(restarter.restartAll(false, true, false) == 0); + + Uint32 cnt = 0; + int nodes[128]; + for(Uint32 i = 0; i Date: Tue, 21 Mar 2006 15:13:41 +0100 Subject: [PATCH 10/15] ndb - bug#18118 timeslice DUMP(7015) ndb/include/kernel/signaldata/DumpStateOrd.hpp: doc... ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: timeslice DUMP(7015) --- .../kernel/signaldata/DumpStateOrd.hpp | 3 + ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 126 ++++++++++-------- 2 files changed, 77 insertions(+), 52 deletions(-) diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index 2c824670cef..b42b930711c 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -126,6 +126,9 @@ public: DihAllAllowNodeStart = 7016, DihMinTimeBetweenLCP = 7017, DihMaxTimeBetweenLCP = 7018, + // 7019 + // 7020 + // 7021 EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP DihSetTimeBetweenGcp = 7090, DihStartLcpImmediately = 7099, diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index eb4ae61a3e4..a8633af2529 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -5983,6 +5983,7 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) execDUMP_STATE_ORD(signal); signal->theData[0] = 7015; + signal->theData[1] = 0; execDUMP_STATE_ORD(signal); c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__); @@ -13036,7 +13037,8 @@ void Dbdih::execDUMP_STATE_ORD(Signal* signal) { DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0]; - if (dumpState->args[0] == DumpStateOrd::DihDumpNodeRestartInfo) { + Uint32 arg = dumpState->args[0]; + if (arg == DumpStateOrd::DihDumpNodeRestartInfo) { infoEvent("c_nodeStartMaster.blockLcp = %d, c_nodeStartMaster.blockGcp = %d, c_nodeStartMaster.wait = %d", c_nodeStartMaster.blockLcp, c_nodeStartMaster.blockGcp, c_nodeStartMaster.wait); infoEvent("cstartGcpNow = %d, cgcpStatus = %d", @@ -13046,7 +13048,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("cgcpOrderBlocked = %d, cgcpStartCounter = %d", cgcpOrderBlocked, cgcpStartCounter); }//if - if (dumpState->args[0] == DumpStateOrd::DihDumpNodeStatusInfo) { + if (arg == DumpStateOrd::DihDumpNodeStatusInfo) { NodeRecordPtr localNodePtr; infoEvent("Printing nodeStatus of all nodes"); for (localNodePtr.i = 1; localNodePtr.i < MAX_NDB_NODES; localNodePtr.i++) { @@ -13058,7 +13060,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) }//for }//if - if (dumpState->args[0] == DumpStateOrd::DihPrintFragmentation){ + if (arg == DumpStateOrd::DihPrintFragmentation){ infoEvent("Printing fragmentation of all tables --"); for(Uint32 i = 0; iargs[0] == 7019 && signal->getLength() == 2) + if(arg == 7019 && signal->getLength() == 2) { char buf2[8+1]; NodeRecordPtr nodePtr; @@ -13251,7 +13253,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) nodePtr.p->m_nodefailSteps.getText(buf2)); } - if(dumpState->args[0] == 7020 && signal->getLength() > 3) + if(arg == 7020 && signal->getLength() > 3) { Uint32 gsn= signal->theData[1]; Uint32 block= signal->theData[2]; @@ -13275,7 +13277,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) gsn, getBlockName(block, "UNKNOWN"), length, buf); } - if(dumpState->args[0] == DumpStateOrd::DihDumpLCPState){ + if(arg == DumpStateOrd::DihDumpLCPState){ infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); infoEvent("lcpStatus = %d (update place = %d) ", c_lcpState.lcpStatus, c_lcpState.lcpStatusUpdatedPlace); @@ -13291,7 +13293,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); } - if(dumpState->args[0] == DumpStateOrd::DihDumpLCPMasterTakeOver){ + if(arg == DumpStateOrd::DihDumpLCPMasterTakeOver){ infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId()); infoEvent ("c_lcpMasterTakeOverState.state = %d updatePlace = %d failedNodeId = %d", @@ -13306,52 +13308,25 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId()); } - if (signal->theData[0] == 7015){ - for(Uint32 i = 0; itabStatus != TabRecord::TS_ACTIVE) - continue; - - infoEvent - ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d", - tabPtr.i, - tabPtr.p->tabCopyStatus, - tabPtr.p->tabUpdateState, - tabPtr.p->tabLcpStatus); + if (signal->theData[0] == 7015) + { + if (signal->getLength() == 1) + { + signal->theData[1] = 0; + } - FragmentstorePtr fragPtr; - for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) { - jam(); - getFragstore(tabPtr.p, fid, fragPtr); - - char buf[100], buf2[100]; - BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ", - fid, fragPtr.p->noLcpReplicas); - - Uint32 num=0; - ReplicaRecordPtr replicaPtr; - replicaPtr.i = fragPtr.p->storedReplicas; - do { - ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord); - BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)", - buf, num, - replicaPtr.p->procNode, - replicaPtr.p->lcpIdStarted, - replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle"); - BaseString::snprintf(buf, sizeof(buf), "%s", buf2); - - num++; - replicaPtr.i = replicaPtr.p->nextReplica; - } while (replicaPtr.i != RNIL); - infoEvent(buf); - } + Uint32 tableId = signal->theData[1]; + if (tableId < ctabFileSize) + { + signal->theData[0] = 7021; + execDUMP_STATE_ORD(signal); + signal->theData[0] = 7015; + signal->theData[1] = tableId + 1; + sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 2, JBB); } } - if(dumpState->args[0] == DumpStateOrd::EnableUndoDelayDataWrite){ + if(arg == DumpStateOrd::EnableUndoDelayDataWrite){ ndbout << "Dbdih:: delay write of datapages for table = " << dumpState->args[1]<< endl; // Send this dump to ACC and TUP @@ -13381,7 +13356,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) return; } - if(dumpState->args[0] == 7098){ + if(arg == 7098){ if(signal->length() == 3){ jam(); infoEvent("startLcpRoundLoopLab(tabel=%d, fragment=%d)", @@ -13394,12 +13369,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) } } - if(dumpState->args[0] == DumpStateOrd::DihStartLcpImmediately){ + if(arg == DumpStateOrd::DihStartLcpImmediately){ c_lcpState.ctimer += (1 << c_lcpState.clcpDelay); return; } - if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp) + if (arg == DumpStateOrd::DihSetTimeBetweenGcp) { if (signal->getLength() == 1) { @@ -13414,6 +13389,53 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) } ndbout_c("Setting time between gcp : %d", cgcpDelay); } + + if (arg == 7021 && signal->getLength() == 2) + { + TabRecordPtr tabPtr; + tabPtr.i = signal->theData[1]; + if (tabPtr.i >= ctabFileSize) + return; + + ptrCheckGuard(tabPtr, ctabFileSize, tabRecord); + + if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) + return; + + infoEvent + ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d", + tabPtr.i, + tabPtr.p->tabCopyStatus, + tabPtr.p->tabUpdateState, + tabPtr.p->tabLcpStatus); + + FragmentstorePtr fragPtr; + for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) { + jam(); + getFragstore(tabPtr.p, fid, fragPtr); + + char buf[100], buf2[100]; + BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ", + fid, fragPtr.p->noLcpReplicas); + + Uint32 num=0; + ReplicaRecordPtr replicaPtr; + replicaPtr.i = fragPtr.p->storedReplicas; + do { + ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord); + BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)", + buf, num, + replicaPtr.p->procNode, + replicaPtr.p->lcpIdStarted, + replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle"); + BaseString::snprintf(buf, sizeof(buf), "%s", buf2); + + num++; + replicaPtr.i = replicaPtr.p->nextReplica; + } while (replicaPtr.i != RNIL); + infoEvent(buf); + } + } }//Dbdih::execDUMP_STATE_ORD() void From 19340f2242443ec54101d7fd518be47211ed0f15 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Mar 2006 11:44:31 +0100 Subject: [PATCH 11/15] ndb - bug#18414 Fix timeout during ABORT when ZABORT_TIMEOUT_BREAK is outstanding ndb/src/kernel/blocks/ERROR_codes.txt: New error code ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: remove dumping of LCP info during NF ndb/src/kernel/blocks/dbtc/DbtcMain.cpp: Fix timeout during ABORT when ZABORT_TIMEOUT_BREAK is outstanding ndb/test/ndbapi/testNodeRestart.cpp: Add testcase for bug18414 ndb/test/ndbapi/testTimeout.cpp: Fix error code checking ndb/test/run-test/daily-basic-tests.txt: Add testcase for bug18414 --- ndb/src/kernel/blocks/ERROR_codes.txt | 2 + ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 4 -- ndb/src/kernel/blocks/dbtc/DbtcMain.cpp | 52 +++++++++++++--- ndb/test/ndbapi/testNodeRestart.cpp | 73 +++++++++++++++++++++++ ndb/test/ndbapi/testTimeout.cpp | 7 ++- ndb/test/run-test/daily-basic-tests.txt | 4 ++ 6 files changed, 128 insertions(+), 14 deletions(-) diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index e5576450846..b4c5d1b1d7e 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -226,6 +226,8 @@ Delay execution of COMPLETECONF signal 2 seconds to generate time-out. 8045: (ABORTCONF only as part of take-over) Delay execution of ABORTCONF signal 2 seconds to generate time-out. +8050: Send ZABORT_TIMEOUT_BREAK delayed + ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBTC ------------------------------------------------- diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index a8633af2529..de35ce5c275 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -5982,10 +5982,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) signal->theData[0] = 7012; execDUMP_STATE_ORD(signal); - signal->theData[0] = 7015; - signal->theData[1] = 0; - execDUMP_STATE_ORD(signal); - c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__); checkLocalNodefailComplete(signal, failedNodePtr.i, NF_LCP_TAKE_OVER); diff --git a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp index ff9b279592c..4ca13bf433b 100644 --- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp +++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp @@ -6386,6 +6386,7 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) return; } + bool found = false; OperationState tmp[16]; Uint32 TloopCount = 0; @@ -6393,7 +6394,31 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) jam(); if (tcConnectptr.i == RNIL) { jam(); - if (Tcheck == 0) { + +#ifdef VM_TRACE + ndbout_c("found: %d Tcheck: %d apiConnectptr.p->counter: %d", + found, Tcheck, apiConnectptr.p->counter); +#endif + if (found || apiConnectptr.p->counter) + { + jam(); + /** + * We sent atleast one ABORT/ABORTED + * or ZABORT_TIMEOUT_BREAK is in job buffer + * wait for reception... + */ + return; + } + + if (Tcheck == 1) + { + jam(); + releaseAbortResources(signal); + return; + } + + if (Tcheck == 0) + { jam(); /*------------------------------------------------------------------ * All nodes had already reported ABORTED for all tcConnect records. @@ -6402,9 +6427,11 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) *------------------------------------------------------------------*/ char buf[96]; buf[0] = 0; char buf2[96]; - BaseString::snprintf(buf, sizeof(buf), "TC %d: %d ops:", - __LINE__, apiConnectptr.i); - for(Uint32 i = 0; icounter); + for(Uint32 i = 0; itheData[0] = TcContinueB::ZABORT_TIMEOUT_BREAK; signal->theData[1] = tcConnectptr.i; signal->theData[2] = apiConnectptr.i; - sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + if (ERROR_INSERTED(8050)) + { + ndbout_c("sending ZABORT_TIMEOUT_BREAK delayed (%d %d)", + Tcheck, apiConnectptr.p->counter); + sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 2000, 3); + } + else + { + sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + } return; }//if ptrCheckGuard(tcConnectptr, ctcConnectFilesize, tcConnectRecord); @@ -6450,7 +6488,7 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) jam(); if (tcConnectptr.p->tcNodedata[Ti] != 0) { TloopCount += 31; - Tcheck = 1; + found = true; hostptr.i = tcConnectptr.p->tcNodedata[Ti]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); if (hostptr.p->hostStatus == HS_ALIVE) { @@ -7007,8 +7045,6 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal) hostptr.i = tfailedNodeId; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - ndbout_c("received execTAKE_OVERTCCONF(%d) from %x (%x)", - tfailedNodeId, signal->getSendersBlockRef(), reference()); if (signal->getSendersBlockRef() != reference()) { jam(); diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index eebd631af94..cc2998ff73a 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -581,6 +581,73 @@ runBug16772(NDBT_Context* ctx, NDBT_Step* step){ return ret ? NDBT_OK : NDBT_FAILED; } +int +runBug18414(NDBT_Context* ctx, NDBT_Step* step){ + + NdbRestarter restarter; + if (restarter.getNumDbNodes() < 2) + { + ctx->stopTest(); + return NDBT_OK; + } + + Ndb* pNdb = GETNDB(step); + HugoOperations hugoOps(*ctx->getTab()); + HugoTransactions hugoTrans(*ctx->getTab()); + int loop = 0; + do + { + if(hugoOps.startTransaction(pNdb) != 0) + goto err; + + if(hugoOps.pkUpdateRecord(pNdb, 0, 128, rand()) != 0) + goto err; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + goto err; + + int node1 = hugoOps.getTransaction()->getConnectedNodeId(); + int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand()); + + if (node1 == -1 || node2 == -1) + break; + + if (loop & 1) + { + if (restarter.insertErrorInNode(node1, 8050)) + goto err; + } + + if (restarter.insertErrorInNode(node2, 5003)) + goto err; + + int res= hugoOps.execute_Rollback(pNdb); + + if (restarter.waitNodesNoStart(&node2, 1) != 0) + goto err; + + if (restarter.insertErrorInAllNodes(0)) + goto err; + + if (restarter.startNodes(&node2, 1) != 0) + goto err; + + if (restarter.waitClusterStarted() != 0) + goto err; + + if (hugoTrans.scanUpdateRecords(pNdb, 128) != 0) + goto err; + + hugoOps.closeTransaction(pNdb); + + } while(++loop < 5); + + return NDBT_OK; + +err: + hugoOps.closeTransaction(pNdb); + return NDBT_FAILED; +} NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -870,6 +937,12 @@ TESTCASE("Bug16772", "Test bug with restarting before NF handling is complete"){ STEP(runBug16772); } +TESTCASE("Bug18414", + "Test bug with NF during NR"){ + INITIALIZER(runLoadTable); + STEP(runBug18414); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/ndbapi/testTimeout.cpp b/ndb/test/ndbapi/testTimeout.cpp index 25392698642..957fcd1d1e7 100644 --- a/ndb/test/ndbapi/testTimeout.cpp +++ b/ndb/test/ndbapi/testTimeout.cpp @@ -173,8 +173,11 @@ int runTimeoutTrans(NDBT_Context* ctx, NDBT_Step* step){ NdbSleep_MilliSleep(sleep); // Expect that transaction has timed-out - CHECK(hugoOps.execute_Commit(pNdb) == 237); - + int ret = hugoOps.execute_Commit(pNdb); + CHECK(ret != 0); + NdbError err = pNdb->getNdbError(ret); + CHECK(err.classification == NdbError::TimeoutExpired); + } while(false); hugoOps.closeTransaction(pNdb); diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 0533d585a41..b11e4479a57 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -458,6 +458,10 @@ max-time: 500 cmd: testSystemRestart args: -n Bug18385 T1 +max-time: 500 +cmd: testNodeRestart +args: -n Bug18414 T1 + # OLD FLEX max-time: 500 cmd: flexBench From ad911e8575e84fb336143b5463711ba8dfc7690b Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Mar 2006 12:11:51 +0100 Subject: [PATCH 12/15] ndb - minor update to ndb-autotest.sh and config files ndb/test/run-test/conf-daily-devel-ndbmaster.txt: Add SendBufferMemory to remove rare overruns ndb/test/run-test/conf-dl145a.txt: Add SendBufferMemory to remove rare overruns ndb/test/run-test/conf-ndbmaster.txt: Add SendBufferMemory to remove rare overruns ndb/test/run-test/conf-shark.txt: Add SendBufferMemory to remove rare overruns ndb/test/run-test/ndb-autotest.sh: Add support for conf per host --- ndb/test/run-test/conf-daily-devel-ndbmaster.txt | 3 +++ .../run-test/{conf-daily-basic-dl145a.txt => conf-dl145a.txt} | 3 +++ .../{conf-daily-basic-ndbmaster.txt => conf-ndbmaster.txt} | 3 +++ .../run-test/{conf-daily-basic-shark.txt => conf-shark.txt} | 3 +++ ndb/test/run-test/ndb-autotest.sh | 3 +++ 5 files changed, 15 insertions(+) rename ndb/test/run-test/{conf-daily-basic-dl145a.txt => conf-dl145a.txt} (91%) rename ndb/test/run-test/{conf-daily-basic-ndbmaster.txt => conf-ndbmaster.txt} (91%) rename ndb/test/run-test/{conf-daily-basic-shark.txt => conf-shark.txt} (91%) diff --git a/ndb/test/run-test/conf-daily-devel-ndbmaster.txt b/ndb/test/run-test/conf-daily-devel-ndbmaster.txt index 8b340e6a39d..51c171a6357 100644 --- a/ndb/test/run-test/conf-daily-devel-ndbmaster.txt +++ b/ndb/test/run-test/conf-daily-devel-ndbmaster.txt @@ -17,3 +17,6 @@ FileSystemPath: /space/autotest/run PortNumber: 16000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/conf-daily-basic-dl145a.txt b/ndb/test/run-test/conf-dl145a.txt similarity index 91% rename from ndb/test/run-test/conf-daily-basic-dl145a.txt rename to ndb/test/run-test/conf-dl145a.txt index d8cf8d34d82..d0a240f09d1 100644 --- a/ndb/test/run-test/conf-daily-basic-dl145a.txt +++ b/ndb/test/run-test/conf-dl145a.txt @@ -17,3 +17,6 @@ FileSystemPath: /home/ndbdev/autotest/run PortNumber: 14000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/conf-daily-basic-ndbmaster.txt b/ndb/test/run-test/conf-ndbmaster.txt similarity index 91% rename from ndb/test/run-test/conf-daily-basic-ndbmaster.txt rename to ndb/test/run-test/conf-ndbmaster.txt index bcd809593f3..89b41850ec0 100644 --- a/ndb/test/run-test/conf-daily-basic-ndbmaster.txt +++ b/ndb/test/run-test/conf-ndbmaster.txt @@ -17,3 +17,6 @@ FileSystemPath: /space/autotest/run PortNumber: 14000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/conf-daily-basic-shark.txt b/ndb/test/run-test/conf-shark.txt similarity index 91% rename from ndb/test/run-test/conf-daily-basic-shark.txt rename to ndb/test/run-test/conf-shark.txt index 6d1f8b64f44..d66d0280d8a 100644 --- a/ndb/test/run-test/conf-daily-basic-shark.txt +++ b/ndb/test/run-test/conf-shark.txt @@ -17,3 +17,6 @@ FileSystemPath: /space/autotest/run PortNumber: 14000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/ndb-autotest.sh b/ndb/test/run-test/ndb-autotest.sh index 4228d2354d3..459f0cd6233 100755 --- a/ndb/test/run-test/ndb-autotest.sh +++ b/ndb/test/run-test/ndb-autotest.sh @@ -299,9 +299,12 @@ choose_conf(){ elif [ -f $test_dir/conf-$1.txt ] then echo "$test_dir/conf-$1.txt" + elif [ -f $test_dir/conf-$HOST.txt ] + echo "$test_dir/conf-$HOST.txt" else echo "Unable to find conf file looked for" 1>&2 echo "$test_dir/conf-$1-$HOST.txt and" 1>&2 + echo "$test_dir/conf-$HOST.txt" 1>&2 echo "$test_dir/conf-$1.txt" 1>&2 exit fi From 4fb98ee6b87a63374381788e2c70bc17e61bd455 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Mar 2006 12:18:07 +0100 Subject: [PATCH 13/15] ndb - some more ndb-autotest updates (previously uncommitted...but in use) ndb/test/run-test/ndb-autotest.sh: More autotest updates --- ndb/test/run-test/ndb-autotest.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ndb/test/run-test/ndb-autotest.sh b/ndb/test/run-test/ndb-autotest.sh index 459f0cd6233..544897a2aa2 100755 --- a/ndb/test/run-test/ndb-autotest.sh +++ b/ndb/test/run-test/ndb-autotest.sh @@ -13,7 +13,7 @@ save_args=$* VERSION="ndb-autotest.sh version 1.04" DATE=`date '+%Y-%m-%d'` -HOST=`hostname` +HOST=`hostname -s` export DATE HOST set -e @@ -35,6 +35,7 @@ report=yes clone=5.0-ndb RUN="daily-basic daily-devel" conf=autotest.conf +LOCK=$HOME/.autotest-lock ############################ # Read command line entries# @@ -66,7 +67,7 @@ done if [ -f $conf ] then - . ./$conf + . $conf else echo "Can't find config file: $conf" exit @@ -105,7 +106,6 @@ fi # Setup the clone source location # #################################### -LOCK=$HOME/.autotest-lock src_clone=$src_clone_base-$clone ####################################### @@ -389,7 +389,8 @@ do awk '{for(i=1;i<='$count';i++)print $i;}'` echo $run_hosts >> /tmp/filter_hosts.$$ - choose $conf $run_hosts > d.tmp + choose $conf $run_hosts > d.tmp.$$ + sed -e s,CHOOSE_dir,"$install_dir",g < d.tmp.$$ > d.tmp $mkconfig d.tmp fi From e74b313c115b6eec1e96a33e16d117f33c788ce8 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Mar 2006 13:38:03 +0100 Subject: [PATCH 14/15] ndb - autotest Update makefile for removed files ndb/test/run-test/Makefile.am: Update makefile for removed files --- ndb/test/run-test/Makefile.am | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ndb/test/run-test/Makefile.am b/ndb/test/run-test/Makefile.am index cf08542ae97..8aced6e91b3 100644 --- a/ndb/test/run-test/Makefile.am +++ b/ndb/test/run-test/Makefile.am @@ -7,11 +7,10 @@ include $(top_srcdir)/ndb/config/type_mgmapiclient.mk.am test_PROGRAMS = atrt test_DATA=daily-basic-tests.txt daily-devel-tests.txt \ - conf-daily-basic-ndbmaster.txt \ - conf-daily-basic-shark.txt \ - conf-daily-devel-ndbmaster.txt \ - conf-daily-sql-ndbmaster.txt \ - conf-daily-basic-dl145a.txt + conf-ndbmaster.txt \ + conf-shark.txt \ + conf-dl145a.txt + test_SCRIPTS=atrt-analyze-result.sh atrt-gather-result.sh atrt-setup.sh \ atrt-clear-result.sh make-config.sh make-index.sh make-html-reports.sh From 2279f08af421311fb7b22474942dc7fe2cfd3bc6 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Mar 2006 15:06:44 +0100 Subject: [PATCH 15/15] ndb - Add per partition info (optionally to ndb_desc) ndb/tools/desc.cpp: Add per partition info (optionally to ndb_desc) --- ndb/tools/desc.cpp | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/ndb/tools/desc.cpp b/ndb/tools/desc.cpp index aac47c9042c..e5371b9b458 100644 --- a/ndb/tools/desc.cpp +++ b/ndb/tools/desc.cpp @@ -23,6 +23,7 @@ NDB_STD_OPTS_VARS; static const char* _dbname = "TEST_DB"; static int _unqualified = 0; +static int _partinfo = 0; static struct my_option my_long_options[] = { NDB_STD_OPTS("ndb_desc"), @@ -32,6 +33,9 @@ static struct my_option my_long_options[] = { "unqualified", 'u', "Use unqualified table names", (gptr*) &_unqualified, (gptr*) &_unqualified, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 }, + { "extra-partition-info", 'p', "Print more info per partition", + (gptr*) &_partinfo, (gptr*) &_partinfo, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} }; static void usage() @@ -52,6 +56,8 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), "d:t:O,/tmp/ndb_desc.trace"); } +static void print_part_info(Ndb* pNdb, NDBT_Table* pTab); + int main(int argc, char** argv){ NDB_INIT(argv[0]); const char *load_default_groups[]= { "mysql_cluster",0 }; @@ -106,7 +112,11 @@ int main(int argc, char** argv){ ndbout << (*pIdx) << endl; } + ndbout << endl; + + if (_partinfo) + print_part_info(pMyNdb, pTab); } else ndbout << argv[i] << ": " << dict->getNdbError() << endl; @@ -115,3 +125,70 @@ int main(int argc, char** argv){ delete pMyNdb; return NDBT_ProgramExit(NDBT_OK); } + +struct InfoInfo +{ + const char * m_title; + NdbRecAttr* m_rec_attr; + const NdbDictionary::Column* m_column; +}; + + +static +void print_part_info(Ndb* pNdb, NDBT_Table* pTab) +{ + InfoInfo g_part_info[] = { + { "Partition", 0, NdbDictionary::Column::FRAGMENT }, + { "Row count", 0, NdbDictionary::Column::ROW_COUNT }, + { "Commit count", 0, NdbDictionary::Column::COMMIT_COUNT }, + { 0, 0, 0 } + }; + + ndbout << "-- Per partition info -- " << endl; + + NdbConnection* pTrans = pNdb->startTransaction(); + if (pTrans == 0) + return; + + do + { + NdbScanOperation* pOp= pTrans->getNdbScanOperation(pTab->getName()); + if (pOp == NULL) + break; + + NdbResultSet* rs= pOp->readTuples(NdbOperation::LM_CommittedRead); + if (rs == 0) + break; + + if (pOp->interpret_exit_last_row() != 0) + break; + + Uint32 i = 0; + for(i = 0; g_part_info[i].m_title != 0; i++) + { + if ((g_part_info[i].m_rec_attr = pOp->getValue(g_part_info[i].m_column)) == 0) + break; + } + + if (g_part_info[i].m_title != 0) + break; + + if (pTrans->execute(NoCommit) != 0) + break; + + for (i = 0; g_part_info[i].m_title != 0; i++) + ndbout << g_part_info[i].m_title << "\t"; + ndbout << endl; + + while(rs->nextResult() == 0) + { + for(i = 0; g_part_info[i].m_title != 0; i++) + { + ndbout << *g_part_info[i].m_rec_attr << "\t"; + } + ndbout << endl; + } + } while(0); + + pTrans->close(); +}