From 746fc2f45712814a89cefca9e2729047502ea736 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 23 May 2006 18:16:26 +1000 Subject: [PATCH 1/8] BUG#13985 Cluster: ndb_mgm "status" command can return incorrect data node status partial fix for this bug. more info on what the other half of the fix involves is in the bug report. ndb/src/mgmclient/CommandInterpreter.cpp: partial fix for bug13985 hold a mutex around printing out events hold the mutex also around printing out put of 'status' commands. this means we don't get 1 started 2 started 2 starting output. we'll instead get the event before/after the entire status output. Due to the nature of the event arriving before status being updated, we'll pretty much always see started AFTER starting. --- ndb/src/mgmclient/CommandInterpreter.cpp | 28 ++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/ndb/src/mgmclient/CommandInterpreter.cpp b/ndb/src/mgmclient/CommandInterpreter.cpp index 14cd3fba42b..cb85cd8f3e6 100644 --- a/ndb/src/mgmclient/CommandInterpreter.cpp +++ b/ndb/src/mgmclient/CommandInterpreter.cpp @@ -172,8 +172,15 @@ private: bool rep_connected; #endif struct NdbThread* m_event_thread; + NdbMutex *m_print_mutex; }; +struct event_thread_param { + NdbMgmHandle *m; + NdbMutex **p; +}; + +NdbMutex* print_mutex; /* * Facade object for CommandInterpreter @@ -409,6 +416,7 @@ CommandInterpreter::CommandInterpreter(const char *_host,int verbose) m_connected= false; m_event_thread= 0; try_reconnect = 0; + m_print_mutex= NdbMutex_Create(); #ifdef HAVE_GLOBAL_REPLICATION rep_host = NULL; m_repserver = NULL; @@ -422,6 +430,7 @@ CommandInterpreter::CommandInterpreter(const char *_host,int verbose) CommandInterpreter::~CommandInterpreter() { disconnect(); + NdbMutex_Destroy(m_print_mutex); ndb_mgm_destroy_handle(&m_mgmsrv); ndb_mgm_destroy_handle(&m_mgmsrv2); } @@ -461,11 +470,13 @@ CommandInterpreter::printError() static int do_event_thread; static void* -event_thread_run(void* m) +event_thread_run(void* p) { DBUG_ENTER("event_thread_run"); - NdbMgmHandle handle= *(NdbMgmHandle*)m; + struct event_thread_param param= *(struct event_thread_param*)p; + NdbMgmHandle handle= *(param.m); + NdbMutex* printmutex= *(param.p); int filter[] = { 15, NDB_MGM_EVENT_CATEGORY_BACKUP, 1, NDB_MGM_EVENT_CATEGORY_STARTUP, @@ -483,7 +494,11 @@ event_thread_run(void* m) { const char ping_token[]= ""; if (memcmp(ping_token,tmp,sizeof(ping_token)-1)) - ndbout << tmp; + if(tmp && strlen(tmp)) + { + Guard g(printmutex); + ndbout << tmp; + } } } while(do_event_thread); NDB_CLOSE_SOCKET(fd); @@ -516,8 +531,11 @@ CommandInterpreter::connect() assert(m_event_thread == 0); assert(do_event_thread == 0); do_event_thread= 0; + struct event_thread_param p; + p.m= &m_mgmsrv2; + p.p= &m_print_mutex; m_event_thread = NdbThread_Create(event_thread_run, - (void**)&m_mgmsrv2, + (void**)&p, 32768, "CommandInterpreted_event_thread", NDB_THREAD_PRIO_LOW); @@ -607,6 +625,7 @@ CommandInterpreter::execute(const char *_line, int _try_reconnect, int result= execute_impl(_line); if (error) *error= m_error; + return result; } @@ -920,6 +939,7 @@ CommandInterpreter::executeForAll(const char * cmd, ExecuteFunction fun, ndbout_c("Trying to start all nodes of system."); ndbout_c("Use ALL STATUS to see the system start-up phases."); } else { + Guard g(m_print_mutex); struct ndb_mgm_cluster_state *cl= ndb_mgm_get_status(m_mgmsrv); if(cl == 0){ ndbout_c("Unable get status from management server"); From 3cea3705fe3bc40ad099057905d5932227638311 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 3 Jul 2006 15:37:57 +1000 Subject: [PATCH 2/8] BUG#13985 ndb_mgm "status" command can return incorrect data node status Second half of the fix for this bug. This patch forces a heartbeat to be sent and will wait (a little while) for replies. This way we can get > all status X starting Y started X started > which is okay as the new status comes after the old status, always. There is the slimmest of opportunities to get output like above where only half the cluster appears started. This is about the best we can do with a command line interactive program. ndb/src/mgmsrv/MgmtSrvr.cpp: Add updateStatus method to MgmtSrvr. Used to force an update of node status for the nodes. ndb/src/mgmsrv/MgmtSrvr.hpp: add prototype for updateStatus(NodeBitmask) method ndb/src/mgmsrv/Services.cpp: When status is queried, force an update of the status in the mgm server. (i.e. send heartbeats) ndb/src/ndbapi/ClusterMgr.cpp: new DEBUG_REG define for debugging registration and HB code. Add ClusterMgr::forceHB(NodeBitmask) which sends a HB signal to each node in the bitmask and then waits for a REGCONF from them. Will only wait for a total of 1 second, not blocking an end client for too long. On receipt of HB, clear the nodeId in the waiting for bitmask and signal any waiting threads. ndb/src/ndbapi/ClusterMgr.hpp: Add ::forceHB(NodeBitmask) and associated variables --- ndb/src/mgmsrv/MgmtSrvr.cpp | 6 +++ ndb/src/mgmsrv/MgmtSrvr.hpp | 2 + ndb/src/mgmsrv/Services.cpp | 3 ++ ndb/src/ndbapi/ClusterMgr.cpp | 71 +++++++++++++++++++++++++++++++++-- ndb/src/ndbapi/ClusterMgr.hpp | 10 ++++- 5 files changed, 86 insertions(+), 6 deletions(-) diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp index b713a3908ab..f3e75f1afe4 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.cpp +++ b/ndb/src/mgmsrv/MgmtSrvr.cpp @@ -1412,6 +1412,12 @@ MgmtSrvr::exitSingleUser(int * stopCount, bool abort) #include +void +MgmtSrvr::updateStatus(NodeBitmask nodes) +{ + theFacade->theClusterMgr->forceHB(nodes); +} + int MgmtSrvr::status(int nodeId, ndb_mgm_node_status * _status, diff --git a/ndb/src/mgmsrv/MgmtSrvr.hpp b/ndb/src/mgmsrv/MgmtSrvr.hpp index 007494a277d..1e59f32b76f 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.hpp +++ b/ndb/src/mgmsrv/MgmtSrvr.hpp @@ -487,6 +487,8 @@ public: void get_connected_nodes(NodeBitmask &connected_nodes) const; SocketServer *get_socket_server() { return m_socket_server; } + void updateStatus(NodeBitmask nodes); + //************************************************************************** private: //************************************************************************** diff --git a/ndb/src/mgmsrv/Services.cpp b/ndb/src/mgmsrv/Services.cpp index d28e0eba5b3..1df7d9be7b4 100644 --- a/ndb/src/mgmsrv/Services.cpp +++ b/ndb/src/mgmsrv/Services.cpp @@ -951,6 +951,9 @@ printNodeStatus(OutputStream *output, MgmtSrvr &mgmsrv, enum ndb_mgm_node_type type) { NodeId nodeId = 0; + NodeBitmask hbnodes; + mgmsrv.get_connected_nodes(hbnodes); + mgmsrv.updateStatus(hbnodes); while(mgmsrv.getNextNodeId(&nodeId, type)) { enum ndb_mgm_node_status status; Uint32 startPhase = 0, diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index fbff57d3168..120ab76f7ca 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -39,6 +39,8 @@ int global_flag_send_heartbeat_now= 0; +//#define DEBUG_REG + // Just a C wrapper for threadMain extern "C" void* @@ -67,6 +69,8 @@ ClusterMgr::ClusterMgr(TransporterFacade & _facade): DBUG_ENTER("ClusterMgr::ClusterMgr"); ndbSetOwnVersion(); clusterMgrThreadMutex = NdbMutex_Create(); + waitForHBMutex= NdbMutex_Create(); + waitForHBCond= NdbCondition_Create(); noOfAliveNodes= 0; noOfConnectedNodes= 0; theClusterMgrThread= 0; @@ -77,7 +81,9 @@ ClusterMgr::ClusterMgr(TransporterFacade & _facade): ClusterMgr::~ClusterMgr() { DBUG_ENTER("ClusterMgr::~ClusterMgr"); - doStop(); + doStop(); + NdbCondition_Destroy(waitForHBCond); + NdbMutex_Destroy(waitForHBMutex); NdbMutex_Destroy(clusterMgrThreadMutex); DBUG_VOID_RETURN; } @@ -163,6 +169,49 @@ ClusterMgr::doStop( ){ DBUG_VOID_RETURN; } +void +ClusterMgr::forceHB(NodeBitmask waitFor) +{ + theFacade.lock_mutex(); + global_flag_send_heartbeat_now= 1; + + waitForHBFromNodes= waitFor; +#ifdef DEBUG_REG + char buf[128]; + ndbout << "Waiting for HB from " << waitForHBFromNodes.getText(buf) << endl; +#endif + NdbApiSignal signal(numberToRef(API_CLUSTERMGR, theFacade.ownId())); + + signal.theVerId_signalNumber = GSN_API_REGREQ; + signal.theReceiversBlockNumber = QMGR; + signal.theTrace = 0; + signal.theLength = ApiRegReq::SignalLength; + + ApiRegReq * req = CAST_PTR(ApiRegReq, signal.getDataPtrSend()); + req->ref = numberToRef(API_CLUSTERMGR, theFacade.ownId()); + req->version = NDB_VERSION; + + int nodeId= 0; + for(int i=0; + NodeBitmask::NotFound!=(nodeId= waitForHBFromNodes.find(i)); + i= nodeId+1) + { +#ifdef DEBUG_REG + ndbout << "FORCE HB to " << nodeId << endl; +#endif + theFacade.sendSignalUnCond(&signal, nodeId); + } + + theFacade.unlock_mutex(); + + NdbMutex_Lock(waitForHBMutex); + NdbCondition_WaitTimeout(waitForHBCond, waitForHBMutex, 1000); + NdbMutex_Unlock(waitForHBMutex); +#ifdef DEBUG_REG + ndbout << "Still waiting for HB from " << waitForHBFromNodes.getText(buf) << endl; +#endif +} + void ClusterMgr::threadMain( ){ NdbApiSignal signal(numberToRef(API_CLUSTERMGR, theFacade.ownId())); @@ -226,7 +275,7 @@ ClusterMgr::threadMain( ){ if (theNode.m_info.m_type == NodeInfo::REP) { signal.theReceiversBlockNumber = API_CLUSTERMGR; } -#if 0 +#ifdef DEBUG_REG ndbout_c("ClusterMgr: Sending API_REGREQ to node %d", (int)nodeId); #endif theFacade.sendSignalUnCond(&signal, nodeId); @@ -278,7 +327,7 @@ ClusterMgr::execAPI_REGREQ(const Uint32 * theData){ const ApiRegReq * const apiRegReq = (ApiRegReq *)&theData[0]; const NodeId nodeId = refToNode(apiRegReq->ref); -#if 0 +#ifdef DEBUG_REG ndbout_c("ClusterMgr: Recd API_REGREQ from node %d", nodeId); #endif @@ -319,7 +368,7 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ const ApiRegConf * const apiRegConf = (ApiRegConf *)&theData[0]; const NodeId nodeId = refToNode(apiRegConf->qmgrRef); -#if 0 +#ifdef DEBUG_REG ndbout_c("ClusterMgr: Recd API_REGCONF from node %d", nodeId); #endif @@ -351,6 +400,13 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ if (node.m_info.m_type != NodeInfo::REP) { node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50; } + waitForHBFromNodes.clear(nodeId); + if(waitForHBFromNodes.isclear()) + { + NdbMutex_Lock(waitForHBMutex); + NdbCondition_Signal(waitForHBCond); + NdbMutex_Unlock(waitForHBMutex); + } } void @@ -379,6 +435,13 @@ ClusterMgr::execAPI_REGREF(const Uint32 * theData){ default: break; } + waitForHBFromNodes.clear(nodeId); + if(waitForHBFromNodes.isclear()) + { + NdbMutex_Lock(waitForHBMutex); + NdbCondition_Signal(waitForHBCond); + NdbMutex_Unlock(waitForHBMutex); + } } void diff --git a/ndb/src/ndbapi/ClusterMgr.hpp b/ndb/src/ndbapi/ClusterMgr.hpp index 1a1e622a889..6eda98c4773 100644 --- a/ndb/src/ndbapi/ClusterMgr.hpp +++ b/ndb/src/ndbapi/ClusterMgr.hpp @@ -49,7 +49,9 @@ public: void doStop(); void startThread(); - + + void forceHB(NodeBitmask waitFor); + private: void threadMain(); @@ -85,7 +87,11 @@ private: Uint32 noOfConnectedNodes; Node theNodes[MAX_NODES]; NdbThread* theClusterMgrThread; - + + NodeBitmask waitForHBFromNodes; // used in forcing HBs + NdbMutex* waitForHBMutex; + NdbCondition* waitForHBCond; + /** * Used for controlling start/stop of the thread */ From bd015e57dbbb8429eb40d59a875610ee5c38ae93 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 7 Jul 2006 17:39:11 +1000 Subject: [PATCH 3/8] BUG#13985 ndb_mgm "status" command can return incorrect data node status better support parallel show commands, hold mutex when touching waitForHBFromNodes ndb/src/ndbapi/ClusterMgr.cpp: correctly serialize ::forceHB calls and hold mutex for whole time updating waitForHBFromNodes --- ndb/src/ndbapi/ClusterMgr.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index 120ab76f7ca..db169304acd 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -172,6 +172,7 @@ ClusterMgr::doStop( ){ void ClusterMgr::forceHB(NodeBitmask waitFor) { + NdbMutex_Lock(waitForHBMutex); theFacade.lock_mutex(); global_flag_send_heartbeat_now= 1; @@ -204,7 +205,6 @@ ClusterMgr::forceHB(NodeBitmask waitFor) theFacade.unlock_mutex(); - NdbMutex_Lock(waitForHBMutex); NdbCondition_WaitTimeout(waitForHBCond, waitForHBMutex, 1000); NdbMutex_Unlock(waitForHBMutex); #ifdef DEBUG_REG @@ -400,13 +400,14 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ if (node.m_info.m_type != NodeInfo::REP) { node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50; } + + NdbMutex_Lock(waitForHBMutex); waitForHBFromNodes.clear(nodeId); + if(waitForHBFromNodes.isclear()) - { - NdbMutex_Lock(waitForHBMutex); NdbCondition_Signal(waitForHBCond); - NdbMutex_Unlock(waitForHBMutex); - } + + NdbMutex_Unlock(waitForHBMutex); } void From 5a8919f290ab569e323d623b4da0714ed2d361da Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 7 Jul 2006 18:39:38 +1000 Subject: [PATCH 4/8] BUG#13985 Cluster: ndb_mgm "status" command can return incorrect data node status use existing transporter mutex ndb/src/ndbapi/ClusterMgr.cpp: just use the transporter facade mutex. parallel 'show' will get woken up at the same time ndb/src/ndbapi/ClusterMgr.hpp: remove wait for heartbeat mutex --- ndb/src/ndbapi/ClusterMgr.cpp | 25 +++++++++++-------------- ndb/src/ndbapi/ClusterMgr.hpp | 1 - 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index db169304acd..772cbaf99b5 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -69,7 +69,6 @@ ClusterMgr::ClusterMgr(TransporterFacade & _facade): DBUG_ENTER("ClusterMgr::ClusterMgr"); ndbSetOwnVersion(); clusterMgrThreadMutex = NdbMutex_Create(); - waitForHBMutex= NdbMutex_Create(); waitForHBCond= NdbCondition_Create(); noOfAliveNodes= 0; noOfConnectedNodes= 0; @@ -83,7 +82,6 @@ ClusterMgr::~ClusterMgr() DBUG_ENTER("ClusterMgr::~ClusterMgr"); doStop(); NdbCondition_Destroy(waitForHBCond); - NdbMutex_Destroy(waitForHBMutex); NdbMutex_Destroy(clusterMgrThreadMutex); DBUG_VOID_RETURN; } @@ -172,8 +170,15 @@ ClusterMgr::doStop( ){ void ClusterMgr::forceHB(NodeBitmask waitFor) { - NdbMutex_Lock(waitForHBMutex); theFacade.lock_mutex(); + + if(!waitForHBFromNodes.isclear()) + { + NdbCondition_WaitTimeout(waitForHBCond, theFacade.theMutexPtr, 1000); + theFacade.unlock_mutex(); + return; + } + global_flag_send_heartbeat_now= 1; waitForHBFromNodes= waitFor; @@ -203,10 +208,8 @@ ClusterMgr::forceHB(NodeBitmask waitFor) theFacade.sendSignalUnCond(&signal, nodeId); } + NdbCondition_WaitTimeout(waitForHBCond, theFacade.theMutexPtr, 1000); theFacade.unlock_mutex(); - - NdbCondition_WaitTimeout(waitForHBCond, waitForHBMutex, 1000); - NdbMutex_Unlock(waitForHBMutex); #ifdef DEBUG_REG ndbout << "Still waiting for HB from " << waitForHBFromNodes.getText(buf) << endl; #endif @@ -401,13 +404,10 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50; } - NdbMutex_Lock(waitForHBMutex); waitForHBFromNodes.clear(nodeId); if(waitForHBFromNodes.isclear()) - NdbCondition_Signal(waitForHBCond); - - NdbMutex_Unlock(waitForHBMutex); + NdbCondition_Broadcast(waitForHBCond); } void @@ -436,13 +436,10 @@ ClusterMgr::execAPI_REGREF(const Uint32 * theData){ default: break; } + waitForHBFromNodes.clear(nodeId); if(waitForHBFromNodes.isclear()) - { - NdbMutex_Lock(waitForHBMutex); NdbCondition_Signal(waitForHBCond); - NdbMutex_Unlock(waitForHBMutex); - } } void diff --git a/ndb/src/ndbapi/ClusterMgr.hpp b/ndb/src/ndbapi/ClusterMgr.hpp index 6eda98c4773..3284fd8aa8a 100644 --- a/ndb/src/ndbapi/ClusterMgr.hpp +++ b/ndb/src/ndbapi/ClusterMgr.hpp @@ -89,7 +89,6 @@ private: NdbThread* theClusterMgrThread; NodeBitmask waitForHBFromNodes; // used in forcing HBs - NdbMutex* waitForHBMutex; NdbCondition* waitForHBCond; /** From a610467f71781644f9cb0461765033e9ea3d3b66 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 7 Jul 2006 20:10:15 +1000 Subject: [PATCH 5/8] BUG#13985: Cluster: ndb_mgm "status" command can return incorrect data node status - only force HB to data nodes - flag for if we broadcast condition on receipt of HB ndb/src/mgmsrv/MgmtSrvr.cpp: Add get_connected_ndb_nodes to check status for connected data nodes only ndb/src/mgmsrv/MgmtSrvr.hpp: add prototype for get_connected_ndb_nodes ndb/src/mgmsrv/Services.cpp: only force HB to NDBD nodes ndb/src/ndbapi/ClusterMgr.cpp: flag to control if we send the condition ndb/src/ndbapi/ClusterMgr.hpp: flag for if we broadcast condition on receipt of hb --- ndb/src/mgmsrv/MgmtSrvr.cpp | 19 +++++++++++++++++++ ndb/src/mgmsrv/MgmtSrvr.hpp | 1 + ndb/src/mgmsrv/Services.cpp | 2 +- ndb/src/ndbapi/ClusterMgr.cpp | 19 ++++++++++++++----- ndb/src/ndbapi/ClusterMgr.hpp | 1 + 5 files changed, 36 insertions(+), 6 deletions(-) diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp index f3e75f1afe4..793192dd487 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.cpp +++ b/ndb/src/mgmsrv/MgmtSrvr.cpp @@ -1941,6 +1941,25 @@ MgmtSrvr::get_connected_nodes(NodeBitmask &connected_nodes) const } } +void +MgmtSrvr::get_connected_ndb_nodes(NodeBitmask &connected_nodes) const +{ + NodeBitmask ndb_nodes; + if (theFacade && theFacade->theClusterMgr) + { + for(Uint32 i = 0; i < MAX_NODES; i++) + { + if (getNodeType(i) == NDB_MGM_NODE_TYPE_NDB) + { + ndb_nodes.set(i); + const ClusterMgr::Node &node= theFacade->theClusterMgr->getNodeInfo(i); + connected_nodes.bitOR(node.m_state.m_connected_nodes); + } + } + } + connected_nodes.bitAND(ndb_nodes); +} + bool MgmtSrvr::alloc_node_id(NodeId * nodeId, enum ndb_mgm_node_type type, diff --git a/ndb/src/mgmsrv/MgmtSrvr.hpp b/ndb/src/mgmsrv/MgmtSrvr.hpp index 1e59f32b76f..5bacf640a18 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.hpp +++ b/ndb/src/mgmsrv/MgmtSrvr.hpp @@ -485,6 +485,7 @@ public: const char *get_connect_address(Uint32 node_id); void get_connected_nodes(NodeBitmask &connected_nodes) const; + void get_connected_ndb_nodes(NodeBitmask &connected_nodes) const; SocketServer *get_socket_server() { return m_socket_server; } void updateStatus(NodeBitmask nodes); diff --git a/ndb/src/mgmsrv/Services.cpp b/ndb/src/mgmsrv/Services.cpp index 1df7d9be7b4..ae3433fe95b 100644 --- a/ndb/src/mgmsrv/Services.cpp +++ b/ndb/src/mgmsrv/Services.cpp @@ -952,7 +952,7 @@ printNodeStatus(OutputStream *output, enum ndb_mgm_node_type type) { NodeId nodeId = 0; NodeBitmask hbnodes; - mgmsrv.get_connected_nodes(hbnodes); + mgmsrv.get_connected_ndb_nodes(hbnodes); mgmsrv.updateStatus(hbnodes); while(mgmsrv.getNextNodeId(&nodeId, type)) { enum ndb_mgm_node_status status; diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index 772cbaf99b5..28f65eebde8 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -70,6 +70,7 @@ ClusterMgr::ClusterMgr(TransporterFacade & _facade): ndbSetOwnVersion(); clusterMgrThreadMutex = NdbMutex_Create(); waitForHBCond= NdbCondition_Create(); + waitingForHB= false; noOfAliveNodes= 0; noOfConnectedNodes= 0; theClusterMgrThread= 0; @@ -172,7 +173,7 @@ ClusterMgr::forceHB(NodeBitmask waitFor) { theFacade.lock_mutex(); - if(!waitForHBFromNodes.isclear()) + if(waitingForHB) { NdbCondition_WaitTimeout(waitForHBCond, theFacade.theMutexPtr, 1000); theFacade.unlock_mutex(); @@ -180,6 +181,7 @@ ClusterMgr::forceHB(NodeBitmask waitFor) } global_flag_send_heartbeat_now= 1; + waitingForHB= true; waitForHBFromNodes= waitFor; #ifdef DEBUG_REG @@ -209,10 +211,11 @@ ClusterMgr::forceHB(NodeBitmask waitFor) } NdbCondition_WaitTimeout(waitForHBCond, theFacade.theMutexPtr, 1000); - theFacade.unlock_mutex(); + waitingForHB= false; #ifdef DEBUG_REG ndbout << "Still waiting for HB from " << waitForHBFromNodes.getText(buf) << endl; #endif + theFacade.unlock_mutex(); } void @@ -404,10 +407,16 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50; } - waitForHBFromNodes.clear(nodeId); + if(waitingForHB) + { + waitForHBFromNodes.clear(nodeId); - if(waitForHBFromNodes.isclear()) - NdbCondition_Broadcast(waitForHBCond); + if(waitForHBFromNodes.isclear()) + { + waitingForHB= false; + NdbCondition_Broadcast(waitForHBCond); + } + } } void diff --git a/ndb/src/ndbapi/ClusterMgr.hpp b/ndb/src/ndbapi/ClusterMgr.hpp index 3284fd8aa8a..b9863821b4f 100644 --- a/ndb/src/ndbapi/ClusterMgr.hpp +++ b/ndb/src/ndbapi/ClusterMgr.hpp @@ -90,6 +90,7 @@ private: NodeBitmask waitForHBFromNodes; // used in forcing HBs NdbCondition* waitForHBCond; + bool waitingForHB; /** * Used for controlling start/stop of the thread From 7fcb36e2afffeb63aa1be9a476a4c781413eeb70 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Aug 2006 15:03:55 +0800 Subject: [PATCH 6/8] BUG#13985 fixups after review by jonas ndb/src/mgmclient/CommandInterpreter.cpp: Guard the print mutex when running SHOW ndb/src/mgmsrv/MgmtSrvr.cpp: replace global_flag_send_heartbeat_now with forceHB()/updateStatus() don't use bitmask as parameter to forceHB to reflect reality of what the function does. remove get_connected_ndb_nodes() as it is no longer used ndb/src/mgmsrv/MgmtSrvr.hpp: remove unused get_connected_ndb_nodes() update updateStatus prototype ndb/src/mgmsrv/Services.cpp: use new prototype for updateStatus() - doesn't accept NodeBitmask ndb/src/ndbapi/ClusterMgr.cpp: remove global_flag_send_heartbeat_now, replace with forceHB. compute bitmask of nodes to send HB to in forceHB ndb/src/ndbapi/ClusterMgr.hpp: update prototype for forceHB, don't give the illusion that NodeBitmask means much. --- ndb/src/mgmclient/CommandInterpreter.cpp | 1 + ndb/src/mgmsrv/MgmtSrvr.cpp | 28 ++++------------------- ndb/src/mgmsrv/MgmtSrvr.hpp | 3 +-- ndb/src/mgmsrv/Services.cpp | 4 +--- ndb/src/ndbapi/ClusterMgr.cpp | 29 ++++++++++++++++-------- ndb/src/ndbapi/ClusterMgr.hpp | 2 +- 6 files changed, 27 insertions(+), 40 deletions(-) diff --git a/ndb/src/mgmclient/CommandInterpreter.cpp b/ndb/src/mgmclient/CommandInterpreter.cpp index 103c252ca04..d4f66a56da9 100644 --- a/ndb/src/mgmclient/CommandInterpreter.cpp +++ b/ndb/src/mgmclient/CommandInterpreter.cpp @@ -705,6 +705,7 @@ CommandInterpreter::execute_impl(const char *_line) DBUG_RETURN(true); if (strcasecmp(firstToken, "SHOW") == 0) { + Guard g(m_print_mutex); executeShow(allAfterFirstToken); DBUG_RETURN(true); } diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp index d514f0da1a4..5fabb84adb7 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.cpp +++ b/ndb/src/mgmsrv/MgmtSrvr.cpp @@ -77,7 +77,6 @@ }\ } -extern int global_flag_send_heartbeat_now; extern int g_no_nodeid_checks; extern my_bool opt_core; @@ -1456,9 +1455,9 @@ MgmtSrvr::exitSingleUser(int * stopCount, bool abort) #include void -MgmtSrvr::updateStatus(NodeBitmask nodes) +MgmtSrvr::updateStatus() { - theFacade->theClusterMgr->forceHB(nodes); + theFacade->theClusterMgr->forceHB(); } int @@ -1985,25 +1984,6 @@ MgmtSrvr::get_connected_nodes(NodeBitmask &connected_nodes) const } } -void -MgmtSrvr::get_connected_ndb_nodes(NodeBitmask &connected_nodes) const -{ - NodeBitmask ndb_nodes; - if (theFacade && theFacade->theClusterMgr) - { - for(Uint32 i = 0; i < MAX_NODES; i++) - { - if (getNodeType(i) == NDB_MGM_NODE_TYPE_NDB) - { - ndb_nodes.set(i); - const ClusterMgr::Node &node= theFacade->theClusterMgr->getNodeInfo(i); - connected_nodes.bitOR(node.m_state.m_connected_nodes); - } - } - } - connected_nodes.bitAND(ndb_nodes); -} - bool MgmtSrvr::alloc_node_id(NodeId * nodeId, enum ndb_mgm_node_type type, @@ -2178,7 +2158,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, if (found_matching_type && !found_free_node) { // we have a temporary error which might be due to that // we have got the latest connect status from db-nodes. Force update. - global_flag_send_heartbeat_now= 1; + updateStatus(); } BaseString type_string, type_c_string; @@ -2532,7 +2512,7 @@ MgmtSrvr::Allocated_resources::~Allocated_resources() if (!m_reserved_nodes.isclear()) { m_mgmsrv.m_reserved_nodes.bitANDC(m_reserved_nodes); // node has been reserved, force update signal to ndb nodes - global_flag_send_heartbeat_now= 1; + m_mgmsrv.updateStatus(); char tmp_str[128]; m_mgmsrv.m_reserved_nodes.getText(tmp_str); diff --git a/ndb/src/mgmsrv/MgmtSrvr.hpp b/ndb/src/mgmsrv/MgmtSrvr.hpp index ab71fe6f4dc..17debb19f50 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.hpp +++ b/ndb/src/mgmsrv/MgmtSrvr.hpp @@ -488,10 +488,9 @@ public: const char *get_connect_address(Uint32 node_id); void get_connected_nodes(NodeBitmask &connected_nodes) const; - void get_connected_ndb_nodes(NodeBitmask &connected_nodes) const; SocketServer *get_socket_server() { return m_socket_server; } - void updateStatus(NodeBitmask nodes); + void updateStatus(); //************************************************************************** private: diff --git a/ndb/src/mgmsrv/Services.cpp b/ndb/src/mgmsrv/Services.cpp index 653f36ecc6d..7f5b0e29442 100644 --- a/ndb/src/mgmsrv/Services.cpp +++ b/ndb/src/mgmsrv/Services.cpp @@ -982,9 +982,7 @@ printNodeStatus(OutputStream *output, MgmtSrvr &mgmsrv, enum ndb_mgm_node_type type) { NodeId nodeId = 0; - NodeBitmask hbnodes; - mgmsrv.get_connected_ndb_nodes(hbnodes); - mgmsrv.updateStatus(hbnodes); + mgmsrv.updateStatus(); while(mgmsrv.getNextNodeId(&nodeId, type)) { enum ndb_mgm_node_status status; Uint32 startPhase = 0, diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index 28f65eebde8..4b3c409e9d4 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -37,8 +37,6 @@ #include #include -int global_flag_send_heartbeat_now= 0; - //#define DEBUG_REG // Just a C wrapper for threadMain @@ -169,7 +167,7 @@ ClusterMgr::doStop( ){ } void -ClusterMgr::forceHB(NodeBitmask waitFor) +ClusterMgr::forceHB() { theFacade.lock_mutex(); @@ -180,10 +178,25 @@ ClusterMgr::forceHB(NodeBitmask waitFor) return; } - global_flag_send_heartbeat_now= 1; waitingForHB= true; - waitForHBFromNodes= waitFor; + NodeBitmask ndb_nodes; + ndb_nodes.clear(); + waitForHBFromNodes.clear(); + for(Uint32 i = 0; i < MAX_NODES; i++) + { + if(!theNodes[i].defined) + continue; + if(theNodes[i].m_info.m_type == NodeInfo::DB) + { + ndb_nodes.set(i); + const ClusterMgr::Node &node= getNodeInfo(i); + waitForHBFromNodes.bitOR(node.m_state.m_connected_nodes); + } + ndbout << endl; + } + waitForHBFromNodes.bitAND(ndb_nodes); + #ifdef DEBUG_REG char buf[128]; ndbout << "Waiting for HB from " << waitForHBFromNodes.getText(buf) << endl; @@ -239,9 +252,6 @@ ClusterMgr::threadMain( ){ /** * Start of Secure area for use of Transporter */ - int send_heartbeat_now= global_flag_send_heartbeat_now; - global_flag_send_heartbeat_now= 0; - theFacade.lock_mutex(); for (int i = 1; i < MAX_NODES; i++){ /** @@ -264,8 +274,7 @@ ClusterMgr::threadMain( ){ } theNode.hbCounter += timeSlept; - if (theNode.hbCounter >= theNode.hbFrequency || - send_heartbeat_now) { + if (theNode.hbCounter >= theNode.hbFrequency) { /** * It is now time to send a new Heartbeat */ diff --git a/ndb/src/ndbapi/ClusterMgr.hpp b/ndb/src/ndbapi/ClusterMgr.hpp index b9863821b4f..d2bcc52f7e8 100644 --- a/ndb/src/ndbapi/ClusterMgr.hpp +++ b/ndb/src/ndbapi/ClusterMgr.hpp @@ -50,7 +50,7 @@ public: void doStop(); void startThread(); - void forceHB(NodeBitmask waitFor); + void forceHB(); private: void threadMain(); From 1bd553d41becd0df5fc9935af196ff500f1fdb25 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Aug 2006 15:39:32 +0800 Subject: [PATCH 7/8] remove undeeded printing of newline. ndb/src/ndbapi/ClusterMgr.cpp: remove extra 'ndbout << endl' --- ndb/src/ndbapi/ClusterMgr.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index 4b3c409e9d4..475561af225 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -193,7 +193,6 @@ ClusterMgr::forceHB() const ClusterMgr::Node &node= getNodeInfo(i); waitForHBFromNodes.bitOR(node.m_state.m_connected_nodes); } - ndbout << endl; } waitForHBFromNodes.bitAND(ndb_nodes); From 5cafb623b7f57031805ce399ab1bafdd8c861c4b Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 15 Aug 2006 11:09:38 +0800 Subject: [PATCH 8/8] BUG#20823 testBackup FailMaster failing few cases not handled properly (NF occurs). ndb/src/kernel/blocks/backup/Backup.cpp: Don't write fragment info if we haven't retreived any fragment info yet (FailMaster test 2) Go directly onto closing files if the tabPtr is RNIL (as we're in recovery) --- ndb/src/kernel/blocks/backup/Backup.cpp | 64 ++++++++++++++++--------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/ndb/src/kernel/blocks/backup/Backup.cpp b/ndb/src/kernel/blocks/backup/Backup.cpp index 43c1de5e2b3..10318e5f52d 100644 --- a/ndb/src/kernel/blocks/backup/Backup.cpp +++ b/ndb/src/kernel/blocks/backup/Backup.cpp @@ -274,36 +274,48 @@ Backup::execCONTINUEB(Signal* signal) BackupRecordPtr ptr; c_backupPool.getPtr(ptr, ptr_I); - TablePtr tabPtr; - ptr.p->tables.getPtr(tabPtr, tabPtr_I); - FragmentPtr fragPtr; - tabPtr.p->fragments.getPtr(fragPtr, fragPtr_I); - BackupFilePtr filePtr; - ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr); - - const Uint32 sz = sizeof(BackupFormat::CtlFile::FragmentInfo) >> 2; - Uint32 * dst; - if (!filePtr.p->operation.dataBuffer.getWritePtr(&dst, sz)) + if (tabPtr_I == RNIL) { - sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, 100, 4); + closeFiles(signal, ptr); return; } + jam(); + TablePtr tabPtr; + ptr.p->tables.getPtr(tabPtr, tabPtr_I); + jam(); + if(tabPtr.p->fragments.getSize()) + { + FragmentPtr fragPtr; + tabPtr.p->fragments.getPtr(fragPtr, fragPtr_I); - BackupFormat::CtlFile::FragmentInfo * fragInfo = - (BackupFormat::CtlFile::FragmentInfo*)dst; - fragInfo->SectionType = htonl(BackupFormat::FRAGMENT_INFO); - fragInfo->SectionLength = htonl(sz); - fragInfo->TableId = htonl(fragPtr.p->tableId); - fragInfo->FragmentNo = htonl(fragPtr_I); - fragInfo->NoOfRecordsLow = htonl(fragPtr.p->noOfRecords & 0xFFFFFFFF); - fragInfo->NoOfRecordsHigh = htonl(fragPtr.p->noOfRecords >> 32); - fragInfo->FilePosLow = htonl(0 & 0xFFFFFFFF); - fragInfo->FilePosHigh = htonl(0 >> 32); + BackupFilePtr filePtr; + ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr); - filePtr.p->operation.dataBuffer.updateWritePtr(sz); + const Uint32 sz = sizeof(BackupFormat::CtlFile::FragmentInfo) >> 2; + Uint32 * dst; + if (!filePtr.p->operation.dataBuffer.getWritePtr(&dst, sz)) + { + sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, 100, 4); + return; + } + + BackupFormat::CtlFile::FragmentInfo * fragInfo = + (BackupFormat::CtlFile::FragmentInfo*)dst; + fragInfo->SectionType = htonl(BackupFormat::FRAGMENT_INFO); + fragInfo->SectionLength = htonl(sz); + fragInfo->TableId = htonl(fragPtr.p->tableId); + fragInfo->FragmentNo = htonl(fragPtr_I); + fragInfo->NoOfRecordsLow = htonl(fragPtr.p->noOfRecords & 0xFFFFFFFF); + fragInfo->NoOfRecordsHigh = htonl(fragPtr.p->noOfRecords >> 32); + fragInfo->FilePosLow = htonl(0 & 0xFFFFFFFF); + fragInfo->FilePosHigh = htonl(0 >> 32); + + filePtr.p->operation.dataBuffer.updateWritePtr(sz); + + fragPtr_I++; + } - fragPtr_I++; if (fragPtr_I == tabPtr.p->fragments.getSize()) { signal->theData[0] = tabPtr.p->tableId; @@ -4243,6 +4255,12 @@ Backup::execSTOP_BACKUP_REQ(Signal* signal) TablePtr tabPtr; ptr.p->tables.first(tabPtr); + if (tabPtr.i == RNIL) + { + closeFiles(signal, ptr); + return; + } + signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO; signal->theData[1] = ptr.i; signal->theData[2] = tabPtr.i;