mirror of
https://github.com/MariaDB/server.git
synced 2025-09-02 09:41:40 +03:00
Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart
- signals where sometimes sent too early when setting up subscriptions ndb/include/kernel/signaldata/DumpStateOrd.hpp: added dump for active subscriptions in cmvmi ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp: added dump for active subscriptions in cmvmi ndb/src/mgmsrv/MgmtSrvr.cpp: bug in that signals where sent prior to api reg conf arrived, causing thrown away signals and subsequent hangs in mgmtserver also add retry if node connected but not yet received api reg conf ndb/src/ndbapi/ClusterMgr.cpp: added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable ndb/src/ndbapi/ClusterMgr.hpp: added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable ndb/src/ndbapi/SignalSender.cpp: assert to see that node is sendable when signal is sent ndb/src/ndbapi/SignalSender.hpp: manke metchd const
This commit is contained in:
@@ -107,6 +107,10 @@ public:
|
|||||||
CmvmiDumpLongSignalMemory = 2601,
|
CmvmiDumpLongSignalMemory = 2601,
|
||||||
CmvmiSetRestartOnErrorInsert = 2602,
|
CmvmiSetRestartOnErrorInsert = 2602,
|
||||||
CmvmiTestLongSigWithDelay = 2603,
|
CmvmiTestLongSigWithDelay = 2603,
|
||||||
|
CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
|
||||||
|
to be able to debug if events
|
||||||
|
for some reason does not end up
|
||||||
|
in clusterlog */
|
||||||
// 7000 DIH
|
// 7000 DIH
|
||||||
// 7001 DIH
|
// 7001 DIH
|
||||||
// 7002 DIH
|
// 7002 DIH
|
||||||
|
@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
|
|||||||
case TimeToWaitAlive:
|
case TimeToWaitAlive:
|
||||||
|
|
||||||
// QMGR
|
// QMGR
|
||||||
case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt ocks<EFBFBD>
|
case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
|
||||||
case HeartbeatIntervalDbApi:
|
case HeartbeatIntervalDbApi:
|
||||||
case ArbitTimeout:
|
case ArbitTimeout:
|
||||||
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
|
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
|
||||||
@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
|
||||||
|
{
|
||||||
|
SubscriberPtr ptr;
|
||||||
|
subscribers.first(ptr);
|
||||||
|
g_eventLogger.info("List subscriptions:");
|
||||||
|
while(ptr.i != RNIL)
|
||||||
|
{
|
||||||
|
g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
|
||||||
|
ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef);
|
||||||
|
for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
|
||||||
|
{
|
||||||
|
Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
|
||||||
|
g_eventLogger.info("Category %u Level %u", i, level);
|
||||||
|
}
|
||||||
|
subscribers.next(ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
|
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
|
||||||
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
|
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
|
||||||
g_sectionSegmentPool.getSize(),
|
g_sectionSegmentPool.getSize(),
|
||||||
|
@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
|
|||||||
return WRONG_PROCESS_TYPE;
|
return WRONG_PROCESS_TYPE;
|
||||||
// Check if we have contact with it
|
// Check if we have contact with it
|
||||||
if(unCond){
|
if(unCond){
|
||||||
if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected)
|
if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
else if (theFacade->get_node_alive(nodeId) == true)
|
else if (theFacade->get_node_alive(nodeId) == true)
|
||||||
@@ -1562,32 +1562,85 @@ MgmtSrvr::status(int nodeId,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
MgmtSrvr::setEventReportingLevelImpl(int nodeId,
|
MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg,
|
||||||
const EventSubscribeReq& ll)
|
const EventSubscribeReq& ll)
|
||||||
{
|
{
|
||||||
SignalSender ss(theFacade);
|
SignalSender ss(theFacade);
|
||||||
ss.lock();
|
NdbNodeBitmask nodes;
|
||||||
|
int retries = 30;
|
||||||
SimpleSignal ssig;
|
|
||||||
EventSubscribeReq * dst =
|
|
||||||
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
|
|
||||||
ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
|
|
||||||
EventSubscribeReq::SignalLength);
|
|
||||||
*dst = ll;
|
|
||||||
|
|
||||||
NodeBitmask nodes;
|
|
||||||
nodes.clear();
|
nodes.clear();
|
||||||
Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId;
|
while (1)
|
||||||
for(; (Uint32) nodeId <= max; nodeId++)
|
|
||||||
{
|
{
|
||||||
if (nodeTypes[nodeId] != NODE_TYPE_DB)
|
Uint32 nodeId, max;
|
||||||
continue;
|
ss.lock();
|
||||||
if (okToSendTo(nodeId, true))
|
SimpleSignal ssig;
|
||||||
continue;
|
EventSubscribeReq * dst =
|
||||||
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
|
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
|
||||||
|
ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
|
||||||
|
EventSubscribeReq::SignalLength);
|
||||||
|
*dst = ll;
|
||||||
|
|
||||||
|
if (nodeId_arg == 0)
|
||||||
{
|
{
|
||||||
nodes.set(nodeId);
|
// all nodes
|
||||||
|
nodeId = 1;
|
||||||
|
max = MAX_NDB_NODES;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// only one node
|
||||||
|
max = nodeId = nodeId_arg;
|
||||||
|
}
|
||||||
|
// first make sure nodes are sendable
|
||||||
|
for(; nodeId <= max; nodeId++)
|
||||||
|
{
|
||||||
|
if (nodeTypes[nodeId] != NODE_TYPE_DB)
|
||||||
|
continue;
|
||||||
|
if (okToSendTo(nodeId, true))
|
||||||
|
{
|
||||||
|
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
|
||||||
|
{
|
||||||
|
// node not connected we can safely skip this one
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// api_reg_conf not recevied yet, need to retry
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (nodeId <= max)
|
||||||
|
{
|
||||||
|
if (--retries)
|
||||||
|
{
|
||||||
|
ss.unlock();
|
||||||
|
NdbSleep_MilliSleep(100);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return SEND_OR_RECEIVE_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nodeId_arg == 0)
|
||||||
|
{
|
||||||
|
// all nodes
|
||||||
|
nodeId = 1;
|
||||||
|
max = MAX_NDB_NODES;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// only one node
|
||||||
|
max = nodeId = nodeId_arg;
|
||||||
|
}
|
||||||
|
// now send to all sendable nodes nodes
|
||||||
|
// note, lock is held, so states have not changed
|
||||||
|
for(; (Uint32) nodeId <= max; nodeId++)
|
||||||
|
{
|
||||||
|
if (nodeTypes[nodeId] != NODE_TYPE_DB)
|
||||||
|
continue;
|
||||||
|
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
|
||||||
|
continue; // node is not connected, skip
|
||||||
|
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
|
||||||
|
nodes.set(nodeId);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nodes.isclear())
|
if (nodes.isclear())
|
||||||
@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
|
|||||||
int error = 0;
|
int error = 0;
|
||||||
while (!nodes.isclear())
|
while (!nodes.isclear())
|
||||||
{
|
{
|
||||||
|
Uint32 nodeId;
|
||||||
SimpleSignal *signal = ss.waitFor();
|
SimpleSignal *signal = ss.waitFor();
|
||||||
int gsn = signal->readSignalNumber();
|
int gsn = signal->readSignalNumber();
|
||||||
nodeId = refToNode(signal->header.theSendersBlockRef);
|
nodeId = refToNode(signal->header.theSendersBlockRef);
|
||||||
|
@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
|
|||||||
ClusterMgr::Node::Node()
|
ClusterMgr::Node::Node()
|
||||||
: m_state(NodeState::SL_NOTHING) {
|
: m_state(NodeState::SL_NOTHING) {
|
||||||
compatible = nfCompleteRep = true;
|
compatible = nfCompleteRep = true;
|
||||||
connected = defined = m_alive = false;
|
connected = defined = m_alive = m_api_reg_conf = false;
|
||||||
m_state.m_connected_nodes.clear();
|
m_state.m_connected_nodes.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
|
|||||||
node.m_info.m_version);
|
node.m_info.m_version);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
node.m_api_reg_conf = true;
|
||||||
|
|
||||||
node.m_state = apiRegConf->nodeState;
|
node.m_state = apiRegConf->nodeState;
|
||||||
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
|
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
|
||||||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
|
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
|
||||||
@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){
|
|||||||
|
|
||||||
noOfConnectedNodes--;
|
noOfConnectedNodes--;
|
||||||
theNodes[nodeId].connected = false;
|
theNodes[nodeId].connected = false;
|
||||||
|
theNodes[nodeId].m_api_reg_conf = false;
|
||||||
theNodes[nodeId].m_state.m_connected_nodes.clear();
|
theNodes[nodeId].m_state.m_connected_nodes.clear();
|
||||||
|
|
||||||
reportNodeFailed(nodeId, true);
|
reportNodeFailed(nodeId, true);
|
||||||
|
@@ -65,6 +65,7 @@ public:
|
|||||||
bool compatible; // Version is compatible
|
bool compatible; // Version is compatible
|
||||||
bool nfCompleteRep; // NF Complete Rep has arrived
|
bool nfCompleteRep; // NF Complete Rep has arrived
|
||||||
bool m_alive; // Node is alive
|
bool m_alive; // Node is alive
|
||||||
|
bool m_api_reg_conf;// API_REGCONF has arrived
|
||||||
|
|
||||||
NodeInfo m_info;
|
NodeInfo m_info;
|
||||||
NodeState m_state;
|
NodeState m_state;
|
||||||
|
@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const {
|
|||||||
|
|
||||||
SendStatus
|
SendStatus
|
||||||
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
|
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
|
||||||
|
assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
|
||||||
|
s->readSignalNumber() == GSN_API_REGREQ);
|
||||||
return theFacade->theTransporterRegistry->prepareSend(&s->header,
|
return theFacade->theTransporterRegistry->prepareSend(&s->header,
|
||||||
1, // JBB
|
1, // JBB
|
||||||
&s->theData[0],
|
&s->theData[0],
|
||||||
|
@@ -32,7 +32,7 @@ public:
|
|||||||
Uint32 theData[25];
|
Uint32 theData[25];
|
||||||
LinearSectionPtr ptr[3];
|
LinearSectionPtr ptr[3];
|
||||||
|
|
||||||
int readSignalNumber() {return header.theVerId_signalNumber; }
|
int readSignalNumber() const {return header.theVerId_signalNumber; }
|
||||||
Uint32 *getDataPtrSend() { return theData; }
|
Uint32 *getDataPtrSend() { return theData; }
|
||||||
const Uint32 *getDataPtr() const { return theData; }
|
const Uint32 *getDataPtr() const { return theData; }
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user