mirror of
https://github.com/MariaDB/server.git
synced 2025-09-02 09:41:40 +03:00
ndb - bug#27466 nf during nr can leave cluster in inconsistent state (recommit in 5.1)
Fix race condition between NODE_FAILREP and local INCL_NODEREQ loop Also retry on ZNODE_START_DISALLOWED_ERROR storage/ndb/include/kernel/signaldata/StartPerm.hpp: Move error code storage/ndb/src/kernel/blocks/ERROR_codes.txt: new error code storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp: Move error code storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: 1) retry also on ZNODE_START_DISALLOWED_ERROR 2) Change if() else in INCL_NODECONF to for-loop instead 3) (last but not least) fix bug, that could cause different block withing same node to have different opinion about node status solution is to check if node is still alive before sening next local INCL_NODEREQ storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp: Add error insert to allow node to die during INCL_NODEREQ storage/ndb/src/kernel/blocks/suma/Suma.cpp: 1) let suma be well behaved (i.e reply to INCL_NODEREQ) 2) Add dump to print c_connceted_nodes/c_subscriber_nodes (8010) storage/ndb/test/ndbapi/testNodeRestart.cpp: new testcase storage/ndb/test/run-test/daily-basic-tests.txt: new testcase
This commit is contained in:
@@ -67,6 +67,7 @@ private:
|
|||||||
enum ErrorCode
|
enum ErrorCode
|
||||||
{
|
{
|
||||||
ZNODE_ALREADY_STARTING_ERROR = 305,
|
ZNODE_ALREADY_STARTING_ERROR = 305,
|
||||||
|
ZNODE_START_DISALLOWED_ERROR = 309,
|
||||||
InitialStartRequired = 320
|
InitialStartRequired = 320
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@@ -6,7 +6,7 @@ Next DBTUP 4029
|
|||||||
Next DBLQH 5045
|
Next DBLQH 5045
|
||||||
Next DBDICT 6007
|
Next DBDICT 6007
|
||||||
Next DBDIH 7183
|
Next DBDIH 7183
|
||||||
Next DBTC 8039
|
Next DBTC 8040
|
||||||
Next CMVMI 9000
|
Next CMVMI 9000
|
||||||
Next BACKUP 10038
|
Next BACKUP 10038
|
||||||
Next DBUTIL 11002
|
Next DBUTIL 11002
|
||||||
@@ -327,6 +327,8 @@ Test Crashes in handling node restarts
|
|||||||
|
|
||||||
7170: Crash when receiving START_PERMREF (InitialStartRequired)
|
7170: Crash when receiving START_PERMREF (InitialStartRequired)
|
||||||
|
|
||||||
|
8039: DBTC delay INCL_NODECONF and kill starting node
|
||||||
|
|
||||||
7174: Crash starting node before sending DICT_LOCK_REQ
|
7174: Crash starting node before sending DICT_LOCK_REQ
|
||||||
7175: Master sends one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR)
|
7175: Master sends one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR)
|
||||||
7176: Slave NR pretends master does not support DICT lock (rolling upgrade)
|
7176: Slave NR pretends master does not support DICT lock (rolling upgrade)
|
||||||
|
@@ -74,7 +74,6 @@
|
|||||||
#define ZWRONG_FAILURE_NUMBER_ERROR 302
|
#define ZWRONG_FAILURE_NUMBER_ERROR 302
|
||||||
#define ZWRONG_START_NODE_ERROR 303
|
#define ZWRONG_START_NODE_ERROR 303
|
||||||
#define ZNO_REPLICA_FOUND_ERROR 304
|
#define ZNO_REPLICA_FOUND_ERROR 304
|
||||||
#define ZNODE_START_DISALLOWED_ERROR 309
|
|
||||||
|
|
||||||
// --------------------------------------
|
// --------------------------------------
|
||||||
// Codes from LQH
|
// Codes from LQH
|
||||||
|
@@ -1709,7 +1709,8 @@ void Dbdih::execSTART_PERMREF(Signal* signal)
|
|||||||
{
|
{
|
||||||
jamEntry();
|
jamEntry();
|
||||||
Uint32 errorCode = signal->theData[1];
|
Uint32 errorCode = signal->theData[1];
|
||||||
if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
|
if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR ||
|
||||||
|
errorCode == StartPermRef::ZNODE_START_DISALLOWED_ERROR) {
|
||||||
jam();
|
jam();
|
||||||
/*-----------------------------------------------------------------------*/
|
/*-----------------------------------------------------------------------*/
|
||||||
// The master was busy adding another node. We will wait for a second and
|
// The master was busy adding another node. We will wait for a second and
|
||||||
@@ -2056,49 +2057,49 @@ void Dbdih::execINCL_NODECONF(Signal* signal)
|
|||||||
TstartNode_or_blockref = signal->theData[0];
|
TstartNode_or_blockref = signal->theData[0];
|
||||||
TsendNodeId = signal->theData[1];
|
TsendNodeId = signal->theData[1];
|
||||||
|
|
||||||
if (TstartNode_or_blockref == clocallqhblockref) {
|
static Uint32 blocklist[] = {
|
||||||
jam();
|
clocallqhblockref,
|
||||||
/*-----------------------------------------------------------------------*/
|
clocaltcblockref,
|
||||||
// THIS SIGNAL CAME FROM THE LOCAL LQH BLOCK.
|
cdictblockref,
|
||||||
// WE WILL NOW SEND INCLUDE TO THE TC BLOCK.
|
0,
|
||||||
/*-----------------------------------------------------------------------*/
|
0,
|
||||||
signal->theData[0] = reference();
|
0
|
||||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
};
|
||||||
sendSignal(clocaltcblockref, GSN_INCL_NODEREQ, signal, 2, JBB);
|
blocklist[3] = numberToRef(BACKUP, getOwnNodeId());
|
||||||
return;
|
blocklist[4] = numberToRef(SUMA, getOwnNodeId());
|
||||||
}//if
|
|
||||||
if (TstartNode_or_blockref == clocaltcblockref) {
|
|
||||||
jam();
|
|
||||||
/*----------------------------------------------------------------------*/
|
|
||||||
// THIS SIGNAL CAME FROM THE LOCAL LQH BLOCK.
|
|
||||||
// WE WILL NOW SEND INCLUDE TO THE DICT BLOCK.
|
|
||||||
/*----------------------------------------------------------------------*/
|
|
||||||
signal->theData[0] = reference();
|
|
||||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
|
||||||
sendSignal(cdictblockref, GSN_INCL_NODEREQ, signal, 2, JBB);
|
|
||||||
return;
|
|
||||||
}//if
|
|
||||||
if (TstartNode_or_blockref == cdictblockref) {
|
|
||||||
jam();
|
|
||||||
/*-----------------------------------------------------------------------*/
|
|
||||||
// THIS SIGNAL CAME FROM THE LOCAL DICT BLOCK. WE WILL NOW SEND CONF TO THE
|
|
||||||
// BACKUP.
|
|
||||||
/*-----------------------------------------------------------------------*/
|
|
||||||
signal->theData[0] = reference();
|
|
||||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
|
||||||
sendSignal(BACKUP_REF, GSN_INCL_NODEREQ, signal, 2, JBB);
|
|
||||||
|
|
||||||
// Suma will not send response to this for now, later...
|
Uint32 i = 0;
|
||||||
sendSignal(SUMA_REF, GSN_INCL_NODEREQ, signal, 2, JBB);
|
for (Uint32 i = 0; blocklist[i] != 0; i++)
|
||||||
return;
|
{
|
||||||
}//if
|
if (TstartNode_or_blockref == blocklist[i])
|
||||||
if (TstartNode_or_blockref == numberToRef(BACKUP, getOwnNodeId())){
|
{
|
||||||
jam();
|
jam();
|
||||||
signal->theData[0] = c_nodeStartSlave.nodeId;
|
if (getNodeStatus(c_nodeStartSlave.nodeId) == NodeRecord::ALIVE &&
|
||||||
signal->theData[1] = cownNodeId;
|
blocklist[i+1] != 0)
|
||||||
sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
|
{
|
||||||
c_nodeStartSlave.nodeId = 0;
|
/**
|
||||||
return;
|
* Send to next in block list
|
||||||
|
*/
|
||||||
|
jam();
|
||||||
|
signal->theData[0] = reference();
|
||||||
|
signal->theData[1] = c_nodeStartSlave.nodeId;
|
||||||
|
sendSignal(blocklist[i+1], GSN_INCL_NODEREQ, signal, 2, JBB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* All done, reply to master
|
||||||
|
*/
|
||||||
|
jam();
|
||||||
|
signal->theData[0] = c_nodeStartSlave.nodeId;
|
||||||
|
signal->theData[1] = cownNodeId;
|
||||||
|
sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
|
||||||
|
|
||||||
|
c_nodeStartSlave.nodeId = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ndbrequire(cmasterdihref = reference());
|
ndbrequire(cmasterdihref = reference());
|
||||||
@@ -2217,7 +2218,7 @@ void Dbdih::execSTART_INFOREQ(Signal* signal)
|
|||||||
StartInfoRef *const ref =(StartInfoRef*)&signal->theData[0];
|
StartInfoRef *const ref =(StartInfoRef*)&signal->theData[0];
|
||||||
ref->startingNodeId = startNode;
|
ref->startingNodeId = startNode;
|
||||||
ref->sendingNodeId = cownNodeId;
|
ref->sendingNodeId = cownNodeId;
|
||||||
ref->errorCode = ZNODE_START_DISALLOWED_ERROR;
|
ref->errorCode = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
|
||||||
sendSignal(cmasterdihref, GSN_START_INFOREF, signal,
|
sendSignal(cmasterdihref, GSN_START_INFOREF, signal,
|
||||||
StartInfoRef::SignalLength, JBB);
|
StartInfoRef::SignalLength, JBB);
|
||||||
return;
|
return;
|
||||||
|
@@ -311,6 +311,19 @@ void Dbtc::execINCL_NODEREQ(Signal* signal)
|
|||||||
hostptr.p->hostStatus = HS_ALIVE;
|
hostptr.p->hostStatus = HS_ALIVE;
|
||||||
signal->theData[0] = cownref;
|
signal->theData[0] = cownref;
|
||||||
c_alive_nodes.set(hostptr.i);
|
c_alive_nodes.set(hostptr.i);
|
||||||
|
|
||||||
|
if (ERROR_INSERTED(8039))
|
||||||
|
{
|
||||||
|
CLEAR_ERROR_INSERT_VALUE;
|
||||||
|
Uint32 save = signal->theData[0];
|
||||||
|
signal->theData[0] = 9999;
|
||||||
|
sendSignal(numberToRef(CMVMI, hostptr.i),
|
||||||
|
GSN_NDB_TAMPER, signal, 1, JBB);
|
||||||
|
signal->theData[0] = save;
|
||||||
|
sendSignalWithDelay(tblockref, GSN_INCL_NODECONF, signal, 5000, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB);
|
sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -813,17 +813,14 @@ void
|
|||||||
Suma::execINCL_NODEREQ(Signal* signal){
|
Suma::execINCL_NODEREQ(Signal* signal){
|
||||||
jamEntry();
|
jamEntry();
|
||||||
|
|
||||||
//const Uint32 senderRef = signal->theData[0];
|
const Uint32 senderRef = signal->theData[0];
|
||||||
const Uint32 nodeId = signal->theData[1];
|
const Uint32 nodeId = signal->theData[1];
|
||||||
|
|
||||||
ndbrequire(!c_alive_nodes.get(nodeId));
|
ndbrequire(!c_alive_nodes.get(nodeId));
|
||||||
c_alive_nodes.set(nodeId);
|
c_alive_nodes.set(nodeId);
|
||||||
|
|
||||||
#if 0 // if we include this DIH's got to be prepared, later if needed...
|
|
||||||
signal->theData[0] = reference();
|
signal->theData[0] = reference();
|
||||||
|
|
||||||
sendSignal(senderRef, GSN_INCL_NODECONF, signal, 1, JBB);
|
sendSignal(senderRef, GSN_INCL_NODECONF, signal, 1, JBB);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -953,6 +950,15 @@ Suma::execDUMP_STATE_ORD(Signal* signal){
|
|||||||
CLEAR_ERROR_INSERT_VALUE;
|
CLEAR_ERROR_INSERT_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tCase == 8010)
|
||||||
|
{
|
||||||
|
char buf1[255], buf2[255];
|
||||||
|
c_subscriber_nodes.getText(buf1);
|
||||||
|
c_connected_nodes.getText(buf2);
|
||||||
|
infoEvent("c_subscriber_nodes: %s", buf1);
|
||||||
|
infoEvent("c_connected_nodes: %s", buf2);
|
||||||
|
}
|
||||||
|
|
||||||
if (tCase == 8009)
|
if (tCase == 8009)
|
||||||
{
|
{
|
||||||
if (ERROR_INSERTED(13030))
|
if (ERROR_INSERTED(13030))
|
||||||
|
@@ -1423,6 +1423,56 @@ runBug27283(NDBT_Context* ctx, NDBT_Step* step)
|
|||||||
return NDBT_OK;
|
return NDBT_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
runBug27466(NDBT_Context* ctx, NDBT_Step* step)
|
||||||
|
{
|
||||||
|
int result = NDBT_OK;
|
||||||
|
int loops = ctx->getNumLoops();
|
||||||
|
int records = ctx->getNumRecords();
|
||||||
|
NdbRestarter res;
|
||||||
|
|
||||||
|
if (res.getNumDbNodes() < 2)
|
||||||
|
{
|
||||||
|
return NDBT_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
Uint32 pos = 0;
|
||||||
|
for (Uint32 i = 0; i<loops; i++)
|
||||||
|
{
|
||||||
|
int node1 = res.getDbNodeId(rand() % res.getNumDbNodes());
|
||||||
|
int node2 = node1;
|
||||||
|
while (node1 == node2)
|
||||||
|
{
|
||||||
|
node2 = res.getDbNodeId(rand() % res.getNumDbNodes());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res.restartOneDbNode(node1, false, true, true))
|
||||||
|
return NDBT_FAILED;
|
||||||
|
|
||||||
|
if (res.waitNodesNoStart(&node1, 1))
|
||||||
|
return NDBT_FAILED;
|
||||||
|
|
||||||
|
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
|
||||||
|
if (res.dumpStateOneNode(node1, val2, 2))
|
||||||
|
return NDBT_FAILED;
|
||||||
|
|
||||||
|
if (res.insertErrorInNode(node2, 8039))
|
||||||
|
return NDBT_FAILED;
|
||||||
|
|
||||||
|
res.startNodes(&node1, 1);
|
||||||
|
NdbSleep_SecSleep(3);
|
||||||
|
if (res.waitNodesNoStart(&node1, 1))
|
||||||
|
return NDBT_FAILED;
|
||||||
|
NdbSleep_SecSleep(5); // Wait for delayed INCL_NODECONF to arrive
|
||||||
|
|
||||||
|
res.startNodes(&node1, 1);
|
||||||
|
if (res.waitClusterStarted())
|
||||||
|
return NDBT_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NDBT_OK;
|
||||||
|
}
|
||||||
|
|
||||||
NDBT_TESTSUITE(testNodeRestart);
|
NDBT_TESTSUITE(testNodeRestart);
|
||||||
TESTCASE("NoLoad",
|
TESTCASE("NoLoad",
|
||||||
"Test that one node at a time can be stopped and then restarted "\
|
"Test that one node at a time can be stopped and then restarted "\
|
||||||
@@ -1774,6 +1824,9 @@ TESTCASE("Bug27003", ""){
|
|||||||
TESTCASE("Bug27283", ""){
|
TESTCASE("Bug27283", ""){
|
||||||
INITIALIZER(runBug27283);
|
INITIALIZER(runBug27283);
|
||||||
}
|
}
|
||||||
|
TESTCASE("Bug27466", ""){
|
||||||
|
INITIALIZER(runBug27466);
|
||||||
|
}
|
||||||
NDBT_TESTSUITE_END(testNodeRestart);
|
NDBT_TESTSUITE_END(testNodeRestart);
|
||||||
|
|
||||||
int main(int argc, const char** argv){
|
int main(int argc, const char** argv){
|
||||||
|
@@ -792,6 +792,10 @@ max-time: 1000
|
|||||||
cmd: testNodeRestart
|
cmd: testNodeRestart
|
||||||
args: -n Bug25468 T1
|
args: -n Bug25468 T1
|
||||||
|
|
||||||
|
max-time: 1000
|
||||||
|
cmd: testNodeRestart
|
||||||
|
args: -n Bug27466 T1
|
||||||
|
|
||||||
max-time: 1000
|
max-time: 1000
|
||||||
cmd: test_event
|
cmd: test_event
|
||||||
args: -l 10 -n Bug27169 T1
|
args: -l 10 -n Bug27169 T1
|
||||||
|
Reference in New Issue
Block a user