1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-34998: master can stop responding after cluster vote to evict a node

After cluster vote to evict a node that failed a transaction,
current master can't commit anymore.

Error voting for joiner in the JOINED state was broken because
the group-wide commit cut (implicit SUCCESS vote) was not taken
into account when processing error vote request from the JOINED
node.

This commit adds 3 MTR tests to verify the fix in the galera
library works as designed.

Requires Galera library commit 91f0090a05e96c3cc353b80d961ede45cefb9279
(galera library version > 26.4.19).

Signed-off-by: Julius Goryavsky <julius.goryavsky@mariadb.com>
This commit is contained in:
Alexey Yurchenko
2024-06-07 00:52:29 +03:00
committed by Julius Goryavsky
parent cb7e39b75b
commit ec5068fe59
12 changed files with 810 additions and 0 deletions

View File

@@ -0,0 +1,109 @@
connection node_4;
connection node_3;
connection node_2;
connection node_1;
connection node_1;
connection node_2;
connection node_3;
connection node_4;
connection node_1;
CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
CREATE PROCEDURE p1(IN max INT)
BEGIN
DECLARE i INT;
DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
SET i = 0;
WHILE i < max DO
INSERT IGNORE INTO t1 VALUES (DEFAULT);
SET i = i + 1;
END WHILE;
END|
CALL p1(130);
connection node_4;
Shutting down server 4...
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_2;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_3;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
Server 4 left the cluster
connection node_1;
CALL p1(130);
connection node_1;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
connection node_2;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
connection node_3;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
INSERT INTO t2 VALUES (DEFAULT);
CALL p1(130);
connection node_1;
SET GLOBAL debug = "+d,sync.wsrep_sst_donor_after_donation";
Restarting server 4
Wait for server 1 to become a donor
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached";
Server 1 got SST request from server 4
SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_sst_donor_after_donation_continue";
SET GLOBAL debug = "";
SET DEBUG_SYNC='RESET';
Waiting for server 4 to leave the cluster
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_2;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_3;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_4;
Server 4 left the cluster, killing it...
Killed server 4...
Restarting server 4...
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_1;
SELECT count(*) AS expect1_390 FROM t1;
expect1_390
390
SELECT count(*) AS expect1_1 FROM t2;
expect1_1
1
connection node_2;
SELECT count(*) AS expect2_390 FROM t1;
expect2_390
390
SELECT count(*) AS expect2_1 FROM t2;
expect2_1
1
connection node_3;
SELECT count(*) AS expect3_390 FROM t1;
expect3_390
390
SELECT count(*) AS expect3_1 FROM t2;
expect3_1
1
connection node_4;
SELECT count(*) AS expect4_390 FROM t1;
expect4_390
390
SELECT count(*) AS expect4_1 FROM t2;
expect4_1
1
DROP TABLE t1;
DROP TABLE t2;
DROP PROCEDURE p1;
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
CALL mtr.add_suppression("Inconsistency detected: Failed on preordered");
CALL mtr.add_suppression("Failed to apply write set");

View File

@@ -0,0 +1,93 @@
connection node_4;
connection node_3;
connection node_2;
connection node_1;
connection node_1;
connection node_2;
connection node_3;
connection node_4;
connection node_1;
CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
CREATE PROCEDURE p1(IN max INT)
BEGIN
DECLARE i INT;
DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
SET i = 0;
WHILE i < max DO
INSERT IGNORE INTO t1 VALUES (DEFAULT);
SET i = i + 1;
END WHILE;
END|
CALL p1(130);
connection node_4;
Shutting down server 4...
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
SET GLOBAL debug = "+d,sync.wsrep_donor_state";
connection node_4;
Restarting server 4...
connection node_1;
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached";
Tables on server 1 flushed and locked for SST to server 4
SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state";
SET GLOBAL debug = "";
SET DEBUG_SYNC='RESET';
Wait for the state snapshot to be copied to server 4
SST script unlocked server 1
connection node_1;
CALL p1(130);
connection node_1;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
connection node_2;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
connection node_3;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
INSERT INTO t2 VALUES (DEFAULT);
CALL p1(130);
Waiting for server 4 to leave the cluster
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_2;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_4;
Server 4 left the cluster, killing it...
Killed server 4...
Restarting server 4...
DROP TABLE t2;
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_1;
SELECT count(*) AS expect1_390 FROM t1;
expect1_390
390
connection node_2;
SELECT count(*) AS expect2_390 FROM t1;
expect2_390
390
connection node_3;
SELECT count(*) AS expect3_390 FROM t1;
expect3_390
390
connection node_4;
SELECT count(*) AS expect4_390 FROM t1;
expect4_390
390
DROP TABLE t1;
DROP PROCEDURE p1;
connection node_4;
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
CALL mtr.add_suppression("Inconsistency detected: Inconsistent by consensus");
CALL mtr.add_suppression("Failed to apply write set: gtid:");

View File

@@ -0,0 +1,101 @@
connection node_4;
connection node_3;
connection node_2;
connection node_1;
connection node_1;
connection node_2;
connection node_3;
connection node_4;
connection node_1;
CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
CREATE PROCEDURE p1(IN max INT)
BEGIN
DECLARE i INT;
DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
SET i = 0;
WHILE i < max DO
INSERT IGNORE INTO t1 VALUES (DEFAULT);
SET i = i + 1;
END WHILE;
END|
CALL p1(130);
connection node_4;
Shutting down server 4...
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
SET GLOBAL debug = "+d,sync.wsrep_donor_state";
connection node_4;
Restarting server 4...
connection node_1;
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached";
Tables on server 1 flushed and locked for SST to server 4
SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state";
SET GLOBAL debug = "";
SET DEBUG_SYNC='RESET';
Wait for the state snapshot to be copied to server 4
SST script unlocked server 1
connection node_1;
CALL p1(130);
connection node_3;
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
INSERT INTO t2 VALUES (DEFAULT);
SET SESSION wsrep_on = OFF;
connection node_1;
CALL p1(130);
Waiting for server 3 to leave the cluster
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_2;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_4;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_3;
Server 3 left the cluster, killing it...
Killed server 3.
Restarting server 3...
Waiting for server 3 to rejoin the cluster
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_3;
sleeping for 20
Waiting ready
Server 3 restarted.
connection node_1;
SET SESSION wsrep_on = ON;
SET SESSION wsrep_sync_wait = 15;
connection node_1;
SELECT count(*) AS expect1_390 FROM t1;
expect1_390
390
connection node_2;
SELECT count(*) AS expect2_390 FROM t1;
expect2_390
390
connection node_3;
SELECT count(*) AS expect3_390 FROM t1;
expect3_390
390
connection node_4;
SELECT count(*) AS expect4_390 FROM t1;
expect4_390
390
DROP TABLE t1;
DROP PROCEDURE p1;
connection node_1;
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
connection node_2;
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
connection node_3;
CALL mtr.add_suppression("Vote 0 \\(success\\) on .* is inconsistent with group");
connection node_4;
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");

View File

@@ -0,0 +1,20 @@
!include ../galera_4nodes.cnf
[mysqld]
wsrep-ignore-apply-errors=0
[mysqld.1]
wsrep_node_name='node_1'
[mysqld.2]
wsrep_node_name='node_2'
[mysqld.3]
wsrep_node_name='node_3'
[mysqld.4]
wsrep_node_name='node_4'
wsrep_sst_donor='node_1'
[ENV]
galera_cluster_size=4

View File

@@ -0,0 +1,158 @@
#
# Test a case where a joiner encounters an error during IST
# Instead of voting it should assume error and bail out.
#
--source include/galera_cluster.inc
--source include/big_test.inc
--source include/have_debug_sync.inc
--let $node_1=node_1
--let $node_2=node_2
--let $node_3=node_3
--let $node_4=node_4
--source ../include/auto_increment_offset_save.inc
# create table t1 and procedure p1 to generate wirtesets
--connection node_1
CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
DELIMITER |;
CREATE PROCEDURE p1(IN max INT)
BEGIN
DECLARE i INT;
DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
SET i = 0;
WHILE i < max DO
INSERT IGNORE INTO t1 VALUES (DEFAULT);
SET i = i + 1;
END WHILE;
END|
DELIMITER ;|
CALL p1(130);
--connection node_4
--echo Shutting down server 4...
--let $node_4_server_id= `SELECT @@server_id`
--let $node_4_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_4_server_id.expect
--let $node_4_pid_file= `SELECT @@pid_file`
--source include/shutdown_mysqld.inc
# Wait for node #4 to leave cluster
--let $members = 3
--connection node_1
--source include/wsrep_wait_membership.inc
--connection node_2
--source include/wsrep_wait_membership.inc
--connection node_3
--source include/wsrep_wait_membership.inc
--echo Server 4 left the cluster
# Create some writesets for IST
--connection node_1
CALL p1(130);
# Create a writeset that node 4 won't be able to apply by creating a table
# that won't be present in the replication stream
--connection node_1
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
--connection node_2
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
--connection node_3
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
# This should cause error during IST
INSERT INTO t2 VALUES (DEFAULT);
# make sure nodes 1,2,3 progress far enough for commit cut update
CALL p1(130);
--connection node_1
# prepare to stop SST donor thread when it receives a request from starting node #4
SET GLOBAL debug = "+d,sync.wsrep_sst_donor_after_donation";
--echo Restarting server 4
# Need to use this form instead of start_mysqld.inc because the latter is blocking
--exec echo "restart:$start_mysqld_params" > $node_4_expect_file_name
--echo Wait for server 1 to become a donor
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached";
--echo Server 1 got SST request from server 4
SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_sst_donor_after_donation_continue";
SET GLOBAL debug = "";
SET DEBUG_SYNC='RESET';
#
# After this point node #4 shall proceed to IST and bail out
#
--echo Waiting for server 4 to leave the cluster
--let $members = 3
--source include/wsrep_wait_membership.inc
--connection node_2
--source include/wsrep_wait_membership.inc
--connection node_3
--source include/wsrep_wait_membership.inc
--connection node_4
--echo Server 4 left the cluster, killing it...
# Kill the connected server
--exec echo "wait" > $node_4_expect_file_name
--let KILL_NODE_PIDFILE = $node_4_pid_file
--perl
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
my $mysqld_pid = `cat $pid_filename`;
chomp($mysqld_pid);
system("kill -9 $mysqld_pid");
exit(0);
EOF
--echo Killed server 4...
--source include/wait_until_disconnected.inc
--echo Restarting server 4...
--source include/start_mysqld.inc
--source include/galera_wait_ready.inc
# Confirm node #4 has rejoined
--connection node_1
--let $members = 4
--source include/wsrep_wait_membership.inc
# Confirm that all is good and all nodes have identical data
--connection node_1
SELECT count(*) AS expect1_390 FROM t1;
SELECT count(*) AS expect1_1 FROM t2;
--connection node_2
SELECT count(*) AS expect2_390 FROM t1;
SELECT count(*) AS expect2_1 FROM t2;
--connection node_3
SELECT count(*) AS expect3_390 FROM t1;
SELECT count(*) AS expect3_1 FROM t2;
--connection node_4
SELECT count(*) AS expect4_390 FROM t1;
SELECT count(*) AS expect4_1 FROM t2;
DROP TABLE t1;
DROP TABLE t2;
DROP PROCEDURE p1;
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
CALL mtr.add_suppression("Inconsistency detected: Failed on preordered");
CALL mtr.add_suppression("Failed to apply write set");
--source ../include/auto_increment_offset_restore.inc

View File

@@ -0,0 +1,21 @@
!include ../galera_4nodes.cnf
[mysqld]
wsrep-ignore-apply-errors=0
[mysqld.1]
wsrep_node_name='node_1'
[mysqld.2]
wsrep_node_name='node_2'
[mysqld.3]
wsrep_node_name='node_3'
[mysqld.4]
wsrep_node_name='node_4'
wsrep_sst_donor='node_1'
[ENV]
galera_cluster_size=4
MTR_SST_JOINER_DELAY=20

View File

@@ -0,0 +1,73 @@
#
# Test a case where a vote happens in JOINED state after SST on a writeset
# that should be applied.
#
--source galera_vote_joined_begin.inc
#
# At this point state snapshot has been copied, node 1 is operational and
# we have about 10 seconds while everything we do will go into the replication
# queue on node 4 which it will have to apply on top of the snapshot.
#
# Increase replication queue on node_4
--connection node_1
CALL p1(130);
# Create a writeset that node 4 won't be able to apply by creating a table
# that won't be present in the replication stream
--connection node_1
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
--connection node_2
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
--connection node_3
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
# This should cause node #4 to initiate a vote and leave the cluster
INSERT INTO t2 VALUES (DEFAULT);
# make sure nodes 1,2,3 progress far enough for commit cut update
CALL p1(130);
--echo Waiting for server 4 to leave the cluster
--let $members = 3
--source include/wsrep_wait_membership.inc
--connection node_2
--source include/wsrep_wait_membership.inc
--connection node_1
--source include/wsrep_wait_membership.inc
--connection node_4
--echo Server 4 left the cluster, killing it...
# Kill the connected server
--exec echo "wait" > $node_4_expect_file_name
--let KILL_NODE_PIDFILE = $node_4_pid_file
--perl
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
my $mysqld_pid = `cat $pid_filename`;
chomp($mysqld_pid);
system("kill -9 $mysqld_pid");
exit(0);
EOF
--echo Killed server 4...
--source include/wait_until_disconnected.inc
--echo Restarting server 4...
--source include/start_mysqld.inc
--source include/galera_wait_ready.inc
DROP TABLE t2;
--source galera_vote_joined_end.inc
--connection node_4
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
CALL mtr.add_suppression("Inconsistency detected: Inconsistent by consensus");
CALL mtr.add_suppression("Failed to apply write set: gtid:");

View File

@@ -0,0 +1,74 @@
# This file purpose is to set up node 4 to require SST which is artificaially
# prolonged and as a result accumulate sufficient relication queue.
# The contents of the qeuee are controlled in the sourcing test files.
--source include/galera_cluster.inc
--source include/big_test.inc
--source include/have_debug_sync.inc
--let $node_1=node_1
--let $node_2=node_2
--let $node_3=node_3
--let $node_4=node_4
--source ../include/auto_increment_offset_save.inc
# create table t1 and procedure p1 to generate wirtesets
--connection node_1
CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
DELIMITER |;
CREATE PROCEDURE p1(IN max INT)
BEGIN
DECLARE i INT;
DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
SET i = 0;
WHILE i < max DO
INSERT IGNORE INTO t1 VALUES (DEFAULT);
SET i = i + 1;
END WHILE;
END|
DELIMITER ;|
# 130 events move the commit cut, it is essential in voting
CALL p1(130);
--connection node_4
--echo Shutting down server 4...
--let $node_4_server_id= `SELECT @@server_id`
--let $node_4_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_4_server_id.expect
--let $node_4_pid_file= `SELECT @@pid_file`
--source include/shutdown_mysqld.inc
# enforce SST
--exec rm -rf $MYSQLTEST_VARDIR/mysqld.4/data/grastate.dat
# Wait for node #4 to leave cluster
--connection node_1
--let $members = 3
--source include/wsrep_wait_membership.inc
# prepare to stop SST donor thread when node is in donor state
SET GLOBAL debug = "+d,sync.wsrep_donor_state";
--connection node_4
--echo Restarting server 4...
# Need to use this form instead of start_mysqld.inc because the latter is blocking
--exec echo "restart:$start_mysqld_params" > $node_4_expect_file_name
# Wait for node #1 to become a donor
--connection node_1
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached";
--echo Tables on server 1 flushed and locked for SST to server 4
SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state";
SET GLOBAL debug = "";
SET DEBUG_SYNC='RESET';
--echo Wait for the state snapshot to be copied to server 4
--source include/galera_wait_ready.inc
--echo SST script unlocked server 1
#
# At this point state snapshot has been copied, node 1 is operational and
# we have about 20 seconds while everything we do will go into the replication
# queue on node 4 which it will have to apply on top of the snapshot.
#

View File

@@ -0,0 +1,33 @@
# Confirm node #4 has rejoined
--connection node_1
--let $members = 4
--source include/wsrep_wait_membership.inc
#DROP TABLE IF EXISTS t2;
# Confirm that all is good and all nodes have identical data
--connection node_1
SELECT count(*) AS expect1_390 FROM t1;
#CALL mtr.add_suppression("Replica SQL: Could not execute Delete_rows");
#CALL mtr.add_suppression("Event 3 Delete_rows apply failed: 120, seqno [0-9]*");
--connection node_2
SELECT count(*) AS expect2_390 FROM t1;
#CALL mtr.add_suppression("mysqld: Can't find record in 't1'");
#CALL mtr.add_suppression("Replica SQL: Could not execute Delete_rows");
#CALL mtr.add_suppression("Event 3 Delete_rows apply failed: 120, seqno seqno [0-9]*");
--connection node_3
SELECT count(*) AS expect3_390 FROM t1;
--connection node_4
SELECT count(*) AS expect4_390 FROM t1;
DROP TABLE t1;
DROP PROCEDURE p1;
#CALL mtr.add_suppression("inconsistent with group");
--source ../include/auto_increment_offset_restore.inc

View File

@@ -0,0 +1,21 @@
!include ../galera_4nodes.cnf
[mysqld]
wsrep-ignore-apply-errors=0
[mysqld.1]
wsrep_node_name='node_1'
[mysqld.2]
wsrep_node_name='node_2'
[mysqld.3]
wsrep_node_name='node_3'
[mysqld.4]
wsrep_node_name='node_4'
wsrep_sst_donor='node_1'
[ENV]
galera_cluster_size=4
MTR_SST_JOINER_DELAY=20

View File

@@ -0,0 +1,100 @@
#
# Test a case where a vote happens in JOINED state after SST on a writeset
# that should be skipped. I.e. JOINED node should continue operation.
#
--source galera_vote_joined_begin.inc
#
# At this point state snapshot has been copied, node 1 is operational and
# we have about 10 seconds while everything we do will go into the replication
# queue on node 4 which it will have to apply on top of the snapshot.
#
# Increase replication queue on node_4
--connection node_1
CALL p1(130);
#
# Create a writeset that node 4 won't be able to apply by making node 3
# inconsisitent
#
--connection node_3
--let $node_3_server_id= `SELECT @@server_id`
--let $node_3_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_3_server_id.expect
--let $node_3_pid_file= `SELECT @@pid_file`
SET SESSION wsrep_on = OFF;
CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
SET SESSION wsrep_on = ON;
# This should cause nodes #1 and #2 to initiate a vote and kick node #3
# out of the cluster, node #4 should recover the vote when fails to apply
# the event and continue
INSERT INTO t2 VALUES (DEFAULT);
SET SESSION wsrep_on = OFF;
# make sure nodes 1,2 progress far enough for commit cut update
--connection node_1
CALL p1(130);
--let $members = 3
--echo Waiting for server 3 to leave the cluster
--connection node_1
--source include/wsrep_wait_membership.inc
--connection node_2
--source include/wsrep_wait_membership.inc
--connection node_4
# need to wait for extra SST delay on joiner
--sleep $MTR_SST_JOINER_DELAY
--sleep $MTR_SST_JOINER_DELAY
--enable_reconnect
--let $wait_timeout = 60
--source include/wsrep_wait_membership.inc
--connection node_3
--echo Server 3 left the cluster, killing it...
# Kill the connected server
--exec echo "wait" > $node_3_expect_file_name
--let KILL_NODE_PIDFILE = $node_3_pid_file
--perl
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
my $mysqld_pid = `cat $pid_filename`;
chomp($mysqld_pid);
system("kill -9 $mysqld_pid");
exit(0);
EOF
--echo Killed server 3.
--source include/wait_until_disconnected.inc
--echo Restarting server 3...
--exec echo "restart:$start_mysqld_params" > $node_3_expect_file_name
--echo Waiting for server 3 to rejoin the cluster
--connection node_1
--let $members = 3
--source include/wsrep_wait_membership.inc
--connection node_3
--echo sleeping for $MTR_SST_JOINER_DELAY
# need to wait for extra SST delay on joiner
--sleep $MTR_SST_JOINER_DELAY
--sleep $MTR_SST_JOINER_DELAY
--echo Waiting ready
--enable_reconnect
--source include/galera_wait_ready.inc
--echo Server 3 restarted.
--source galera_vote_joined_end.inc
--connection node_1
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
--connection node_2
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
--connection node_3
CALL mtr.add_suppression("Vote 0 \\(success\\) on .* is inconsistent with group");
--connection node_4
CALL mtr.add_suppression("BF applier failed to open_and_lock_tables: 1146");
CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");

View File

@@ -915,6 +915,13 @@ EOF
fi
fi
# Delay for MTR tests if needed to simulate long SST
if [ ${MTR_SST_JOINER_DELAY:=0} -gt 0 ]
then
wsrep_log_info "Sleeping $MTR_SST_JOINER_DELAY seconds for MTR test"
sleep $MTR_SST_JOINER_DELAY
fi
# Remove special tags from the magic file, and from the output:
coords=$(head -n1 "$MAGIC_FILE")
wsrep_log_info "Galera co-ords from recovery: $coords"