1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-11853: semisync thread can be killed after sync binlog but before ACK in the sync state

Problem:
========
If a primary is shutdown during an active semi-sync connection
during the period when the primary is awaiting an ACK, the primary
hard kills the active communication thread and does not ensure the
transaction was received by a replica. This can lead to an
inconsistent replication state.

Solution:
========
During shutdown, the primary should wait for an ACK or timeout
before hard killing a thread which is awaiting a communication. We
extend the `SHUTDOWN WAIT FOR SLAVES` logic to identify and ignore
any threads waiting for a semi-sync ACK in phase 1. Then, before
stopping the ack receiver thread, the shutdown is delayed until all
waiting semi-sync connections receive an ACK or time out. The
connections are then killed in phase 2.

Notes:
 1) There remains an unresolved corner case that affects this
patch. MDEV-28141: Slave crashes with Packets out of order when
connecting to a shutting down master. Specifically, If a slave is
connecting to a master which is actively shutting down, the slave
can crash with a "Packets out of order" assertion error. To get
around this issue in the MTR tests, the primary will wait a small
amount of time before phase 1 killing threads to let the replicas
safely stop (if applicable).
 2) This patch also fixes MDEV-28114: Semi-sync Master ACK Receiver
Thread Can Error on COM_QUIT

Reviewed By
============
Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
Brandon Nesterenko
2021-10-20 20:13:45 -06:00
parent 807945f2eb
commit a83c7ab1ea
13 changed files with 1091 additions and 22 deletions

View File

@@ -0,0 +1,517 @@
#############################
# Common setup for all tests
#############################
# Note: Simulated slave delay is hardcoded to 800 milliseconds
# Note: Simulated master shutdown delay is hardcoded to 500 milliseconds
include/rpl_init.inc [topology=1->2, 1->3]
connection server_1;
# Slaves which simulate an error will produce a timeout on the primary
call mtr.add_suppression("Timeout waiting");
call mtr.add_suppression("did not exit");
# Suppress slave errors related to the simulated error
connection server_2;
call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
connection server_3;
call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
connection server_1;
CREATE TABLE t1 (a int);
connection server_2;
connection server_3;
connect server_1_con2, localhost, root,,;
#############################
# Test cases
#############################
#
# Test Case 1) If both replicas simulate a delay that is within the
# allowed timeout, the primary should delay killing the suspended thread
# until an ACK is received (Rpl_semi_sync_master_yes_tx should be 1).
#
connection server_1;
#--
#-- Semi-sync Setup
connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
#-- Enable semi-sync on master
connection server_1;
SET @@GLOBAL.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_timeout= 1600;
#-- Wait for master to recognize semi-sync slaves
connection server_1;
#-- Master should have semi-sync enabled with 2 connections
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status ON
show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
#--
#-- Test begins
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1)
show status like 'Rpl_semi_sync_master_yes_tx';
Variable_name Value
Rpl_semi_sync_master_yes_tx 1
show status like 'Rpl_semi_sync_master_no_tx';
Variable_name Value
Rpl_semi_sync_master_no_tx 0
connection server_1_con2;
# Check logs to ensure shutdown was delayed
FOUND 1 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err
# Validate slave data is in correct state
connection server_2;
select count(*)=1 from t1;
count(*)=1
1
connection server_3;
select count(*)=1 from t1;
count(*)=1
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
connection server_3;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
#
# Test Case 2) If both replicas simulate an error before sending an ACK,
# the primary should delay killing the suspended thread until the
# timeout is reached (Rpl_semi_sync_master_no_tx should be 1).
#
connection server_1;
#--
#-- Semi-sync Setup
connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
#-- Enable semi-sync on master
connection server_1;
SET @@GLOBAL.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_timeout= 500;
#-- Wait for master to recognize semi-sync slaves
connection server_1;
#-- Master should have semi-sync enabled with 2 connections
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status ON
show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event";
#--
#-- Test begins
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1)
show status like 'Rpl_semi_sync_master_yes_tx';
Variable_name Value
Rpl_semi_sync_master_yes_tx 0
show status like 'Rpl_semi_sync_master_no_tx';
Variable_name Value
Rpl_semi_sync_master_no_tx 1
connection server_1_con2;
# Check logs to ensure shutdown was delayed
FOUND 2 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err
# Validate slave data is in correct state
connection server_2;
select count(*)=0 from t1;
count(*)=0
1
connection server_3;
select count(*)=0 from t1;
count(*)=0
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
connection server_3;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
#
# Test Case 3) If one replica simulates a delay within the allowed
# timeout and the other simulates an error before sending an ACK, the
# primary should delay killing the suspended thread until it receives an
# ACK from the delayed slave (Rpl_semi_sync_master_yes_tx should be 1).
#
connection server_1;
#--
#-- Semi-sync Setup
connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
#-- Enable semi-sync on master
connection server_1;
SET @@GLOBAL.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_timeout= 1600;
#-- Wait for master to recognize semi-sync slaves
connection server_1;
#-- Master should have semi-sync enabled with 2 connections
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status ON
show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
#--
#-- Test begins
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1)
show status like 'Rpl_semi_sync_master_yes_tx';
Variable_name Value
Rpl_semi_sync_master_yes_tx 1
show status like 'Rpl_semi_sync_master_no_tx';
Variable_name Value
Rpl_semi_sync_master_no_tx 0
connection server_1_con2;
# Check logs to ensure shutdown was delayed
FOUND 3 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err
# Validate slave data is in correct state
connection server_2;
select count(*)=0 from t1;
count(*)=0
1
connection server_3;
select count(*)=1 from t1;
count(*)=1
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
connection server_3;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
#
# Test Case 4) If a replica errors before sending an ACK, it will cause
# the IO thread to stop and handle the error. During error handling, if
# semi-sync is active, the replica will form a new connection with the
# primary to kill the active connection. However, if the primary is
# shutting down, it may kill the new connection, thereby leaving the
# active semi-sync connection in-tact. The slave should notice this, and
# not issue a `QUIT` command to the primary, which would otherwise be
# sent to kill an active connection. This test case validates that the
# slave does not send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx
# should be 1 because server_3 will send the ACK within a valid timeout).
#
connection server_1;
#--
#-- Semi-sync Setup
connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
#-- Enable semi-sync on master
connection server_1;
SET @@GLOBAL.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_timeout= 1600;
#-- Wait for master to recognize semi-sync slaves
connection server_1;
#-- Master should have semi-sync enabled with 2 connections
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status ON
show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,slave_delay_killing_semisync_connection";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
#--
#-- Test begins
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1)
show status like 'Rpl_semi_sync_master_yes_tx';
Variable_name Value
Rpl_semi_sync_master_yes_tx 1
show status like 'Rpl_semi_sync_master_no_tx';
Variable_name Value
Rpl_semi_sync_master_no_tx 0
connection server_1_con2;
# Check logs to ensure shutdown was delayed
FOUND 4 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err
# Validate slave data is in correct state
connection server_2;
select count(*)=0 from t1;
count(*)=0
1
connection server_3;
select count(*)=1 from t1;
count(*)=1
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
connection server_3;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status OFF
SELECT COUNT(*)=0 from t1;
COUNT(*)=0
1
#############################
# Cleanup
#############################
connection server_2;
include/stop_slave.inc
include/start_slave.inc
connection server_3;
include/stop_slave.inc
include/start_slave.inc
connection server_1;
drop table t1;
include/rpl_end.inc

View File

@@ -0,0 +1,14 @@
!include ../my.cnf
[mysqld.1]
log_warnings=9
[mysqld.2]
log_warnings=9
[mysqld.3]
log_warnings=9
[ENV]
SERVER_MYPORT_3= @mysqld.3.port
SERVER_MYSOCK_3= @mysqld.3.socket

View File

@@ -0,0 +1,163 @@
#
# Helper file to ensure that a primary waits for all ACKS (or timeout) from its
# replicas before shutting down.
#
# Parameters:
# server_1_dbug (string) Debug setting for primary (server 1)
# server_2_dbug (string) Debug setting to simulate delay or error on
# the first replica (server 2)
# server_3_dbug (string) Debug setting to simulate delay or error on
# the second replica (server 3)
# semisync_timeout (int) Rpl_semi_sync_master_timeout to use
# server_2_expect_row_count (int) The number of rows expected on the first
# replica after the shutdown
# server_3_expect_row_count (int) The number of rows expected on the second
# replica after the shutdown
#
--connection server_1
let $log_error_file= `SELECT @@GLOBAL.log_error`;
--echo #--
--echo #-- Semi-sync Setup
--connection server_1
--save_master_pos
echo #-- Enable semi-sync on slaves
let slave_last= 3;
--let i= 2
while (`SELECT $i <= $slave_last`)
{
--connection server_$i
--sync_with_master
set global rpl_semi_sync_slave_enabled = 1;
source include/stop_slave.inc;
source include/start_slave.inc;
show status like 'Rpl_semi_sync_slave_status';
--inc $i
}
--echo #-- Enable semi-sync on master
--connection server_1
SET @@GLOBAL.rpl_semi_sync_master_enabled = 1;
--eval set @@global.rpl_semi_sync_master_timeout= $semisync_timeout
--echo #-- Wait for master to recognize semi-sync slaves
--connection server_1
let $status_var= Rpl_semi_sync_master_clients;
let $status_var_value= 2;
source include/wait_for_status_var.inc;
--echo #-- Master should have semi-sync enabled with 2 connections
show status like 'Rpl_semi_sync_master_status';
show status like 'Rpl_semi_sync_master_clients';
--echo #-- Prepare servers to simulate delay or error
--connection server_1
--eval SET @@GLOBAL.debug_dbug= $server_1_dbug
--connection server_2
--eval SET @@GLOBAL.debug_dbug= $server_2_dbug
--connection server_3
--eval SET @@GLOBAL.debug_dbug= $server_3_dbug
--echo #--
--echo #-- Test begins
--connection server_1
--echo #-- Begin semi-sync transaction
--send INSERT INTO t1 VALUES (1)
--connection server_1_con2
--echo #-- Wait until master recognizes a connection is awaiting semi-sync ACK
let $status_var= Rpl_semi_sync_master_wait_sessions;
let $status_var_value= 1;
source include/wait_for_status_var.inc;
show status like 'Rpl_semi_sync_master_wait_sessions';
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
wait
EOF
--echo #-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
--echo #-- Begin master shutdown
--send SHUTDOWN WAIT FOR ALL SLAVES
--connection server_1
--reap
--echo #-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1)
show status like 'Rpl_semi_sync_master_yes_tx';
show status like 'Rpl_semi_sync_master_no_tx';
--connection server_1_con2
--reap
--source include/wait_until_disconnected.inc
--echo # Check logs to ensure shutdown was delayed
--let SEARCH_FILE=$log_error_file
--let SEARCH_PATTERN=Delaying shutdown to await semi-sync ACK
--source include/search_pattern_in_file.inc
--echo # Validate slave data is in correct state
--connection server_2
--eval select count(*)=$server_2_expect_row_count from t1
--connection server_3
--eval select count(*)=$server_3_expect_row_count from t1
--echo #
--echo #-- Re-synchronize slaves with master and disable semi-sync
--echo #-- Stop slaves
--connection server_2
--eval SET @@GLOBAL.debug_dbug= "$sav_server_2_dbug"
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0
source include/stop_slave.inc;
--connection server_3
--eval SET @@GLOBAL.debug_dbug= "$sav_server_3_dbug"
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0
source include/stop_slave.inc;
--echo #-- Bring the master back up
--connection server_1_con2
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
restart
EOF
--enable_reconnect
--source include/wait_until_connected_again.inc
--connection default
--enable_reconnect
--source include/wait_until_connected_again.inc
--connection server_1
--enable_reconnect
--source include/wait_until_connected_again.inc
--eval SET @@GLOBAL.debug_dbug= "$sav_master_dbug"
let $status_var= Rpl_semi_sync_master_clients;
let $status_var_value= 0;
source include/wait_for_status_var.inc;
--eval SET @@GLOBAL.rpl_semi_sync_master_enabled = 0
show status like 'Rpl_semi_sync_master_status';
TRUNCATE TABLE t1;
--save_master_pos
--echo #-- Bring slaves back up
--let i= 2
while (`SELECT $i <= $slave_last`)
{
--connection server_$i
source include/start_slave.inc;
show status like 'Rpl_semi_sync_slave_status';
--sync_with_master
SELECT COUNT(*)=0 from t1;
--inc $i
}

View File

@@ -0,0 +1,214 @@
#
# Purpose:
# This test validates that data is consistent between a primary and replica
# in semi-sync mode when the primary is issued `SHUTDOWN WAIT FOR SLAVES`
# during an active communication. More specifically, the primary should not
# kill the connection until it is sure a replica has received all binlog
# data, i.e. once the primary receives the ACK. If a primary is issued a
# shutdown before receiving an ACK, it should wait until either 1) the ACK is
# received, or 2) the configured timeout (rpl_semi_sync_master_timeout) is
# reached.
#
# Methodology:
# Using a topology consisting of one primary with two replicas, all in
# semi-sync mode, we use DEBUG_DBUG to simulate an error or delay on the
# replicas during an active communication while the primary is issued
# `SHUTDOWN WAIT FOR SLAVES`. We create four test cases to ensure the primary
# will correctly wait for the communication to finish, and use the semi-sync
# status variables Rpl_semi_sync_master_yes_tx and Rpl_semi_sync_master_no_tx
# to ensure the connection was not prematurely killed due to the shutdown.
# Test Case 1) If both replicas simulate a delay that is within the allowed
# timeout, the primary should delay killing the suspended thread
# until an ACK is received (Rpl_semi_sync_master_yes_tx should
# be 1).
# Test Case 2) If both replicas simulate an error before sending an ACK, the
# primary should delay killing the suspended thread until the
# the timeout is reached (Rpl_semi_sync_master_no_tx should be
# 1).
# Test Case 3) If one replica simulates a delay within the allowed timeout
# and the other simulates an error before sending an ACK, the
# primary should delay killing the suspended thread until it
# receives an ACK from the delayed slave
# (Rpl_semi_sync_master_yes_tx should be 1).
# Test Case 4) If a replica errors before sending an ACK, it will cause the
# IO thread to stop and handle the error. During error handling,
# if semi-sync is active, the replica will form a new connection
# with the primary to kill the active connection. However, if
# the primary is shutting down, it may kill the new connection,
# thereby leaving the active semi-sync connection in-tact. The
# slave should notice this, and not issue a `QUIT` command to
# the primary, which would otherwise be sent to kill an active
# connection. This test case validates that the slave does not
# send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx should
# be 1 because server_3 will send the ACK within a valid
# timeout).
#
# References:
# MDEV-11853: semisync thread can be killed after sync binlog but before ACK
# in the sync state
# MDEV-28114: Semi-sync Master ACK Receiver Thread Can Error on COM_QUIT
#
--echo #############################
--echo # Common setup for all tests
--echo #############################
--echo # Note: Simulated slave delay is hardcoded to 800 milliseconds
--echo # Note: Simulated master shutdown delay is hardcoded to 500 milliseconds
--source include/have_debug.inc
--let $rpl_topology=1->2, 1->3
--source include/rpl_init.inc
--connection server_1
--echo # Slaves which simulate an error will produce a timeout on the primary
call mtr.add_suppression("Timeout waiting");
call mtr.add_suppression("did not exit");
--let $sav_master_timeout= `SELECT @@global.rpl_semi_sync_master_timeout`
--let $sav_enabled_master= `SELECT @@GLOBAL.rpl_semi_sync_master_enabled`
--let $sav_master_dbug= `SELECT @@GLOBAL.debug_dbug`
--echo # Suppress slave errors related to the simulated error
--connection server_2
call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
--let $sav_enabled_server_2=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled`
--let $sav_server_2_dbug= `SELECT @@GLOBAL.debug_dbug`
--connection server_3
call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
--let $sav_enabled_server_3=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled`
--let $sav_server_3_dbug= `SELECT @@GLOBAL.debug_dbug`
--connection server_1
CREATE TABLE t1 (a int);
--save_master_pos
--let i= 2
--let slave_last= 3
while (`SELECT $i <= $slave_last`)
{
--connection server_$i
--sync_with_master
--inc $i
}
# Set up the connection used to issue the shutdown
--connect(server_1_con2, localhost, root,,)
--echo #############################
--echo # Test cases
--echo #############################
--echo #
--echo # Test Case 1) If both replicas simulate a delay that is within the
--echo # allowed timeout, the primary should delay killing the suspended thread
--echo # until an ACK is received (Rpl_semi_sync_master_yes_tx should be 1).
--echo #
--let server_1_dbug= ""
--let server_2_dbug= "+d,simulate_delay_semisync_slave_reply"
--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply"
--let semisync_timeout= 1600
--let server_2_expect_row_count= 1
--let server_3_expect_row_count= 1
--source rpl_semi_sync_shutdown_await_ack.inc
--echo #
--echo # Test Case 2) If both replicas simulate an error before sending an ACK,
--echo # the primary should delay killing the suspended thread until the
--echo # timeout is reached (Rpl_semi_sync_master_no_tx should be 1).
--echo #
--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1"
--let server_2_dbug= "+d,corrupt_queue_event"
--let server_3_dbug= "+d,corrupt_queue_event"
--let semisync_timeout= 500
--let server_2_expect_row_count= 0
--let server_3_expect_row_count= 0
--source rpl_semi_sync_shutdown_await_ack.inc
--echo #
--echo # Test Case 3) If one replica simulates a delay within the allowed
--echo # timeout and the other simulates an error before sending an ACK, the
--echo # primary should delay killing the suspended thread until it receives an
--echo # ACK from the delayed slave (Rpl_semi_sync_master_yes_tx should be 1).
--echo #
--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1"
--let server_2_dbug= "+d,corrupt_queue_event"
--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply"
--let semisync_timeout= 1600
--let server_2_expect_row_count= 0
--let server_3_expect_row_count= 1
--source rpl_semi_sync_shutdown_await_ack.inc
--echo #
--echo # Test Case 4) If a replica errors before sending an ACK, it will cause
--echo # the IO thread to stop and handle the error. During error handling, if
--echo # semi-sync is active, the replica will form a new connection with the
--echo # primary to kill the active connection. However, if the primary is
--echo # shutting down, it may kill the new connection, thereby leaving the
--echo # active semi-sync connection in-tact. The slave should notice this, and
--echo # not issue a `QUIT` command to the primary, which would otherwise be
--echo # sent to kill an active connection. This test case validates that the
--echo # slave does not send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx
--echo # should be 1 because server_3 will send the ACK within a valid timeout).
--echo #
# mysqld_delay_kill_threads_phase1 ensures that server_2 will have enough time
# to start a new connection that has the intent to kill the active semi-sync
# connection
--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1"
# slave_delay_killing_semisync_connection ensures that the primary has force
# killed its current connection before it is able to issue `KILL`
--let server_2_dbug= "+d,corrupt_queue_event,slave_delay_killing_semisync_connection"
--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply"
--let semisync_timeout= 1600
--let server_2_expect_row_count= 0
--let server_3_expect_row_count= 1
--source rpl_semi_sync_shutdown_await_ack.inc
--echo #############################
--echo # Cleanup
--echo #############################
--connection server_2
source include/stop_slave.inc;
source include/start_slave.inc;
--disable_query_log
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled = $sav_enabled_server_2
--eval SET @@GLOBAL.debug_dbug= "$sav_server_2_dbug"
--enable_query_log
--connection server_3
source include/stop_slave.inc;
source include/start_slave.inc;
--disable_query_log
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled = $sav_enabled_server_3
--eval SET @@GLOBAL.debug_dbug= "$sav_server_3_dbug"
--enable_query_log
--connection server_1
let $status_var= Rpl_semi_sync_master_clients;
let $status_var_value= 0;
source include/wait_for_status_var.inc;
--disable_query_log
--eval SET @@GLOBAL.rpl_semi_sync_master_timeout= $sav_master_timeout
--eval SET @@GLOBAL.rpl_semi_sync_master_enabled= $sav_enabled_master
--eval SET @@GLOBAL.debug_dbug= "$sav_master_dbug"
--enable_query_log
drop table t1;
--source include/rpl_end.inc