mirror of
https://github.com/MariaDB/server.git
synced 2025-08-07 00:04:31 +03:00
MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
AKA rpl.rpl_parallel, binlog_encryption.rpl_parallel fails in buildbot with timeout in include A replication parallel worker thread can deadlock with another connection running SHOW SLAVE STATUS. That is, if the replication worker thread is in do_gco_wait() and is killed, it will already hold the LOCK_parallel_entry, and during error reporting, try to grab the err_lock. SHOW SLAVE STATUS, however, grabs these locks in reverse order. It will initially grab the err_lock, and then try to grab LOCK_parallel_entry. This leads to a deadlock when both threads have grabbed their first lock without the second. This patch implements the MDEV-31894 proposed fix to optimize the workers_idle() check to compare the last in-use relay log’s queued_count==dequeued_count for idleness. This removes the need for workers_idle() to grab LOCK_parallel_entry, as these values are atomically updated. Huge thanks to Kristian Nielsen for diagnosing the problem! Reviewed By: ============ Kristian Nielsen <knielsen@knielsen-hq.org> Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
66
mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
Normal file
66
mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
Normal file
@@ -0,0 +1,66 @@
|
||||
include/master-slave.inc
|
||||
[connection master]
|
||||
#
|
||||
# Initialize test data
|
||||
connection master;
|
||||
create table t1 (a int) engine=innodb;
|
||||
insert into t1 values (1);
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
include/sync_with_master_gtid.inc
|
||||
include/stop_slave.inc
|
||||
call mtr.add_suppression("Connection was killed");
|
||||
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
|
||||
set @save_parallel_threads= @@global.slave_parallel_threads;
|
||||
set @save_parallel_mode= @@global.slave_parallel_mode;
|
||||
set @save_transaction_retries= @@global.slave_transaction_retries;
|
||||
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
|
||||
set @@global.slave_parallel_threads= 2;
|
||||
set @@global.slave_parallel_mode= CONSERVATIVE;
|
||||
set @@global.slave_transaction_retries= 0;
|
||||
set @@global.innodb_lock_wait_timeout= 10;
|
||||
# Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
|
||||
BEGIN;
|
||||
select * from t1 where a=1 for update;
|
||||
a
|
||||
1
|
||||
connection master;
|
||||
set @old_dbug= @@session.debug_dbug;
|
||||
set @@session.debug_dbug="+d,binlog_force_commit_id";
|
||||
SET @commit_id= 10000;
|
||||
update t1 set a=2 where a=1;
|
||||
SET @commit_id= 10001;
|
||||
insert into t1 values (3);
|
||||
set @@session.debug_dbug= @old_dbug;
|
||||
connection slave;
|
||||
start slave;
|
||||
# Waiting for first transaction to start (and be held at innodb row lock)..
|
||||
# Waiting for next transaction to start and hold at do_gco_wait()..
|
||||
connection slave1;
|
||||
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
|
||||
show slave status;
|
||||
connection slave;
|
||||
set debug_sync="now wait_for sss_got_err_lock";
|
||||
kill <TID of worker in do_gco_wait>;
|
||||
set debug_sync="now signal sss_continue";
|
||||
connection slave1;
|
||||
# Waiting for SHOW SLAVE STATUS to complete..
|
||||
# ..done
|
||||
connection slave;
|
||||
ROLLBACK;
|
||||
include/wait_for_slave_sql_error.inc [errno=1927]
|
||||
#
|
||||
# Cleanup
|
||||
connection master;
|
||||
drop table t1;
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
set debug_sync= "RESET";
|
||||
set @@global.slave_parallel_threads= @save_parallel_threads;
|
||||
set @@global.slave_parallel_mode= @save_parallel_mode;
|
||||
set @@global.slave_transaction_retries= @save_transaction_retries;
|
||||
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
|
||||
start slave sql_thread;
|
||||
include/sync_with_master_gtid.inc
|
||||
include/rpl_end.inc
|
||||
# End of rpl_deadlock_show_slave_status.test
|
121
mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
Normal file
121
mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
Normal file
@@ -0,0 +1,121 @@
|
||||
#
|
||||
# Verify that SHOW SLAVE STATUS will not cause deadlocks on the replica.
|
||||
# A deadlock has been seen in do_gco_wait if the thread is killed, as it will
|
||||
# hold the LOCK_parallel_entry, and during error reporting, try to grab the
|
||||
# err_lock. Prior to MDEV-10653, SHOW SLAVE STATUS would grab these locks in
|
||||
# the reverse order, as calling workers_idle() used to grab LOCK_parallel_entry
|
||||
# with the err_lock already grabbed (though the MDEV-10653 patch changed the
|
||||
# workles_idle() implementation to remove the need for locking the
|
||||
# parallel_entry).
|
||||
#
|
||||
# References:
|
||||
# MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
|
||||
#
|
||||
|
||||
--source include/master-slave.inc
|
||||
--source include/have_innodb.inc
|
||||
--source include/have_debug.inc
|
||||
--source include/have_binlog_format_row.inc
|
||||
|
||||
--echo #
|
||||
--echo # Initialize test data
|
||||
--connection master
|
||||
create table t1 (a int) engine=innodb;
|
||||
insert into t1 values (1);
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
--source include/sync_with_master_gtid.inc
|
||||
--source include/stop_slave.inc
|
||||
|
||||
call mtr.add_suppression("Connection was killed");
|
||||
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
|
||||
|
||||
set @save_parallel_threads= @@global.slave_parallel_threads;
|
||||
set @save_parallel_mode= @@global.slave_parallel_mode;
|
||||
set @save_transaction_retries= @@global.slave_transaction_retries;
|
||||
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
|
||||
|
||||
set @@global.slave_parallel_threads= 2;
|
||||
set @@global.slave_parallel_mode= CONSERVATIVE;
|
||||
set @@global.slave_transaction_retries= 0;
|
||||
set @@global.innodb_lock_wait_timeout= 10;
|
||||
|
||||
--echo # Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
|
||||
BEGIN;
|
||||
select * from t1 where a=1 for update;
|
||||
|
||||
--connection master
|
||||
|
||||
set @old_dbug= @@session.debug_dbug;
|
||||
set @@session.debug_dbug="+d,binlog_force_commit_id";
|
||||
|
||||
|
||||
# GCO 1
|
||||
SET @commit_id= 10000;
|
||||
# T1
|
||||
update t1 set a=2 where a=1;
|
||||
|
||||
# GCO 2
|
||||
SET @commit_id= 10001;
|
||||
# T2
|
||||
insert into t1 values (3);
|
||||
|
||||
set @@session.debug_dbug= @old_dbug;
|
||||
|
||||
--connection slave
|
||||
start slave;
|
||||
|
||||
--echo # Waiting for first transaction to start (and be held at innodb row lock)..
|
||||
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(%)' and command LIKE 'Slave_worker';
|
||||
--source include/wait_condition.inc
|
||||
|
||||
--echo # Waiting for next transaction to start and hold at do_gco_wait()..
|
||||
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
|
||||
--source include/wait_condition.inc
|
||||
|
||||
--connection slave1
|
||||
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
|
||||
--send show slave status
|
||||
|
||||
--connection slave
|
||||
set debug_sync="now wait_for sss_got_err_lock";
|
||||
|
||||
--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'`
|
||||
--replace_result $t2_tid "<TID of worker in do_gco_wait>"
|
||||
--eval kill $t2_tid
|
||||
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE command LIKE 'Killed';
|
||||
--source include/wait_condition.inc
|
||||
|
||||
set debug_sync="now signal sss_continue";
|
||||
|
||||
--connection slave1
|
||||
--echo # Waiting for SHOW SLAVE STATUS to complete..
|
||||
--disable_result_log
|
||||
--reap
|
||||
--enable_result_log
|
||||
--echo # ..done
|
||||
|
||||
--connection slave
|
||||
ROLLBACK;
|
||||
--let $slave_sql_errno= 1927
|
||||
--source include/wait_for_slave_sql_error.inc
|
||||
|
||||
|
||||
--echo #
|
||||
--echo # Cleanup
|
||||
--connection master
|
||||
drop table t1;
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
set debug_sync= "RESET";
|
||||
set @@global.slave_parallel_threads= @save_parallel_threads;
|
||||
set @@global.slave_parallel_mode= @save_parallel_mode;
|
||||
set @@global.slave_transaction_retries= @save_transaction_retries;
|
||||
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
|
||||
start slave sql_thread;
|
||||
--source include/sync_with_master_gtid.inc
|
||||
|
||||
--source include/rpl_end.inc
|
||||
--echo # End of rpl_deadlock_show_slave_status.test
|
Reference in New Issue
Block a user