MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave

AKA rpl.rpl_parallel, binlog_encryption.rpl_parallel fails in buildbot with timeout in include A replication parallel worker thread can deadlock with another connection running SHOW SLAVE STATUS. That is, if the replication worker thread is in do_gco_wait() and is killed, it will already hold the LOCK_parallel_entry, and during error reporting, try to grab the err_lock. SHOW SLAVE STATUS, however, grabs these locks in reverse order. It will initially grab the err_lock, and then try to grab LOCK_parallel_entry. This leads to a deadlock when both threads have grabbed their first lock without the second. This patch implements the MDEV-31894 proposed fix to optimize the workers_idle() check to compare the last in-use relay log’s queued_count==dequeued_count for idleness. This removes the need for workers_idle() to grab LOCK_parallel_entry, as these values are atomically updated. Huge thanks to Kristian Nielsen for diagnosing the problem! Reviewed By: ============ Kristian Nielsen <knielsen@knielsen-hq.org> Andrei Elkin <andrei.elkin@mariadb.com>
2025-08-07 00:04:31 +03:00 · 2023-11-29 06:53:31 -07:00
parent 5ca63b2b8b
commit 8dad51481b
6 changed files with 203 additions and 19 deletions
--- a/mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
+++ b/mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
@@ -0,0 +1,66 @@
+include/master-slave.inc
+[connection master]
+#
+# Initialize test data
+connection master;
+create table t1 (a int) engine=innodb;
+insert into t1 values (1);
+include/save_master_gtid.inc
+connection slave;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+call mtr.add_suppression("Connection was killed");
+call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+set @save_parallel_threads= @@global.slave_parallel_threads;
+set @save_parallel_mode= @@global.slave_parallel_mode;
+set @save_transaction_retries= @@global.slave_transaction_retries;
+set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
+set @@global.slave_parallel_threads= 2;
+set @@global.slave_parallel_mode= CONSERVATIVE;
+set @@global.slave_transaction_retries= 0;
+set @@global.innodb_lock_wait_timeout= 10;
+# Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
+BEGIN;
+select * from t1 where a=1 for update;
+a
+1
+connection master;
+set @old_dbug= @@session.debug_dbug;
+set @@session.debug_dbug="+d,binlog_force_commit_id";
+SET @commit_id= 10000;
+update t1 set a=2 where a=1;
+SET @commit_id= 10001;
+insert into t1 values (3);
+set @@session.debug_dbug= @old_dbug;
+connection slave;
+start slave;
+# Waiting for first transaction to start (and be held at innodb row lock)..
+# Waiting for next transaction to start and hold at do_gco_wait()..
+connection slave1;
+set @@session.debug_dbug="+d,hold_sss_with_err_lock";
+show slave status;
+connection slave;
+set debug_sync="now wait_for sss_got_err_lock";
+kill <TID of worker in do_gco_wait>;
+set debug_sync="now signal sss_continue";
+connection slave1;
+# Waiting for SHOW SLAVE STATUS to complete..
+# ..done
+connection slave;
+ROLLBACK;
+include/wait_for_slave_sql_error.inc [errno=1927]
+#
+# Cleanup
+connection master;
+drop table t1;
+include/save_master_gtid.inc
+connection slave;
+set debug_sync= "RESET";
+set @@global.slave_parallel_threads= @save_parallel_threads;
+set @@global.slave_parallel_mode= @save_parallel_mode;
+set @@global.slave_transaction_retries= @save_transaction_retries;
+set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
+start slave sql_thread;
+include/sync_with_master_gtid.inc
+include/rpl_end.inc
+# End of rpl_deadlock_show_slave_status.test
--- a/mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
+++ b/mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
@@ -0,0 +1,121 @@
+#
+#   Verify that SHOW SLAVE STATUS will not cause deadlocks on the replica.
+# A deadlock has been seen in do_gco_wait if the thread is killed, as it will
+# hold the LOCK_parallel_entry, and during error reporting, try to grab the
+# err_lock. Prior to MDEV-10653, SHOW SLAVE STATUS would grab these locks in
+# the reverse order, as calling workers_idle() used to grab LOCK_parallel_entry
+# with the err_lock already grabbed (though the MDEV-10653 patch changed the
+# workles_idle() implementation to remove the need for locking the
+# parallel_entry).
+#
+# References:
+#   MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
+#
+
+--source include/master-slave.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/have_binlog_format_row.inc
+
+--echo #
+--echo # Initialize test data
+--connection master
+create table t1 (a int) engine=innodb;
+insert into t1 values (1);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+
+call mtr.add_suppression("Connection was killed");
+call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+
+set @save_parallel_threads= @@global.slave_parallel_threads;
+set @save_parallel_mode= @@global.slave_parallel_mode;
+set @save_transaction_retries= @@global.slave_transaction_retries;
+set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
+
+set @@global.slave_parallel_threads= 2;
+set @@global.slave_parallel_mode= CONSERVATIVE;
+set @@global.slave_transaction_retries= 0;
+set @@global.innodb_lock_wait_timeout= 10;
+
+--echo # Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
+BEGIN;
+select * from t1 where a=1 for update;
+
+--connection master
+
+set @old_dbug= @@session.debug_dbug;
+set @@session.debug_dbug="+d,binlog_force_commit_id";
+
+
+# GCO 1
+SET @commit_id= 10000;
+# T1
+update t1 set a=2 where a=1;
+
+# GCO 2
+SET @commit_id= 10001;
+# T2
+insert into t1 values (3);
+
+set @@session.debug_dbug= @old_dbug;
+
+--connection slave
+start slave;
+
+--echo # Waiting for first transaction to start (and be held at innodb row lock)..
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(%)' and  command LIKE 'Slave_worker';
+--source include/wait_condition.inc
+
+--echo # Waiting for next transaction to start and hold at do_gco_wait()..
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and  command LIKE 'Slave_worker';
+--source include/wait_condition.inc
+
+--connection slave1
+set @@session.debug_dbug="+d,hold_sss_with_err_lock";
+--send show slave status
+
+--connection slave
+set debug_sync="now wait_for sss_got_err_lock";
+
+--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'`
+--replace_result $t2_tid "<TID of worker in do_gco_wait>"
+--eval kill $t2_tid
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE command LIKE 'Killed';
+--source include/wait_condition.inc
+
+set debug_sync="now signal sss_continue";
+
+--connection slave1
+--echo # Waiting for SHOW SLAVE STATUS to complete..
+--disable_result_log
+--reap
+--enable_result_log
+--echo # ..done
+
+--connection slave
+ROLLBACK;
+--let $slave_sql_errno= 1927
+--source include/wait_for_slave_sql_error.inc
+
+
+--echo #
+--echo # Cleanup
+--connection master
+drop table t1;
+--source include/save_master_gtid.inc
+
+--connection slave
+set debug_sync= "RESET";
+set @@global.slave_parallel_threads= @save_parallel_threads;
+set @@global.slave_parallel_mode= @save_parallel_mode;
+set @@global.slave_transaction_retries= @save_transaction_retries;
+set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
+start slave sql_thread;
+--source include/sync_with_master_gtid.inc
+
+--source include/rpl_end.inc
+--echo # End of rpl_deadlock_show_slave_status.test