1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave

AKA rpl.rpl_parallel, binlog_encryption.rpl_parallel fails in
buildbot with timeout in include

A replication parallel worker thread can deadlock with another
connection running SHOW SLAVE STATUS. That is, if the replication
worker thread is in do_gco_wait() and is killed, it will already
hold the LOCK_parallel_entry, and during error reporting, try to
grab the err_lock. SHOW SLAVE STATUS, however, grabs these locks in
reverse order. It will initially grab the err_lock, and then try to
grab LOCK_parallel_entry. This leads to a deadlock when both threads
have grabbed their first lock without the second.

This patch implements the MDEV-31894 proposed fix to optimize the
workers_idle() check to compare the last in-use relay log’s
queued_count==dequeued_count for idleness. This removes the need for
workers_idle() to grab LOCK_parallel_entry, as these values are
atomically updated.

Huge thanks to Kristian Nielsen for diagnosing the problem!

Reviewed By:
============
Kristian Nielsen <knielsen@knielsen-hq.org>
Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
Brandon Nesterenko
2023-11-29 06:53:31 -07:00
parent 5ca63b2b8b
commit 8dad51481b
6 changed files with 203 additions and 19 deletions

View File

@@ -0,0 +1,66 @@
include/master-slave.inc
[connection master]
#
# Initialize test data
connection master;
create table t1 (a int) engine=innodb;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
set @save_parallel_threads= @@global.slave_parallel_threads;
set @save_parallel_mode= @@global.slave_parallel_mode;
set @save_transaction_retries= @@global.slave_transaction_retries;
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @@global.slave_parallel_threads= 2;
set @@global.slave_parallel_mode= CONSERVATIVE;
set @@global.slave_transaction_retries= 0;
set @@global.innodb_lock_wait_timeout= 10;
# Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
BEGIN;
select * from t1 where a=1 for update;
a
1
connection master;
set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10000;
update t1 set a=2 where a=1;
SET @commit_id= 10001;
insert into t1 values (3);
set @@session.debug_dbug= @old_dbug;
connection slave;
start slave;
# Waiting for first transaction to start (and be held at innodb row lock)..
# Waiting for next transaction to start and hold at do_gco_wait()..
connection slave1;
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
show slave status;
connection slave;
set debug_sync="now wait_for sss_got_err_lock";
kill <TID of worker in do_gco_wait>;
set debug_sync="now signal sss_continue";
connection slave1;
# Waiting for SHOW SLAVE STATUS to complete..
# ..done
connection slave;
ROLLBACK;
include/wait_for_slave_sql_error.inc [errno=1927]
#
# Cleanup
connection master;
drop table t1;
include/save_master_gtid.inc
connection slave;
set debug_sync= "RESET";
set @@global.slave_parallel_threads= @save_parallel_threads;
set @@global.slave_parallel_mode= @save_parallel_mode;
set @@global.slave_transaction_retries= @save_transaction_retries;
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
start slave sql_thread;
include/sync_with_master_gtid.inc
include/rpl_end.inc
# End of rpl_deadlock_show_slave_status.test

View File

@@ -0,0 +1,121 @@
#
# Verify that SHOW SLAVE STATUS will not cause deadlocks on the replica.
# A deadlock has been seen in do_gco_wait if the thread is killed, as it will
# hold the LOCK_parallel_entry, and during error reporting, try to grab the
# err_lock. Prior to MDEV-10653, SHOW SLAVE STATUS would grab these locks in
# the reverse order, as calling workers_idle() used to grab LOCK_parallel_entry
# with the err_lock already grabbed (though the MDEV-10653 patch changed the
# workles_idle() implementation to remove the need for locking the
# parallel_entry).
#
# References:
# MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
#
--source include/master-slave.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_binlog_format_row.inc
--echo #
--echo # Initialize test data
--connection master
create table t1 (a int) engine=innodb;
insert into t1 values (1);
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
set @save_parallel_threads= @@global.slave_parallel_threads;
set @save_parallel_mode= @@global.slave_parallel_mode;
set @save_transaction_retries= @@global.slave_transaction_retries;
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @@global.slave_parallel_threads= 2;
set @@global.slave_parallel_mode= CONSERVATIVE;
set @@global.slave_transaction_retries= 0;
set @@global.innodb_lock_wait_timeout= 10;
--echo # Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
BEGIN;
select * from t1 where a=1 for update;
--connection master
set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";
# GCO 1
SET @commit_id= 10000;
# T1
update t1 set a=2 where a=1;
# GCO 2
SET @commit_id= 10001;
# T2
insert into t1 values (3);
set @@session.debug_dbug= @old_dbug;
--connection slave
start slave;
--echo # Waiting for first transaction to start (and be held at innodb row lock)..
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(%)' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--echo # Waiting for next transaction to start and hold at do_gco_wait()..
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--connection slave1
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
--send show slave status
--connection slave
set debug_sync="now wait_for sss_got_err_lock";
--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'`
--replace_result $t2_tid "<TID of worker in do_gco_wait>"
--eval kill $t2_tid
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE command LIKE 'Killed';
--source include/wait_condition.inc
set debug_sync="now signal sss_continue";
--connection slave1
--echo # Waiting for SHOW SLAVE STATUS to complete..
--disable_result_log
--reap
--enable_result_log
--echo # ..done
--connection slave
ROLLBACK;
--let $slave_sql_errno= 1927
--source include/wait_for_slave_sql_error.inc
--echo #
--echo # Cleanup
--connection master
drop table t1;
--source include/save_master_gtid.inc
--connection slave
set debug_sync= "RESET";
set @@global.slave_parallel_threads= @save_parallel_threads;
set @@global.slave_parallel_mode= @save_parallel_mode;
set @@global.slave_transaction_retries= @save_transaction_retries;
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
start slave sql_thread;
--source include/sync_with_master_gtid.inc
--source include/rpl_end.inc
--echo # End of rpl_deadlock_show_slave_status.test