mirror of
https://github.com/MariaDB/server.git
synced 2025-07-08 17:02:21 +03:00
MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
AKA rpl.rpl_parallel, binlog_encryption.rpl_parallel fails in buildbot with timeout in include A replication parallel worker thread can deadlock with another connection running SHOW SLAVE STATUS. That is, if the replication worker thread is in do_gco_wait() and is killed, it will already hold the LOCK_parallel_entry, and during error reporting, try to grab the err_lock. SHOW SLAVE STATUS, however, grabs these locks in reverse order. It will initially grab the err_lock, and then try to grab LOCK_parallel_entry. This leads to a deadlock when both threads have grabbed their first lock without the second. This patch implements the MDEV-31894 proposed fix to optimize the workers_idle() check to compare the last in-use relay log’s queued_count==dequeued_count for idleness. This removes the need for workers_idle() to grab LOCK_parallel_entry, as these values are atomically updated. Huge thanks to Kristian Nielsen for diagnosing the problem! Reviewed By: ============ Kristian Nielsen <knielsen@knielsen-hq.org> Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
66
mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
Normal file
66
mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
include/master-slave.inc
|
||||||
|
[connection master]
|
||||||
|
#
|
||||||
|
# Initialize test data
|
||||||
|
connection master;
|
||||||
|
create table t1 (a int) engine=innodb;
|
||||||
|
insert into t1 values (1);
|
||||||
|
include/save_master_gtid.inc
|
||||||
|
connection slave;
|
||||||
|
include/sync_with_master_gtid.inc
|
||||||
|
include/stop_slave.inc
|
||||||
|
call mtr.add_suppression("Connection was killed");
|
||||||
|
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
|
||||||
|
set @save_parallel_threads= @@global.slave_parallel_threads;
|
||||||
|
set @save_parallel_mode= @@global.slave_parallel_mode;
|
||||||
|
set @save_transaction_retries= @@global.slave_transaction_retries;
|
||||||
|
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
|
||||||
|
set @@global.slave_parallel_threads= 2;
|
||||||
|
set @@global.slave_parallel_mode= CONSERVATIVE;
|
||||||
|
set @@global.slave_transaction_retries= 0;
|
||||||
|
set @@global.innodb_lock_wait_timeout= 10;
|
||||||
|
# Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
|
||||||
|
BEGIN;
|
||||||
|
select * from t1 where a=1 for update;
|
||||||
|
a
|
||||||
|
1
|
||||||
|
connection master;
|
||||||
|
set @old_dbug= @@session.debug_dbug;
|
||||||
|
set @@session.debug_dbug="+d,binlog_force_commit_id";
|
||||||
|
SET @commit_id= 10000;
|
||||||
|
update t1 set a=2 where a=1;
|
||||||
|
SET @commit_id= 10001;
|
||||||
|
insert into t1 values (3);
|
||||||
|
set @@session.debug_dbug= @old_dbug;
|
||||||
|
connection slave;
|
||||||
|
start slave;
|
||||||
|
# Waiting for first transaction to start (and be held at innodb row lock)..
|
||||||
|
# Waiting for next transaction to start and hold at do_gco_wait()..
|
||||||
|
connection slave1;
|
||||||
|
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
|
||||||
|
show slave status;
|
||||||
|
connection slave;
|
||||||
|
set debug_sync="now wait_for sss_got_err_lock";
|
||||||
|
kill <TID of worker in do_gco_wait>;
|
||||||
|
set debug_sync="now signal sss_continue";
|
||||||
|
connection slave1;
|
||||||
|
# Waiting for SHOW SLAVE STATUS to complete..
|
||||||
|
# ..done
|
||||||
|
connection slave;
|
||||||
|
ROLLBACK;
|
||||||
|
include/wait_for_slave_sql_error.inc [errno=1927]
|
||||||
|
#
|
||||||
|
# Cleanup
|
||||||
|
connection master;
|
||||||
|
drop table t1;
|
||||||
|
include/save_master_gtid.inc
|
||||||
|
connection slave;
|
||||||
|
set debug_sync= "RESET";
|
||||||
|
set @@global.slave_parallel_threads= @save_parallel_threads;
|
||||||
|
set @@global.slave_parallel_mode= @save_parallel_mode;
|
||||||
|
set @@global.slave_transaction_retries= @save_transaction_retries;
|
||||||
|
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
|
||||||
|
start slave sql_thread;
|
||||||
|
include/sync_with_master_gtid.inc
|
||||||
|
include/rpl_end.inc
|
||||||
|
# End of rpl_deadlock_show_slave_status.test
|
121
mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
Normal file
121
mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
#
|
||||||
|
# Verify that SHOW SLAVE STATUS will not cause deadlocks on the replica.
|
||||||
|
# A deadlock has been seen in do_gco_wait if the thread is killed, as it will
|
||||||
|
# hold the LOCK_parallel_entry, and during error reporting, try to grab the
|
||||||
|
# err_lock. Prior to MDEV-10653, SHOW SLAVE STATUS would grab these locks in
|
||||||
|
# the reverse order, as calling workers_idle() used to grab LOCK_parallel_entry
|
||||||
|
# with the err_lock already grabbed (though the MDEV-10653 patch changed the
|
||||||
|
# workles_idle() implementation to remove the need for locking the
|
||||||
|
# parallel_entry).
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
|
||||||
|
#
|
||||||
|
|
||||||
|
--source include/master-slave.inc
|
||||||
|
--source include/have_innodb.inc
|
||||||
|
--source include/have_debug.inc
|
||||||
|
--source include/have_binlog_format_row.inc
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # Initialize test data
|
||||||
|
--connection master
|
||||||
|
create table t1 (a int) engine=innodb;
|
||||||
|
insert into t1 values (1);
|
||||||
|
--source include/save_master_gtid.inc
|
||||||
|
|
||||||
|
--connection slave
|
||||||
|
--source include/sync_with_master_gtid.inc
|
||||||
|
--source include/stop_slave.inc
|
||||||
|
|
||||||
|
call mtr.add_suppression("Connection was killed");
|
||||||
|
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
|
||||||
|
|
||||||
|
set @save_parallel_threads= @@global.slave_parallel_threads;
|
||||||
|
set @save_parallel_mode= @@global.slave_parallel_mode;
|
||||||
|
set @save_transaction_retries= @@global.slave_transaction_retries;
|
||||||
|
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
|
||||||
|
|
||||||
|
set @@global.slave_parallel_threads= 2;
|
||||||
|
set @@global.slave_parallel_mode= CONSERVATIVE;
|
||||||
|
set @@global.slave_transaction_retries= 0;
|
||||||
|
set @@global.innodb_lock_wait_timeout= 10;
|
||||||
|
|
||||||
|
--echo # Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
|
||||||
|
BEGIN;
|
||||||
|
select * from t1 where a=1 for update;
|
||||||
|
|
||||||
|
--connection master
|
||||||
|
|
||||||
|
set @old_dbug= @@session.debug_dbug;
|
||||||
|
set @@session.debug_dbug="+d,binlog_force_commit_id";
|
||||||
|
|
||||||
|
|
||||||
|
# GCO 1
|
||||||
|
SET @commit_id= 10000;
|
||||||
|
# T1
|
||||||
|
update t1 set a=2 where a=1;
|
||||||
|
|
||||||
|
# GCO 2
|
||||||
|
SET @commit_id= 10001;
|
||||||
|
# T2
|
||||||
|
insert into t1 values (3);
|
||||||
|
|
||||||
|
set @@session.debug_dbug= @old_dbug;
|
||||||
|
|
||||||
|
--connection slave
|
||||||
|
start slave;
|
||||||
|
|
||||||
|
--echo # Waiting for first transaction to start (and be held at innodb row lock)..
|
||||||
|
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(%)' and command LIKE 'Slave_worker';
|
||||||
|
--source include/wait_condition.inc
|
||||||
|
|
||||||
|
--echo # Waiting for next transaction to start and hold at do_gco_wait()..
|
||||||
|
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
|
||||||
|
--source include/wait_condition.inc
|
||||||
|
|
||||||
|
--connection slave1
|
||||||
|
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
|
||||||
|
--send show slave status
|
||||||
|
|
||||||
|
--connection slave
|
||||||
|
set debug_sync="now wait_for sss_got_err_lock";
|
||||||
|
|
||||||
|
--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'`
|
||||||
|
--replace_result $t2_tid "<TID of worker in do_gco_wait>"
|
||||||
|
--eval kill $t2_tid
|
||||||
|
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE command LIKE 'Killed';
|
||||||
|
--source include/wait_condition.inc
|
||||||
|
|
||||||
|
set debug_sync="now signal sss_continue";
|
||||||
|
|
||||||
|
--connection slave1
|
||||||
|
--echo # Waiting for SHOW SLAVE STATUS to complete..
|
||||||
|
--disable_result_log
|
||||||
|
--reap
|
||||||
|
--enable_result_log
|
||||||
|
--echo # ..done
|
||||||
|
|
||||||
|
--connection slave
|
||||||
|
ROLLBACK;
|
||||||
|
--let $slave_sql_errno= 1927
|
||||||
|
--source include/wait_for_slave_sql_error.inc
|
||||||
|
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # Cleanup
|
||||||
|
--connection master
|
||||||
|
drop table t1;
|
||||||
|
--source include/save_master_gtid.inc
|
||||||
|
|
||||||
|
--connection slave
|
||||||
|
set debug_sync= "RESET";
|
||||||
|
set @@global.slave_parallel_threads= @save_parallel_threads;
|
||||||
|
set @@global.slave_parallel_mode= @save_parallel_mode;
|
||||||
|
set @@global.slave_transaction_retries= @save_transaction_retries;
|
||||||
|
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
|
||||||
|
start slave sql_thread;
|
||||||
|
--source include/sync_with_master_gtid.inc
|
||||||
|
|
||||||
|
--source include/rpl_end.inc
|
||||||
|
--echo # End of rpl_deadlock_show_slave_status.test
|
@ -2537,23 +2537,10 @@ rpl_parallel::stop_during_until()
|
|||||||
|
|
||||||
|
|
||||||
bool
|
bool
|
||||||
rpl_parallel::workers_idle()
|
rpl_parallel::workers_idle(Relay_log_info *rli)
|
||||||
{
|
{
|
||||||
struct rpl_parallel_entry *e;
|
return rli->last_inuse_relaylog->queued_count ==
|
||||||
uint32 i, max_i;
|
rli->last_inuse_relaylog->dequeued_count;
|
||||||
|
|
||||||
max_i= domain_hash.records;
|
|
||||||
for (i= 0; i < max_i; ++i)
|
|
||||||
{
|
|
||||||
bool active;
|
|
||||||
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
|
|
||||||
mysql_mutex_lock(&e->LOCK_parallel_entry);
|
|
||||||
active= e->current_sub_id > e->last_committed_sub_id;
|
|
||||||
mysql_mutex_unlock(&e->LOCK_parallel_entry);
|
|
||||||
if (active)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return (i == max_i);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -369,9 +369,10 @@ struct rpl_parallel {
|
|||||||
rpl_parallel_entry *find(uint32 domain_id);
|
rpl_parallel_entry *find(uint32 domain_id);
|
||||||
void wait_for_done(THD *thd, Relay_log_info *rli);
|
void wait_for_done(THD *thd, Relay_log_info *rli);
|
||||||
void stop_during_until();
|
void stop_during_until();
|
||||||
bool workers_idle();
|
|
||||||
int wait_for_workers_idle(THD *thd);
|
int wait_for_workers_idle(THD *thd);
|
||||||
int do_event(rpl_group_info *serial_rgi, Log_event *ev, ulonglong event_size);
|
int do_event(rpl_group_info *serial_rgi, Log_event *ev, ulonglong event_size);
|
||||||
|
|
||||||
|
static bool workers_idle(Relay_log_info *rli);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -622,7 +622,7 @@ struct inuse_relaylog {
|
|||||||
rpl_gtid *relay_log_state;
|
rpl_gtid *relay_log_state;
|
||||||
uint32 relay_log_state_count;
|
uint32 relay_log_state_count;
|
||||||
/* Number of events in this relay log queued for worker threads. */
|
/* Number of events in this relay log queued for worker threads. */
|
||||||
int64 queued_count;
|
Atomic_counter<int64> queued_count;
|
||||||
/* Number of events completed by worker threads. */
|
/* Number of events completed by worker threads. */
|
||||||
Atomic_counter<int64> dequeued_count;
|
Atomic_counter<int64> dequeued_count;
|
||||||
/* Set when all events have been read from a relaylog. */
|
/* Set when all events have been read from a relaylog. */
|
||||||
|
11
sql/slave.cc
11
sql/slave.cc
@ -3123,6 +3123,14 @@ static bool send_show_master_info_data(THD *thd, Master_info *mi, bool full,
|
|||||||
mysql_mutex_lock(&mi->err_lock);
|
mysql_mutex_lock(&mi->err_lock);
|
||||||
/* err_lock is to protect mi->rli.last_error() */
|
/* err_lock is to protect mi->rli.last_error() */
|
||||||
mysql_mutex_lock(&mi->rli.err_lock);
|
mysql_mutex_lock(&mi->rli.err_lock);
|
||||||
|
|
||||||
|
DBUG_EXECUTE_IF("hold_sss_with_err_lock", {
|
||||||
|
DBUG_ASSERT(!debug_sync_set_action(
|
||||||
|
thd, STRING_WITH_LEN("now SIGNAL sss_got_err_lock "
|
||||||
|
"WAIT_FOR sss_continue")));
|
||||||
|
DBUG_SET("-d,hold_sss_with_err_lock");
|
||||||
|
});
|
||||||
|
|
||||||
protocol->store(mi->host, &my_charset_bin);
|
protocol->store(mi->host, &my_charset_bin);
|
||||||
protocol->store(mi->user, &my_charset_bin);
|
protocol->store(mi->user, &my_charset_bin);
|
||||||
protocol->store((uint32) mi->port);
|
protocol->store((uint32) mi->port);
|
||||||
@ -3197,7 +3205,8 @@ static bool send_show_master_info_data(THD *thd, Master_info *mi, bool full,
|
|||||||
while the slave is processing ignored events, such as those skipped
|
while the slave is processing ignored events, such as those skipped
|
||||||
due to slave_skip_counter.
|
due to slave_skip_counter.
|
||||||
*/
|
*/
|
||||||
if (mi->using_parallel() && idle && !mi->rli.parallel.workers_idle())
|
if (mi->using_parallel() && idle &&
|
||||||
|
!rpl_parallel::workers_idle(&mi->rli))
|
||||||
idle= false;
|
idle= false;
|
||||||
}
|
}
|
||||||
if (idle)
|
if (idle)
|
||||||
|
Reference in New Issue
Block a user