From d4309d4830c8889286ab586c04dc5545ba9bf0a4 Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Thu, 15 Jun 2023 21:46:01 +0200 Subject: [PATCH] MDEV-31448: Killing a replica thread awaiting its GCO can hang/crash a parallel replica Various test cases for the bugs around MDEV-31448. Test cases due to Brandon Nesterenko, thanks! Reviewed-by: Andrei Elkin Signed-off-by: Kristian Nielsen --- .../rpl/include/mdev-31448_conservative.inc | 68 +++++++++ .../rpl/include/mdev-31448_optimistic.inc | 94 ++++++++++++ ...ev-31448_kill_ooo_finish_optimistic.result | 52 +++++++ .../suite/rpl/r/rpl_parallel_kill.result | 142 ++++++++++++++++++ ...mdev-31448_kill_ooo_finish_optimistic.test | 93 ++++++++++++ mysql-test/suite/rpl/t/rpl_parallel_kill.test | 15 ++ 6 files changed, 464 insertions(+) create mode 100644 mysql-test/suite/rpl/include/mdev-31448_conservative.inc create mode 100644 mysql-test/suite/rpl/include/mdev-31448_optimistic.inc create mode 100644 mysql-test/suite/rpl/r/mdev-31448_kill_ooo_finish_optimistic.result create mode 100644 mysql-test/suite/rpl/r/rpl_parallel_kill.result create mode 100644 mysql-test/suite/rpl/t/mdev-31448_kill_ooo_finish_optimistic.test create mode 100644 mysql-test/suite/rpl/t/rpl_parallel_kill.test diff --git a/mysql-test/suite/rpl/include/mdev-31448_conservative.inc b/mysql-test/suite/rpl/include/mdev-31448_conservative.inc new file mode 100644 index 00000000000..9a2884439f6 --- /dev/null +++ b/mysql-test/suite/rpl/include/mdev-31448_conservative.inc @@ -0,0 +1,68 @@ +--connection master +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; +insert into t1 values (1); +--source include/save_master_gtid.inc + +--connection slave +call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends"); + +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc +set @save.slave_parallel_threads= @@global.slave_parallel_threads; +set @save.slave_parallel_mode= @@global.slave_parallel_mode; +set @@global.slave_parallel_threads= 3; +set @@global.slave_parallel_mode= CONSERVATIVE; +--connection slave1 +BEGIN; +update t1 set a=2 where a=1; + +--connection master +SET @old_dbug= @@SESSION.debug_dbug; +SET @@SESSION.debug_dbug="+d,binlog_force_commit_id"; + +# GCO 1 +SET @commit_id= 10000; +# T1 +update t1 set a=2 where a=1; +# T2 +insert into t2 values (1); + +# GCO 2 +SET @commit_id= 10001; +# T3 +insert into t1 values (3); + +--connection slave +--source include/start_slave.inc + +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(-1)' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc + +--let $t3_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'` +--evalp kill $t3_tid + +--connection slave1 +commit; + +--connection slave +--let $slave_timeout=1032 +--source include/wait_for_slave_sql_to_stop.inc + +update t1 set a=1 where a=2; +set @@global.slave_parallel_threads = @save.slave_parallel_threads; +set @@global.slave_parallel_mode = @save.slave_parallel_mode; +--source include/start_slave.inc + +--echo # +--echo # Cleanup +--connection master +DROP TABLE t1, t2; +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc diff --git a/mysql-test/suite/rpl/include/mdev-31448_optimistic.inc b/mysql-test/suite/rpl/include/mdev-31448_optimistic.inc new file mode 100644 index 00000000000..9b72181d249 --- /dev/null +++ b/mysql-test/suite/rpl/include/mdev-31448_optimistic.inc @@ -0,0 +1,94 @@ +--echo # MDEV-31448 OOO finish event group by killed worker +# The test demonstrates how a killed worker access gco lists +# in finish_event_group() out-of-order to fire +# DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id); +# in the buggy version. + +--echo # Initialize test data +--connection master +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; + +insert into t1 values (1); +--source include/save_master_gtid.inc + +--connection slave +call mtr.add_suppression("Connection was killed"); +call mtr.add_suppression("Can.t find record"); + +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc +set @save.slave_parallel_threads= @@global.slave_parallel_threads; +set @save.slave_parallel_mode= @@global.slave_parallel_mode; +set @@global.slave_parallel_threads= 3; +set @@global.slave_parallel_mode= OPTIMISTIC; + +--connection slave1 +begin; +update t1 set a=2 where a=1; + +--connection master +set @old_dbug= @@session.debug_dbug; +set @@session.debug_dbug="+d,binlog_force_commit_id"; + +# GCO 1 +set @commit_id= 10000; +# T1 +update t1 set a=2 where a=1; + +if (!$killed_trx_commits) +{ +set @commit_id= 10001; +# T2 +set statement skip_parallel_replication=1 for insert into t2 values (1); +} + +if ($killed_trx_commits) +{ +insert into t2 values (1); +} +# GCO 2 +# T3 +drop table t2; + +--connection slave +--source include/start_slave.inc + +--echo # wait for T1 +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(-1)' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc + +--echo # wait for T2 +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc +--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'` +--echo # wait for T3 +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc + +--evalp kill $t2_tid +# give some little time for T2 to re-sink into the same state +--let $slave_param=Last_Errno +--let $slave_param_value=1927 +--source include/wait_for_slave_param.inc +--connection slave1 +commit; + +--connection slave +--let $slave_timeout=1032 +--source include/wait_for_slave_sql_to_stop.inc + +update t1 set a=1 where a=2; +set @@global.slave_parallel_threads = @save.slave_parallel_threads; +set @@global.slave_parallel_mode = @save.slave_parallel_mode; +--source include/start_slave.inc + +--echo # +--echo # Cleanup +--connection master +drop table t1; +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc + diff --git a/mysql-test/suite/rpl/r/mdev-31448_kill_ooo_finish_optimistic.result b/mysql-test/suite/rpl/r/mdev-31448_kill_ooo_finish_optimistic.result new file mode 100644 index 00000000000..2753e66fe48 --- /dev/null +++ b/mysql-test/suite/rpl/r/mdev-31448_kill_ooo_finish_optimistic.result @@ -0,0 +1,52 @@ +include/master-slave.inc +[connection master] +# MDEV-31448 OOO finish event group by killed worker +# Initialize test data +connection master; +call mtr.add_suppression("Slave: Connection was killed"); +call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends"); +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; +insert into t1 values (1); +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +set @@global.slave_parallel_threads= 4; +set @@global.slave_parallel_mode= OPTIMISTIC; +set @@global.innodb_lock_wait_timeout= 30; +set @@global.slave_transaction_retries= 0; +connection slave1; +BEGIN; +SELECT * FROM t1 WHERE a=1 FOR UPDATE; +a +1 +connection master; +SET @old_dbug= @@SESSION.debug_dbug; +SET @@SESSION.debug_dbug="+d,binlog_force_commit_id"; +SET @commit_id= 10000; +update t1 set a=2 where a=1; +set statement skip_parallel_replication=1 for insert into t2 values (1); +drop table t2; +connection slave; +include/start_slave.inc +# wait for T1 +# wait for T2 +# wait for T3 +kill T2_TID; +connection slave1; +ROLLBACK; +connection master; +DROP TABLE t1; +include/save_master_gtid.inc +connection slave; +# +# Cleanup +include/stop_slave.inc +set @@global.slave_parallel_threads= 0; +set @@global.slave_parallel_mode= conservative; +set @@global.innodb_lock_wait_timeout= 50; +set @@global.slave_transaction_retries= 10; +include/start_slave.inc +include/sync_with_master_gtid.inc +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/r/rpl_parallel_kill.result b/mysql-test/suite/rpl/r/rpl_parallel_kill.result new file mode 100644 index 00000000000..7e6b065725b --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_parallel_kill.result @@ -0,0 +1,142 @@ +include/master-slave.inc +[connection master] +connection master; +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; +insert into t1 values (1); +include/save_master_gtid.inc +connection slave; +call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends"); +include/sync_with_master_gtid.inc +include/stop_slave.inc +set @save.slave_parallel_threads= @@global.slave_parallel_threads; +set @save.slave_parallel_mode= @@global.slave_parallel_mode; +set @@global.slave_parallel_threads= 3; +set @@global.slave_parallel_mode= CONSERVATIVE; +connection slave1; +BEGIN; +update t1 set a=2 where a=1; +connection master; +SET @old_dbug= @@SESSION.debug_dbug; +SET @@SESSION.debug_dbug="+d,binlog_force_commit_id"; +SET @commit_id= 10000; +update t1 set a=2 where a=1; +insert into t2 values (1); +SET @commit_id= 10001; +insert into t1 values (3); +connection slave; +include/start_slave.inc +kill $t3_tid; +connection slave1; +commit; +connection slave; +include/wait_for_slave_sql_to_stop.inc +update t1 set a=1 where a=2; +set @@global.slave_parallel_threads = @save.slave_parallel_threads; +set @@global.slave_parallel_mode = @save.slave_parallel_mode; +include/start_slave.inc +# +# Cleanup +connection master; +DROP TABLE t1, t2; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +# MDEV-31448 OOO finish event group by killed worker +# Initialize test data +connection master; +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; +insert into t1 values (1); +include/save_master_gtid.inc +connection slave; +call mtr.add_suppression("Connection was killed"); +call mtr.add_suppression("Can.t find record"); +include/sync_with_master_gtid.inc +include/stop_slave.inc +set @save.slave_parallel_threads= @@global.slave_parallel_threads; +set @save.slave_parallel_mode= @@global.slave_parallel_mode; +set @@global.slave_parallel_threads= 3; +set @@global.slave_parallel_mode= OPTIMISTIC; +connection slave1; +begin; +update t1 set a=2 where a=1; +connection master; +set @old_dbug= @@session.debug_dbug; +set @@session.debug_dbug="+d,binlog_force_commit_id"; +set @commit_id= 10000; +update t1 set a=2 where a=1; +insert into t2 values (1); +drop table t2; +connection slave; +include/start_slave.inc +# wait for T1 +# wait for T2 +# wait for T3 +kill $t2_tid; +include/wait_for_slave_param.inc [Last_Errno] +connection slave1; +commit; +connection slave; +include/wait_for_slave_sql_to_stop.inc +update t1 set a=1 where a=2; +set @@global.slave_parallel_threads = @save.slave_parallel_threads; +set @@global.slave_parallel_mode = @save.slave_parallel_mode; +include/start_slave.inc +# +# Cleanup +connection master; +drop table t1; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +# MDEV-31448 OOO finish event group by killed worker +# Initialize test data +connection master; +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; +insert into t1 values (1); +include/save_master_gtid.inc +connection slave; +call mtr.add_suppression("Connection was killed"); +call mtr.add_suppression("Can.t find record"); +include/sync_with_master_gtid.inc +include/stop_slave.inc +set @save.slave_parallel_threads= @@global.slave_parallel_threads; +set @save.slave_parallel_mode= @@global.slave_parallel_mode; +set @@global.slave_parallel_threads= 3; +set @@global.slave_parallel_mode= OPTIMISTIC; +connection slave1; +begin; +update t1 set a=2 where a=1; +connection master; +set @old_dbug= @@session.debug_dbug; +set @@session.debug_dbug="+d,binlog_force_commit_id"; +set @commit_id= 10000; +update t1 set a=2 where a=1; +set @commit_id= 10001; +set statement skip_parallel_replication=1 for insert into t2 values (1); +drop table t2; +connection slave; +include/start_slave.inc +# wait for T1 +# wait for T2 +# wait for T3 +kill $t2_tid; +include/wait_for_slave_param.inc [Last_Errno] +connection slave1; +commit; +connection slave; +include/wait_for_slave_sql_to_stop.inc +update t1 set a=1 where a=2; +set @@global.slave_parallel_threads = @save.slave_parallel_threads; +set @@global.slave_parallel_mode = @save.slave_parallel_mode; +include/start_slave.inc +# +# Cleanup +connection master; +drop table t1; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/mdev-31448_kill_ooo_finish_optimistic.test b/mysql-test/suite/rpl/t/mdev-31448_kill_ooo_finish_optimistic.test new file mode 100644 index 00000000000..ae15ed64a65 --- /dev/null +++ b/mysql-test/suite/rpl/t/mdev-31448_kill_ooo_finish_optimistic.test @@ -0,0 +1,93 @@ +--source include/master-slave.inc +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_binlog_format_row.inc + +--echo # MDEV-31448 OOO finish event group by killed worker +# The test demonstrates how a killed worker access gco lists +# in finish_event_group() out-of-order to fire +# DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id); +# in the buggy version. + +--echo # Initialize test data +--connection master +call mtr.add_suppression("Slave: Connection was killed"); +call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends"); +create table t1 (a int) engine=innodb; +create table t2 (a int) engine=innodb; + +insert into t1 values (1); +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc +--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads` +--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode` +--let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout` +--let $save_transaction_retries= `SELECT @@global.slave_transaction_retries` +set @@global.slave_parallel_threads= 4; +set @@global.slave_parallel_mode= OPTIMISTIC; +set @@global.innodb_lock_wait_timeout= 30; +set @@global.slave_transaction_retries= 0; + +--connection slave1 +BEGIN; +SELECT * FROM t1 WHERE a=1 FOR UPDATE; + +--connection master +SET @old_dbug= @@SESSION.debug_dbug; +SET @@SESSION.debug_dbug="+d,binlog_force_commit_id"; + +# GCO 1 +SET @commit_id= 10000; +# T1 +update t1 set a=2 where a=1; +# T2 +set statement skip_parallel_replication=1 for insert into t2 values (1); + +# GCO 2 +# T3 +drop table t2; + +--connection slave +--source include/start_slave.inc + +--echo # wait for T1 +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(-1)' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc + +--echo # wait for T2 +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc +--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'` +--echo # wait for T3 +--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker'; +--source include/wait_condition.inc + +--replace_result $t2_tid T2_TID +--eval kill $t2_tid + +--sleep 1 + +--connection slave1 +# Release the blocked T1 +ROLLBACK; + +--connection master +DROP TABLE t1; +--source include/save_master_gtid.inc + +--connection slave +--echo # +--echo # Cleanup +--source include/stop_slave.inc +eval set @@global.slave_parallel_threads= $save_slave_parallel_threads; +eval set @@global.slave_parallel_mode= $save_slave_parallel_mode; +eval set @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout; +eval set @@global.slave_transaction_retries= $save_transaction_retries; +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc + +--source include/rpl_end.inc + diff --git a/mysql-test/suite/rpl/t/rpl_parallel_kill.test b/mysql-test/suite/rpl/t/rpl_parallel_kill.test new file mode 100644 index 00000000000..563b0aa61e9 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_parallel_kill.test @@ -0,0 +1,15 @@ +--source include/master-slave.inc +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_binlog_format_row.inc + +--source include/mdev-31448_conservative.inc + +--let $killed_trx_commits=1 +--source include/mdev-31448_optimistic.inc +--let $killed_trx_commits=0 +--source include/mdev-31448_optimistic.inc + + + +--source include/rpl_end.inc