diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result index 3f44505e0f6..124b7c9a029 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel.result +++ b/mysql-test/suite/rpl/r/rpl_parallel.result @@ -1537,7 +1537,7 @@ a b 120 0 121 0 include/stop_slave.inc -SET GLOBAL debug_dbug= @old_debug; +SET GLOBAL debug_dbug= @old_dbug; include/start_slave.inc *** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. *** include/stop_slave.inc @@ -1566,7 +1566,88 @@ a b 130 0 131 0 include/stop_slave.inc -SET GLOBAL debug_dbug= @old_debug; +SET GLOBAL debug_dbug= @old_dbug; +include/start_slave.inc +*** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) *** +INSERT INTO t3 VALUES (201,0), (202,0); +include/save_master_gtid.inc +include/sync_with_master_gtid.inc +include/stop_slave.inc +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug= '+d,inject_mdev8031'; +SET @old_dbug= @@SESSION.debug_dbug; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; +SET @commit_id= 10200; +INSERT INTO t3 VALUES (203, 1); +INSERT INTO t3 VALUES (204, 1); +INSERT INTO t3 VALUES (205, 1); +UPDATE t3 SET b=b+1 WHERE a=201; +UPDATE t3 SET b=b+1 WHERE a=201; +UPDATE t3 SET b=b+1 WHERE a=201; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=203; +UPDATE t3 SET b=b+1 WHERE a=203; +UPDATE t3 SET b=b+1 WHERE a=204; +UPDATE t3 SET b=b+1 WHERE a=204; +UPDATE t3 SET b=b+1 WHERE a=204; +UPDATE t3 SET b=b+1 WHERE a=203; +UPDATE t3 SET b=b+1 WHERE a=205; +UPDATE t3 SET b=b+1 WHERE a=205; +SET SESSION debug_dbug=@old_dbug; +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +a b +201 3 +202 4 +203 4 +204 4 +205 3 +include/save_master_gtid.inc +include/start_slave.inc +include/sync_with_master_gtid.inc +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +a b +201 3 +202 4 +203 4 +204 4 +205 3 +include/stop_slave.inc +SET GLOBAL debug_dbug= @old_dbug; +include/start_slave.inc +*** Check getting deadlock killed inside open_binlog() during retry. *** +include/stop_slave.inc +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill'; +SET @old_max= @@GLOBAL.max_relay_log_size; +SET GLOBAL max_relay_log_size= 4096; +SET @old_dbug= @@SESSION.debug_dbug; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; +SET @commit_id= 10210; +Omit long queries that cause relaylog rotations and transaction retries... +SET SESSION debug_dbug=@old_dbug; +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +a b +201 6 +202 8 +203 7 +204 7 +205 5 +include/save_master_gtid.inc +include/start_slave.inc +include/sync_with_master_gtid.inc +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +a b +201 6 +202 8 +203 7 +204 7 +205 5 +include/stop_slave.inc +SET GLOBAL debug_dbug= @old_debg; +SET GLOBAL max_relay_log_size= @old_max; include/start_slave.inc include/stop_slave.inc SET GLOBAL slave_parallel_threads=@old_parallel_threads; diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test index 1ca164fdc97..feac32b1454 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel.test +++ b/mysql-test/suite/rpl/t/rpl_parallel.test @@ -2155,7 +2155,7 @@ SELECT * FROM t3 WHERE a >= 120 ORDER BY a; SELECT * FROM t3 WHERE a >= 120 ORDER BY a; --source include/stop_slave.inc -SET GLOBAL debug_dbug= @old_debug; +SET GLOBAL debug_dbug= @old_dbug; --source include/start_slave.inc @@ -2202,10 +2202,119 @@ SELECT * FROM t3 WHERE a >= 130 ORDER BY a; SELECT * FROM t3 WHERE a >= 130 ORDER BY a; --source include/stop_slave.inc -SET GLOBAL debug_dbug= @old_debug; +SET GLOBAL debug_dbug= @old_dbug; --source include/start_slave.inc +--echo *** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) *** + +--connection server_1 +INSERT INTO t3 VALUES (201,0), (202,0); +--source include/save_master_gtid.inc + +--connection server_2 +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug= '+d,inject_mdev8031'; + +--connection server_1 +# We artificially create a situation that hopefully resembles the original +# bug which was only seen "in the wild", and only once. +# Setup a fake group commit with lots of conflicts that will lead to deadloc +# kill. The slave DBUG injection causes the slave to be deadlock killed at +# a particular point during the retry, and then later do a small sleep at +# another critical point where the prior transaction then has a chance to +# complete. Finally an extra KILL check catches an unhandled, lingering +# deadlock kill. So rather artificial, but at least it exercises the +# relevant code paths. +SET @old_dbug= @@SESSION.debug_dbug; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; + +SET @commit_id= 10200; +INSERT INTO t3 VALUES (203, 1); +INSERT INTO t3 VALUES (204, 1); +INSERT INTO t3 VALUES (205, 1); +UPDATE t3 SET b=b+1 WHERE a=201; +UPDATE t3 SET b=b+1 WHERE a=201; +UPDATE t3 SET b=b+1 WHERE a=201; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=202; +UPDATE t3 SET b=b+1 WHERE a=203; +UPDATE t3 SET b=b+1 WHERE a=203; +UPDATE t3 SET b=b+1 WHERE a=204; +UPDATE t3 SET b=b+1 WHERE a=204; +UPDATE t3 SET b=b+1 WHERE a=204; +UPDATE t3 SET b=b+1 WHERE a=203; +UPDATE t3 SET b=b+1 WHERE a=205; +UPDATE t3 SET b=b+1 WHERE a=205; +SET SESSION debug_dbug=@old_dbug; + +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc + +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +--source include/stop_slave.inc +SET GLOBAL debug_dbug= @old_dbug; +--source include/start_slave.inc + + +--echo *** Check getting deadlock killed inside open_binlog() during retry. *** + +--connection server_2 +--source include/stop_slave.inc +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill'; +SET @old_max= @@GLOBAL.max_relay_log_size; +SET GLOBAL max_relay_log_size= 4096; + +--connection server_1 +SET @old_dbug= @@SESSION.debug_dbug; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; + +--let $large= `SELECT REPEAT("*", 8192)` +SET @commit_id= 10210; +--echo Omit long queries that cause relaylog rotations and transaction retries... +--disable_query_log +eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */; +eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */; +--enable_query_log +SET SESSION debug_dbug=@old_dbug; + +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc + +SELECT * FROM t3 WHERE a>=200 ORDER BY a; +--source include/stop_slave.inc +SET GLOBAL debug_dbug= @old_debg; +SET GLOBAL max_relay_log_size= @old_max; +--source include/start_slave.inc + + + # Clean up. --connection server_2 --source include/stop_slave.inc diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index b7b1ffbeeff..99ddde95689 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -2,6 +2,7 @@ #include "rpl_parallel.h" #include "slave.h" #include "rpl_mi.h" +#include "sql_parse.h" #include "debug_sync.h" /* @@ -326,7 +327,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, IO_CACHE rlog; LOG_INFO linfo; File fd= (File)-1; - const char *errmsg= NULL; + const char *errmsg; inuse_relaylog *ir= rgi->relay_log; uint64 event_count; uint64 events_to_execute= rgi->retry_event_count; @@ -342,6 +343,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt, do_retry: event_count= 0; err= 0; + errmsg= NULL; /* If we already started committing before getting the deadlock (or other @@ -377,7 +379,16 @@ do_retry: */ if(thd->wait_for_commit_ptr) thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); + DBUG_EXECUTE_IF("inject_mdev8031", { + /* Simulate that we get deadlock killed at this exact point. */ + rgi->killed_for_retry= true; + mysql_mutex_lock(&thd->LOCK_thd_data); + thd->killed= KILL_CONNECTION; + mysql_mutex_unlock(&thd->LOCK_thd_data); + }); rgi->cleanup_context(thd, 1); + thd->reset_killed(); + thd->clear_error(); /* If we retry due to a deadlock kill that occured during the commit step, we @@ -418,10 +429,22 @@ do_retry: complete its commit. */ thd->clear_error(); + thd->reset_killed(); if(thd->wait_for_commit_ptr) thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); + DBUG_EXECUTE_IF("inject_mdev8031", { + /* Inject a small sleep to give prior transaction a chance to commit. */ + my_sleep(100000); + }); } + /* + Let us clear any lingering deadlock kill one more time, here after + wait_for_prior_commit() has completed. This should rule out any + possibility of an old deadlock kill lingering on beyond this point. + */ + thd->reset_killed(); + strmake_buf(log_name, ir->name); if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) { @@ -437,6 +460,14 @@ do_retry: err= 1; goto err; } + DBUG_EXECUTE_IF("inject_mdev8031", { + /* Simulate pending KILL caught in read_relay_log_description_event(). */ + if (thd->check_killed()) { + thd->send_kill_message(); + err= 1; + goto err; + } + }); my_b_seek(&rlog, cur_offset); do @@ -459,7 +490,7 @@ do_retry: { errmsg= "slave SQL thread aborted because of I/O error"; err= 1; - goto err; + goto check_retry; } if (rlog.error > 0) { @@ -488,10 +519,25 @@ do_retry: } strmake_buf(log_name ,linfo.log_file_name); + DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", { + if (retries < 2) + { + /* Simulate that we get deadlock killed during open_binlog(). */ + mysql_reset_thd_for_next_command(thd); + rgi->killed_for_retry= true; + mysql_mutex_lock(&thd->LOCK_thd_data); + thd->killed= KILL_CONNECTION; + mysql_mutex_unlock(&thd->LOCK_thd_data); + thd->send_kill_message(); + fd= (File)-1; + err= 1; + goto check_retry; + } + }); if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) { err= 1; - goto err; + goto check_retry; } /* Loop to try again on the new log file. */ } @@ -534,26 +580,31 @@ do_retry: if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd);); DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100", err= dbug_simulate_tmp_error(rgi, thd);); - if (err) + if (!err) + continue; + +check_retry: + convert_kill_to_deadlock_error(rgi); + if (has_temporary_error(thd)) { - convert_kill_to_deadlock_error(rgi); - if (has_temporary_error(thd)) + ++retries; + if (retries < slave_trans_retries) { - ++retries; - if (retries < slave_trans_retries) + if (fd >= 0) { end_io_cache(&rlog); mysql_file_close(fd, MYF(MY_WME)); fd= (File)-1; - goto do_retry; } - sql_print_error("Slave worker thread retried transaction %lu time(s) " - "in vain, giving up. Consider raising the value of " - "the slave_transaction_retries variable.", - slave_trans_retries); + goto do_retry; } - goto err; + sql_print_error("Slave worker thread retried transaction %lu time(s) " + "in vain, giving up. Consider raising the value of " + "the slave_transaction_retries variable.", + slave_trans_retries); } + goto err; + } while (event_count < events_to_execute); err: