diff --git a/mysql-test/suite/rpl/r/rpl_parallel_ftwrl.result b/mysql-test/suite/rpl/r/rpl_parallel_ftwrl.result new file mode 100644 index 00000000000..b6c71055fe7 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_parallel_ftwrl.result @@ -0,0 +1,105 @@ +include/master-slave.inc +[connection master] +connection slave; +include/stop_slave.inc +SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads; +SET GLOBAL slave_parallel_threads=3; +SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode; +SET GLOBAL slave_parallel_mode=aggressive; +SET @old_dbug= @@GLOBAL.debug_dbug; +CHANGE MASTER TO master_use_gtid=slave_pos; +include/start_slave.inc +*** MDEV-31509: Lost data with FTWRL and STOP SLAVE +connection master; +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=MyISAM; +INSERT INTO t1 VALUES (0,0); +INSERT INTO t2 VALUES (0,0); +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +connection slave; +*** Arrange for T1 to delay before entering GCO wait. +SET GLOBAL debug_dbug="+d,gco_wait_delay_gtid_0_x_99"; +*** Arrange for T2 to wait for FTWRL to start. +SET GLOBAL debug_dbug="+d,hold_worker_on_schedule"; +*** Arrange for T2 to delay wakeup from FTWRL pause. +SET GLOBAL debug_dbug="+d,delay_ftwrl_wait_gtid_0_x_100"; +connection master; +*** Event group T1 +SET SESSION gtid_seq_no=99; +INSERT INTO t1 VALUES (1,1); +connection slave; +*** 1a. Wait for T1 to be queued. +SET debug_sync="now WAIT_FOR gco_wait_paused"; +connection master; +*** Event group T2 +SET SESSION gtid_seq_no=100; +INSERT INTO t2 VALUES (2,2); +connection slave; +*** 1b. Wait for T2 to be queued. +SET debug_sync= "now WAIT_FOR reached_pause"; +connection slave1; +*** 2. Run FTWRL +SET GLOBAL debug_dbug= "+d,pause_for_ftwrl_wait"; +FLUSH TABLES WITH READ LOCK; +connection slave; +SET debug_sync= "now WAIT_FOR pause_ftwrl_waiting"; +*** 3. Wait for T2 to be waiting for FTWRL pause +SET debug_sync= "now SIGNAL continue_worker"; +*** 4. FTWRL completes, UNLOCK TABLES. +SET debug_sync="now SIGNAL pause_ftwrl_cont"; +connection slave1; +UNLOCK TABLES; +connection slave; +*** T2 is now ready to proceed after FTWRL pause, but did not wake up yet. +SET debug_sync="now WAIT_FOR pause_wait_started"; +*** 5. STOP SLAVE is run. +connection slave1; +SET GLOBAL debug_dbug="+d,rpl_parallel_wait_for_done_trigger"; +STOP SLAVE; +connection slave; +SET debug_sync="now WAIT_FOR wait_for_done_waiting"; +*** 5. T2 wakes up after FTWRL pause, reaches wait_for_prior_commit(). +SET debug_sync="now SIGNAL pause_wait_continue"; +*** 6. T1 starts. +SET debug_sync="now SIGNAL gco_wait_cont"; +connection slave1; +connection slave; +include/wait_for_slave_to_stop.inc +connection master; +SELECT * FROM t1 ORDER BY a; +a b +0 0 +1 1 +SELECT * FROM t2 ORDER BY a; +a b +0 0 +2 2 +include/save_master_gtid.inc +connection slave; +include/start_slave.inc +include/sync_with_master_gtid.inc +SELECT @@GLOBAL.gtid_slave_pos; +@@GLOBAL.gtid_slave_pos +0-1-100 +SELECT * FROM t1 ORDER BY a; +a b +0 0 +1 1 +SELECT * FROM t2 ORDER BY a; +a b +0 0 +2 2 +*** Clean up. +connection slave; +include/stop_slave.inc +SET DEBUG_SYNC= "RESET"; +SET GLOBAL slave_parallel_threads= @old_parallel_threads; +SET GLOBAL slave_parallel_mode= @old_parallel_mode; +SET GLOBAL debug_dbug=@old_dbug; +include/start_slave.inc +connection master; +DROP TABLE t1, t2; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel_ftwrl.test b/mysql-test/suite/rpl/t/rpl_parallel_ftwrl.test new file mode 100644 index 00000000000..308fcd0bd1f --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_parallel_ftwrl.test @@ -0,0 +1,143 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/have_binlog_format_mixed.inc +--source include/master-slave.inc + +--connection slave +--source include/stop_slave.inc +SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads; +SET GLOBAL slave_parallel_threads=3; +SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode; +SET GLOBAL slave_parallel_mode=aggressive; +SET @old_dbug= @@GLOBAL.debug_dbug; +CHANGE MASTER TO master_use_gtid=slave_pos; +--source include/start_slave.inc + +--echo *** MDEV-31509: Lost data with FTWRL and STOP SLAVE +# The bug was as follows: +# 1. Event groups T1 and T2 are queued but not started yet. +# 2. FLUSH TABLE WITH READ LOCKS starts, sets rpl_parallel_entry::pause_sub_id +# 3. T2 Sees pause_sub_id, goes to wait for the pause to complete. +# 4. FTWRL completes, UNLOCK TABLES is run. +# 5. STOP SLAVE is run, sets rpl_parallel_entry::stop_sub_id. +# 6. T2 wakes up after FTWRL pause, only now sets +# rpl_parallel_entry::largest_started_sub_id. This is the bug, +# largest_started_sub_id is set too late here. +# 7. T1 starts, it sees stop_sub_idLOCK_parallel_entry); } while (wait_count > entry->count_committing_event_groups); } +} - if (entry->force_abort && rgi->gtid_sub_id > entry->stop_sub_id) + +static bool +do_stop_handling(rpl_group_info *rgi) +{ + bool should_stop= false; + rpl_parallel_entry *entry= rgi->parallel_entry; + + mysql_mutex_assert_owner(&entry->LOCK_parallel_entry); + + if (unlikely(entry->force_abort) && rgi->gtid_sub_id > entry->stop_sub_id) { /* We are stopping (STOP SLAVE), and this event group need not be applied @@ -430,10 +440,26 @@ do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco, rather than execute, the following events. Once all queued events have been skipped, the STOP SLAVE is complete (for this thread). */ - return true; + should_stop= true; } - else - return false; + + if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id)) + { + rgi->worker_error= 1; + should_stop= true; + } + + if (likely(!should_stop)) + { + /* + Since we did not decide to stop, bump the largest_started_sub_id while + still holding LOCK_parallel_entry. + */ + if (rgi->gtid_sub_id > entry->largest_started_sub_id) + entry->largest_started_sub_id= rgi->gtid_sub_id; + } + + return should_stop; } @@ -480,15 +506,25 @@ do_ftwrl_wait(rpl_group_info *rgi, mysql_cond_wait(&entry->COND_parallel_entry, &entry->LOCK_parallel_entry); } while (sub_id > entry->pause_sub_id); + DBUG_EXECUTE_IF("delay_ftwrl_wait_gtid_0_x_100", { + if (rgi->current_gtid.domain_id == 0 && + rgi->current_gtid.seq_no == 100) { + /* + Simulate delayed wakeup from the mysql_cond_wait(). To do this, we + need to have the LOCK_parallel_entry mutex released during the wait. + */ + mysql_mutex_unlock(&entry->LOCK_parallel_entry); + debug_sync_set_action(thd, + STRING_WITH_LEN("now SIGNAL pause_wait_started WAIT_FOR pause_wait_continue")); + mysql_mutex_lock(&entry->LOCK_parallel_entry); + } + }); /* We do not call EXIT_COND() here, as this will be done later by our caller (since we set *did_enter_cond to true). */ } - if (sub_id > entry->largest_started_sub_id) - entry->largest_started_sub_id= sub_id; - DBUG_RETURN(aborted); } @@ -646,7 +682,17 @@ rpl_pause_for_ftwrl(THD *thd) mysql_mutex_unlock(&rpt->LOCK_rpl_thread); ++e->need_sub_id_signal; if (e->pause_sub_id == (uint64)ULONGLONG_MAX) + { e->pause_sub_id= e->largest_started_sub_id; + DBUG_EXECUTE_IF("pause_for_ftwrl_wait", { + mysql_mutex_unlock(&e->LOCK_parallel_entry); + debug_sync_set_action(thd, + STRING_WITH_LEN("now " + "SIGNAL pause_ftwrl_waiting " + "WAIT_FOR pause_ftwrl_cont")); + mysql_mutex_lock(&e->LOCK_parallel_entry); + }); + } thd->ENTER_COND(&e->COND_parallel_entry, &e->LOCK_parallel_entry, &stage_waiting_for_ftwrl_threads_to_pause, &old_stage); thd->set_time_for_next_stage(); @@ -1284,14 +1330,15 @@ handle_rpl_parallel_thread(void *arg) event_gtid_sub_id= rgi->gtid_sub_id; rgi->thd= thd; - mysql_mutex_lock(&entry->LOCK_parallel_entry); - skip_event_group= do_gco_wait(rgi, gco, &did_enter_cond, &old_stage); + DBUG_EXECUTE_IF("gco_wait_delay_gtid_0_x_99", { + if (rgi->current_gtid.domain_id == 0 && rgi->current_gtid.seq_no == 99) { + debug_sync_set_action(thd, + STRING_WITH_LEN("now SIGNAL gco_wait_paused WAIT_FOR gco_wait_cont")); + } }); - if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id)) - { - skip_event_group= true; - rgi->worker_error= 1; - } + mysql_mutex_lock(&entry->LOCK_parallel_entry); + do_gco_wait(rgi, gco, &did_enter_cond, &old_stage); + skip_event_group= do_stop_handling(rgi); if (likely(!skip_event_group)) skip_event_group= do_ftwrl_wait(rgi, &did_enter_cond, &old_stage);