mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
MDEV-31509: Lost data with FTWRL and STOP SLAVE
The largest_started_sub_id needs to be set under LOCK_parallel_entry together with testing stop_sub_id. However, in-between was the logic for do_ftwrl_wait(), which temporarily releases the mutex. This could lead to inconsistent stopping amongst worker threads and lost data. Fix by moving all the stop-related logic out from unrelated do_gco_wait() and do_ftwrl_wait() and into its own function do_stop_handling(). Reviewed-by: Andrei Elkin <andrei.elkin@mariadb.com> Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
105
mysql-test/suite/rpl/r/rpl_parallel_ftwrl.result
Normal file
105
mysql-test/suite/rpl/r/rpl_parallel_ftwrl.result
Normal file
@@ -0,0 +1,105 @@
|
||||
include/master-slave.inc
|
||||
[connection master]
|
||||
connection slave;
|
||||
include/stop_slave.inc
|
||||
SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads;
|
||||
SET GLOBAL slave_parallel_threads=3;
|
||||
SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
|
||||
SET GLOBAL slave_parallel_mode=aggressive;
|
||||
SET @old_dbug= @@GLOBAL.debug_dbug;
|
||||
CHANGE MASTER TO master_use_gtid=slave_pos;
|
||||
include/start_slave.inc
|
||||
*** MDEV-31509: Lost data with FTWRL and STOP SLAVE
|
||||
connection master;
|
||||
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
|
||||
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
|
||||
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=MyISAM;
|
||||
INSERT INTO t1 VALUES (0,0);
|
||||
INSERT INTO t2 VALUES (0,0);
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
include/sync_with_master_gtid.inc
|
||||
connection slave;
|
||||
*** Arrange for T1 to delay before entering GCO wait.
|
||||
SET GLOBAL debug_dbug="+d,gco_wait_delay_gtid_0_x_99";
|
||||
*** Arrange for T2 to wait for FTWRL to start.
|
||||
SET GLOBAL debug_dbug="+d,hold_worker_on_schedule";
|
||||
*** Arrange for T2 to delay wakeup from FTWRL pause.
|
||||
SET GLOBAL debug_dbug="+d,delay_ftwrl_wait_gtid_0_x_100";
|
||||
connection master;
|
||||
*** Event group T1
|
||||
SET SESSION gtid_seq_no=99;
|
||||
INSERT INTO t1 VALUES (1,1);
|
||||
connection slave;
|
||||
*** 1a. Wait for T1 to be queued.
|
||||
SET debug_sync="now WAIT_FOR gco_wait_paused";
|
||||
connection master;
|
||||
*** Event group T2
|
||||
SET SESSION gtid_seq_no=100;
|
||||
INSERT INTO t2 VALUES (2,2);
|
||||
connection slave;
|
||||
*** 1b. Wait for T2 to be queued.
|
||||
SET debug_sync= "now WAIT_FOR reached_pause";
|
||||
connection slave1;
|
||||
*** 2. Run FTWRL
|
||||
SET GLOBAL debug_dbug= "+d,pause_for_ftwrl_wait";
|
||||
FLUSH TABLES WITH READ LOCK;
|
||||
connection slave;
|
||||
SET debug_sync= "now WAIT_FOR pause_ftwrl_waiting";
|
||||
*** 3. Wait for T2 to be waiting for FTWRL pause
|
||||
SET debug_sync= "now SIGNAL continue_worker";
|
||||
*** 4. FTWRL completes, UNLOCK TABLES.
|
||||
SET debug_sync="now SIGNAL pause_ftwrl_cont";
|
||||
connection slave1;
|
||||
UNLOCK TABLES;
|
||||
connection slave;
|
||||
*** T2 is now ready to proceed after FTWRL pause, but did not wake up yet.
|
||||
SET debug_sync="now WAIT_FOR pause_wait_started";
|
||||
*** 5. STOP SLAVE is run.
|
||||
connection slave1;
|
||||
SET GLOBAL debug_dbug="+d,rpl_parallel_wait_for_done_trigger";
|
||||
STOP SLAVE;
|
||||
connection slave;
|
||||
SET debug_sync="now WAIT_FOR wait_for_done_waiting";
|
||||
*** 5. T2 wakes up after FTWRL pause, reaches wait_for_prior_commit().
|
||||
SET debug_sync="now SIGNAL pause_wait_continue";
|
||||
*** 6. T1 starts.
|
||||
SET debug_sync="now SIGNAL gco_wait_cont";
|
||||
connection slave1;
|
||||
connection slave;
|
||||
include/wait_for_slave_to_stop.inc
|
||||
connection master;
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a b
|
||||
0 0
|
||||
1 1
|
||||
SELECT * FROM t2 ORDER BY a;
|
||||
a b
|
||||
0 0
|
||||
2 2
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
include/start_slave.inc
|
||||
include/sync_with_master_gtid.inc
|
||||
SELECT @@GLOBAL.gtid_slave_pos;
|
||||
@@GLOBAL.gtid_slave_pos
|
||||
0-1-100
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a b
|
||||
0 0
|
||||
1 1
|
||||
SELECT * FROM t2 ORDER BY a;
|
||||
a b
|
||||
0 0
|
||||
2 2
|
||||
*** Clean up.
|
||||
connection slave;
|
||||
include/stop_slave.inc
|
||||
SET DEBUG_SYNC= "RESET";
|
||||
SET GLOBAL slave_parallel_threads= @old_parallel_threads;
|
||||
SET GLOBAL slave_parallel_mode= @old_parallel_mode;
|
||||
SET GLOBAL debug_dbug=@old_dbug;
|
||||
include/start_slave.inc
|
||||
connection master;
|
||||
DROP TABLE t1, t2;
|
||||
include/rpl_end.inc
|
143
mysql-test/suite/rpl/t/rpl_parallel_ftwrl.test
Normal file
143
mysql-test/suite/rpl/t/rpl_parallel_ftwrl.test
Normal file
@@ -0,0 +1,143 @@
|
||||
--source include/have_innodb.inc
|
||||
--source include/have_debug.inc
|
||||
--source include/have_debug_sync.inc
|
||||
--source include/have_binlog_format_mixed.inc
|
||||
--source include/master-slave.inc
|
||||
|
||||
--connection slave
|
||||
--source include/stop_slave.inc
|
||||
SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads;
|
||||
SET GLOBAL slave_parallel_threads=3;
|
||||
SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
|
||||
SET GLOBAL slave_parallel_mode=aggressive;
|
||||
SET @old_dbug= @@GLOBAL.debug_dbug;
|
||||
CHANGE MASTER TO master_use_gtid=slave_pos;
|
||||
--source include/start_slave.inc
|
||||
|
||||
--echo *** MDEV-31509: Lost data with FTWRL and STOP SLAVE
|
||||
# The bug was as follows:
|
||||
# 1. Event groups T1 and T2 are queued but not started yet.
|
||||
# 2. FLUSH TABLE WITH READ LOCKS starts, sets rpl_parallel_entry::pause_sub_id
|
||||
# 3. T2 Sees pause_sub_id, goes to wait for the pause to complete.
|
||||
# 4. FTWRL completes, UNLOCK TABLES is run.
|
||||
# 5. STOP SLAVE is run, sets rpl_parallel_entry::stop_sub_id.
|
||||
# 6. T2 wakes up after FTWRL pause, only now sets
|
||||
# rpl_parallel_entry::largest_started_sub_id. This is the bug,
|
||||
# largest_started_sub_id is set too late here.
|
||||
# 7. T1 starts, it sees stop_sub_id<T1, so T1 is skipped due to STOP SLAVE.
|
||||
# 8. T2 continues, its check for stop_sub_id was before STOP SLAVE. So T2 is
|
||||
# wrongly applied, silently losing transaction T1.
|
||||
|
||||
|
||||
--connection master
|
||||
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
|
||||
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
|
||||
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=MyISAM;
|
||||
INSERT INTO t1 VALUES (0,0);
|
||||
INSERT INTO t2 VALUES (0,0);
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
--source include/sync_with_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
--echo *** Arrange for T1 to delay before entering GCO wait.
|
||||
SET GLOBAL debug_dbug="+d,gco_wait_delay_gtid_0_x_99";
|
||||
--echo *** Arrange for T2 to wait for FTWRL to start.
|
||||
SET GLOBAL debug_dbug="+d,hold_worker_on_schedule";
|
||||
--echo *** Arrange for T2 to delay wakeup from FTWRL pause.
|
||||
SET GLOBAL debug_dbug="+d,delay_ftwrl_wait_gtid_0_x_100";
|
||||
|
||||
--connection master
|
||||
--echo *** Event group T1
|
||||
SET SESSION gtid_seq_no=99;
|
||||
INSERT INTO t1 VALUES (1,1);
|
||||
|
||||
--connection slave
|
||||
--echo *** 1a. Wait for T1 to be queued.
|
||||
SET debug_sync="now WAIT_FOR gco_wait_paused";
|
||||
|
||||
--connection master
|
||||
--echo *** Event group T2
|
||||
SET SESSION gtid_seq_no=100;
|
||||
INSERT INTO t2 VALUES (2,2);
|
||||
|
||||
--connection slave
|
||||
--echo *** 1b. Wait for T2 to be queued.
|
||||
SET debug_sync= "now WAIT_FOR reached_pause";
|
||||
|
||||
--connection slave1
|
||||
--echo *** 2. Run FTWRL
|
||||
SET GLOBAL debug_dbug= "+d,pause_for_ftwrl_wait";
|
||||
send FLUSH TABLES WITH READ LOCK;
|
||||
|
||||
--connection slave
|
||||
SET debug_sync= "now WAIT_FOR pause_ftwrl_waiting";
|
||||
|
||||
--echo *** 3. Wait for T2 to be waiting for FTWRL pause
|
||||
SET debug_sync= "now SIGNAL continue_worker";
|
||||
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "%Waiting due to global read lock%" and command="Slave_worker";
|
||||
--source include/wait_condition.inc
|
||||
|
||||
--echo *** 4. FTWRL completes, UNLOCK TABLES.
|
||||
SET debug_sync="now SIGNAL pause_ftwrl_cont";
|
||||
|
||||
--connection slave1
|
||||
reap;
|
||||
UNLOCK TABLES;
|
||||
|
||||
--connection slave
|
||||
--echo *** T2 is now ready to proceed after FTWRL pause, but did not wake up yet.
|
||||
SET debug_sync="now WAIT_FOR pause_wait_started";
|
||||
|
||||
--echo *** 5. STOP SLAVE is run.
|
||||
--connection slave1
|
||||
SET GLOBAL debug_dbug="+d,rpl_parallel_wait_for_done_trigger";
|
||||
send STOP SLAVE;
|
||||
|
||||
--connection slave
|
||||
SET debug_sync="now WAIT_FOR wait_for_done_waiting";
|
||||
|
||||
--echo *** 5. T2 wakes up after FTWRL pause, reaches wait_for_prior_commit().
|
||||
SET debug_sync="now SIGNAL pause_wait_continue";
|
||||
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "%Waiting for prior transaction to commit%" and command="Slave_worker";
|
||||
--source include/wait_condition.inc
|
||||
|
||||
--echo *** 6. T1 starts.
|
||||
SET debug_sync="now SIGNAL gco_wait_cont";
|
||||
|
||||
--connection slave1
|
||||
reap;
|
||||
|
||||
--connection slave
|
||||
--source include/wait_for_slave_to_stop.inc
|
||||
|
||||
--connection master
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
SELECT * FROM t2 ORDER BY a;
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
--source include/start_slave.inc
|
||||
--source include/sync_with_master_gtid.inc
|
||||
|
||||
# The bug here was that T2 was errorneously replicated while T1 was
|
||||
# being skipped due to STOP SLAVE. So the @@gtid_slave_pos was at T2,
|
||||
# but we were missing the data from T1.
|
||||
SELECT @@GLOBAL.gtid_slave_pos;
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
SELECT * FROM t2 ORDER BY a;
|
||||
|
||||
--echo *** Clean up.
|
||||
--connection slave
|
||||
--source include/stop_slave.inc
|
||||
SET DEBUG_SYNC= "RESET";
|
||||
SET GLOBAL slave_parallel_threads= @old_parallel_threads;
|
||||
SET GLOBAL slave_parallel_mode= @old_parallel_mode;
|
||||
SET GLOBAL debug_dbug=@old_dbug;
|
||||
--source include/start_slave.inc
|
||||
|
||||
--connection master
|
||||
DROP TABLE t1, t2;
|
||||
|
||||
--source include/rpl_end.inc
|
@@ -369,7 +369,7 @@ register_wait_for_prior_event_group_commit(rpl_group_info *rgi,
|
||||
Do not start parallel execution of this event group until all prior groups
|
||||
have reached the commit phase that are not safe to run in parallel with.
|
||||
*/
|
||||
static bool
|
||||
static void
|
||||
do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco,
|
||||
bool *did_enter_cond, PSI_stage_info *old_stage)
|
||||
{
|
||||
@@ -421,8 +421,18 @@ do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco,
|
||||
&entry->LOCK_parallel_entry);
|
||||
} while (wait_count > entry->count_committing_event_groups);
|
||||
}
|
||||
}
|
||||
|
||||
if (entry->force_abort && rgi->gtid_sub_id > entry->stop_sub_id)
|
||||
|
||||
static bool
|
||||
do_stop_handling(rpl_group_info *rgi)
|
||||
{
|
||||
bool should_stop= false;
|
||||
rpl_parallel_entry *entry= rgi->parallel_entry;
|
||||
|
||||
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
|
||||
|
||||
if (unlikely(entry->force_abort) && rgi->gtid_sub_id > entry->stop_sub_id)
|
||||
{
|
||||
/*
|
||||
We are stopping (STOP SLAVE), and this event group need not be applied
|
||||
@@ -430,10 +440,26 @@ do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco,
|
||||
rather than execute, the following events. Once all queued events have
|
||||
been skipped, the STOP SLAVE is complete (for this thread).
|
||||
*/
|
||||
return true;
|
||||
should_stop= true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
|
||||
if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id))
|
||||
{
|
||||
rgi->worker_error= 1;
|
||||
should_stop= true;
|
||||
}
|
||||
|
||||
if (likely(!should_stop))
|
||||
{
|
||||
/*
|
||||
Since we did not decide to stop, bump the largest_started_sub_id while
|
||||
still holding LOCK_parallel_entry.
|
||||
*/
|
||||
if (rgi->gtid_sub_id > entry->largest_started_sub_id)
|
||||
entry->largest_started_sub_id= rgi->gtid_sub_id;
|
||||
}
|
||||
|
||||
return should_stop;
|
||||
}
|
||||
|
||||
|
||||
@@ -480,15 +506,25 @@ do_ftwrl_wait(rpl_group_info *rgi,
|
||||
mysql_cond_wait(&entry->COND_parallel_entry, &entry->LOCK_parallel_entry);
|
||||
} while (sub_id > entry->pause_sub_id);
|
||||
|
||||
DBUG_EXECUTE_IF("delay_ftwrl_wait_gtid_0_x_100", {
|
||||
if (rgi->current_gtid.domain_id == 0 &&
|
||||
rgi->current_gtid.seq_no == 100) {
|
||||
/*
|
||||
Simulate delayed wakeup from the mysql_cond_wait(). To do this, we
|
||||
need to have the LOCK_parallel_entry mutex released during the wait.
|
||||
*/
|
||||
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
|
||||
debug_sync_set_action(thd,
|
||||
STRING_WITH_LEN("now SIGNAL pause_wait_started WAIT_FOR pause_wait_continue"));
|
||||
mysql_mutex_lock(&entry->LOCK_parallel_entry);
|
||||
}
|
||||
});
|
||||
/*
|
||||
We do not call EXIT_COND() here, as this will be done later by our
|
||||
caller (since we set *did_enter_cond to true).
|
||||
*/
|
||||
}
|
||||
|
||||
if (sub_id > entry->largest_started_sub_id)
|
||||
entry->largest_started_sub_id= sub_id;
|
||||
|
||||
DBUG_RETURN(aborted);
|
||||
}
|
||||
|
||||
@@ -646,7 +682,17 @@ rpl_pause_for_ftwrl(THD *thd)
|
||||
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
||||
++e->need_sub_id_signal;
|
||||
if (e->pause_sub_id == (uint64)ULONGLONG_MAX)
|
||||
{
|
||||
e->pause_sub_id= e->largest_started_sub_id;
|
||||
DBUG_EXECUTE_IF("pause_for_ftwrl_wait", {
|
||||
mysql_mutex_unlock(&e->LOCK_parallel_entry);
|
||||
debug_sync_set_action(thd,
|
||||
STRING_WITH_LEN("now "
|
||||
"SIGNAL pause_ftwrl_waiting "
|
||||
"WAIT_FOR pause_ftwrl_cont"));
|
||||
mysql_mutex_lock(&e->LOCK_parallel_entry);
|
||||
});
|
||||
}
|
||||
thd->ENTER_COND(&e->COND_parallel_entry, &e->LOCK_parallel_entry,
|
||||
&stage_waiting_for_ftwrl_threads_to_pause, &old_stage);
|
||||
thd->set_time_for_next_stage();
|
||||
@@ -1284,14 +1330,15 @@ handle_rpl_parallel_thread(void *arg)
|
||||
event_gtid_sub_id= rgi->gtid_sub_id;
|
||||
rgi->thd= thd;
|
||||
|
||||
mysql_mutex_lock(&entry->LOCK_parallel_entry);
|
||||
skip_event_group= do_gco_wait(rgi, gco, &did_enter_cond, &old_stage);
|
||||
DBUG_EXECUTE_IF("gco_wait_delay_gtid_0_x_99", {
|
||||
if (rgi->current_gtid.domain_id == 0 && rgi->current_gtid.seq_no == 99) {
|
||||
debug_sync_set_action(thd,
|
||||
STRING_WITH_LEN("now SIGNAL gco_wait_paused WAIT_FOR gco_wait_cont"));
|
||||
} });
|
||||
|
||||
if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id))
|
||||
{
|
||||
skip_event_group= true;
|
||||
rgi->worker_error= 1;
|
||||
}
|
||||
mysql_mutex_lock(&entry->LOCK_parallel_entry);
|
||||
do_gco_wait(rgi, gco, &did_enter_cond, &old_stage);
|
||||
skip_event_group= do_stop_handling(rgi);
|
||||
if (likely(!skip_event_group))
|
||||
skip_event_group= do_ftwrl_wait(rgi, &did_enter_cond, &old_stage);
|
||||
|
||||
|
Reference in New Issue
Block a user