mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
MDEV-33500 (part 2): rpl.rpl_parallel_sbm can still fail
The failing test case validates Seconds_Behind_Master for a delayed slave, while STOP SLAVE is executed during a delay. The test fixes initially added to the test (commitb04c857596
) added a table lock to ensure a transaction could not finish before validating the Seconds_Behind_Master field after SLAVE START, but did not address a possibility that the transaction could finish before running the STOP SLAVE command, which invalidates the validations for the rest of the test case. Specifically, this would result in 1) a timeout in “Waiting for table metadata lock” on the replica, which expects the transaction to retry after slave restart and hit a lock conflict on the locked tables (added inb04c857596
), and 2) that Seconds_Behind_Master should have increased, but did not. The failure can be reproduced by synchronizing the slave to the master before the MDEV-32265 echo statement (i.e. before the SLAVE STOP). This patch fixes the test by adding a mechanism to use DEBUG_SYNC to synchronize a MASTER_DELAY, rather than continually increase the duration of the delay each time the test fails on buildbot. This is to ensure that on slow machines, a delay does not pass before the test gets a chance to validate results. Additionally, it decreases overall test time because the test can continue immediately after validation, thereby bypassing the remainder of a full delay for each transaction.
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
# ==== Purpose ====
|
||||
#
|
||||
# If using DEBUG_SYNC to coordinate a slave's SQL DELAY via the DEBUG_DBUG
|
||||
# identifier "sql_delay_by_debug_sync", this helper file will help synchronize
|
||||
# a slave with the master for statements which don't need to be delayed. This
|
||||
# can be helpful, for example, for setup/cleanup statements, if they must be
|
||||
# run in the same lifetime as the statements used for the test.
|
||||
#
|
||||
# The actual synchronization will take place based on the input parameter
|
||||
# slave_sync_method, which can be "gtid", "file_coord", or "none"; and will use
|
||||
# the helper files sync_with_master_gtid.inc or sync_with_master.inc (or none
|
||||
# at all), respectively.
|
||||
#
|
||||
#
|
||||
# ==== Requirements ====
|
||||
#
|
||||
# --source include/have_debug.inc
|
||||
# --source include/have_debug_sync.inc
|
||||
# set @@GLOBAL.debug_dbug= "+d,sql_delay_by_debug_sync";
|
||||
#
|
||||
#
|
||||
# ==== Usage ====
|
||||
#
|
||||
# --let $slave_sync_method= gtid|file_coord|none
|
||||
# [--let $num_event_groups= NUMBER]
|
||||
# --source include/sync_with_master_sql_delay_debug_sync.inc
|
||||
#
|
||||
#
|
||||
# Parameters:
|
||||
# $slave_sync_method
|
||||
# Value can be gtid, file_coord, or none; and will synchronize the slave
|
||||
# with the master via this method (i.e. using sync_with_master_gtid.inc
|
||||
# or sync_with_master.inc, respectively), after synchronizing the SQL
|
||||
# delay
|
||||
#
|
||||
# $num_event_groups
|
||||
# Number of event groups to synchronize the SQL delay for. If unset, will
|
||||
# be default to 1.
|
||||
#
|
||||
|
||||
--let $include_filename= sync_with_master_sql_delay_debug_sync.inc
|
||||
--source include/begin_include_file.inc
|
||||
|
||||
if (!$slave_sync_method)
|
||||
{
|
||||
--die Parameter slave_sync_method must be set
|
||||
}
|
||||
|
||||
if (`select "$slave_sync_method" not like "gtid" and "$slave_sync_method" not like "file_coord" and "$slave_sync_method" not like "none"`)
|
||||
{
|
||||
--die Parameter slave_sync_method must have value "gtid", "file_coord" or "none"
|
||||
}
|
||||
|
||||
if (`select "$slave_sync_method" not like "none" and strcmp("$master_pos", "") = 0`)
|
||||
{
|
||||
--die sync_with_master.inc or sync_with_master_gtid.inc was not called to populate variable master_pos
|
||||
}
|
||||
|
||||
if (!$num_event_groups)
|
||||
{
|
||||
--let $num_event_groups= 1
|
||||
}
|
||||
|
||||
while ($num_event_groups)
|
||||
{
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
--dec $num_event_groups
|
||||
}
|
||||
|
||||
if (`select "$slave_sync_method" LIKE "gtid"`)
|
||||
{
|
||||
--source include/sync_with_master_gtid.inc
|
||||
}
|
||||
|
||||
if (`select "$slave_sync_method" LIKE "file_coord"`)
|
||||
{
|
||||
--source include/sync_with_master.inc
|
||||
}
|
||||
|
||||
--let $include_filename= sync_with_master_sql_delay_debug_sync.inc
|
||||
--source include/end_include_file.inc
|
@@ -5,14 +5,20 @@ include/master-slave.inc
|
||||
#
|
||||
connection slave;
|
||||
include/stop_slave.inc
|
||||
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
|
||||
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
|
||||
change master to master_delay=3, master_use_gtid=Slave_Pos;
|
||||
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
|
||||
change master to master_delay=1, master_use_gtid=Slave_Pos;
|
||||
include/start_slave.inc
|
||||
connection master;
|
||||
create table t1 (a int);
|
||||
create table t2 (a int);
|
||||
include/sync_slave_sql_with_master.inc
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
include/sync_with_master_sql_delay_debug_sync.inc
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
#
|
||||
# Pt 1) Ensure SBM is updated immediately upon arrival of the next event
|
||||
connection master;
|
||||
@@ -21,12 +27,23 @@ insert into t1 values (0);
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
# Waiting for transaction to arrive on slave and begin SQL Delay..
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
# Validating SBM is updated on event arrival..
|
||||
# ..done
|
||||
# MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently
|
||||
# delaying a transaction; then when the reciprocal START SLAVE occurs,
|
||||
# if the event is still to be delayed, SBM should resume accordingly
|
||||
include/stop_slave.inc
|
||||
connection server_2;
|
||||
# Ensure the kill from STOP SLAVE will be received before continuing the
|
||||
# SQL thread
|
||||
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
|
||||
STOP SLAVE;
|
||||
connection slave;
|
||||
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
connection server_2;
|
||||
include/wait_for_slave_to_stop.inc
|
||||
set debug_sync="RESET";
|
||||
# Lock t1 on slave to ensure the event can't finish (and thereby update
|
||||
# Seconds_Behind_Master) so slow running servers don't accidentally
|
||||
# catch up to the master before checking SBM.
|
||||
@@ -34,6 +51,10 @@ connection server_2;
|
||||
LOCK TABLES t1 WRITE;
|
||||
include/start_slave.inc
|
||||
connection slave;
|
||||
# SQL delay has no impact for the rest of the test case, so ignore it
|
||||
include/sync_with_master_sql_delay_debug_sync.inc
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
# Waiting for replica to get blocked by the table lock
|
||||
# Sleeping 1s to increment SBM
|
||||
# Ensuring Seconds_Behind_Master increases after sleeping..
|
||||
@@ -54,6 +75,13 @@ insert into t1 values (2);
|
||||
include/save_master_pos.inc
|
||||
connection slave;
|
||||
# Wait for first transaction to complete SQL delay and begin execution..
|
||||
include/sync_with_master_sql_delay_debug_sync.inc
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
# Wait for second transaction to complete SQL delay..
|
||||
include/sync_with_master_sql_delay_debug_sync.inc
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
# Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
|
||||
# ..and that SBM wasn't calculated using prior committed transactions
|
||||
# ..done
|
||||
@@ -63,6 +91,8 @@ include/wait_for_slave_param.inc [Relay_Master_Log_File]
|
||||
include/wait_for_slave_param.inc [Exec_Master_Log_Pos]
|
||||
# Cleanup
|
||||
include/stop_slave.inc
|
||||
set debug_sync= "RESET";
|
||||
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
|
||||
CHANGE MASTER TO master_delay=0;
|
||||
include/start_slave.inc
|
||||
#
|
||||
|
@@ -3,6 +3,7 @@
|
||||
#
|
||||
--source include/have_log_bin.inc
|
||||
--source include/have_debug.inc
|
||||
--source include/have_debug_sync.inc
|
||||
--source include/master-slave.inc
|
||||
|
||||
--echo #
|
||||
@@ -16,14 +17,23 @@
|
||||
# Seconds_Behind_Master is based on the timestamp of the new transaction,
|
||||
# rather than the last committed transaction.
|
||||
#
|
||||
# Note that the test doesn't actually use the value of MASTER_DELAY, but
|
||||
# rather uses debug_sync to coordinate the end of the delay. This is to ensure
|
||||
# that on slow machines, a delay doesn't pass before the test gets a chance to
|
||||
# validate results. Additionally, it lets us continue testing after validation
|
||||
# so we don't have to wait out a full delay unnecessarily. The debug_sync point
|
||||
# is enabled via sql_delay_by_debug_sync, which will delay transactions based
|
||||
# only on GTID events, so only one synchronization is needed per transaction.
|
||||
#
|
||||
|
||||
--connection slave
|
||||
--source include/stop_slave.inc
|
||||
--let $save_dbug= `SELECT @@GLOBAL.debug_dbug`
|
||||
--let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode`
|
||||
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
|
||||
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
|
||||
--let $master_delay= 3
|
||||
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
|
||||
|
||||
--let $master_delay= 1
|
||||
--eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos
|
||||
--source include/start_slave.inc
|
||||
|
||||
@@ -31,7 +41,11 @@ set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
|
||||
--let insert_ctr= 0
|
||||
create table t1 (a int);
|
||||
create table t2 (a int);
|
||||
--source include/sync_slave_sql_with_master.inc
|
||||
--source include/save_master_gtid.inc
|
||||
--connection slave
|
||||
--let $slave_sync_method= gtid
|
||||
--let $num_event_groups= 2
|
||||
--source include/sync_with_master_sql_delay_debug_sync.inc
|
||||
|
||||
--echo #
|
||||
--echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
|
||||
@@ -48,8 +62,7 @@ sleep 2;
|
||||
--connection slave
|
||||
|
||||
--echo # Waiting for transaction to arrive on slave and begin SQL Delay..
|
||||
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting until MASTER_DELAY seconds after master executed event';
|
||||
--source include/wait_condition.inc
|
||||
set debug_sync= "now WAIT_FOR at_sql_delay";
|
||||
|
||||
--echo # Validating SBM is updated on event arrival..
|
||||
--let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
|
||||
@@ -66,7 +79,20 @@ if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`)
|
||||
--echo # delaying a transaction; then when the reciprocal START SLAVE occurs,
|
||||
--echo # if the event is still to be delayed, SBM should resume accordingly
|
||||
|
||||
--source include/stop_slave.inc
|
||||
--connection server_2
|
||||
--echo # Ensure the kill from STOP SLAVE will be received before continuing the
|
||||
--echo # SQL thread
|
||||
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
|
||||
--send STOP SLAVE
|
||||
|
||||
--connection slave
|
||||
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
|
||||
set debug_sync= "now SIGNAL continue_sql_thread";
|
||||
|
||||
--connection server_2
|
||||
--reap
|
||||
--source include/wait_for_slave_to_stop.inc
|
||||
set debug_sync="RESET";
|
||||
|
||||
--echo # Lock t1 on slave to ensure the event can't finish (and thereby update
|
||||
--echo # Seconds_Behind_Master) so slow running servers don't accidentally
|
||||
@@ -77,6 +103,11 @@ LOCK TABLES t1 WRITE;
|
||||
--source include/start_slave.inc
|
||||
|
||||
--connection slave
|
||||
|
||||
--echo # SQL delay has no impact for the rest of the test case, so ignore it
|
||||
--let $slave_sync_method= none
|
||||
--source include/sync_with_master_sql_delay_debug_sync.inc
|
||||
|
||||
--echo # Waiting for replica to get blocked by the table lock
|
||||
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
|
||||
--source include/wait_condition.inc
|
||||
@@ -120,9 +151,14 @@ sleep 3;
|
||||
|
||||
--connection slave
|
||||
--echo # Wait for first transaction to complete SQL delay and begin execution..
|
||||
--let $slave_sync_method= none
|
||||
--source include/sync_with_master_sql_delay_debug_sync.inc
|
||||
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker';
|
||||
--source include/wait_condition.inc
|
||||
|
||||
--echo # Wait for second transaction to complete SQL delay..
|
||||
--source include/sync_with_master_sql_delay_debug_sync.inc
|
||||
|
||||
--echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
|
||||
--let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
|
||||
--let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()`
|
||||
@@ -147,6 +183,8 @@ UNLOCK TABLES;
|
||||
|
||||
--echo # Cleanup
|
||||
--source include/stop_slave.inc
|
||||
set debug_sync= "RESET";
|
||||
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
|
||||
--eval CHANGE MASTER TO master_delay=0
|
||||
--source include/start_slave.inc
|
||||
|
||||
|
19
sql/slave.cc
19
sql/slave.cc
@@ -1088,6 +1088,8 @@ terminate_slave_thread(THD *thd,
|
||||
mysql_mutex_unlock(&thd->LOCK_thd_kill);
|
||||
mysql_mutex_unlock(&thd->LOCK_thd_data);
|
||||
|
||||
DEBUG_SYNC(current_thd, "after_thd_awake_kill");
|
||||
|
||||
/*
|
||||
There is a small chance that slave thread might miss the first
|
||||
alarm. To protect againts it, resend the signal until it reacts
|
||||
@@ -3794,13 +3796,26 @@ sql_delay_event(Log_event *ev, THD *thd, rpl_group_info *rgi)
|
||||
sql_delay, (long)ev->when,
|
||||
rli->mi->clock_diff_with_master,
|
||||
(long)now, (ulonglong)sql_delay_end, (long)nap_time));
|
||||
|
||||
if (sql_delay_end > now)
|
||||
/* if using debug_sync for sql_delay, only delay once per event group */
|
||||
if (DBUG_EVALUATE_IF("sql_delay_by_debug_sync", type == GTID_EVENT,
|
||||
sql_delay_end > now))
|
||||
{
|
||||
DBUG_PRINT("info", ("delaying replication event %lu secs",
|
||||
nap_time));
|
||||
rli->start_sql_delay(sql_delay_end);
|
||||
mysql_mutex_unlock(&rli->data_lock);
|
||||
|
||||
#ifdef ENABLED_DEBUG_SYNC
|
||||
DBUG_EXECUTE_IF("sql_delay_by_debug_sync", {
|
||||
DBUG_ASSERT(!debug_sync_set_action(
|
||||
thd, STRING_WITH_LEN(
|
||||
"now SIGNAL at_sql_delay WAIT_FOR continue_sql_thread")));
|
||||
|
||||
// Skip the actual sleep if using DEBUG_SYNC to coordinate SQL_DELAY
|
||||
DBUG_RETURN(0);
|
||||
};);
|
||||
#endif
|
||||
|
||||
DBUG_RETURN(slave_sleep(thd, nap_time, sql_slave_killed, rgi));
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user