From 68938d2b42474a6ff1fced3b1495ec45de2f5c47 Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Mon, 9 Sep 2024 08:50:02 -0600 Subject: [PATCH] MDEV-33500 (part 2): rpl.rpl_parallel_sbm can still fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The failing test case validates Seconds_Behind_Master for a delayed slave, while STOP SLAVE is executed during a delay. The test fixes initially added to the test (commit b04c8575967) added a table lock to ensure a transaction could not finish before validating the Seconds_Behind_Master field after SLAVE START, but did not address a possibility that the transaction could finish before running the STOP SLAVE command, which invalidates the validations for the rest of the test case. Specifically, this would result in 1) a timeout in “Waiting for table metadata lock” on the replica, which expects the transaction to retry after slave restart and hit a lock conflict on the locked tables (added in b04c8575967), and 2) that Seconds_Behind_Master should have increased, but did not. The failure can be reproduced by synchronizing the slave to the master before the MDEV-32265 echo statement (i.e. before the SLAVE STOP). This patch fixes the test by adding a mechanism to use DEBUG_SYNC to synchronize a MASTER_DELAY, rather than continually increase the duration of the delay each time the test fails on buildbot. This is to ensure that on slow machines, a delay does not pass before the test gets a chance to validate results. Additionally, it decreases overall test time because the test can continue immediately after validation, thereby bypassing the remainder of a full delay for each transaction. --- .../sync_with_master_sql_delay_debug_sync.inc | 82 +++++++++++++++++++ .../suite/rpl/r/rpl_parallel_sbm.result | 38 ++++++++- mysql-test/suite/rpl/t/rpl_parallel_sbm.test | 50 +++++++++-- sql/slave.cc | 19 ++++- 4 files changed, 177 insertions(+), 12 deletions(-) create mode 100644 mysql-test/suite/rpl/include/sync_with_master_sql_delay_debug_sync.inc diff --git a/mysql-test/suite/rpl/include/sync_with_master_sql_delay_debug_sync.inc b/mysql-test/suite/rpl/include/sync_with_master_sql_delay_debug_sync.inc new file mode 100644 index 00000000000..7f3602d6979 --- /dev/null +++ b/mysql-test/suite/rpl/include/sync_with_master_sql_delay_debug_sync.inc @@ -0,0 +1,82 @@ +# ==== Purpose ==== +# +# If using DEBUG_SYNC to coordinate a slave's SQL DELAY via the DEBUG_DBUG +# identifier "sql_delay_by_debug_sync", this helper file will help synchronize +# a slave with the master for statements which don't need to be delayed. This +# can be helpful, for example, for setup/cleanup statements, if they must be +# run in the same lifetime as the statements used for the test. +# +# The actual synchronization will take place based on the input parameter +# slave_sync_method, which can be "gtid", "file_coord", or "none"; and will use +# the helper files sync_with_master_gtid.inc or sync_with_master.inc (or none +# at all), respectively. +# +# +# ==== Requirements ==== +# +# --source include/have_debug.inc +# --source include/have_debug_sync.inc +# set @@GLOBAL.debug_dbug= "+d,sql_delay_by_debug_sync"; +# +# +# ==== Usage ==== +# +# --let $slave_sync_method= gtid|file_coord|none +# [--let $num_event_groups= NUMBER] +# --source include/sync_with_master_sql_delay_debug_sync.inc +# +# +# Parameters: +# $slave_sync_method +# Value can be gtid, file_coord, or none; and will synchronize the slave +# with the master via this method (i.e. using sync_with_master_gtid.inc +# or sync_with_master.inc, respectively), after synchronizing the SQL +# delay +# +# $num_event_groups +# Number of event groups to synchronize the SQL delay for. If unset, will +# be default to 1. +# + +--let $include_filename= sync_with_master_sql_delay_debug_sync.inc +--source include/begin_include_file.inc + +if (!$slave_sync_method) +{ + --die Parameter slave_sync_method must be set +} + +if (`select "$slave_sync_method" not like "gtid" and "$slave_sync_method" not like "file_coord" and "$slave_sync_method" not like "none"`) +{ + --die Parameter slave_sync_method must have value "gtid", "file_coord" or "none" +} + +if (`select "$slave_sync_method" not like "none" and strcmp("$master_pos", "") = 0`) +{ + --die sync_with_master.inc or sync_with_master_gtid.inc was not called to populate variable master_pos +} + +if (!$num_event_groups) +{ + --let $num_event_groups= 1 +} + +while ($num_event_groups) +{ + set debug_sync= "now WAIT_FOR at_sql_delay"; + set debug_sync= "now SIGNAL continue_sql_thread"; + --dec $num_event_groups +} + +if (`select "$slave_sync_method" LIKE "gtid"`) +{ + --source include/sync_with_master_gtid.inc +} + +if (`select "$slave_sync_method" LIKE "file_coord"`) +{ + --source include/sync_with_master.inc +} + +--let $include_filename= sync_with_master_sql_delay_debug_sync.inc +--source include/end_include_file.inc diff --git a/mysql-test/suite/rpl/r/rpl_parallel_sbm.result b/mysql-test/suite/rpl/r/rpl_parallel_sbm.result index 75012c93f3b..59a6955aa1b 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel_sbm.result +++ b/mysql-test/suite/rpl/r/rpl_parallel_sbm.result @@ -5,14 +5,20 @@ include/master-slave.inc # connection slave; include/stop_slave.inc -set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master"; set @@GLOBAL.slave_parallel_mode= CONSERVATIVE; -change master to master_delay=3, master_use_gtid=Slave_Pos; +set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync"; +change master to master_delay=1, master_use_gtid=Slave_Pos; include/start_slave.inc connection master; create table t1 (a int); create table t2 (a int); -include/sync_slave_sql_with_master.inc +include/save_master_gtid.inc +connection slave; +include/sync_with_master_sql_delay_debug_sync.inc +set debug_sync= "now WAIT_FOR at_sql_delay"; +set debug_sync= "now SIGNAL continue_sql_thread"; +set debug_sync= "now WAIT_FOR at_sql_delay"; +set debug_sync= "now SIGNAL continue_sql_thread"; # # Pt 1) Ensure SBM is updated immediately upon arrival of the next event connection master; @@ -21,12 +27,23 @@ insert into t1 values (0); include/save_master_gtid.inc connection slave; # Waiting for transaction to arrive on slave and begin SQL Delay.. +set debug_sync= "now WAIT_FOR at_sql_delay"; # Validating SBM is updated on event arrival.. # ..done # MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently # delaying a transaction; then when the reciprocal START SLAVE occurs, # if the event is still to be delayed, SBM should resume accordingly -include/stop_slave.inc +connection server_2; +# Ensure the kill from STOP SLAVE will be received before continuing the +# SQL thread +set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill"; +STOP SLAVE; +connection slave; +set debug_sync= "now WAIT_FOR slave_notified_of_kill"; +set debug_sync= "now SIGNAL continue_sql_thread"; +connection server_2; +include/wait_for_slave_to_stop.inc +set debug_sync="RESET"; # Lock t1 on slave to ensure the event can't finish (and thereby update # Seconds_Behind_Master) so slow running servers don't accidentally # catch up to the master before checking SBM. @@ -34,6 +51,10 @@ connection server_2; LOCK TABLES t1 WRITE; include/start_slave.inc connection slave; +# SQL delay has no impact for the rest of the test case, so ignore it +include/sync_with_master_sql_delay_debug_sync.inc +set debug_sync= "now WAIT_FOR at_sql_delay"; +set debug_sync= "now SIGNAL continue_sql_thread"; # Waiting for replica to get blocked by the table lock # Sleeping 1s to increment SBM # Ensuring Seconds_Behind_Master increases after sleeping.. @@ -54,6 +75,13 @@ insert into t1 values (2); include/save_master_pos.inc connection slave; # Wait for first transaction to complete SQL delay and begin execution.. +include/sync_with_master_sql_delay_debug_sync.inc +set debug_sync= "now WAIT_FOR at_sql_delay"; +set debug_sync= "now SIGNAL continue_sql_thread"; +# Wait for second transaction to complete SQL delay.. +include/sync_with_master_sql_delay_debug_sync.inc +set debug_sync= "now WAIT_FOR at_sql_delay"; +set debug_sync= "now SIGNAL continue_sql_thread"; # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle.. # ..and that SBM wasn't calculated using prior committed transactions # ..done @@ -63,6 +91,8 @@ include/wait_for_slave_param.inc [Relay_Master_Log_File] include/wait_for_slave_param.inc [Exec_Master_Log_Pos] # Cleanup include/stop_slave.inc +set debug_sync= "RESET"; +set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync"; CHANGE MASTER TO master_delay=0; include/start_slave.inc # diff --git a/mysql-test/suite/rpl/t/rpl_parallel_sbm.test b/mysql-test/suite/rpl/t/rpl_parallel_sbm.test index 90753caf143..6941a88f4d0 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel_sbm.test +++ b/mysql-test/suite/rpl/t/rpl_parallel_sbm.test @@ -3,6 +3,7 @@ # --source include/have_log_bin.inc --source include/have_debug.inc +--source include/have_debug_sync.inc --source include/master-slave.inc --echo # @@ -16,14 +17,23 @@ # Seconds_Behind_Master is based on the timestamp of the new transaction, # rather than the last committed transaction. # +# Note that the test doesn't actually use the value of MASTER_DELAY, but +# rather uses debug_sync to coordinate the end of the delay. This is to ensure +# that on slow machines, a delay doesn't pass before the test gets a chance to +# validate results. Additionally, it lets us continue testing after validation +# so we don't have to wait out a full delay unnecessarily. The debug_sync point +# is enabled via sql_delay_by_debug_sync, which will delay transactions based +# only on GTID events, so only one synchronization is needed per transaction. +# --connection slave --source include/stop_slave.inc --let $save_dbug= `SELECT @@GLOBAL.debug_dbug` --let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode` -set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master"; set @@GLOBAL.slave_parallel_mode= CONSERVATIVE; ---let $master_delay= 3 +set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync"; + +--let $master_delay= 1 --eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos --source include/start_slave.inc @@ -31,7 +41,11 @@ set @@GLOBAL.slave_parallel_mode= CONSERVATIVE; --let insert_ctr= 0 create table t1 (a int); create table t2 (a int); ---source include/sync_slave_sql_with_master.inc +--source include/save_master_gtid.inc +--connection slave +--let $slave_sync_method= gtid +--let $num_event_groups= 2 +--source include/sync_with_master_sql_delay_debug_sync.inc --echo # --echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event @@ -48,8 +62,7 @@ sleep 2; --connection slave --echo # Waiting for transaction to arrive on slave and begin SQL Delay.. ---let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting until MASTER_DELAY seconds after master executed event'; ---source include/wait_condition.inc +set debug_sync= "now WAIT_FOR at_sql_delay"; --echo # Validating SBM is updated on event arrival.. --let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1) @@ -66,7 +79,20 @@ if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`) --echo # delaying a transaction; then when the reciprocal START SLAVE occurs, --echo # if the event is still to be delayed, SBM should resume accordingly ---source include/stop_slave.inc +--connection server_2 +--echo # Ensure the kill from STOP SLAVE will be received before continuing the +--echo # SQL thread +set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill"; +--send STOP SLAVE + +--connection slave +set debug_sync= "now WAIT_FOR slave_notified_of_kill"; +set debug_sync= "now SIGNAL continue_sql_thread"; + +--connection server_2 +--reap +--source include/wait_for_slave_to_stop.inc +set debug_sync="RESET"; --echo # Lock t1 on slave to ensure the event can't finish (and thereby update --echo # Seconds_Behind_Master) so slow running servers don't accidentally @@ -77,6 +103,11 @@ LOCK TABLES t1 WRITE; --source include/start_slave.inc --connection slave + +--echo # SQL delay has no impact for the rest of the test case, so ignore it +--let $slave_sync_method= none +--source include/sync_with_master_sql_delay_debug_sync.inc + --echo # Waiting for replica to get blocked by the table lock --let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock'; --source include/wait_condition.inc @@ -120,9 +151,14 @@ sleep 3; --connection slave --echo # Wait for first transaction to complete SQL delay and begin execution.. +--let $slave_sync_method= none +--source include/sync_with_master_sql_delay_debug_sync.inc --let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker'; --source include/wait_condition.inc +--echo # Wait for second transaction to complete SQL delay.. +--source include/sync_with_master_sql_delay_debug_sync.inc + --echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle.. --let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1) --let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()` @@ -147,6 +183,8 @@ UNLOCK TABLES; --echo # Cleanup --source include/stop_slave.inc +set debug_sync= "RESET"; +set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync"; --eval CHANGE MASTER TO master_delay=0 --source include/start_slave.inc diff --git a/sql/slave.cc b/sql/slave.cc index f258f9f3595..1a6a5a6efdb 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -1088,6 +1088,8 @@ terminate_slave_thread(THD *thd, mysql_mutex_unlock(&thd->LOCK_thd_kill); mysql_mutex_unlock(&thd->LOCK_thd_data); + DEBUG_SYNC(current_thd, "after_thd_awake_kill"); + /* There is a small chance that slave thread might miss the first alarm. To protect againts it, resend the signal until it reacts @@ -3794,13 +3796,26 @@ sql_delay_event(Log_event *ev, THD *thd, rpl_group_info *rgi) sql_delay, (long)ev->when, rli->mi->clock_diff_with_master, (long)now, (ulonglong)sql_delay_end, (long)nap_time)); - - if (sql_delay_end > now) + /* if using debug_sync for sql_delay, only delay once per event group */ + if (DBUG_EVALUATE_IF("sql_delay_by_debug_sync", type == GTID_EVENT, + sql_delay_end > now)) { DBUG_PRINT("info", ("delaying replication event %lu secs", nap_time)); rli->start_sql_delay(sql_delay_end); mysql_mutex_unlock(&rli->data_lock); + +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("sql_delay_by_debug_sync", { + DBUG_ASSERT(!debug_sync_set_action( + thd, STRING_WITH_LEN( + "now SIGNAL at_sql_delay WAIT_FOR continue_sql_thread"))); + + // Skip the actual sleep if using DEBUG_SYNC to coordinate SQL_DELAY + DBUG_RETURN(0); + };); +#endif + DBUG_RETURN(slave_sleep(thd, nap_time, sql_slave_killed, rgi)); } }