1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

MDEV-30619: Parallel Slave SQL Thread Can Update Seconds_Behind_Master with Active Workers

MDEV-31749 sporadic assert in MDEV-30619 new test

If the workers of a parallel replica are busy (potentially with long
queues), but the SQL thread has no events left to distribute (so it
goes idle), then the next event that comes from the primary will
update mi->last_master_timestamp with its timestamp, even if the
workers have not yet finished.

This patch changes the parallel replica logic which updates
last_master_timestamp after idling from using solely sql_thread_caught_up
(added in MDEV-29639) to using the latter with rli queued/dequeued
event counters.
That is, if  the queued count is equal to the dequeued count, it
means all events have been processed and the replica is considered
idle when the driver thread has also distributed all events.

Low level details of the commit include
- to make a more generalized test for Seconds_Behind_Master on
  the parallel replica, rpl_delayed_parallel_slave_sbm.test
  is renamed to rpl_parallel_sbm.test for this purpose.
- pause_sql_thread_on_next_event usage was removed
  with the MDEV-30619 fixes. Rather than remove it, we adapt it
  to the needs of this test case
- added test case to cover SBM spike of relay log read and LMT
  update that was fixed by MDEV-29639
- rpl_seconds_behind_master_spike.test is made to use
  the negate_clock_diff_with_master debug eval.

Reviewed By:
============
Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
Brandon Nesterenko
2023-06-28 10:28:31 -06:00
committed by Andrei
parent 734583b0d7
commit 063f4ac25e
7 changed files with 275 additions and 81 deletions

View File

@@ -0,0 +1,89 @@
include/master-slave.inc
[connection master]
#
# MDEV-29639: Seconds_Behind_Master is incorrect for Delayed, Parallel Replicas
#
connection slave;
include/stop_slave.inc
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
change master to master_delay=3, master_use_gtid=Slave_Pos;
include/start_slave.inc
connection master;
create table t1 (a int);
create table t2 (a int);
include/sync_slave_sql_with_master.inc
#
# Pt 1) Ensure SBM is updated immediately upon arrival of the next event
# Lock t1 on slave so the first received transaction does not complete/commit
connection slave;
LOCK TABLES t1 WRITE;
connection master;
# Sleep 2 to allow a buffer between events for SBM check
insert into t1 values (0);
include/save_master_gtid.inc
connection slave;
# Waiting for transaction to arrive on slave and begin SQL Delay..
# Validating SBM is updated on event arrival..
# ..done
connection slave;
UNLOCK TABLES;
include/sync_with_master_gtid.inc
#
# Pt 2) If the worker threads have not entered an idle state, ensure
# following events do not update SBM
connection slave;
LOCK TABLES t1 WRITE;
connection master;
# Sleep 2 to allow a buffer between events for SBM check
insert into t1 values (1);
# Sleep 3 to create gap between events
insert into t1 values (2);
include/save_master_pos.inc
connection slave;
# Wait for first transaction to complete SQL delay and begin execution..
# Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
# ..and that SBM wasn't calculated using prior committed transactions
# ..done
connection slave;
UNLOCK TABLES;
include/wait_for_slave_param.inc [Relay_Master_Log_File]
include/wait_for_slave_param.inc [Exec_Master_Log_Pos]
# Cleanup
include/stop_slave.inc
CHANGE MASTER TO master_delay=0;
include/start_slave.inc
#
# MDEV-30619: Parallel Slave SQL Thread Can Update Seconds_Behind_Master with Active Workers
#
connection slave;
# Ensure the replica is fully idle before starting transactions
# Lock t1 on slave so the first received transaction does not complete/commit
LOCK TABLES t1 WRITE;
connection master;
insert into t1 values (3);
include/save_master_gtid.inc
connection slave;
# Waiting for first transaction to begin..
connection master;
# Sleep 2 sec to create a gap between events
INSERT INTO t2 VALUES (1);
include/save_master_gtid.inc
connection slave;
# Waiting for second transaction to begin..
connection slave;
UNLOCK TABLES;
include/sync_with_master_gtid.inc
#
# Cleanup
connection master;
DROP TABLE t1, t2;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @@GLOBAL.debug_dbug= "";
set @@GLOBAL.slave_parallel_mode= "$save_parallel_mode";
include/start_slave.inc
include/rpl_end.inc
# End of rpl_parallel_sbm.test