Move deletion of old GTID rows to slave background thread

This patch changes how old rows in mysql.gtid_slave_pos* tables are deleted. Instead of doing it as part of every replicated transaction in record_gtid(), it is done periodically (every @@gtid_cleanup_batch_size transaction) in the slave background thread. This removes the deletion step from the replication process in SQL or worker threads, which could speed up replication with many small transactions. It also decreases contention on the global mutex LOCK_slave_state. And it simplifies the logic, eg. when a replicated transaction fails after having deleted old rows. With this patch, the deletion of old GTID rows happens asynchroneously and slightly non-deterministic. Thus the number of old rows in mysql.gtid_slave_pos can temporarily exceed @@gtid_cleanup_batch_size. But all old rows will be deleted eventually after sufficiently many new GTIDs have been replicated.
2025-08-08 11:22:35 +03:00 · 2018-10-14 20:41:49 +02:00
parent 24a45d3bd7
commit 34f11b06e6
21 changed files with 684 additions and 316 deletions
--- a/mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result
@@ -16,36 +16,32 @@ INSERT INTO t1 VALUES (1);
 connection slave;
 connection slave;
 include/stop_slave.inc
+SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
+SET GLOBAL gtid_cleanup_batch_size= 2;
 SET @old_dbug= @@GLOBAL.debug_dbug;
 SET GLOBAL debug_dbug="+d,gtid_slave_pos_simulate_failed_delete";
 SET sql_log_bin= 0;
-CALL mtr.add_suppression("Can't find file");
+CALL mtr.add_suppression("<DEBUG> Error deleting old GTID row");
 SET sql_log_bin= 1;
 include/start_slave.inc
 connection master;
-INSERT INTO t1 VALUES (2);
 connection slave;
-include/wait_for_slave_sql_error.inc [errno=1942]
-STOP SLAVE IO_THREAD;
-SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
-ORDER BY domain_id, sub_id DESC LIMIT 1;
-domain_id	server_id	seq_no
-0	1	3
+SELECT COUNT(*), MAX(seq_no) INTO @pre_count, @pre_max_seq_no
+FROM mysql.gtid_slave_pos;
+SELECT IF(@pre_count >= 20, "OK", CONCAT("Error: too few rows seen while errors injected: ", @pre_count));
+IF(@pre_count >= 20, "OK", CONCAT("Error: too few rows seen while errors injected: ", @pre_count))
+OK
 SET GLOBAL debug_dbug= @old_dbug;
-include/start_slave.inc
 connection master;
-INSERT INTO t1 VALUES (3);
 connection slave;
 connection slave;
-SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
-ORDER BY domain_id, sub_id DESC LIMIT 1;
-domain_id	server_id	seq_no
-0	1	4
-SELECT * FROM t1 ORDER BY i;
-i
-1
-2
-3
+SELECT IF(COUNT(*) >= 1, "OK", CONCAT("Error: too few rows seen after errors no longer injected: ", COUNT(*)))
+FROM mysql.gtid_slave_pos
+WHERE seq_no <= @pre_max_seq_no;
+IF(COUNT(*) >= 1, "OK", CONCAT("Error: too few rows seen after errors no longer injected: ", COUNT(*)))
+OK
 connection master;
 DROP TABLE t1;
+connection slave;
+SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;
 include/rpl_end.inc
--- a/mysql-test/suite/rpl/r/rpl_gtid_stop_start.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_stop_start.result
@@ -171,7 +171,7 @@ include/start_slave.inc
 *** MDEV-4692: mysql.gtid_slave_pos accumulates values for a domain ***
 SELECT domain_id, COUNT(*) FROM mysql.gtid_slave_pos GROUP BY domain_id;
 domain_id	COUNT(*)
-0	2
+0	3
 1	2
 connection server_1;
 INSERT INTO t1 VALUES (11);
@@ -179,7 +179,7 @@ connection server_2;
 FLUSH NO_WRITE_TO_BINLOG TABLES;
 SELECT domain_id, COUNT(*) FROM mysql.gtid_slave_pos GROUP BY domain_id;
 domain_id	COUNT(*)
-0	2
+0	4
 1	2
 include/start_slave.inc
 connection server_1;
@@ -189,8 +189,8 @@ connection server_2;
 FLUSH NO_WRITE_TO_BINLOG TABLES;
 SELECT domain_id, COUNT(*) FROM mysql.gtid_slave_pos GROUP BY domain_id;
 domain_id	COUNT(*)
-0	2
-1	2
+0	3
+1	1
 *** MDEV-4650: show variables; ERROR 1946 (HY000): Failed to load replication slave GTID position ***
 connection server_2;
 SET sql_log_bin=0;
--- a/mysql-test/suite/rpl/r/rpl_parallel_optimistic.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel_optimistic.result
@@ -12,6 +12,8 @@ SET GLOBAL slave_parallel_threads=10;
 CHANGE MASTER TO master_use_gtid=slave_pos;
 SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
 SET GLOBAL slave_parallel_mode='optimistic';
+SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
+SET GLOBAL gtid_cleanup_batch_size= 1000000;
 connection server_1;
 INSERT INTO t1 VALUES(1,1);
 BEGIN;
@@ -131,6 +133,11 @@ c
 204
 205
 206
+SELECT IF(COUNT(*) >= 30, "OK", CONCAT("Error: too few old rows found: ", COUNT(*)))
+FROM mysql.gtid_slave_pos;
+IF(COUNT(*) >= 30, "OK", CONCAT("Error: too few old rows found: ", COUNT(*)))
+OK
+SET GLOBAL gtid_cleanup_batch_size=1;
 *** Test @@skip_parallel_replication. ***
 connection server_2;
 include/stop_slave.inc
@@ -651,9 +658,10 @@ DROP TABLE t1, t2, t3;
 include/save_master_gtid.inc
 connection server_2;
 include/sync_with_master_gtid.inc
-Check that no more than the expected last four GTIDs are in mysql.gtid_slave_pos
-select count(4) <= 4 from mysql.gtid_slave_pos order by domain_id, sub_id;
-count(4) <= 4
+SELECT COUNT(*) <= 5*@@GLOBAL.gtid_cleanup_batch_size
+FROM mysql.gtid_slave_pos;
+COUNT(*) <= 5*@@GLOBAL.gtid_cleanup_batch_size
 1
+SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;
 connection server_1;
 include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test
@@ -28,37 +28,79 @@ INSERT INTO t1 VALUES (1);
 # Inject an artificial error deleting entries, and check that the error handling code works.
 --connection slave
 --source include/stop_slave.inc
+SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
+SET GLOBAL gtid_cleanup_batch_size= 2;
 SET @old_dbug= @@GLOBAL.debug_dbug;
 SET GLOBAL debug_dbug="+d,gtid_slave_pos_simulate_failed_delete";
 SET sql_log_bin= 0;
-CALL mtr.add_suppression("Can't find file");
+CALL mtr.add_suppression("<DEBUG> Error deleting old GTID row");
 SET sql_log_bin= 1;
 --source include/start_slave.inc

 --connection master
-INSERT INTO t1 VALUES (2);
+--disable_query_log
+let $i = 20;
+while ($i) {
+  eval INSERT INTO t1 VALUES ($i+10);
+  dec $i;
+}
+--enable_query_log
+--save_master_pos

 --connection slave
--let $slave_sql_errno= 1942
--source include/wait_for_slave_sql_error.inc
-STOP SLAVE IO_THREAD;
-SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
- ORDER BY domain_id, sub_id DESC LIMIT 1;
+--sync_with_master
+
+# Now wait for the slave background thread to try to delete old rows and
+# hit the error injection.
+--let _TEST_MYSQLD_ERROR_LOG=$MYSQLTEST_VARDIR/log/mysqld.2.err
+--perl
+  open F, '<', $ENV{'_TEST_MYSQLD_ERROR_LOG'} or die;
+  outer: while (1) {
+    inner: while (<F>) {
+      last outer if /<DEBUG> Error deleting old GTID row/;
+    }
+    # Easy way to do sub-second sleep without extra modules.
+    select(undef, undef, undef, 0.1);
+  }
+EOF
+
+# Since we injected error in the cleanup code, the rows should remain in
+# mysql.gtid_slave_pos. Check that we have at least 20 (more robust against
+# non-deterministic cleanup and future changes than checking for exact number).
+SELECT COUNT(*), MAX(seq_no) INTO @pre_count, @pre_max_seq_no
+  FROM mysql.gtid_slave_pos;
+SELECT IF(@pre_count >= 20, "OK", CONCAT("Error: too few rows seen while errors injected: ", @pre_count));
 SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc

 --connection master
-INSERT INTO t1 VALUES (3);
+--disable_query_log
+let $i = 20;
+while ($i) {
+  eval INSERT INTO t1 VALUES ($i+40);
+  dec $i;
+}
+--enable_query_log
 --sync_slave_with_master

 --connection slave
-SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
- ORDER BY domain_id, sub_id DESC LIMIT 1;
-SELECT * FROM t1 ORDER BY i;
-
+# Now check that 1) rows are being deleted again after removing error
+# injection, and 2) old rows are left that failed their delete while errors
+# where injected (again compensating for non-deterministic deletion).
+# Deletion is async and slightly non-deterministic, so we wait for at
+# least 10 of the 20 new rows to be deleted.
+let $wait_condition=
+  SELECT COUNT(*) <= 20-10
+    FROM mysql.gtid_slave_pos
+   WHERE seq_no > @pre_max_seq_no;
+--source include/wait_condition.inc
+SELECT IF(COUNT(*) >= 1, "OK", CONCAT("Error: too few rows seen after errors no longer injected: ", COUNT(*)))
+  FROM mysql.gtid_slave_pos
+ WHERE seq_no <= @pre_max_seq_no;

 # Clean up
 --connection master
 DROP TABLE t1;
+--connection slave
+SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;

 --source include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_parallel_optimistic.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel_optimistic.test
@@ -21,6 +21,10 @@ SET GLOBAL slave_parallel_threads=10;
 CHANGE MASTER TO master_use_gtid=slave_pos;
 SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
 SET GLOBAL slave_parallel_mode='optimistic';
+# Run the first part of the test with high batch size and see that
+# old rows remain in the table.
+SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
+SET GLOBAL gtid_cleanup_batch_size= 1000000;


 --connection server_1
@@ -108,7 +112,12 @@ SELECT * FROM t3 ORDER BY c;
 SELECT * FROM t1 ORDER BY a;
 SELECT * FROM t2 ORDER BY a;
 SELECT * FROM t3 ORDER BY c;
-#SHOW STATUS LIKE 'Slave_retried_transactions';
+# Check that we have a bunch of old rows left-over - they were not deleted
+# due to high @@gtid_cleanup_batch_size. Then set a low
+# @@gtid_cleanup_batch_size so we can test that rows start being deleted.
+SELECT IF(COUNT(*) >= 30, "OK", CONCAT("Error: too few old rows found: ", COUNT(*)))
+  FROM mysql.gtid_slave_pos;
+SET GLOBAL gtid_cleanup_batch_size=1;


 --echo *** Test @@skip_parallel_replication. ***
@@ -557,25 +566,18 @@ DROP TABLE t1, t2, t3;

 --connection server_2
 --source include/sync_with_master_gtid.inc
-# Check for left-over rows in table mysql.gtid_slave_pos (MDEV-12147).
-#
-# There was a bug when a transaction got a conflict and was rolled back. It
-# might have also handled deletion of some old rows, and these deletions would
-# then also be rolled back. And since the deletes were never re-tried, old no
-# longer needed rows would accumulate in the table without limit.
-# 
-# The earlier part of this test file have plenty of transactions being rolled
-# back. But the last DROP TABLE statement runs on its own and should never
-# conflict, thus at this point the mysql.gtid_slave_pos table should be clean.
-#
-# To support @@gtid_pos_auto_engines, when a row is inserted in the table, it
-# is associated with the engine of the table at insertion time, and it will
-# only be deleted during record_gtid from a table of the same engine. Since we
-# alter the table from MyISAM to InnoDB at the start of this test, we should
-# end up with 4 rows: two left-over from when the table was MyISAM, and two
-# left-over from the InnoDB part.
--echo Check that no more than the expected last four GTIDs are in mysql.gtid_slave_pos
-select count(4) <= 4 from mysql.gtid_slave_pos order by domain_id, sub_id;
+# Check that old rows are deleted from mysql.gtid_slave_pos.
+# Deletion is asynchronous, so use wait_condition.inc.
+# Also, there is a small amount of non-determinism in the deletion of old
+# rows, so it is not guaranteed that there can never be more than
+# @@gtid_cleanup_batch_size rows in the table; so allow a bit of slack
+# here.
+let $wait_condition=
+  SELECT COUNT(*) <= 5*@@GLOBAL.gtid_cleanup_batch_size
+    FROM mysql.gtid_slave_pos;
+--source include/wait_condition.inc
+eval $wait_condition;
+SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;

 --connection server_1
 --source include/rpl_end.inc