MDEV-26: Global transaction ID.

Fix problems related to reconnect. When we need to reconnect (ie. explict stop/start of just the IO thread by user, or automatic reconnect due to loosing network connection with the master), it is a bit complex to correctly resume at the right point without causing duplicate or missing events in the relay log. The previous code had multiple problems in this regard. With this patch, the problem is solved as follows. The IO thread keeps track (in memory) of which GTID was last queued to the relay log. If it needs to reconnect, it resumes at that GTID position. It also counts number of events received within the last, possibly partial, event group, and skips the same number of events after a reconnect, so that events already enqueued before the reconnect are not duplicated. (There is no need to keep any persistent state; whenever we restart slave threads after both of them being stopped (such as after server restart), we erase the relay logs and start over from the last GTID applied by SQL thread. But while the SQL thread is running, this patch is needed to get correct relay log).
2025-08-01 03:47:19 +03:00 · 2013-06-05 14:32:47 +02:00
parent 7ad47ab0e0
commit 5cb486d159
12 changed files with 936 additions and 99 deletions
--- a/mysql-test/include/kill_binlog_dump_threads.inc
+++ b/mysql-test/include/kill_binlog_dump_threads.inc
@ -0,0 +1,62 @@
+# ==== Purpose ====
+#
+# Terminate all binlog dump threads on a master.
+#
+# This is sometimes useful, as normally such dump threads can hang
+# around for some time before they notice that the slave has disconnected.
+#
+# Note that if there are active slave connections, they might try to
+# reconnect as their dump threads are killed, which may not lead to the
+# desired results.
+#
+#
+# ==== Usage ====
+#
+# [--let $kill_timeout= NUMBER]
+# --source include/stop_slavekill_binlog_dump_threads.inc
+#
+# Parameters:
+#   $kill_timeout
+#     Maximum number of seconds to wait for dump threads to disappear.
+
+
+--let $include_filename= kill_binlog_dump_threads.inc
+--source include/begin_include_file.inc
+
+--disable_query_log
+
+let $wait_counter= 300;
+if ($kill_timeout)
+{
+  let $wait_counter= `SELECT $kill_timeout * 10`;
+}
+
+let $success= 0;
+while ($wait_counter)
+{
+    dec $wait_counter;
+    let $_tid= `SELECT id FROM information_schema.processlist WHERE command = 'Binlog Dump' LIMIT 1`;
+    if ($_tid)
+    {
+        eval KILL QUERY $_tid;
+    }
+    if (!$_tid)
+    {
+        let $wait_counter= 0;
+	let $success= 1;
+    }
+    if (!$success)
+    {
+        real_sleep 0.1;
+    }
+}
+if (!$success)
+{
+    SHOW FULL PROCESSLIST;
+    --die Timeout while waiting for binlog dump threads to disappear.
+}
+
+--enable_query_log
+
+--let $include_filename= kill_binlog_dump_threads.inc
+--source include/end_include_file.inc
--- a/mysql-test/suite/rpl/r/rpl_gtid_reconnect.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_reconnect.result
@ -0,0 +1,169 @@
+include/rpl_init.inc [topology=1->2]
+include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid= current_pos;
+include/start_slave.inc
+CREATE TABLE t1 (a INT);
+FLUSH LOGS;
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (1);
+INSERT INTO t1 VALUES (2);
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (3);
+INSERT INTO t1 VALUES (4);
+INSERT INTO t1 VALUES (5);
+include/stop_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+5
+include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (10);
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+5
+10
+include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+SET gtid_domain_id=10;
+SET gtid_seq_no=50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id=10;
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id=11;
+SET gtid_seq_no=200;
+INSERT INTO t1 VALUES (13);
+START SLAVE UNTIL master_gtid_pos="10-1-50,11-1-200";
+include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+11
+12
+13
+include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (20);
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+11
+12
+13
+20
+include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+include/kill_binlog_dump_threads.inc
+SET gtid_domain_id= 9;
+SET gtid_seq_no= 50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id= 10;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 100;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 200;
+INSERT INTO t1 VALUES (13);
+SET gtid_domain_id= 10;
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+START SLAVE UNTIL master_gtid_pos="9-1-50,10-1-200";
+include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+11
+12
+13
+SET GLOBAL debug_dbug= @old_debug;
+INSERT INTO t1 VALUES (20);
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+11
+12
+13
+20
+*** Test when slave IO thread needs to reconnect in the middle of an event group. ***
+include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+include/kill_binlog_dump_threads.inc
+SET GLOBAL debug_dbug="+d,binlog_force_reconnect_after_22_events";
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (1);
+BEGIN;
+INSERT INTO t2 VALUES (10);
+INSERT INTO t2 VALUES (11);
+COMMIT;
+BEGIN;
+INSERT INTO t2 VALUES (20);
+INSERT INTO t2 VALUES (21);
+INSERT INTO t2 VALUES (22);
+INSERT INTO t2 VALUES (23);
+INSERT INTO t2 VALUES (24);
+INSERT INTO t2 VALUES (25);
+INSERT INTO t2 VALUES (26);
+INSERT INTO t2 VALUES (27);
+INSERT INTO t2 VALUES (28);
+INSERT INTO t2 VALUES (29);
+COMMIT;
+include/start_slave.inc
+SELECT * FROM t2 ORDER BY a;
+a
+1
+10
+11
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+SET GLOBAL debug_dbug= @old_debug;
+DROP TABLE t1, t2;
+include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_gtid_reconnect.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_reconnect.test
@ -0,0 +1,201 @@
+--let $rpl_topology=1->2
+--source include/rpl_init.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+
+
+--connection server_2
+--source include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid= current_pos;
+--source include/start_slave.inc
+
+--connection server_1
+CREATE TABLE t1 (a INT);
+FLUSH LOGS;
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+
+
+# Prepare a string of events and have the slave replicate all of it.
+--connection server_1
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (1);
+INSERT INTO t1 VALUES (2);
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (3);
+INSERT INTO t1 VALUES (4);
+INSERT INTO t1 VALUES (5);
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+SELECT * FROM t1 ORDER BY a;
+
+# Now start the slave again, but force a reconnect. There was a bug that this
+# reconnect would cause duplicate events.
+
+--connection server_1
+# Make sure to get rid of any old binlog dump thread so it does not
+# interfere with our DBUG error injection.
+--source include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (10);
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 ORDER BY a;
+
+--source include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+
+# A1 B1 A2 B2 A3 B3, slave reached A1 and B3 and stopped. Slave starts,
+# reconnects at A2. There was a bug that B2 would be duplicated.
+
+SET gtid_domain_id=10;
+SET gtid_seq_no=50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id=10;
+SET gtid_seq_no=100;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id=11;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id=10;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id=11;
+SET gtid_seq_no=200;
+INSERT INTO t1 VALUES (13);
+
+--connection server_2
+START SLAVE UNTIL master_gtid_pos="10-1-50,11-1-200";
+--source include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+
+--connection server_1
+--source include/kill_binlog_dump_threads.inc
+INSERT INTO t1 VALUES (20);
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 ORDER BY a;
+
+--source include/stop_slave.inc
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+
+# A1 B1 A2 B2 A3 B3. START SLAVE UNTIL A1,B3, gets reconnect at B2.
+# There was a bug that the UNTIL would be ignored, and A2 would be lost.
+
+--source include/kill_binlog_dump_threads.inc
+SET gtid_domain_id= 9;
+SET gtid_seq_no= 50;
+INSERT INTO t1 VALUES (1);
+SET gtid_domain_id= 10;
+INSERT INTO t1 VALUES (11);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (2);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 100;
+INSERT INTO t1 VALUES (12);
+SET gtid_domain_id= 9;
+INSERT INTO t1 VALUES (3);
+SET gtid_domain_id= 10;
+SET gtid_seq_no= 200;
+INSERT INTO t1 VALUES (13);
+SET gtid_domain_id= 10;
+
+SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
+
+--connection server_2
+START SLAVE UNTIL master_gtid_pos="9-1-50,10-1-200";
+--source include/wait_for_slave_to_stop.inc
+SELECT * FROM t1 ORDER BY a;
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+INSERT INTO t1 VALUES (20);
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+
+
+--echo *** Test when slave IO thread needs to reconnect in the middle of an event group. ***
+--connection server_2
+--source include/stop_slave.inc
+
+TRUNCATE t1;
+RESET MASTER;
+SET GLOBAL gtid_slave_pos= "";
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+TRUNCATE t1;
+RESET MASTER;
+
+--source include/kill_binlog_dump_threads.inc
+SET GLOBAL debug_dbug="+d,binlog_force_reconnect_after_22_events";
+
+# 4 events for FD, fake rotate, gtid list, binlog checkpoint.
+# 2 events for GTID, create table
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
+# 3 events for BEGIN/query/COMMIT
+INSERT INTO t2 VALUES (1);
+# 4 events for BEGIN/query/query/COMMIT
+BEGIN;
+INSERT INTO t2 VALUES (10);
+INSERT INTO t2 VALUES (11);
+COMMIT;
+# So this event group starts after 4+2+4+3=13 events. Or 16 in row-based.
+BEGIN;
+INSERT INTO t2 VALUES (20);
+INSERT INTO t2 VALUES (21);
+INSERT INTO t2 VALUES (22);
+INSERT INTO t2 VALUES (23);
+INSERT INTO t2 VALUES (24);
+INSERT INTO t2 VALUES (25);
+INSERT INTO t2 VALUES (26);
+INSERT INTO t2 VALUES (27);
+INSERT INTO t2 VALUES (28);
+INSERT INTO t2 VALUES (29);
+COMMIT;
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t2 ORDER BY a;
+
+--connection server_1
+SET GLOBAL debug_dbug= @old_debug;
+
+
+# Clean up.
+--connection server_1
+DROP TABLE t1, t2;
+
+--source include/rpl_end.inc