1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-30 16:24:05 +03:00

MDEV-26: Global transaction ID.

Fix problems related to reconnect. When we need to reconnect (ie. explict
stop/start of just the IO thread by user, or automatic reconnect due to
loosing network connection with the master), it is a bit complex to correctly
resume at the right point without causing duplicate or missing events in the
relay log. The previous code had multiple problems in this regard.

With this patch, the problem is solved as follows. The IO thread keeps track
(in memory) of which GTID was last queued to the relay log. If it needs to
reconnect, it resumes at that GTID position. It also counts number of events
received within the last, possibly partial, event group, and skips the same
number of events after a reconnect, so that events already enqueued before the
reconnect are not duplicated.

(There is no need to keep any persistent state; whenever we restart slave
threads after both of them being stopped (such as after server restart), we
erase the relay logs and start over from the last GTID applied by SQL thread.
But while the SQL thread is running, this patch is needed to get correct relay
log).
This commit is contained in:
unknown
2013-06-05 14:32:47 +02:00
parent 7ad47ab0e0
commit 5cb486d159
12 changed files with 936 additions and 99 deletions

View File

@ -0,0 +1,62 @@
# ==== Purpose ====
#
# Terminate all binlog dump threads on a master.
#
# This is sometimes useful, as normally such dump threads can hang
# around for some time before they notice that the slave has disconnected.
#
# Note that if there are active slave connections, they might try to
# reconnect as their dump threads are killed, which may not lead to the
# desired results.
#
#
# ==== Usage ====
#
# [--let $kill_timeout= NUMBER]
# --source include/stop_slavekill_binlog_dump_threads.inc
#
# Parameters:
# $kill_timeout
# Maximum number of seconds to wait for dump threads to disappear.
--let $include_filename= kill_binlog_dump_threads.inc
--source include/begin_include_file.inc
--disable_query_log
let $wait_counter= 300;
if ($kill_timeout)
{
let $wait_counter= `SELECT $kill_timeout * 10`;
}
let $success= 0;
while ($wait_counter)
{
dec $wait_counter;
let $_tid= `SELECT id FROM information_schema.processlist WHERE command = 'Binlog Dump' LIMIT 1`;
if ($_tid)
{
eval KILL QUERY $_tid;
}
if (!$_tid)
{
let $wait_counter= 0;
let $success= 1;
}
if (!$success)
{
real_sleep 0.1;
}
}
if (!$success)
{
SHOW FULL PROCESSLIST;
--die Timeout while waiting for binlog dump threads to disappear.
}
--enable_query_log
--let $include_filename= kill_binlog_dump_threads.inc
--source include/end_include_file.inc

View File

@ -0,0 +1,169 @@
include/rpl_init.inc [topology=1->2]
include/stop_slave.inc
CHANGE MASTER TO master_use_gtid= current_pos;
include/start_slave.inc
CREATE TABLE t1 (a INT);
FLUSH LOGS;
SET gtid_domain_id=10;
INSERT INTO t1 VALUES (1);
INSERT INTO t1 VALUES (2);
SET gtid_seq_no=100;
INSERT INTO t1 VALUES (3);
INSERT INTO t1 VALUES (4);
INSERT INTO t1 VALUES (5);
include/stop_slave.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
3
4
5
include/kill_binlog_dump_threads.inc
INSERT INTO t1 VALUES (10);
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
3
4
5
10
include/stop_slave.inc
TRUNCATE t1;
RESET MASTER;
SET GLOBAL gtid_slave_pos= "";
SET GLOBAL debug_dbug= @old_debug;
TRUNCATE t1;
RESET MASTER;
SET gtid_domain_id=10;
SET gtid_seq_no=50;
INSERT INTO t1 VALUES (1);
SET gtid_domain_id=11;
INSERT INTO t1 VALUES (11);
SET gtid_domain_id=10;
SET gtid_seq_no=100;
INSERT INTO t1 VALUES (2);
SET gtid_domain_id=11;
INSERT INTO t1 VALUES (12);
SET gtid_domain_id=10;
INSERT INTO t1 VALUES (3);
SET gtid_domain_id=11;
SET gtid_seq_no=200;
INSERT INTO t1 VALUES (13);
START SLAVE UNTIL master_gtid_pos="10-1-50,11-1-200";
include/wait_for_slave_to_stop.inc
SELECT * FROM t1 ORDER BY a;
a
1
11
12
13
include/kill_binlog_dump_threads.inc
INSERT INTO t1 VALUES (20);
SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
3
11
12
13
20
include/stop_slave.inc
TRUNCATE t1;
RESET MASTER;
SET GLOBAL gtid_slave_pos= "";
SET GLOBAL debug_dbug= @old_debug;
TRUNCATE t1;
RESET MASTER;
include/kill_binlog_dump_threads.inc
SET gtid_domain_id= 9;
SET gtid_seq_no= 50;
INSERT INTO t1 VALUES (1);
SET gtid_domain_id= 10;
INSERT INTO t1 VALUES (11);
SET gtid_domain_id= 9;
INSERT INTO t1 VALUES (2);
SET gtid_domain_id= 10;
SET gtid_seq_no= 100;
INSERT INTO t1 VALUES (12);
SET gtid_domain_id= 9;
INSERT INTO t1 VALUES (3);
SET gtid_domain_id= 10;
SET gtid_seq_no= 200;
INSERT INTO t1 VALUES (13);
SET gtid_domain_id= 10;
SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
START SLAVE UNTIL master_gtid_pos="9-1-50,10-1-200";
include/wait_for_slave_to_stop.inc
SELECT * FROM t1 ORDER BY a;
a
1
11
12
13
SET GLOBAL debug_dbug= @old_debug;
INSERT INTO t1 VALUES (20);
include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
3
11
12
13
20
*** Test when slave IO thread needs to reconnect in the middle of an event group. ***
include/stop_slave.inc
TRUNCATE t1;
RESET MASTER;
SET GLOBAL gtid_slave_pos= "";
SET GLOBAL debug_dbug= @old_debug;
TRUNCATE t1;
RESET MASTER;
include/kill_binlog_dump_threads.inc
SET GLOBAL debug_dbug="+d,binlog_force_reconnect_after_22_events";
CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t2 VALUES (1);
BEGIN;
INSERT INTO t2 VALUES (10);
INSERT INTO t2 VALUES (11);
COMMIT;
BEGIN;
INSERT INTO t2 VALUES (20);
INSERT INTO t2 VALUES (21);
INSERT INTO t2 VALUES (22);
INSERT INTO t2 VALUES (23);
INSERT INTO t2 VALUES (24);
INSERT INTO t2 VALUES (25);
INSERT INTO t2 VALUES (26);
INSERT INTO t2 VALUES (27);
INSERT INTO t2 VALUES (28);
INSERT INTO t2 VALUES (29);
COMMIT;
include/start_slave.inc
SELECT * FROM t2 ORDER BY a;
a
1
10
11
20
21
22
23
24
25
26
27
28
29
SET GLOBAL debug_dbug= @old_debug;
DROP TABLE t1, t2;
include/rpl_end.inc

View File

@ -0,0 +1,201 @@
--let $rpl_topology=1->2
--source include/rpl_init.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--connection server_2
--source include/stop_slave.inc
CHANGE MASTER TO master_use_gtid= current_pos;
--source include/start_slave.inc
--connection server_1
CREATE TABLE t1 (a INT);
FLUSH LOGS;
--save_master_pos
--connection server_2
--sync_with_master
# Prepare a string of events and have the slave replicate all of it.
--connection server_1
SET gtid_domain_id=10;
INSERT INTO t1 VALUES (1);
INSERT INTO t1 VALUES (2);
SET gtid_seq_no=100;
INSERT INTO t1 VALUES (3);
INSERT INTO t1 VALUES (4);
INSERT INTO t1 VALUES (5);
--save_master_pos
--connection server_2
--sync_with_master
--source include/stop_slave.inc
SELECT * FROM t1 ORDER BY a;
# Now start the slave again, but force a reconnect. There was a bug that this
# reconnect would cause duplicate events.
--connection server_1
# Make sure to get rid of any old binlog dump thread so it does not
# interfere with our DBUG error injection.
--source include/kill_binlog_dump_threads.inc
INSERT INTO t1 VALUES (10);
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
--save_master_pos
--connection server_2
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 ORDER BY a;
--source include/stop_slave.inc
TRUNCATE t1;
RESET MASTER;
SET GLOBAL gtid_slave_pos= "";
--connection server_1
SET GLOBAL debug_dbug= @old_debug;
TRUNCATE t1;
RESET MASTER;
# A1 B1 A2 B2 A3 B3, slave reached A1 and B3 and stopped. Slave starts,
# reconnects at A2. There was a bug that B2 would be duplicated.
SET gtid_domain_id=10;
SET gtid_seq_no=50;
INSERT INTO t1 VALUES (1);
SET gtid_domain_id=11;
INSERT INTO t1 VALUES (11);
SET gtid_domain_id=10;
SET gtid_seq_no=100;
INSERT INTO t1 VALUES (2);
SET gtid_domain_id=11;
INSERT INTO t1 VALUES (12);
SET gtid_domain_id=10;
INSERT INTO t1 VALUES (3);
SET gtid_domain_id=11;
SET gtid_seq_no=200;
INSERT INTO t1 VALUES (13);
--connection server_2
START SLAVE UNTIL master_gtid_pos="10-1-50,11-1-200";
--source include/wait_for_slave_to_stop.inc
SELECT * FROM t1 ORDER BY a;
--connection server_1
--source include/kill_binlog_dump_threads.inc
INSERT INTO t1 VALUES (20);
SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
--save_master_pos
--connection server_2
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 ORDER BY a;
--source include/stop_slave.inc
TRUNCATE t1;
RESET MASTER;
SET GLOBAL gtid_slave_pos= "";
--connection server_1
SET GLOBAL debug_dbug= @old_debug;
TRUNCATE t1;
RESET MASTER;
# A1 B1 A2 B2 A3 B3. START SLAVE UNTIL A1,B3, gets reconnect at B2.
# There was a bug that the UNTIL would be ignored, and A2 would be lost.
--source include/kill_binlog_dump_threads.inc
SET gtid_domain_id= 9;
SET gtid_seq_no= 50;
INSERT INTO t1 VALUES (1);
SET gtid_domain_id= 10;
INSERT INTO t1 VALUES (11);
SET gtid_domain_id= 9;
INSERT INTO t1 VALUES (2);
SET gtid_domain_id= 10;
SET gtid_seq_no= 100;
INSERT INTO t1 VALUES (12);
SET gtid_domain_id= 9;
INSERT INTO t1 VALUES (3);
SET gtid_domain_id= 10;
SET gtid_seq_no= 200;
INSERT INTO t1 VALUES (13);
SET gtid_domain_id= 10;
SET GLOBAL debug_dbug="+d,gtid_force_reconnect_at_10_1_100";
--connection server_2
START SLAVE UNTIL master_gtid_pos="9-1-50,10-1-200";
--source include/wait_for_slave_to_stop.inc
SELECT * FROM t1 ORDER BY a;
--connection server_1
SET GLOBAL debug_dbug= @old_debug;
INSERT INTO t1 VALUES (20);
--save_master_pos
--connection server_2
--source include/start_slave.inc
SELECT * FROM t1 ORDER BY a;
--echo *** Test when slave IO thread needs to reconnect in the middle of an event group. ***
--connection server_2
--source include/stop_slave.inc
TRUNCATE t1;
RESET MASTER;
SET GLOBAL gtid_slave_pos= "";
--connection server_1
SET GLOBAL debug_dbug= @old_debug;
TRUNCATE t1;
RESET MASTER;
--source include/kill_binlog_dump_threads.inc
SET GLOBAL debug_dbug="+d,binlog_force_reconnect_after_22_events";
# 4 events for FD, fake rotate, gtid list, binlog checkpoint.
# 2 events for GTID, create table
CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
# 3 events for BEGIN/query/COMMIT
INSERT INTO t2 VALUES (1);
# 4 events for BEGIN/query/query/COMMIT
BEGIN;
INSERT INTO t2 VALUES (10);
INSERT INTO t2 VALUES (11);
COMMIT;
# So this event group starts after 4+2+4+3=13 events. Or 16 in row-based.
BEGIN;
INSERT INTO t2 VALUES (20);
INSERT INTO t2 VALUES (21);
INSERT INTO t2 VALUES (22);
INSERT INTO t2 VALUES (23);
INSERT INTO t2 VALUES (24);
INSERT INTO t2 VALUES (25);
INSERT INTO t2 VALUES (26);
INSERT INTO t2 VALUES (27);
INSERT INTO t2 VALUES (28);
INSERT INTO t2 VALUES (29);
COMMIT;
--save_master_pos
--connection server_2
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t2 ORDER BY a;
--connection server_1
SET GLOBAL debug_dbug= @old_debug;
# Clean up.
--connection server_1
DROP TABLE t1, t2;
--source include/rpl_end.inc