1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

MDEV-7818: Deadlock occurring with parallel replication and FTWRL

Problem is that FLUSH TABLES WITH READ LOCK first blocks threads from
starting new commits, then waits for running commits to complete. But
in-order parallel replication needs commits to happen in a particular
order, so this can easily deadlock.

To fix this problem, this patch introduces a way to temporarily pause
the parallel replication worker threads. Before starting FTWRL, we let
all worker threads complete in-progress transactions, and then
wait. Then we proceed to take the global read lock. Once the lock is
obtained, we unpause the worker threads. Now commits are blocked from
starting by the global read lock, so the deadlock will no longer occur.
This commit is contained in:
Kristian Nielsen
2015-10-22 11:18:34 +02:00
parent 6d96fab7dd
commit ba02550166
8 changed files with 537 additions and 25 deletions

View File

@@ -1,3 +1,5 @@
--source include/have_debug.inc
--source include/have_innodb.inc
--source include/have_binlog_format_statement.inc
--let $rpl_topology=1->2
--source include/rpl_init.inc
@@ -78,13 +80,144 @@ SET GLOBAL sql_slave_skip_counter= 1;
SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
# Clean up
--echo *** MDEV-7818: Deadlock occurring with parallel replication and FTWRL ***
--connection server_1
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t2 VALUES (1,0), (2,0), (3,0);
--save_master_pos
--connection server_2
--sync_with_master
--source include/stop_slave.inc
--connection server_1
# Create a group commit with two transactions, will be used to provoke the
# problematic thread interaction with FTWRL on the slave.
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 4242;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
BEGIN;
UPDATE t2 SET b=b+1 WHERE a=2;
COMMIT;
BEGIN;
INSERT INTO t2 VALUES (4,10);
COMMIT;
SET SESSION debug_dbug= @old_dbug;
INSERT INTO t2 VALUES (5,0);
INSERT INTO t2 VALUES (6,0);
INSERT INTO t2 VALUES (7,0);
INSERT INTO t2 VALUES (8,0);
INSERT INTO t2 VALUES (9,0);
INSERT INTO t2 VALUES (10,0);
INSERT INTO t2 VALUES (11,0);
INSERT INTO t2 VALUES (12,0);
INSERT INTO t2 VALUES (13,0);
INSERT INTO t2 VALUES (14,0);
INSERT INTO t2 VALUES (15,0);
INSERT INTO t2 VALUES (16,0);
INSERT INTO t2 VALUES (17,0);
INSERT INTO t2 VALUES (18,0);
INSERT INTO t2 VALUES (19,0);
--save_master_pos
--connection server_2
--connect (s1, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
# Block one transaction on a row lock.
BEGIN;
SELECT * FROM t2 WHERE a=2 FOR UPDATE;
--connection server_2
# Wait for slave thread of the other transaction to have the commit lock.
--source include/start_slave.inc
--let $wait_condition= SELECT COUNT(*) > 0 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
--source include/wait_condition.inc
--connect (s2, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
send FLUSH TABLES WITH READ LOCK;
# The bug was that at this point we were deadlocked.
# The FTWRL command would wait forever for T2 to commit.
# T2 would wait for T1 to commit first, but T1 is waiting for
# the global read lock to be released.
--connection s1
# Release the lock that blocs T1 from replicating.
COMMIT;
--connection s1
send STOP SLAVE;
--connection s2
reap;
--connection server_1
SELECT * FROM t2 ORDER BY a;
--connection s2
UNLOCK TABLES;
--connection s1
reap;
--connection server_2
--source include/wait_for_slave_to_stop.inc
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t2 ORDER BY a;
--echo *** MDEV-8318: Assertion `!pool->busy' failed in pool_mark_busy(rpl_parallel_thread_pool*) on concurrent FTWRL ***
--connection server_1
LOCK TABLE t2 WRITE;
--connect (m1,localhost,root,,test)
--connection m1
--let $cid=`SELECT CONNECTION_ID()`
send FLUSH TABLES WITH READ LOCK;
--connect (m2,localhost,root,,test)
# We cannot force the race with DEBUG_SYNC, because the race does not
# exist after fixing the bug. At best we could force a debug sync to
# time out, which is effectively just a sleep.
# So just put a small sleep here; it is enough to trigger the bug in
# most run before the bug fix, and the code should work correctly
# however the thread scheduling happens.
--sleep 0.1
send FLUSH TABLES WITH READ LOCK;
--connection server_1
--replace_result $cid CID
eval KILL QUERY $cid;
--connection m1
--error ER_QUERY_INTERRUPTED
reap;
--connection server_1
UNLOCK TABLES;
--connection m2
reap;
UNLOCK TABLES;
# Clean up.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
--source include/start_slave.inc
--connection server_1
DROP TABLE t1;
DROP TABLE t1, t2;
--source include/rpl_end.inc