MDEV-7818: Deadlock occurring with parallel replication and FTWRL

Problem is that FLUSH TABLES WITH READ LOCK first blocks threads from starting new commits, then waits for running commits to complete. But in-order parallel replication needs commits to happen in a particular order, so this can easily deadlock. To fix this problem, this patch introduces a way to temporarily pause the parallel replication worker threads. Before starting FTWRL, we let all worker threads complete in-progress transactions, and then wait. Then we proceed to take the global read lock. Once the lock is obtained, we unpause the worker threads. Now commits are blocked from starting by the global read lock, so the deadlock will no longer occur.
2025-08-07 00:04:31 +03:00 · 2015-10-22 11:18:34 +02:00
parent 6d96fab7dd
commit ba02550166
8 changed files with 537 additions and 25 deletions
--- a/mysql-test/suite/rpl/t/rpl_parallel2.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel2.test
@@ -1,3 +1,5 @@
+--source include/have_debug.inc
+--source include/have_innodb.inc
 --source include/have_binlog_format_statement.inc
 --let $rpl_topology=1->2
 --source include/rpl_init.inc
@@ -78,13 +80,144 @@ SET GLOBAL sql_slave_skip_counter= 1;
 SELECT * FROM t1 WHERE a >= 10 ORDER BY a;


-# Clean up
+--echo *** MDEV-7818: Deadlock occurring with parallel replication and FTWRL ***
+
+--connection server_1
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (1,0), (2,0), (3,0);
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+
+--connection server_1
+# Create a group commit with two transactions, will be used to provoke the
+# problematic thread interaction with FTWRL on the slave.
+SET @old_dbug= @@SESSION.debug_dbug;
+SET @commit_id= 4242;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+BEGIN;
+UPDATE t2 SET b=b+1 WHERE a=2;
+COMMIT;
+
+BEGIN;
+INSERT INTO t2 VALUES (4,10);
+COMMIT;
+
+SET SESSION debug_dbug= @old_dbug;
+
+INSERT INTO t2 VALUES (5,0);
+INSERT INTO t2 VALUES (6,0);
+INSERT INTO t2 VALUES (7,0);
+INSERT INTO t2 VALUES (8,0);
+INSERT INTO t2 VALUES (9,0);
+INSERT INTO t2 VALUES (10,0);
+INSERT INTO t2 VALUES (11,0);
+INSERT INTO t2 VALUES (12,0);
+INSERT INTO t2 VALUES (13,0);
+INSERT INTO t2 VALUES (14,0);
+INSERT INTO t2 VALUES (15,0);
+INSERT INTO t2 VALUES (16,0);
+INSERT INTO t2 VALUES (17,0);
+INSERT INTO t2 VALUES (18,0);
+INSERT INTO t2 VALUES (19,0);
+--save_master_pos
+
+--connection server_2
+
+--connect (s1, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
+# Block one transaction on a row lock.
+BEGIN;
+SELECT * FROM t2 WHERE a=2 FOR UPDATE;
+
+--connection server_2
+
+# Wait for slave thread of the other transaction to have the commit lock.
+--source include/start_slave.inc
+--let $wait_condition= SELECT COUNT(*) > 0 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
+--source include/wait_condition.inc
+
+--connect (s2, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
+send FLUSH TABLES WITH READ LOCK;
+# The bug was that at this point we were deadlocked.
+# The FTWRL command would wait forever for T2 to commit.
+# T2 would wait for T1 to commit first, but T1 is waiting for
+# the global read lock to be released.
+
+--connection s1
+# Release the lock that blocs T1 from replicating.
+COMMIT;
+
+--connection s1
+send STOP SLAVE;
+
+--connection s2
+reap;
+
+--connection server_1
+SELECT * FROM t2 ORDER BY a;
+
+--connection s2
+UNLOCK TABLES;
+
+--connection s1
+reap;
+
+--connection server_2
+--source include/wait_for_slave_to_stop.inc
+--source include/start_slave.inc
+--sync_with_master
+
+SELECT * FROM t2 ORDER BY a;
+
+
+
+--echo *** MDEV-8318: Assertion `!pool->busy' failed in pool_mark_busy(rpl_parallel_thread_pool*) on concurrent FTWRL ***
+
+--connection server_1
+LOCK TABLE t2 WRITE;
+
+
+--connect (m1,localhost,root,,test)
+--connection m1
+--let $cid=`SELECT CONNECTION_ID()`
+send FLUSH TABLES WITH READ LOCK;
+
+--connect (m2,localhost,root,,test)
+# We cannot force the race with DEBUG_SYNC, because the race does not
+# exist after fixing the bug. At best we could force a debug sync to
+# time out, which is effectively just a sleep.
+# So just put a small sleep here; it is enough to trigger the bug in
+# most run before the bug fix, and the code should work correctly
+# however the thread scheduling happens.
+--sleep 0.1
+send FLUSH TABLES WITH READ LOCK;
+
+--connection server_1
+--replace_result $cid CID
+eval KILL QUERY $cid;
+
+--connection m1
+--error ER_QUERY_INTERRUPTED
+reap;
+
+--connection server_1
+UNLOCK TABLES;
+
+--connection m2
+reap;
+UNLOCK TABLES;
+
+
+# Clean up.
 --connection server_2
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
 --source include/start_slave.inc

 --connection server_1
-DROP TABLE t1;
+DROP TABLE t1, t2;

 --source include/rpl_end.inc