1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-31 22:22:30 +03:00

MDEV-7326: Server deadlock in connection with parallel replication

The bug occurs when a transaction does a retry after all transactions have
done mark_start_commit() in a batch of group commit from the master. In this
case, the retrying transaction can unmark_start_commit() after the following
batch has already started running and de-allocated the GCO. Then after retry,
the transaction will re-do mark_start_commit() on a de-allocated GCO, and also
wakeup of later GCOs can be lost.

This was seen "in the wild" by a user, even though it is not known exactly
what circumstances can lead to retry of one transaction after all transactions
in a group have reached the commit phase.

The lifetime around GCO was somewhat clunky anyway. With this patch, a GCO
lives until rpl_parallel_entry::last_committed_sub_id has reached the last
transaction in the GCO. This guarantees that the GCO will still be alive when
a transaction does mark_start_commit(). Also, we now loop over the list of
active GCOs for wakeup, to ensure we do not lose a wakeup even in the
problematic case.
This commit is contained in:
Kristian Nielsen
2015-01-07 14:45:39 +01:00
parent 4a3251595c
commit f27817c1d0
6 changed files with 420 additions and 55 deletions

View File

@@ -1023,6 +1023,119 @@ SET GLOBAL slave_parallel_threads=0;
SET GLOBAL slave_parallel_threads=10;
CHANGE MASTER TO master_use_gtid=slave_pos;
include/start_slave.inc
*** MDEV-7326 Server deadlock in connection with parallel replication ***
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=0;
SET GLOBAL slave_parallel_threads=3;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
include/start_slave.inc
SET @old_format= @@SESSION.binlog_format;
SET binlog_format= STATEMENT;
INSERT INTO t1 VALUES (foo(50,
"rpl_parallel_start_waiting_for_prior SIGNAL t3_ready",
"rpl_parallel_end_of_group SIGNAL prep_ready WAIT_FOR prep_cont"));
SET DEBUG_SYNC= "now WAIT_FOR prep_ready";
INSERT INTO t2 VALUES (foo(50,
"rpl_parallel_simulate_temp_err_xid SIGNAL t1_ready1 WAIT_FOR t1_cont1",
"rpl_parallel_retry_after_unmark SIGNAL t1_ready2 WAIT_FOR t1_cont2"));
SET DEBUG_SYNC= "now WAIT_FOR t1_ready1";
INSERT INTO t1 VALUES (foo(51,
"rpl_parallel_before_mark_start_commit SIGNAL t2_ready1 WAIT_FOR t2_cont1",
"rpl_parallel_after_mark_start_commit SIGNAL t2_ready2"));
SET DEBUG_SYNC= "now WAIT_FOR t2_ready1";
SET DEBUG_SYNC= "now SIGNAL t1_cont1";
SET DEBUG_SYNC= "now WAIT_FOR t1_ready2";
INSERT INTO t1 VALUES (52);
SET BINLOG_FORMAT= @old_format;
SELECT * FROM t2 WHERE a>=50 ORDER BY a;
a
50
SELECT * FROM t1 WHERE a>=50 ORDER BY a;
a
50
51
52
SET DEBUG_SYNC= "now SIGNAL prep_cont";
SET DEBUG_SYNC= "now WAIT_FOR t3_ready";
SET DEBUG_SYNC= "now SIGNAL t2_cont1";
SET DEBUG_SYNC= "now WAIT_FOR t2_ready2";
SET DEBUG_SYNC= "now SIGNAL t1_cont2";
SELECT * FROM t2 WHERE a>=50 ORDER BY a;
a
50
SELECT * FROM t1 WHERE a>=50 ORDER BY a;
a
50
51
52
SET DEBUG_SYNC="reset";
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL slave_parallel_threads=0;
SET GLOBAL slave_parallel_threads=10;
include/start_slave.inc
*** MDEV-7326 Server deadlock in connection with parallel replication ***
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=0;
SET GLOBAL slave_parallel_threads=3;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
include/start_slave.inc
SET @old_format= @@SESSION.binlog_format;
SET binlog_format= STATEMENT;
INSERT INTO t1 VALUES (foo(60,
"rpl_parallel_start_waiting_for_prior SIGNAL t3_ready",
"rpl_parallel_end_of_group SIGNAL prep_ready WAIT_FOR prep_cont"));
SET DEBUG_SYNC= "now WAIT_FOR prep_ready";
INSERT INTO t2 VALUES (foo(60,
"rpl_parallel_simulate_temp_err_xid SIGNAL t1_ready1 WAIT_FOR t1_cont1",
"rpl_parallel_retry_after_unmark SIGNAL t1_ready2 WAIT_FOR t1_cont2"));
SET DEBUG_SYNC= "now WAIT_FOR t1_ready1";
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
SET binlog_format=statement;
INSERT INTO t1 VALUES (foo(61,
"rpl_parallel_before_mark_start_commit SIGNAL t2_ready1 WAIT_FOR t2_cont1",
"rpl_parallel_after_mark_start_commit SIGNAL t2_ready2"));
SET debug_sync='now WAIT_FOR master_queued1';
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
INSERT INTO t6 VALUES (62);
SET debug_sync='now WAIT_FOR master_queued2';
SET debug_sync='now SIGNAL master_cont1';
SET debug_sync='RESET';
SET BINLOG_FORMAT= @old_format;
SELECT * FROM t2 WHERE a>=60 ORDER BY a;
a
60
SELECT * FROM t1 WHERE a>=60 ORDER BY a;
a
60
61
SELECT * FROM t6 WHERE a>=60 ORDER BY a;
a
62
SET DEBUG_SYNC= "now WAIT_FOR t2_ready1";
SET DEBUG_SYNC= "now SIGNAL t1_cont1";
SET DEBUG_SYNC= "now WAIT_FOR t1_ready2";
SET DEBUG_SYNC= "now SIGNAL prep_cont";
SET DEBUG_SYNC= "now WAIT_FOR t3_ready";
SET DEBUG_SYNC= "now SIGNAL t2_cont1";
SET DEBUG_SYNC= "now WAIT_FOR t2_ready2";
SET DEBUG_SYNC= "now SIGNAL t1_cont2";
SELECT * FROM t2 WHERE a>=60 ORDER BY a;
a
60
SELECT * FROM t1 WHERE a>=60 ORDER BY a;
a
60
61
SELECT * FROM t6 WHERE a>=60 ORDER BY a;
a
62
SET DEBUG_SYNC="reset";
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL slave_parallel_threads=0;
SET GLOBAL slave_parallel_threads=10;
include/start_slave.inc
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc