1
0
mirror of https://github.com/MariaDB/server.git synced 2025-09-02 09:41:40 +03:00

MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill)

There was a rare race, where a deadlock error might not be correctly
handled, causing the slave to stop with something like this in the error
log:

150423 14:04:10 [ERROR] Slave SQL: Connection was killed, Gtid 0-1-2, Internal MariaDB error code: 1927
150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927
150423 14:04:10 [Warning] Slave: Deadlock found when trying to get lock; try restarting transaction Error_code: 1213
150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927
150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927
150423 14:04:10 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'master-bin.000001 position 1234

The problem was incorrect error handling. When a deadlock is detected, it
causes a KILL CONNECTION on the offending thread. This error is then later
converted to a deadlock error, and the transaction is retried.

However, the deadlock error was not cleared at the start of the retry, nor
was the lingering kill signal. So it was possible to get another deadlock
kill early during retry. If this happened with particular thread
scheduling/timing, it was possible that the new KILL CONNECTION error was
masked by the earlier deadlock error, so that the second kill was not
properly converted into a deadlock error and retry.

This patch adds code that clears the old error and killed flag before
starting the retry. It also adds code to handle a deadlock kill caught in a
couple of places where it was not handled before.
This commit is contained in:
Kristian Nielsen
2015-04-23 14:09:15 +02:00
parent 4760528754
commit b616991a68
3 changed files with 259 additions and 18 deletions

View File

@@ -1537,7 +1537,7 @@ a b
120 0
121 0
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug;
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
include/stop_slave.inc
@@ -1566,7 +1566,88 @@ a b
130 0
131 0
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug;
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
INSERT INTO t3 VALUES (201,0), (202,0);
include/save_master_gtid.inc
include/sync_with_master_gtid.inc
include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10200;
INSERT INTO t3 VALUES (203, 1);
INSERT INTO t3 VALUES (204, 1);
INSERT INTO t3 VALUES (205, 1);
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=205;
UPDATE t3 SET b=b+1 WHERE a=205;
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 3
202 4
203 4
204 4
205 3
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 3
202 4
203 4
204 4
205 3
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** Check getting deadlock killed inside open_binlog() during retry. ***
include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
SET @old_max= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size= 4096;
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10210;
Omit long queries that cause relaylog rotations and transaction retries...
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 6
202 8
203 7
204 7
205 5
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 6
202 8
203 7
204 7
205 5
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
include/start_slave.inc
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;