1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing

out of order at retry

The test failures were of two sorts. One is that the number of retries
what the slave thought as a temporary error exceeded
the default value of the slave retry option.
The 2nd issue was an out of order commit by transactions that
were supposed to error out instead.
Both issues are caused by the same reason that the post-temporary-error
retry did not check possibly already existing error status.

This is mended with refining conditions to retry. Specifically, a retrying
worker checks `rpl_parallel_entry::stop_on_error_sub_id` that
a potential failing predecessor could set to its own sub id.
Now should the member be set the retrying follower errors out with
ER_PRIOR_COMMIT_FAILED.
This commit is contained in:
Andrei Elkin
2018-02-09 15:00:23 +02:00
parent 76ae6e725d
commit 30019a48bf
3 changed files with 194 additions and 3 deletions

View File

@@ -128,6 +128,7 @@ SELECT * FROM t1 ORDER BY a;
SET sql_log_bin=0;
CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
SET sql_log_bin=1;
SET @old_dbug= @@GLOBAL.debug_dbug;
@@ -371,7 +372,7 @@ SELECT * FROM t3 ORDER BY a;
SET binlog_format=@old_format;
# Clean up.
# Clean up of the above part.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
@@ -381,4 +382,102 @@ SET GLOBAL slave_parallel_threads=@old_parallel_threads;
DROP TABLE t1, t2, t3, t4;
DROP function foo;
--sync_slave_with_master server_2
#
# MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing out of order at retry
#
--connection server_1
CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
# Replicate create-t1 and prepare to re-start slave in optimistic mode
--sync_slave_with_master server_2
--source include/stop_slave.inc
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @@GLOBAL.slave_parallel_threads=5;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET @@GLOBAL.slave_parallel_mode='aggressive';
SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
SET @@GLOBAL.innodb_lock_wait_timeout=2;
SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
SET @@GLOBAL.slave_transaction_retries=1;
--echo # Spoilers on the slave side causing temporary errors
--connect (spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT)
BEGIN;
INSERT INTO t1 SET a=1,b=2;
--connect (spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT)
BEGIN;
INSERT INTO t1 SET a=2,b=2;
--echo # Master payload
--connection server_1
SET @@SESSION.GTID_SEQ_NO=1000;
INSERT INTO t1 SET a=1,b=1;
SET @@SESSION.GTID_SEQ_NO=1001;
INSERT INTO t1 SET a=2,b=1;
--echo # Start slave whose both appliers is destined to being blocked
--connection server_2
SET @old_dbug= @@GLOBAL.debug_dbug;
SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
--source include/start_slave.inc
--echo # Make sure the 2nd seqno_1001 worker has gotten to waiting
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE '%debug sync point: now%';
--source include/wait_condition.inc
--echo # Signal to the 1st to proceed after it has reached termination state
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
--connection spoiler_21
ROLLBACK;
--echo # Release the 2nd worker to proceed
--connection spoiler_22
ROLLBACK;
--connection server_2
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';
--echo # observe how it all ends
if (`SELECT count(*) = 1 FROM t1 WHERE a = 1`)
{
--echo "*** Unexpected commit by the first Worker ***"
SELECT * from t1;
--die
}
--echo # Wait for the workers to go home and check the result of applying
--let $wait_condition=SELECT count(*) = 0 FROM information_schema.processlist WHERE command = 'Slave_worker'
--source include/wait_condition.inc
if (`SELECT count(*) = 1 FROM t1 WHERE a = 2`)
{
--echo
--echo "*** Error: congrats, you hit MDEV-12746 issue. ***"
--echo
--die
}
--echo # which is OK
#
# Clean up
#
--connection server_2
--source include/stop_slave.inc
SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
SET @@GLOBAL.debug_dbug=@old_dbug;
SET debug_sync='RESET';
--source include/start_slave.inc
--connection server_1
DROP TABLE t1;
--sync_slave_with_master server_2
--source include/rpl_end.inc