1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-21117: refine the server binlog-based recovery for semisync

Problem:
=======
When the semisync master is crashed and restarted as slave it could
recover transactions that former slaves may never have seen.
A known method existed to clear out all prepared transactions
with --tc-heuristic-recover=rollback does not care to adjust
binlog accordingly.

Fix:
===
The binlog-based recovery is made to concern of the slave semisync role of
post-crash restarted server.
No changes in behavior is done to the "normal" binloggging server
and the semisync master.

When the restarted server is configured with
  --rpl-semi-sync-slave-enabled=1
the refined recovery attempts to roll back prepared transactions
and truncate binlog accordingly.
In case of a partially committed (that is committed at least
in one of the engine participants) such transaction gets committed.
It's guaranteed no (partially as well) committed transactions
exist beyond the truncate position.
In case there exists a non-transactional replication event
(being in a way a committed transaction) past the
computed truncate position the recovery ends with an error.

As after master crash and failover to slave, the demoted-to-slave
ex-master must be ready to face and accept its own (generated by)
events, without generally necessary --replicate-same-server-id.
So the acceptance conditions are relaxed for the semisync slave
to accept own events without that option.
While gtid_strict_mode ON ensures no duplicate transaction can be
(re-)executed the master_use_gtid=none slave has to be
configured with --replicate-same-server-id.

*NOTE* for reviewers.

This patch does not handle the user XA which is done
in next git commit.
This commit is contained in:
Sujatha
2020-04-09 20:45:45 +05:30
committed by Andrei Elkin
parent 82c07b178a
commit 6c39eaeb12
25 changed files with 2619 additions and 127 deletions

View File

@@ -0,0 +1,129 @@
include/master-slave.inc
[connection master]
connection server_2;
include/stop_slave.inc
connection server_1;
RESET MASTER;
SET @@global.max_binlog_size= 4096;
connection server_2;
RESET MASTER;
SET @@global.max_binlog_size= 4096;
set @@global.rpl_semi_sync_slave_enabled = 1;
set @@global.gtid_slave_pos = "";
CHANGE MASTER TO master_use_gtid= slave_pos;
include/start_slave.inc
connection server_1;
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
set @@global.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_wait_point=AFTER_SYNC;
call mtr.add_suppression("Can.t init tc log");
call mtr.add_suppression("Aborting");
call mtr.add_suppression("1 client is using or hasn.t closed the table properly");
call mtr.add_suppression("Table './mtr/test_suppressions' is marked as crashed and should be repaired");
CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
INSERT INTO t1 VALUES (1, 'dummy1');
connect conn_client,127.0.0.1,root,,test,$SERVER_MYPORT_1,;
SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL con1_ready WAIT_FOR con1_go";
INSERT INTO t1 VALUES (2, REPEAT("x", 4100));
connection server_1;
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
# Kill the server
connection server_2;
include/stop_slave.inc
SELECT @@GLOBAL.gtid_current_pos;
@@GLOBAL.gtid_current_pos
0-1-8
# restart: --rpl-semi-sync-slave-enabled=1
connection server_1;
FOUND 1 /truncated binlog file:.*master.*000001/ in mysqld.1.err
disconnect conn_client;
connection server_2;
set global rpl_semi_sync_master_enabled = 1;
set global rpl_semi_sync_master_wait_point=AFTER_SYNC;
connection server_1;
CHANGE MASTER TO master_host='127.0.0.1', master_port=$new_master_port, master_user='root', master_use_gtid=SLAVE_POS;
set global rpl_semi_sync_slave_enabled = 1;
set @@global.gtid_slave_pos=@@global.gtid_binlog_pos;
include/start_slave.inc
connection server_2;
INSERT INTO t1 VALUES (3, 'dummy3');
# The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
Variable_name Value
gtid_binlog_pos 0-2-9
connection server_1;
SELECT COUNT(*) = 3 as 'true' FROM t1;
true
1
# ... the gtid states on the slave:
SHOW VARIABLES LIKE 'gtid_slave_pos';
Variable_name Value
gtid_slave_pos 0-2-9
SHOW VARIABLES LIKE 'gtid_binlog_pos';
Variable_name Value
gtid_binlog_pos 0-2-9
connection server_2;
connect conn_client,127.0.0.1,root,,test,$SERVER_MYPORT_2,;
SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL con1_ready WAIT_FOR con1_go";
INSERT INTO t1 VALUES (4, REPEAT("x", 4100));
connect conn_client_2,127.0.0.1,root,,test,$SERVER_MYPORT_2,;
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
SET DEBUG_SYNC= "commit_after_release_LOCK_log SIGNAL con1_ready WAIT_FOR con2_go";
INSERT INTO t1 VALUES (5, REPEAT("x", 4100));
connection server_2;
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
# Kill the server
connection server_1;
include/stop_slave.inc
SELECT @@GLOBAL.gtid_current_pos;
@@GLOBAL.gtid_current_pos
0-2-11
# restart: --rpl-semi-sync-slave-enabled=1
connection server_2;
NOT FOUND /truncated binlog file:.*slave.*000001/ in mysqld.2.err
disconnect conn_client;
connection server_1;
set global rpl_semi_sync_master_enabled = 1;
set global rpl_semi_sync_master_wait_point=AFTER_SYNC;
connection server_2;
CHANGE MASTER TO master_host='127.0.0.1', master_port=$new_master_port, master_user='root', master_use_gtid=SLAVE_POS;
set global rpl_semi_sync_slave_enabled = 1;
set @@global.gtid_slave_pos=@@global.gtid_binlog_pos;
include/start_slave.inc
connection server_1;
INSERT INTO t1 VALUES (6, 'Done');
# The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
Variable_name Value
gtid_binlog_pos 0-1-12
connection server_2;
SELECT COUNT(*) = 6 as 'true' FROM t1;
true
1
# ... the gtid states on the slave:
SHOW VARIABLES LIKE 'gtid_slave_pos';
Variable_name Value
gtid_slave_pos 0-1-12
SHOW VARIABLES LIKE 'gtid_binlog_pos';
Variable_name Value
gtid_binlog_pos 0-1-12
include/diff_tables.inc [server_1:t1, server_2:t1]
# Cleanup
connection server_1;
DROP TABLE t1;
connection server_2;
include/stop_slave.inc
connection server_1;
set @@global.rpl_semi_sync_master_enabled = 0;
set @@global.rpl_semi_sync_slave_enabled = 0;
set @@global.rpl_semi_sync_master_wait_point=default;
RESET SLAVE;
RESET MASTER;
connection server_2;
set @@global.rpl_semi_sync_master_enabled = 0;
set @@global.rpl_semi_sync_slave_enabled = 0;
set @@global.rpl_semi_sync_master_wait_point=default;
CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_use_gtid=no;
include/start_slave.inc
connection default;
include/rpl_end.inc

View File

@@ -0,0 +1,77 @@
if ($failover_to_slave)
{
--let $server_to_crash=1
--let $server_to_promote=2
--let $new_master_port=$SERVER_MYPORT_2
--let $client_port=$SERVER_MYPORT_1
--connect (conn_client,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
}
if (!$failover_to_slave)
{
--let $server_to_crash=2
--let $server_to_promote=1
--let $new_master_port=$SERVER_MYPORT_1
--let $client_port=$SERVER_MYPORT_2
--connect (conn_client,127.0.0.1,root,,test,$SERVER_MYPORT_2,)
}
# Hold insert after write to binlog and before "run_commit_ordered" in engine
SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL con1_ready WAIT_FOR con1_go";
--send_eval $query_to_crash
# complicate recovery with an extra binlog file
if (!$failover_to_slave)
{
--connect (conn_client_2,127.0.0.1,root,,test,$SERVER_MYPORT_2,)
# use the same signal with $query_to_crash
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
SET DEBUG_SYNC= "commit_after_release_LOCK_log SIGNAL con1_ready WAIT_FOR con2_go";
--send_eval $query2_to_crash
}
--connection server_$server_to_crash
SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
--source include/kill_mysqld.inc
--connection server_$server_to_promote
--error 2003
--source include/stop_slave.inc
SELECT @@GLOBAL.gtid_current_pos;
--let $restart_parameters=--rpl-semi-sync-slave-enabled=1
--let $allow_rpl_inited=1
--source include/start_mysqld.inc
--connection server_$server_to_crash
--enable_reconnect
--source include/wait_until_connected_again.inc
# Check error log for correct messages.
let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.$server_to_crash.err;
--let SEARCH_FILE=$log_error_
--let SEARCH_PATTERN=$log_search_pattern
--source include/search_pattern_in_file.inc
--disconnect conn_client
#
# FAIL OVER now to new master
#
--connection server_$server_to_promote
set global rpl_semi_sync_master_enabled = 1;
set global rpl_semi_sync_master_wait_point=AFTER_SYNC;
--connection server_$server_to_crash
--let $master_port=$SERVER_MYPORT_2
if (`select $server_to_crash = 2`)
{
--let $master_port=$SERVER_MYPORT_1
}
evalp CHANGE MASTER TO master_host='127.0.0.1', master_port=$new_master_port, master_user='root', master_use_gtid=SLAVE_POS;
set global rpl_semi_sync_slave_enabled = 1;
set @@global.gtid_slave_pos=@@global.gtid_binlog_pos;
--source include/start_slave.inc

View File

@@ -0,0 +1,11 @@
!include suite/rpl/rpl_1slave_base.cnf
!include include/default_client.cnf
[mysqld.1]
log-slave-updates
gtid-strict-mode=1
[mysqld.2]
log-slave-updates
gtid-strict-mode=1

View File

@@ -0,0 +1,144 @@
# ==== Purpose ====
#
# Test verifies replication failover scenario.
#
# ==== Implementation ====
#
# Steps:
# 0 - Having two servers 1 and 2 enable semi-sync replication with
# with the master wait 'after_sync'.
# 1 - Insert a row. While inserting second row simulate
# a server crash at once the transaction is written to binlog, flushed
# and synced but the binlog position is not updated.
# 2 - Post crash-recovery on the old master execute there CHANGE MASTER
# TO command to connect to server id 2.
# 3 - The old master new slave server 1 must connect to the new
# master server 2.
# 4 - repeat the above to crash the new master and restore in role the old one
#
# ==== References ====
#
# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
--source include/have_innodb.inc
--source include/have_debug_sync.inc
--source include/have_binlog_format_row.inc
--source include/master-slave.inc
# Initial slave
--connection server_2
--source include/stop_slave.inc
# Initial master
--connection server_1
RESET MASTER;
SET @@global.max_binlog_size= 4096;
--connection server_2
RESET MASTER;
SET @@global.max_binlog_size= 4096;
set @@global.rpl_semi_sync_slave_enabled = 1;
set @@global.gtid_slave_pos = "";
CHANGE MASTER TO master_use_gtid= slave_pos;
--source include/start_slave.inc
--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
set @@global.rpl_semi_sync_master_enabled = 1;
set @@global.rpl_semi_sync_master_wait_point=AFTER_SYNC;
call mtr.add_suppression("Can.t init tc log");
call mtr.add_suppression("Aborting");
call mtr.add_suppression("1 client is using or hasn.t closed the table properly");
call mtr.add_suppression("Table './mtr/test_suppressions' is marked as crashed and should be repaired");
CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
INSERT INTO t1 VALUES (1, 'dummy1');
#
# CRASH the original master, and FAILOVER to the new
#
# value 1 for server id 1 -> 2 failover
--let $failover_to_slave=1
--let $query_to_crash= INSERT INTO t1 VALUES (2, REPEAT("x", 4100))
--let $log_search_pattern=truncated binlog file:.*master.*000001
--source rpl_semi_sync_crash.inc
--connection server_2
--let $rows_so_far=3
--eval INSERT INTO t1 VALUES ($rows_so_far, 'dummy3')
--save_master_pos
--echo # The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--connection server_1
--sync_with_master
--eval SELECT COUNT(*) = $rows_so_far as 'true' FROM t1
--echo # ... the gtid states on the slave:
SHOW VARIABLES LIKE 'gtid_slave_pos';
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--connection server_2
#
# CRASH the new master and FAILOVER back to the original
#
# value 0 for the reverse server id 2 -> 1 failover
--let $failover_to_slave=0
--let $query_to_crash = INSERT INTO t1 VALUES (4, REPEAT("x", 4100))
--let $query2_to_crash= INSERT INTO t1 VALUES (5, REPEAT("x", 4100))
--let $log_search_pattern=truncated binlog file:.*slave.*000001
--source rpl_semi_sync_crash.inc
--connection server_1
--let $rows_so_far=6
--eval INSERT INTO t1 VALUES ($rows_so_far, 'Done')
--save_master_pos
--echo # The gtid state on current master must be equal to ...
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--connection server_2
--sync_with_master
--eval SELECT COUNT(*) = $rows_so_far as 'true' FROM t1
--echo # ... the gtid states on the slave:
SHOW VARIABLES LIKE 'gtid_slave_pos';
SHOW VARIABLES LIKE 'gtid_binlog_pos';
--let $diff_tables=server_1:t1, server_2:t1
--source include/diff_tables.inc
#
--echo # Cleanup
#
--connection server_1
DROP TABLE t1;
--save_master_pos
--connection server_2
--sync_with_master
--source include/stop_slave.inc
--connection server_1
set @@global.rpl_semi_sync_master_enabled = 0;
set @@global.rpl_semi_sync_slave_enabled = 0;
set @@global.rpl_semi_sync_master_wait_point=default;
RESET SLAVE;
RESET MASTER;
--connection server_2
set @@global.rpl_semi_sync_master_enabled = 0;
set @@global.rpl_semi_sync_slave_enabled = 0;
set @@global.rpl_semi_sync_master_wait_point=default;
evalp CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_use_gtid=no;
--source include/start_slave.inc
connection default;
--enable_reconnect
--source include/wait_until_connected_again.inc
--source include/rpl_end.inc