mirror of
https://github.com/MariaDB/server.git
synced 2025-07-30 16:24:05 +03:00
When the I/O thread was stopped while copying a long transaction, and restarted,
Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it by moving the test for corruption to Start_log_event::exec_event(). Changed Rotate_log_event::exec_event() to not increment positions when the event is seen in the middle of a transaction. I did a separate commit in 4.1 (so this should not be merged to 4.0) because code is a bit different in 4.1. A test to see if the slave detects when the master died while writing a transaction to the binlog (uses a forged truncated binlog I made). sql/log_event.cc: When the I/O thread was stopped while copying a long transaction, and restarted, Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it by moving the test for corruption to Start_log_event::exec_event(). Changed Rotate_log_event::exec_event() to not increment positions when the event is seen in the middle of a transaction.
This commit is contained in:
14
mysql-test/r/rpl_trunc_binlog.result
Normal file
14
mysql-test/r/rpl_trunc_binlog.result
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
slave stop;
|
||||||
|
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
|
||||||
|
reset master;
|
||||||
|
reset slave;
|
||||||
|
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
|
||||||
|
slave start;
|
||||||
|
stop slave;
|
||||||
|
flush logs;
|
||||||
|
reset slave;
|
||||||
|
start slave;
|
||||||
|
show slave status;
|
||||||
|
Master_Host Master_User Master_Port Connect_retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_do_db Replicate_ignore_db Last_errno Last_error Skip_counter Exec_master_log_pos Relay_log_space
|
||||||
|
127.0.0.1 root MASTER_PORT 1 master-bin.002 4 slave-relay-bin.002 161 master-bin.001 Yes No 0 there is an unfinished transaction in the relay log (could find neither COMMIT nor ROLLBACK in the relay log); it could be that the master died while writing the transaction to its binary log. Now the slave is rolling back the transaction. 0 79 317
|
||||||
|
reset master;
|
BIN
mysql-test/std_data/trunc_binlog.001
Normal file
BIN
mysql-test/std_data/trunc_binlog.001
Normal file
Binary file not shown.
22
mysql-test/t/rpl_trunc_binlog.test
Normal file
22
mysql-test/t/rpl_trunc_binlog.test
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# We are testing if a binlog which contains BEGIN but not COMMIT (the master did
|
||||||
|
# while writing the transaction to the binlog) triggers an error on slave.
|
||||||
|
# So we use such a truncated binlog and simulate that the master restarted after
|
||||||
|
# this.
|
||||||
|
|
||||||
|
source include/master-slave.inc;
|
||||||
|
|
||||||
|
connection slave;
|
||||||
|
stop slave;
|
||||||
|
connection master;
|
||||||
|
flush logs;
|
||||||
|
system mv -f var/log/master-bin.001 var/log/master-bin.002;
|
||||||
|
system cp std_data/trunc_binlog.001 var/log/master-bin.001;
|
||||||
|
connection slave;
|
||||||
|
reset slave;
|
||||||
|
start slave;
|
||||||
|
# can't sync_with_master so we must sleep
|
||||||
|
sleep 3;
|
||||||
|
--replace_result $MASTER_MYPORT MASTER_PORT
|
||||||
|
show slave status;
|
||||||
|
connection master;
|
||||||
|
reset master;
|
@ -2091,6 +2091,23 @@ int Start_log_event::exec_event(struct st_relay_log_info* rli)
|
|||||||
*/
|
*/
|
||||||
close_temporary_tables(thd);
|
close_temporary_tables(thd);
|
||||||
cleanup_load_tmpdir();
|
cleanup_load_tmpdir();
|
||||||
|
/*
|
||||||
|
As a transaction NEVER spans on 2 or more binlogs:
|
||||||
|
if we have an active transaction at this point, the master died while
|
||||||
|
writing the transaction to the binary log, i.e. while flushing the binlog
|
||||||
|
cache to the binlog. As the write was started, the transaction had been
|
||||||
|
committed on the master, so we lack of information to replay this
|
||||||
|
transaction on the slave; all we can do is stop with error.
|
||||||
|
*/
|
||||||
|
if (rli->inside_transaction)
|
||||||
|
{
|
||||||
|
slave_print_error(rli, 0,
|
||||||
|
"there is an unfinished transaction in the relay log \
|
||||||
|
(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
|
||||||
|
the master died while writing the transaction to its binary log. Now the slave \
|
||||||
|
is rolling back the transaction.");
|
||||||
|
return(1);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
/*
|
/*
|
||||||
Now the older formats; in that case load_tmpdir is cleaned up by the I/O
|
Now the older formats; in that case load_tmpdir is cleaned up by the I/O
|
||||||
@ -2166,51 +2183,34 @@ int Stop_log_event::exec_event(struct st_relay_log_info* rli)
|
|||||||
We can't rotate the slave as this will cause infinitive rotations
|
We can't rotate the slave as this will cause infinitive rotations
|
||||||
in a A -> B -> A setup.
|
in a A -> B -> A setup.
|
||||||
|
|
||||||
NOTES
|
|
||||||
As a transaction NEVER spans on 2 or more binlogs:
|
|
||||||
if we have an active transaction at this point, the master died while
|
|
||||||
writing the transaction to the binary log, i.e. while flushing the binlog
|
|
||||||
cache to the binlog. As the write was started, the transaction had been
|
|
||||||
committed on the master, so we lack of information to replay this
|
|
||||||
transaction on the slave; all we can do is stop with error.
|
|
||||||
If we didn't detect it, then positions would start to become garbage (as we
|
|
||||||
are incrementing rli->relay_log_pos whereas we are in a transaction: the new
|
|
||||||
rli->relay_log_pos will be
|
|
||||||
relay_log_pos of the BEGIN + size of the Rotate event = garbage.
|
|
||||||
|
|
||||||
Since MySQL 4.0.14, the master ALWAYS sends a Rotate event when it starts
|
|
||||||
sending the next binlog, so we are sure to receive a Rotate event just
|
|
||||||
after the end of the "dead master"'s binlog; so this exec_event() is the
|
|
||||||
right place to catch the problem. If we would wait until
|
|
||||||
Start_log_event::exec_event() it would be too late, rli->relay_log_pos would
|
|
||||||
already be garbage.
|
|
||||||
|
|
||||||
RETURN VALUES
|
RETURN VALUES
|
||||||
0 ok
|
0 ok
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int Rotate_log_event::exec_event(struct st_relay_log_info* rli)
|
int Rotate_log_event::exec_event(struct st_relay_log_info* rli)
|
||||||
{
|
{
|
||||||
char* log_name = rli->master_log_name;
|
|
||||||
DBUG_ENTER("Rotate_log_event::exec_event");
|
DBUG_ENTER("Rotate_log_event::exec_event");
|
||||||
|
|
||||||
pthread_mutex_lock(&rli->data_lock);
|
pthread_mutex_lock(&rli->data_lock);
|
||||||
|
/*
|
||||||
if (rli->inside_transaction)
|
If we are in a transaction: the only normal case is when the I/O thread was
|
||||||
|
copying a big transaction, then it was stopped and restarted: we have this
|
||||||
|
in the relay log:
|
||||||
|
BEGIN
|
||||||
|
...
|
||||||
|
ROTATE (a fake one)
|
||||||
|
...
|
||||||
|
COMMIT or ROLLBACK
|
||||||
|
In that case, we don't want to touch the coordinates which correspond to the
|
||||||
|
beginning of the transaction.
|
||||||
|
*/
|
||||||
|
if (!rli->inside_transaction)
|
||||||
{
|
{
|
||||||
slave_print_error(rli, 0,
|
memcpy(rli->master_log_name, new_log_ident, ident_len+1);
|
||||||
"there is an unfinished transaction in the relay log \
|
rli->master_log_pos= pos;
|
||||||
(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
|
DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
|
||||||
the master died while writing the transaction to its binary log. Now the slave \
|
|
||||||
is rolling back the transaction.");
|
|
||||||
pthread_mutex_unlock(&rli->data_lock);
|
|
||||||
DBUG_RETURN(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(log_name, new_log_ident, ident_len+1);
|
|
||||||
rli->master_log_pos = pos;
|
|
||||||
rli->relay_log_pos += get_event_len();
|
rli->relay_log_pos += get_event_len();
|
||||||
DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
|
|
||||||
pthread_mutex_unlock(&rli->data_lock);
|
pthread_mutex_unlock(&rli->data_lock);
|
||||||
pthread_cond_broadcast(&rli->data_cond);
|
pthread_cond_broadcast(&rli->data_cond);
|
||||||
flush_relay_log_info(rli);
|
flush_relay_log_info(rli);
|
||||||
|
Reference in New Issue
Block a user