When the I/O thread was stopped while copying a long transaction, and restarted,

Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it by moving the test for corruption to Start_log_event::exec_event(). Changed Rotate_log_event::exec_event() to not increment positions when the event is seen in the middle of a transaction. I did a separate commit in 4.1 (so this should not be merged to 4.0) because code is a bit different in 4.1. A test to see if the slave detects when the master died while writing a transaction to the binlog (uses a forged truncated binlog I made). sql/log_event.cc: When the I/O thread was stopped while copying a long transaction, and restarted, Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it by moving the test for corruption to Start_log_event::exec_event(). Changed Rotate_log_event::exec_event() to not increment positions when the event is seen in the middle of a transaction.
2025-07-30 16:24:05 +03:00 · 2003-10-03 22:13:01 +02:00
parent 7a58bfee61
commit 1bd7662b89
4 changed files with 69 additions and 33 deletions
--- a/mysql-test/r/rpl_trunc_binlog.result
+++ b/mysql-test/r/rpl_trunc_binlog.result
@ -0,0 +1,14 @@
 slave stop;
 drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
 reset master;
 reset slave;
 drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
 slave start;
 stop slave;
 flush logs;
 reset slave;
 start slave;
 show slave status;
 Master_Host	Master_User	Master_Port	Connect_retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_do_db	Replicate_ignore_db	Last_errno	Last_error	Skip_counter	Exec_master_log_pos	Relay_log_space
 127.0.0.1	root	MASTER_PORT	1	master-bin.002	4	slave-relay-bin.002	161	master-bin.001	Yes	No			0	there is an unfinished transaction in the relay log (could find neither COMMIT nor ROLLBACK in the relay log); it could be that the master died while writing the transaction to its binary log. Now the slave is rolling back the transaction.	0	79	317
 reset master;
--- a/mysql-test/std_data/trunc_binlog.001
+++ b/mysql-test/std_data/trunc_binlog.001
--- a/mysql-test/t/rpl_trunc_binlog.test
+++ b/mysql-test/t/rpl_trunc_binlog.test
@ -0,0 +1,22 @@
 # We are testing if a binlog which contains BEGIN but not COMMIT (the master did
 # while writing the transaction to the binlog) triggers an error on slave.
 # So we use such a truncated binlog and simulate that the master restarted after
 # this.
 source include/master-slave.inc;
 connection slave;
 stop slave;
 connection master;
 flush logs;
 system mv -f var/log/master-bin.001 var/log/master-bin.002;
 system cp std_data/trunc_binlog.001 var/log/master-bin.001;
 connection slave;
 reset slave;
 start slave;
 # can't sync_with_master so we must sleep
 sleep 3;
 --replace_result $MASTER_MYPORT MASTER_PORT
 show slave status;
 connection master;
 reset master;
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@ -2091,6 +2091,23 @@ int Start_log_event::exec_event(struct st_relay_log_info* rli)
    */
    close_temporary_tables(thd);
    cleanup_load_tmpdir();
    /*
      As a transaction NEVER spans on 2 or more binlogs:
      if we have an active transaction at this point, the master died while
      writing the transaction to the binary log, i.e. while flushing the binlog
      cache to the binlog. As the write was started, the transaction had been
      committed on the master, so we lack of information to replay this
      transaction on the slave; all we can do is stop with error.
    */
    if (rli->inside_transaction)
    {
      slave_print_error(rli, 0,
                        "there is an unfinished transaction in the relay log \
 (could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
 the master died while writing the transaction to its binary log. Now the slave \
 is rolling back the transaction.");
      return(1);
    }
    break;
  /* 
     Now the older formats; in that case load_tmpdir is cleaned up by the I/O
@ -2166,51 +2183,34 @@ int Stop_log_event::exec_event(struct st_relay_log_info* rli)
    We can't rotate the slave as this will cause infinitive rotations
    in a A -> B -> A setup.
  NOTES
    As a transaction NEVER spans on 2 or more binlogs:
    if we have an active transaction at this point, the master died while
    writing the transaction to the binary log, i.e. while flushing the binlog
    cache to the binlog. As the write was started, the transaction had been
    committed on the master, so we lack of information to replay this
    transaction on the slave; all we can do is stop with error.
    If we didn't detect it, then positions would start to become garbage (as we
    are incrementing rli->relay_log_pos whereas we are in a transaction: the new
    rli->relay_log_pos will be
    relay_log_pos of the BEGIN + size of the Rotate event = garbage.
    Since MySQL 4.0.14, the master ALWAYS sends a Rotate event when it starts
    sending the next binlog, so we are sure to receive a Rotate event just
    after the end of the "dead master"'s binlog; so this exec_event() is the
    right place to catch the problem. If we would wait until
    Start_log_event::exec_event() it would be too late, rli->relay_log_pos would
    already be garbage.
  RETURN VALUES
    0	ok
 */
 int Rotate_log_event::exec_event(struct st_relay_log_info* rli)
 {
  char* log_name = rli->master_log_name;
  DBUG_ENTER("Rotate_log_event::exec_event");
  pthread_mutex_lock(&rli->data_lock);
-
+  /*
-  if (rli->inside_transaction)
+    If we are in a transaction: the only normal case is when the I/O thread was
    copying a big transaction, then it was stopped and restarted: we have this
    in the relay log:
    BEGIN
    ...
    ROTATE (a fake one)
    ...
    COMMIT or ROLLBACK
    In that case, we don't want to touch the coordinates which correspond to the
    beginning of the transaction.
  */
  if (!rli->inside_transaction)
  {
-    slave_print_error(rli, 0,
+    memcpy(rli->master_log_name, new_log_ident, ident_len+1);
-                      "there is an unfinished transaction in the relay log \
+    rli->master_log_pos= pos;
-(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
+    DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
 the master died while writing the transaction to its binary log. Now the slave \
 is rolling back the transaction.");
    pthread_mutex_unlock(&rli->data_lock);
    DBUG_RETURN(1);
  }
  memcpy(log_name, new_log_ident, ident_len+1);
  rli->master_log_pos = pos;
  rli->relay_log_pos += get_event_len();
  DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
  pthread_mutex_unlock(&rli->data_lock);
  pthread_cond_broadcast(&rli->data_cond);
  flush_relay_log_info(rli);