From 219e6a6438a531802b331c69e697eec83ba8648c Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 20 Oct 2007 20:16:12 +0200 Subject: [PATCH] BUG#24860 (Incorrect SLAVE_TRANSACTION_RETRIES code can result in slave stuck): If a temporary error occured inside a group on an event that was not the first event of the group, the slave could get stuck because the retry counter is reset whenever an event was executed successfully. This patch only reset the retry counter when an entire group has been successfully executed, or failed with a non-transient error. sql/slave.cc: Adding debug printouts to every place where Relay_log_info::trans_retries is changed and to has_temporary_error() to print the error when returning. Adding debug variable all_errors_are_temporary_errors to make all errors for slave thread temporary errors. Adding code so that the Relay_log_info::trans_retries is only reset when an entire group was sucessfully executed, or a non-temporary error occured. mysql-test/suite/rpl/r/rpl_temporary_errors.result: New BitKeeper file ``mysql-test/suite/rpl/r/rpl_temporary_errors.result'' mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt: New BitKeeper file ``mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt'' mysql-test/suite/rpl/t/rpl_temporary_errors.test: New BitKeeper file ``mysql-test/suite/rpl/t/rpl_temporary_errors.test'' --- .../suite/rpl/r/rpl_temporary_errors.result | 81 +++++++++++++++++++ .../rpl/t/rpl_temporary_errors-slave.opt | 3 + .../suite/rpl/t/rpl_temporary_errors.test | 29 +++++++ sql/slave.cc | 26 ++++-- 4 files changed, 134 insertions(+), 5 deletions(-) create mode 100644 mysql-test/suite/rpl/r/rpl_temporary_errors.result create mode 100644 mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt create mode 100644 mysql-test/suite/rpl/t/rpl_temporary_errors.test diff --git a/mysql-test/suite/rpl/r/rpl_temporary_errors.result b/mysql-test/suite/rpl/r/rpl_temporary_errors.result new file mode 100644 index 00000000000..31c8930ec4f --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_temporary_errors.result @@ -0,0 +1,81 @@ +stop slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +reset master; +reset slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +start slave; +**** On Master **** +SET SESSION BINLOG_FORMAT=ROW; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT); +INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4); +**** On Slave **** +SHOW STATUS LIKE 'Slave_retried_transactions'; +Variable_name Value +Slave_retried_transactions 0 +UPDATE t1 SET a = 5, b = 47 WHERE a = 1; +SELECT * FROM t1; +a b +5 47 +2 2 +3 3 +4 4 +**** On Master **** +UPDATE t1 SET a = 5, b = 5 WHERE a = 1; +SELECT * FROM t1; +a b +5 5 +2 2 +3 3 +4 4 +**** On Slave **** +SHOW STATUS LIKE 'Slave_retried_transactions'; +Variable_name Value +Slave_retried_transactions 2 +SELECT * FROM t1; +a b +5 47 +2 2 +3 3 +4 4 +SHOW SLAVE STATUS; +Slave_IO_State # +Master_Host 127.0.0.1 +Master_User root +Master_Port MASTER_PORT +Connect_Retry 1 +Master_Log_File master-bin.000001 +Read_Master_Log_Pos 408 +Relay_Log_File # +Relay_Log_Pos # +Relay_Master_Log_File master-bin.000001 +Slave_IO_Running Yes +Slave_SQL_Running No +Replicate_Do_DB +Replicate_Ignore_DB +Replicate_Do_Table +Replicate_Ignore_Table # +Replicate_Wild_Do_Table +Replicate_Wild_Ignore_Table +Last_Errno 1032 +Last_Error Error in Update_rows event: error during transaction execution on table test.t1. Can't find record in 't1' +Skip_Counter 0 +Exec_Master_Log_Pos 318 +Relay_Log_Space # +Until_Condition None +Until_Log_File +Until_Log_Pos 0 +Master_SSL_Allowed No +Master_SSL_CA_File +Master_SSL_CA_Path +Master_SSL_Cert +Master_SSL_Cipher +Master_SSL_Key +Seconds_Behind_Master # +Master_SSL_Verify_Server_Cert No +Last_IO_Errno # +Last_IO_Error # +Last_SQL_Errno 1032 +Last_SQL_Error Error in Update_rows event: error during transaction execution on table test.t1. Can't find record in 't1' +DROP TABLE t1; +**** On Master **** +DROP TABLE t1; diff --git a/mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt b/mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt new file mode 100644 index 00000000000..80c171170f6 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt @@ -0,0 +1,3 @@ +--loose-debug="+d,all_errors_are_temporary_errors" --slave-transaction-retries=2 + + diff --git a/mysql-test/suite/rpl/t/rpl_temporary_errors.test b/mysql-test/suite/rpl/t/rpl_temporary_errors.test new file mode 100644 index 00000000000..0ca515fff40 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_temporary_errors.test @@ -0,0 +1,29 @@ +source include/master-slave.inc; + +--echo **** On Master **** +connection master; +SET SESSION BINLOG_FORMAT=ROW; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT); +INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4); +--echo **** On Slave **** +sync_slave_with_master; +SHOW STATUS LIKE 'Slave_retried_transactions'; +UPDATE t1 SET a = 5, b = 47 WHERE a = 1; +SELECT * FROM t1; +--echo **** On Master **** +connection master; +UPDATE t1 SET a = 5, b = 5 WHERE a = 1; +save_master_pos; +SELECT * FROM t1; +#SHOW BINLOG EVENTS; +--echo **** On Slave **** +connection slave; +source include/wait_for_slave_sql_to_stop.inc; +SHOW STATUS LIKE 'Slave_retried_transactions'; +SELECT * FROM t1; +source include/show_slave_status.inc; +DROP TABLE t1; + +--echo **** On Master **** +connection master; +DROP TABLE t1; diff --git a/sql/slave.cc b/sql/slave.cc index 9780ae70c5b..169cf897e6a 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -1714,7 +1714,14 @@ static int has_temporary_error(THD *thd) DBUG_ENTER("has_temporary_error"); if (thd->is_fatal_error) + { + DBUG_PRINT("info", ("thd->net.last_errno: %s", ER(thd->net.last_errno))); DBUG_RETURN(0); + } + + DBUG_EXECUTE_IF("all_errors_are_temporary_errors", + if (thd->net.last_errno) + thd->net.last_errno= ER_LOCK_DEADLOCK;); /* Temporary error codes: @@ -1723,7 +1730,10 @@ static int has_temporary_error(THD *thd) */ if (thd->net.last_errno == ER_LOCK_DEADLOCK || thd->net.last_errno == ER_LOCK_WAIT_TIMEOUT) + { + DBUG_PRINT("info", ("thd->net.last_errno: %s", ER(thd->net.last_errno))); DBUG_RETURN(1); + } #ifdef HAVE_NDB_BINLOG /* @@ -1905,7 +1915,8 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli) } if (slave_trans_retries) { - if (exec_res && has_temporary_error(thd)) + int temp_err; + if (exec_res && (temp_err= has_temporary_error(thd))) { const char *errmsg; /* @@ -1953,15 +1964,19 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli) "the slave_transaction_retries variable.", slave_trans_retries); } - else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) + else if (exec_res && !temp_err || + (opt_using_transactions && + rli->group_relay_log_pos == rli->event_relay_log_pos)) { /* - Only reset the retry counter if the event succeeded or - failed with a non-transient error. On a successful event, - the execution will proceed as usual; in the case of a + Only reset the retry counter if the entire group succeeded + or failed with a non-transient error. On a successful + event, the execution will proceed as usual; in the case of a non-transient error, the slave will stop with an error. */ rli->trans_retries= 0; // restart from fresh + DBUG_PRINT("info", ("Resetting retry counter, rli->trans_retries: %d", + rli->trans_retries)); } } DBUG_RETURN(exec_res); @@ -2450,6 +2465,7 @@ pthread_handler_t handle_slave_sql(void *arg) rli->ignore_log_space_limit= 0; pthread_mutex_unlock(&rli->log_space_lock); rli->trans_retries= 0; // start from "no error" + DBUG_PRINT("info", ("rli->trans_retries: %d", rli->trans_retries)); if (init_relay_log_pos(rli, rli->group_relay_log_name,