From 21cc7d5a2582f5baa61c936beab72015027cfdc0 Mon Sep 17 00:00:00 2001 From: Andrei Elkin Date: Thu, 26 Mar 2009 10:25:06 +0200 Subject: [PATCH 1/2] Bug#38205 Row-based Replication (RBR) causes inconsistencies: HA_ERR_FOUND_DUP Bug#319 if while a non-transactional slave is replicating a transaction possible problem It is impossible to roll back a mixed engines transaction when one of the engine is non-transaction. In replication that fact is crucial because the slave can not safely re-apply a transction that was interrupted with STOP SLAVE. Fixed with making STOP SLAVE not be effective immediately in the case the current group of replication events has modified a non-transaction table. In order for slave to leave either the group needs finishing or the user issues KILL QUERY|CONNECTION slave_thread_id. mysql-test/suite/bugs/r/rpl_bug38205.result: bug#38205 non-deterministic part of tests results. mysql-test/suite/bugs/t/rpl_bug38205.test: bug#38205 non-deterministic part of tests. mysql-test/suite/rpl/r/rpl_start_stop_slave.result: bug#38205 deterministic part of tests results. mysql-test/suite/rpl/t/rpl_start_stop_slave-slave.opt: increasing `innodb_lock_wait_timeout' to make the test pass on slow env w/o timeout expired issue. mysql-test/suite/rpl/t/rpl_start_stop_slave.test: bug#38205 deterministic part of tests. sql/log_event.cc: Augmenting row-based events applying with the notion of thd->transaction.{all,stmt}.modified_non_trans_table. The pair is set and reset according to its specification for the mixed transaction processing. Particualry, once `modified_non_trans_table' is set in the row-events processing loop, it will remain till the commit of the transaction. sql/slave.cc: Consulting `thd->transaction.all.modified_non_trans_table' to decide whether to terminate by the sql thread or to continue even though the sql thread might have been STOP-ed (rli->abort_slave). --- mysql-test/suite/bugs/r/rpl_bug38205.result | 56 ++++++ mysql-test/suite/bugs/t/rpl_bug38205.test | 165 ++++++++++++++++++ .../suite/rpl/r/rpl_start_stop_slave.result | 28 +++ .../rpl/t/rpl_start_stop_slave-slave.opt | 1 + .../suite/rpl/t/rpl_start_stop_slave.test | 87 ++++++++- sql/log_event.cc | 11 +- sql/slave.cc | 3 + 7 files changed, 349 insertions(+), 2 deletions(-) create mode 100644 mysql-test/suite/bugs/r/rpl_bug38205.result create mode 100644 mysql-test/suite/bugs/t/rpl_bug38205.test create mode 100644 mysql-test/suite/rpl/t/rpl_start_stop_slave-slave.opt diff --git a/mysql-test/suite/bugs/r/rpl_bug38205.result b/mysql-test/suite/bugs/r/rpl_bug38205.result new file mode 100644 index 00000000000..195c0ca1135 --- /dev/null +++ b/mysql-test/suite/bugs/r/rpl_bug38205.result @@ -0,0 +1,56 @@ +stop slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +reset master; +reset slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +start slave; +create table t1i(n int primary key) engine=innodb; +create table t2m(n int primary key) engine=myisam; +begin; +insert into t1i values (1); +insert into t1i values (2); +insert into t1i values (3); +commit; +begin; +insert into t1i values (5); +begin; +insert into t1i values (4); +insert into t2m values (1); +update t1i set n = 5 where n = 4; +commit; +zero +0 +*** kill sql thread *** +rollback; +*** sql thread is *not* running: No *** +*** the prove: the killed slave has not finished the current transaction *** +three +3 +one +1 +zero +0 +delete from t2m; +start slave sql_thread; +delete from t1i; +delete from t2m; +begin; +insert into t1i values (5); +begin; +insert into t1i values (4); +update t1i set n = 5 where n = 4; +commit; +zero +0 +stop slave sql_thread; +rollback; +*** sql thread is running: No *** +*** the prove: the stopped slave has rolled back the current transaction *** +zero +0 +zero +0 +one +1 +start slave sql_thread; +drop table t1i, t2m; diff --git a/mysql-test/suite/bugs/t/rpl_bug38205.test b/mysql-test/suite/bugs/t/rpl_bug38205.test new file mode 100644 index 00000000000..f754d7481c7 --- /dev/null +++ b/mysql-test/suite/bugs/t/rpl_bug38205.test @@ -0,0 +1,165 @@ +# +# Bug #38205 Row-based Replication (RBR) causes inconsistencies: HA_ERR_FOUND_DUPP_KEY +# +# Verifying the fact that STOP SLAVE in the middle of a group execution waits +# for the end of the group before the slave sql thread will stop. +# The patch refines STOP SLAVE to not interrupt a transaction or other type of +# the replication events group (the part I). +# Killing the sql thread continues to provide a "hard" stop (the part II). +# +# Non-deterministic tests +# + +source include/master-slave.inc; +source include/have_innodb.inc; + + +# +# Part II, killed sql slave leaves instantly +# + +# A. multi-statement transaction as the replication group + +connection master; + +create table t1i(n int primary key) engine=innodb; +create table t2m(n int primary key) engine=myisam; + +sync_slave_with_master; + +connection master; + +begin; +insert into t1i values (1); +insert into t1i values (2); +insert into t1i values (3); +commit; + +sync_slave_with_master; + +# +# todo: first challenge is to find out the SQL thread id +# the following is not fully reliable +# + +let $id=`SELECT id from information_schema.processlist where user like 'system user' and state like '%Has read all relay log%' or user like 'system user' and state like '%Reading event from the relay log%'`; +connection slave; +begin; +insert into t1i values (5); + +connection master; +let $pos0_master= query_get_value(SHOW MASTER STATUS, Position, 1); +begin; +insert into t1i values (4); +insert into t2m values (1); # non-ta update +update t1i set n = 5 where n = 4; # to block at. can't be played with killed +commit; +let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1); + +connection slave; +# slave sql thread must be locked out by the conn `slave' explicit lock +let $pos0_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); +--disable_query_log +eval select $pos0_master - $pos0_slave as zero; +--enable_query_log + +connection slave1; + +let $count= 1; +let $table= t2m; +source include/wait_until_rows_count.inc; +# +# todo: may fail as said above +# +--echo *** kill sql thread *** +--disable_query_log +eval kill connection $id; +--enable_query_log + +connection slave; +rollback; # release the sql thread + +connection slave1; + +source include/wait_for_slave_sql_to_stop.inc; +let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1); +--echo *** sql thread is *not* running: $sql_status *** +let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); + +connection slave; +--echo *** the prove: the killed slave has not finished the current transaction *** + +--disable_query_log +select count(*) as three from t1i; +eval select $pos1_master > $pos1_slave as one; +eval select $pos1_slave - $pos0_slave as zero; +--enable_query_log + +delete from t2m; # remove the row to be able to replay +start slave sql_thread; + +# +# Part I: B The homogenous transaction remains interuptable in between +# + +connection master; +delete from t1i; +delete from t2m; + +sync_slave_with_master; +begin; +insert into t1i values (5); + +connection master; +let $pos0_master= query_get_value(SHOW MASTER STATUS, Position, 1); +begin; +insert into t1i values (4); +update t1i set n = 5 where n = 4; # to block at. not to be played +commit; +let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1); + + +connection slave1; +# slave sql can't advance as must be locked by the conn `slave' trans +let $pos0_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); +--disable_query_log +eval select $pos0_master - $pos0_slave as zero; +--enable_query_log + +# +# the replicated trans is blocked by the slave's local. +# However, it's not easy to catch the exact moment when it happens. +# The test issues sleep which makes the test either non-deterministic or +# wasting too much time. +# +--sleep 3 + +send stop slave sql_thread; + +connection slave; +rollback; # release the sql thread + +connection slave1; +reap; +source include/wait_for_slave_sql_to_stop.inc; +let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1); +--echo *** sql thread is running: $sql_status *** + +let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); + +--echo *** the prove: the stopped slave has rolled back the current transaction *** + +--disable_query_log +select count(*) as zero from t1i; +eval select $pos0_master - $pos0_slave as zero; +eval select $pos1_master > $pos0_slave as one; +--enable_query_log + +start slave sql_thread; + +# clean-up + +connection master; +drop table t1i, t2m; + +sync_slave_with_master; diff --git a/mysql-test/suite/rpl/r/rpl_start_stop_slave.result b/mysql-test/suite/rpl/r/rpl_start_stop_slave.result index 04ece812f35..130b65c704c 100644 --- a/mysql-test/suite/rpl/r/rpl_start_stop_slave.result +++ b/mysql-test/suite/rpl/r/rpl_start_stop_slave.result @@ -10,3 +10,31 @@ start slave; stop slave io_thread; start slave io_thread; drop table t1; +create table t1i(n int primary key) engine=innodb; +create table t2m(n int primary key) engine=myisam; +begin; +insert into t1i values (1); +insert into t1i values (2); +insert into t1i values (3); +commit; +begin; +insert into t1i values (5); +begin; +insert into t1i values (4); +insert into t2m values (1); +insert into t1i values (5); +commit; +zero +0 +stop slave; +rollback; +*** sql thread is running: No *** +*** the prove: the stopped slave has finished the current transaction *** +five +5 +zero +0 +one +1 +start slave; +drop table t1i, t2m; diff --git a/mysql-test/suite/rpl/t/rpl_start_stop_slave-slave.opt b/mysql-test/suite/rpl/t/rpl_start_stop_slave-slave.opt new file mode 100644 index 00000000000..00ea161cd6e --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_start_stop_slave-slave.opt @@ -0,0 +1 @@ +--innodb_lock_wait_timeout=60 diff --git a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test index dbf0775c978..529ed7acb32 100644 --- a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test +++ b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test @@ -1,4 +1,5 @@ source include/master-slave.inc; +source include/have_innodb.inc; # # Bug#6148 () @@ -35,4 +36,88 @@ save_master_pos; connection slave; sync_with_master; -# End of 4.1 tests + +# +# Bug #38205 Row-based Replication (RBR) causes inconsistencies... +# +# Verifying that STOP SLAVE does not interrupt excution of a group +# execution of events if the group can not roll back. +# Killing the sql thread continues to provide a "hard" stop (the +# part II, moved to the bugs suite as it's hard to make it +# deterministic with KILL). +# + +# +# Part I. The being stopped sql thread finishes first the current group of +# events if the group contains an event on a non-transaction table. + +connection master; +create table t1i(n int primary key) engine=innodb; +create table t2m(n int primary key) engine=myisam; +begin; +insert into t1i values (1); +insert into t1i values (2); +insert into t1i values (3); +commit; + +sync_slave_with_master; +connection slave; +begin; +insert into t1i values (5); + +connection master; +let $pos0_master= query_get_value(SHOW MASTER STATUS, Position, 1); +begin; +insert into t1i values (4); +insert into t2m values (1); # non-ta update to process +insert into t1i values (5); # to block at. to be played with stopped +commit; + +connection slave; +# slave sql thread must be locked out by the conn `slave' explicit lock +let $pos0_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); +--disable_query_log +eval select $pos0_master - $pos0_slave as zero; +--enable_query_log + +connection slave1; +let $count= 1; +let $table= t2m; +source include/wait_until_rows_count.inc; +send stop slave; + +connection slave; +rollback; # release the sql thread + +connection slave1; +reap; +source include/wait_for_slave_to_stop.inc; +let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1); +--echo *** sql thread is running: $sql_status *** + + +connection master; +let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1); + +connection slave; +source include/wait_for_slave_sql_to_stop.inc; + +let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); + +--echo *** the prove: the stopped slave has finished the current transaction *** + +--disable_query_log +select count(*) as five from t1i; +eval select $pos1_master - $pos1_slave as zero; +eval select $pos1_slave > $pos0_slave as one; +--enable_query_log + +start slave; + +# clean-up +connection master; +drop table t1i, t2m; + +sync_slave_with_master; + +# End of tests diff --git a/sql/log_event.cc b/sql/log_event.cc index 1941a304e8e..9dfc7ed3198 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -7114,7 +7114,12 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) */ lex_start(thd); mysql_reset_thd_for_next_command(thd); - + /* + The current statement is just about to begin and + has not yet modified anything. Note, all.modified is reset + by mysql_reset_thd_for_next_command. + */ + thd->transaction.stmt.modified_non_trans_table= FALSE; /* Check if the slave is set to use SBR. If so, it should switch to using RBR until the end of the "statement", i.e., next @@ -7217,6 +7222,7 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) if (table) { + bool transactional_table= table->file->has_transactions(); /* table == NULL means that this table should not be replicated (this was set up by Table_map_log_event::do_apply_event() @@ -7348,6 +7354,9 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) m_curr_row= m_curr_row_end; + if (error == 0 && !transactional_table) + thd->transaction.all.modified_non_trans_table= + thd->transaction.stmt.modified_non_trans_table= TRUE; } // row processing loop DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event", diff --git a/sql/slave.cc b/sql/slave.cc index 22c61b3ec6c..5a797328bc4 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -687,6 +687,9 @@ static bool sql_slave_killed(THD* thd, Relay_log_info* rli) DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun if (abort_loop || thd->killed || rli->abort_slave) { + if (rli->abort_slave && rli->is_in_group() && + thd->transaction.all.modified_non_trans_table) + DBUG_RETURN(0); /* If we are in an unsafe situation (stopping could corrupt replication), we give one minute to the slave SQL thread of grace before really From a3299de2f7a8ec8a256274f8af889a052cade55b Mon Sep 17 00:00:00 2001 From: Andrei Elkin Date: Thu, 9 Apr 2009 16:05:41 +0300 Subject: [PATCH 2/2] Bug #38205 Row-based Replication (RBR) causes inconsistencies: HA_ERR_FOUND_DUPP_KEY Bug#319 if while a non-transactional slave is replicating a transaction possible problem only testing related: addressing reviewers' comments. mysql-test/suite/bugs/r/rpl_bug38205.result: new results. mysql-test/suite/bugs/t/rpl_bug38205.test: changing output per reviewer's offering. mysql-test/suite/rpl/r/rpl_start_stop_slave.result: new results. mysql-test/suite/rpl/t/rpl_start_stop_slave.test: refining tests for Bug #38205, Bug#319 to start slave with the necessary synchronization; changing output per reviewer's offering; removing redundant wait for stop. --- mysql-test/suite/bugs/r/rpl_bug38205.result | 2 +- mysql-test/suite/bugs/t/rpl_bug38205.test | 3 ++- mysql-test/suite/rpl/r/rpl_start_stop_slave.result | 4 ++-- mysql-test/suite/rpl/t/rpl_start_stop_slave.test | 8 ++++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mysql-test/suite/bugs/r/rpl_bug38205.result b/mysql-test/suite/bugs/r/rpl_bug38205.result index 195c0ca1135..8f1dee344fa 100644 --- a/mysql-test/suite/bugs/r/rpl_bug38205.result +++ b/mysql-test/suite/bugs/r/rpl_bug38205.result @@ -44,7 +44,7 @@ zero 0 stop slave sql_thread; rollback; -*** sql thread is running: No *** +*** sql thread is *not* running: No *** *** the prove: the stopped slave has rolled back the current transaction *** zero 0 diff --git a/mysql-test/suite/bugs/t/rpl_bug38205.test b/mysql-test/suite/bugs/t/rpl_bug38205.test index f754d7481c7..52b36636e79 100644 --- a/mysql-test/suite/bugs/t/rpl_bug38205.test +++ b/mysql-test/suite/bugs/t/rpl_bug38205.test @@ -1,5 +1,6 @@ # # Bug #38205 Row-based Replication (RBR) causes inconsistencies: HA_ERR_FOUND_DUPP_KEY +# Bug#319 if while a non-transactional slave is replicating a transaction possible problem # # Verifying the fact that STOP SLAVE in the middle of a group execution waits # for the end of the group before the slave sql thread will stop. @@ -143,7 +144,7 @@ connection slave1; reap; source include/wait_for_slave_sql_to_stop.inc; let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1); ---echo *** sql thread is running: $sql_status *** +--echo *** sql thread is *not* running: $sql_status *** let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); diff --git a/mysql-test/suite/rpl/r/rpl_start_stop_slave.result b/mysql-test/suite/rpl/r/rpl_start_stop_slave.result index 130b65c704c..e2b1935c268 100644 --- a/mysql-test/suite/rpl/r/rpl_start_stop_slave.result +++ b/mysql-test/suite/rpl/r/rpl_start_stop_slave.result @@ -28,7 +28,7 @@ zero 0 stop slave; rollback; -*** sql thread is running: No *** +*** sql thread is *not* running: No *** *** the prove: the stopped slave has finished the current transaction *** five 5 @@ -36,5 +36,5 @@ zero 0 one 1 -start slave; +include/start_slave.inc drop table t1i, t2m; diff --git a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test index 529ed7acb32..d9b87427321 100644 --- a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test +++ b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test @@ -38,7 +38,8 @@ sync_with_master; # -# Bug #38205 Row-based Replication (RBR) causes inconsistencies... +# Bug#38205 Row-based Replication (RBR) causes inconsistencies... +# Bug#319 if while a non-transactional slave is replicating a transaction... # # Verifying that STOP SLAVE does not interrupt excution of a group # execution of events if the group can not roll back. @@ -93,14 +94,13 @@ connection slave1; reap; source include/wait_for_slave_to_stop.inc; let $sql_status= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1); ---echo *** sql thread is running: $sql_status *** +--echo *** sql thread is *not* running: $sql_status *** connection master; let $pos1_master= query_get_value(SHOW MASTER STATUS, Position, 1); connection slave; -source include/wait_for_slave_sql_to_stop.inc; let $pos1_slave= query_get_value(SHOW SLAVE STATUS, Exec_Master_Log_Pos, 1); @@ -112,7 +112,7 @@ eval select $pos1_master - $pos1_slave as zero; eval select $pos1_slave > $pos0_slave as one; --enable_query_log -start slave; +source include/start_slave.inc; # clean-up connection master;