From a3cbc44b24ec467f33e445f57e2022e038b88623 Mon Sep 17 00:00:00 2001 From: sjaakola Date: Tue, 12 Sep 2023 02:37:30 +0200 Subject: [PATCH 1/4] MDEV-31833 replication breaks when using optimistic replication and replica is a galera node MariaDB async replication SQL thread was stopped for any failure in applying of replication events and error message logged for the failure was: "Node has dropped from cluster". The assumption was that event applying failure is always due to node dropping out. With optimistic parallel replication, event applying can fail for natural reasons and applying should be retried to handle the failure. This retry logic was never exercised because the slave SQL thread was stopped with first applying failure. To support optimistic parallel replication retrying logic this commit will now skip replication slave abort, if node remains in cluster (wsrep_ready==ON) and replication is configured for optimistic or aggressive retry logic. During the development of this fix, galera.galera_as_slave_nonprim test showed some problems. The test was analyzed, and it appears to need some attention. One excessive sleep command was removed in this commit, but it will need more fixes still to be fully deterministic. After this commit galera_as_slave_nonprim is successful, though. Signed-off-by: Julius Goryavsky --- .../suite/galera/t/galera_as_slave_nonprim.test | 2 -- sql/log_event_server.cc | 3 ++- sql/rpl_parallel.cc | 15 +++++++++++++++ sql/slave.cc | 16 +++++++++++++--- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/mysql-test/suite/galera/t/galera_as_slave_nonprim.test b/mysql-test/suite/galera/t/galera_as_slave_nonprim.test index 0d878db29b8..e22f0240a59 100644 --- a/mysql-test/suite/galera/t/galera_as_slave_nonprim.test +++ b/mysql-test/suite/galera/t/galera_as_slave_nonprim.test @@ -27,7 +27,6 @@ SET SESSION wsrep_sync_wait = 0; CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; --connection node_2 ---sleep 1 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc @@ -47,7 +46,6 @@ INSERT INTO t1 VALUES (1),(2),(3),(4),(5); --connection node_2 --sleep 5 - --let $value = query_get_value(SHOW SLAVE STATUS, Last_SQL_Error, 1) --connection node_1 --disable_query_log diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index be33607f960..1c0dee41122 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -5728,7 +5728,8 @@ int Rows_log_event::do_apply_event(rpl_group_info *rgi) ignored_error_code(actual_error) : 0); #ifdef WITH_WSREP - if (WSREP(thd) && wsrep_ignored_error_code(this, actual_error)) + if (WSREP(thd) && thd->wsrep_applier && + wsrep_ignored_error_code(this, actual_error)) { idempotent_error= true; thd->wsrep_has_ignored_error= true; diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 41c99386a1f..affbdbdeab8 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -820,6 +820,10 @@ do_retry: event_count= 0; err= 0; errmsg= NULL; +#ifdef WITH_WSREP + thd->wsrep_cs().reset_error(); + WSREP_DEBUG("retrying async replication event"); +#endif /* WITH_WSREP */ /* If we already started committing before getting the deadlock (or other @@ -919,6 +923,7 @@ do_retry: err= rgi->worker_error= 1; my_error(ER_PRIOR_COMMIT_FAILED, MYF(0)); mysql_mutex_unlock(&entry->LOCK_parallel_entry); + goto err; } mysql_mutex_unlock(&entry->LOCK_parallel_entry); @@ -960,7 +965,17 @@ do_retry: possibility of an old deadlock kill lingering on beyond this point. */ thd->reset_killed(); +#ifdef WITH_WSREP + if (wsrep_before_command(thd)) + { + WSREP_WARN("Parallel slave worker failed at wsrep_before_command() hook"); + err= 1; + goto err; + } + wsrep_start_trx_if_not_started(thd); + WSREP_DEBUG("parallel slave retry, after trx start"); +#endif /* WITH_WSREP */ strmake_buf(log_name, ir->name); if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0) { diff --git a/sql/slave.cc b/sql/slave.cc index 0948ae1d251..88bc4d49436 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -3878,9 +3878,19 @@ apply_event_and_update_pos_apply(Log_event* ev, THD* thd, rpl_group_info *rgi, default: WSREP_DEBUG("SQL apply failed, res %d conflict state: %s", exec_res, wsrep_thd_transaction_state_str(thd)); - rli->abort_slave= 1; - rli->report(ERROR_LEVEL, ER_UNKNOWN_COM_ERROR, rgi->gtid_info(), - "Node has dropped from cluster"); + /* + async replication thread should be stopped, if failure was + not due to optimistic parallel applying or if node + has dropped from cluster + */ + if (thd->system_thread == SYSTEM_THREAD_SLAVE_SQL && + ((rli->mi->using_parallel() && + rli->mi->parallel_mode <= SLAVE_PARALLEL_CONSERVATIVE) || + wsrep_ready == 0)) { + rli->abort_slave= 1; + rli->report(ERROR_LEVEL, ER_UNKNOWN_COM_ERROR, rgi->gtid_info(), + "Node has dropped from cluster"); + } break; } mysql_mutex_unlock(&thd->LOCK_thd_data); From d20a4da23d4189aba263ad54128375bf3e110206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 12 Sep 2023 15:16:31 +0300 Subject: [PATCH 2/4] MDEV-32150 InnoDB reports corruption on 32-bit platforms with ibd files sizes > 4GB buf_read_page_low(): Use 64-bit arithmetics when computing the file byte offset. In other calls to fil_space_t::io() the offset was being computed correctly, for example by buf_page_t::physical_offset(). --- storage/innobase/buf/buf0rea.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 822c424fe46..c81017bb024 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -335,7 +335,8 @@ nothing_read: auto fio = space->io(IORequest(sync ? IORequest::READ_SYNC : IORequest::READ_ASYNC), - page_id.page_no() * len, len, dst, bpage); + os_offset_t{page_id.page_no()} * len, len, + dst, bpage); *err= fio.err; if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { From 9e9cefde2a410c88a71fd7d07a918bce9e506db7 Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Wed, 13 Sep 2023 12:10:43 +0200 Subject: [PATCH 3/4] post-merge fix --- include/mysql/service_my_crypt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mysql/service_my_crypt.h b/include/mysql/service_my_crypt.h index ac8e427231d..1a4ebcfd067 100644 --- a/include/mysql/service_my_crypt.h +++ b/include/mysql/service_my_crypt.h @@ -45,7 +45,7 @@ extern "C" { /* The max key length of all supported algorithms */ #define MY_AES_MAX_KEY_LENGTH 32 -#define MY_AES_CTX_SIZE 1024 +#define MY_AES_CTX_SIZE 1040 enum my_aes_mode { MY_AES_ECB, MY_AES_CBC From 7de0c7b569af9120d8df38f7bf5931853901a3b2 Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Thu, 24 Aug 2023 10:42:17 -0600 Subject: [PATCH 4/4] MDEV-31038: rpl.rpl_xa_prepare_gtid_fail clean up - Removed commented out and unused lines. - Updated test to reference true failure of timeout rather than deadlock - Switched save variables from MTR to user - Forced relay-log purge to not potentially re-execute an already prepared transaction --- .../rpl/r/rpl_xa_prepare_gtid_fail.result | 20 +++++------ .../suite/rpl/t/rpl_xa_prepare_gtid_fail.test | 36 +++++++++---------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result b/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result index f3fecbda349..dd0d132471e 100644 --- a/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result +++ b/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result @@ -2,14 +2,14 @@ include/master-slave.inc [connection master] connection slave; include/stop_slave.inc +set @save_par_thds= @@global.slave_parallel_threads; +set @save_strict_mode= @@global.gtid_strict_mode; +set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout; change master to master_use_gtid=slave_pos; set @@global.slave_parallel_threads= 4; set @@global.slave_parallel_mode= optimistic; set @@global.gtid_strict_mode=ON; -set sql_log_bin= 0; -alter table mysql.gtid_slave_pos engine=innodb; -call mtr.add_suppression("Deadlock found.*"); -set sql_log_bin= 1; +set statement sql_log_bin=0 for alter table mysql.gtid_slave_pos engine=innodb; include/start_slave.inc connection master; create table t1 (a int primary key, b int) engine=innodb; @@ -27,25 +27,25 @@ xa end '1'; xa prepare '1'; xa commit '1'; include/save_master_gtid.inc -connection slave; connection slave1; BEGIN; SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=100 FOR UPDATE; domain_id sub_id server_id seq_no connection slave; include/start_slave.inc -include/wait_for_slave_sql_error.inc [errno=1942,1213] +include/wait_for_slave_sql_error.inc [errno=1942] +include/stop_slave_io.inc connection slave1; ROLLBACK; # Cleanup connection master; drop table t1; connection slave; -include/stop_slave.inc +# TODO: Remove after fixing MDEV-21777 set @@global.gtid_slave_pos= "0-1-100"; -set @@global.slave_parallel_threads= 0; -set @@global.gtid_strict_mode= 0; -set @@global.innodb_lock_wait_timeout= 50; +set @@global.slave_parallel_threads= @save_par_thds; +set @@global.gtid_strict_mode= @save_strict_mode; +set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout; include/start_slave.inc include/rpl_end.inc # End of rpl_xa_prepare_gtid_fail.test diff --git a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test index 8042b355754..aa1b088ed23 100644 --- a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test +++ b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test @@ -6,8 +6,8 @@ # GTID slave state, then the slave should immediately quit in error, without # retry. # -# This tests validates the above behavior by simulating a deadlock on the -# GTID slave state table during the second part of XA PREPARE's commit, to +# This tests validates the above behavior by forcing a lock-wait timeout on +# the GTID slave state table during the second part of XA PREPARE's commit, to # ensure that the appropriate error is reported and the transaction was never # retried. # @@ -23,23 +23,19 @@ source include/have_innodb.inc; --connection slave --source include/stop_slave.inc ---let $save_par_thds= `SELECT @@global.slave_parallel_threads` ---let $save_strict_mode= `SELECT @@global.gtid_strict_mode` ---let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout` +set @save_par_thds= @@global.slave_parallel_threads; +set @save_strict_mode= @@global.gtid_strict_mode; +set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout; change master to master_use_gtid=slave_pos; set @@global.slave_parallel_threads= 4; set @@global.slave_parallel_mode= optimistic; set @@global.gtid_strict_mode=ON; -set sql_log_bin= 0; -alter table mysql.gtid_slave_pos engine=innodb; -call mtr.add_suppression("Deadlock found.*"); -set sql_log_bin= 1; +set statement sql_log_bin=0 for alter table mysql.gtid_slave_pos engine=innodb; --source include/start_slave.inc --connection master -let $datadir= `select @@datadir`; create table t1 (a int primary key, b int) engine=innodb; insert t1 values (1,1); --source include/save_master_gtid.inc @@ -64,11 +60,6 @@ xa prepare '1'; xa commit '1'; --source include/save_master_gtid.inc - ---connection slave - -#--eval set statement sql_log_bin=0 for insert into mysql.gtid_slave_pos values ($gtid_domain_id, 5, $gtid_server_id, $xap_seq_no) - --connection slave1 BEGIN; --eval SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=$xap_seq_no FOR UPDATE @@ -76,9 +67,14 @@ BEGIN; --connection slave --source include/start_slave.inc ---let $slave_sql_errno= 1942,1213 +--let $slave_sql_errno= 1942 --source include/wait_for_slave_sql_error.inc +# TODO: Remove after fixing MDEV-21777 +# Stop the IO thread too, so the existing relay logs are force purged on slave +# restart, as to not re-execute the already-prepared transaction +--source include/stop_slave_io.inc + --let $retried_tx_test= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1) if ($retried_tx_initial != $retried_tx_test) { @@ -95,11 +91,11 @@ ROLLBACK; drop table t1; --connection slave ---source include/stop_slave.inc +--echo # TODO: Remove after fixing MDEV-21777 --eval set @@global.gtid_slave_pos= "$new_gtid" ---eval set @@global.slave_parallel_threads= $save_par_thds ---eval set @@global.gtid_strict_mode= $save_strict_mode ---eval set @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout +set @@global.slave_parallel_threads= @save_par_thds; +set @@global.gtid_strict_mode= @save_strict_mode; +set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout; --source include/start_slave.inc --source include/rpl_end.inc