From a3cbc44b24ec467f33e445f57e2022e038b88623 Mon Sep 17 00:00:00 2001
From: sjaakola <seppo.jaakola@iki.fi>
Date: Tue, 12 Sep 2023 02:37:30 +0200
Subject: [PATCH 1/4] MDEV-31833 replication breaks when using optimistic
 replication and replica is a galera node

MariaDB async replication SQL thread was stopped for any failure
in applying of replication events and error message logged for the failure
was: "Node has dropped from cluster". The assumption was that event applying
failure is always due to node dropping out.
With optimistic parallel replication, event applying can fail for natural
reasons and applying should be retried to handle the failure. This retry
logic was never exercised because the slave SQL thread was stopped with first
applying failure.

To support optimistic parallel replication retrying logic this commit will
now skip replication slave abort, if node remains in cluster (wsrep_ready==ON)
and replication is configured for optimistic or aggressive retry logic.

During the development of this fix, galera.galera_as_slave_nonprim test showed
some problems. The test was analyzed, and it appears to need some attention.
One excessive sleep command was removed in this commit, but it will need more
fixes still to be fully deterministic. After this commit galera_as_slave_nonprim
is successful, though.

Signed-off-by: Julius Goryavsky <julius.goryavsky@mariadb.com>
---
 .../suite/galera/t/galera_as_slave_nonprim.test  |  2 --
 sql/log_event_server.cc                          |  3 ++-
 sql/rpl_parallel.cc                              | 15 +++++++++++++++
 sql/slave.cc                                     | 16 +++++++++++++---
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/mysql-test/suite/galera/t/galera_as_slave_nonprim.test b/mysql-test/suite/galera/t/galera_as_slave_nonprim.test
index 0d878db29b8..e22f0240a59 100644
--- a/mysql-test/suite/galera/t/galera_as_slave_nonprim.test
+++ b/mysql-test/suite/galera/t/galera_as_slave_nonprim.test
@@ -27,7 +27,6 @@ SET SESSION wsrep_sync_wait = 0;
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
 
 --connection node_2
---sleep 1
 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
 
@@ -47,7 +46,6 @@ INSERT INTO t1 VALUES (1),(2),(3),(4),(5);
 
 --connection node_2
 --sleep 5
-
 --let $value = query_get_value(SHOW SLAVE STATUS, Last_SQL_Error, 1)
 --connection node_1
 --disable_query_log
diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
index be33607f960..1c0dee41122 100644
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -5728,7 +5728,8 @@ int Rows_log_event::do_apply_event(rpl_group_info *rgi)
                              ignored_error_code(actual_error) : 0);
 
 #ifdef WITH_WSREP
-        if (WSREP(thd) && wsrep_ignored_error_code(this, actual_error))
+        if (WSREP(thd) && thd->wsrep_applier &&
+            wsrep_ignored_error_code(this, actual_error))
         {
           idempotent_error= true;
           thd->wsrep_has_ignored_error= true;
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 41c99386a1f..affbdbdeab8 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -820,6 +820,10 @@ do_retry:
   event_count= 0;
   err= 0;
   errmsg= NULL;
+#ifdef WITH_WSREP
+  thd->wsrep_cs().reset_error();
+  WSREP_DEBUG("retrying async replication event");
+#endif /* WITH_WSREP */
 
   /*
     If we already started committing before getting the deadlock (or other
@@ -919,6 +923,7 @@ do_retry:
       err= rgi->worker_error= 1;
       my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
       mysql_mutex_unlock(&entry->LOCK_parallel_entry);
+
       goto err;
     }
     mysql_mutex_unlock(&entry->LOCK_parallel_entry);
@@ -960,7 +965,17 @@ do_retry:
     possibility of an old deadlock kill lingering on beyond this point.
   */
   thd->reset_killed();
+#ifdef WITH_WSREP
+  if (wsrep_before_command(thd))
+  {
+    WSREP_WARN("Parallel slave worker failed at wsrep_before_command() hook");
+    err= 1;
+    goto err;
+  }
+  wsrep_start_trx_if_not_started(thd);
+  WSREP_DEBUG("parallel slave retry, after trx start");
 
+#endif /* WITH_WSREP */
   strmake_buf(log_name, ir->name);
   if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
   {
diff --git a/sql/slave.cc b/sql/slave.cc
index 0948ae1d251..88bc4d49436 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -3878,9 +3878,19 @@ apply_event_and_update_pos_apply(Log_event* ev, THD* thd, rpl_group_info *rgi,
       default:
           WSREP_DEBUG("SQL apply failed, res %d conflict state: %s",
                       exec_res, wsrep_thd_transaction_state_str(thd));
-          rli->abort_slave= 1;
-          rli->report(ERROR_LEVEL, ER_UNKNOWN_COM_ERROR, rgi->gtid_info(),
-                      "Node has dropped from cluster");
+          /*
+            async replication thread should be stopped, if failure was
+            not due to optimistic parallel applying or if node
+            has dropped from cluster
+           */
+          if (thd->system_thread == SYSTEM_THREAD_SLAVE_SQL &&
+              ((rli->mi->using_parallel() &&
+                rli->mi->parallel_mode <= SLAVE_PARALLEL_CONSERVATIVE) ||
+               wsrep_ready == 0)) {
+            rli->abort_slave= 1;
+            rli->report(ERROR_LEVEL, ER_UNKNOWN_COM_ERROR, rgi->gtid_info(),
+                        "Node has dropped from cluster");
+          }
           break;
       }
       mysql_mutex_unlock(&thd->LOCK_thd_data);

From d20a4da23d4189aba263ad54128375bf3e110206 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Tue, 12 Sep 2023 15:16:31 +0300
Subject: [PATCH 2/4] MDEV-32150 InnoDB reports corruption on 32-bit platforms
 with ibd files sizes > 4GB

buf_read_page_low(): Use 64-bit arithmetics when computing the
file byte offset. In other calls to fil_space_t::io() the offset
was being computed correctly, for example by
buf_page_t::physical_offset().
---
 storage/innobase/buf/buf0rea.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 822c424fe46..c81017bb024 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -335,7 +335,8 @@ nothing_read:
 	auto fio = space->io(IORequest(sync
 				       ? IORequest::READ_SYNC
 				       : IORequest::READ_ASYNC),
-			     page_id.page_no() * len, len, dst, bpage);
+			     os_offset_t{page_id.page_no()} * len, len,
+			     dst, bpage);
 	*err= fio.err;
 
 	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {

From 9e9cefde2a410c88a71fd7d07a918bce9e506db7 Mon Sep 17 00:00:00 2001
From: Sergei Golubchik <serg@mariadb.org>
Date: Wed, 13 Sep 2023 12:10:43 +0200
Subject: [PATCH 3/4] post-merge fix

---
 include/mysql/service_my_crypt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mysql/service_my_crypt.h b/include/mysql/service_my_crypt.h
index ac8e427231d..1a4ebcfd067 100644
--- a/include/mysql/service_my_crypt.h
+++ b/include/mysql/service_my_crypt.h
@@ -45,7 +45,7 @@ extern "C" {
 /* The max key length of all supported algorithms */
 #define MY_AES_MAX_KEY_LENGTH 32
 
-#define MY_AES_CTX_SIZE 1024
+#define MY_AES_CTX_SIZE 1040
 
 enum my_aes_mode {
     MY_AES_ECB, MY_AES_CBC

From 7de0c7b569af9120d8df38f7bf5931853901a3b2 Mon Sep 17 00:00:00 2001
From: Brandon Nesterenko <brandon.nesterenko@mariadb.com>
Date: Thu, 24 Aug 2023 10:42:17 -0600
Subject: [PATCH 4/4] MDEV-31038: rpl.rpl_xa_prepare_gtid_fail clean up

- Removed commented out and unused lines.
- Updated test to reference true failure of timeout
  rather than deadlock
- Switched save variables from MTR to user
- Forced relay-log purge to not potentially re-execute
  an already prepared transaction
---
 .../rpl/r/rpl_xa_prepare_gtid_fail.result     | 20 +++++------
 .../suite/rpl/t/rpl_xa_prepare_gtid_fail.test | 36 +++++++++----------
 2 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result b/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result
index f3fecbda349..dd0d132471e 100644
--- a/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result
+++ b/mysql-test/suite/rpl/r/rpl_xa_prepare_gtid_fail.result
@@ -2,14 +2,14 @@ include/master-slave.inc
 [connection master]
 connection slave;
 include/stop_slave.inc
+set @save_par_thds= @@global.slave_parallel_threads;
+set @save_strict_mode= @@global.gtid_strict_mode;
+set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
 change master to master_use_gtid=slave_pos;
 set @@global.slave_parallel_threads= 4;
 set @@global.slave_parallel_mode= optimistic;
 set @@global.gtid_strict_mode=ON;
-set sql_log_bin= 0;
-alter table mysql.gtid_slave_pos engine=innodb;
-call mtr.add_suppression("Deadlock found.*");
-set sql_log_bin= 1;
+set statement sql_log_bin=0 for alter table mysql.gtid_slave_pos engine=innodb;
 include/start_slave.inc
 connection master;
 create table t1 (a int primary key, b int) engine=innodb;
@@ -27,25 +27,25 @@ xa end '1';
 xa prepare '1';
 xa commit '1';
 include/save_master_gtid.inc
-connection slave;
 connection slave1;
 BEGIN;
 SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=100 FOR UPDATE;
 domain_id	sub_id	server_id	seq_no
 connection slave;
 include/start_slave.inc
-include/wait_for_slave_sql_error.inc [errno=1942,1213]
+include/wait_for_slave_sql_error.inc [errno=1942]
+include/stop_slave_io.inc
 connection slave1;
 ROLLBACK;
 # Cleanup
 connection master;
 drop table t1;
 connection slave;
-include/stop_slave.inc
+# TODO: Remove after fixing MDEV-21777
 set @@global.gtid_slave_pos= "0-1-100";
-set @@global.slave_parallel_threads= 0;
-set @@global.gtid_strict_mode= 0;
-set @@global.innodb_lock_wait_timeout= 50;
+set @@global.slave_parallel_threads= @save_par_thds;
+set @@global.gtid_strict_mode= @save_strict_mode;
+set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
 include/start_slave.inc
 include/rpl_end.inc
 # End of rpl_xa_prepare_gtid_fail.test
diff --git a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test
index 8042b355754..aa1b088ed23 100644
--- a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test
+++ b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail.test
@@ -6,8 +6,8 @@
 # GTID slave state, then the slave should immediately quit in error, without
 # retry.
 #
-#   This tests validates the above behavior by simulating a deadlock on the
-# GTID slave state table during the second part of XA PREPARE's commit, to
+#   This tests validates the above behavior by forcing a lock-wait timeout on
+# the GTID slave state table during the second part of XA PREPARE's commit, to
 # ensure that the appropriate error is reported and the transaction was never
 # retried.
 #
@@ -23,23 +23,19 @@ source include/have_innodb.inc;
 --connection slave
 --source include/stop_slave.inc
 
---let $save_par_thds= `SELECT @@global.slave_parallel_threads`
---let $save_strict_mode= `SELECT @@global.gtid_strict_mode`
---let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout`
+set @save_par_thds= @@global.slave_parallel_threads;
+set @save_strict_mode= @@global.gtid_strict_mode;
+set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
 
 change master to master_use_gtid=slave_pos;
 set @@global.slave_parallel_threads= 4;
 set @@global.slave_parallel_mode= optimistic;
 set @@global.gtid_strict_mode=ON;
 
-set sql_log_bin= 0;
-alter table mysql.gtid_slave_pos engine=innodb;
-call mtr.add_suppression("Deadlock found.*");
-set sql_log_bin= 1;
+set statement sql_log_bin=0 for alter table mysql.gtid_slave_pos engine=innodb;
 --source include/start_slave.inc
 
 --connection master
-let $datadir= `select @@datadir`;
 create table t1 (a int primary key, b int) engine=innodb;
 insert t1 values (1,1);
 --source include/save_master_gtid.inc
@@ -64,11 +60,6 @@ xa prepare '1';
 xa commit '1';
 --source include/save_master_gtid.inc
 
-
---connection slave
-
-#--eval set statement sql_log_bin=0 for insert into mysql.gtid_slave_pos values ($gtid_domain_id, 5, $gtid_server_id, $xap_seq_no)
-
 --connection slave1
 BEGIN;
 --eval SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=$xap_seq_no FOR UPDATE
@@ -76,9 +67,14 @@ BEGIN;
 --connection slave
 --source include/start_slave.inc
 
---let $slave_sql_errno= 1942,1213
+--let $slave_sql_errno= 1942
 --source include/wait_for_slave_sql_error.inc
 
+# TODO: Remove after fixing MDEV-21777
+# Stop the IO thread too, so the existing relay logs are force purged on slave
+# restart, as to not re-execute the already-prepared transaction
+--source include/stop_slave_io.inc
+
 --let $retried_tx_test= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
 if ($retried_tx_initial != $retried_tx_test)
 {
@@ -95,11 +91,11 @@ ROLLBACK;
 drop table t1;
 
 --connection slave
---source include/stop_slave.inc
+--echo # TODO: Remove after fixing MDEV-21777
 --eval set @@global.gtid_slave_pos= "$new_gtid"
---eval set @@global.slave_parallel_threads= $save_par_thds
---eval set @@global.gtid_strict_mode= $save_strict_mode
---eval set @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout
+set @@global.slave_parallel_threads= @save_par_thds;
+set @@global.gtid_strict_mode= @save_strict_mode;
+set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
 --source include/start_slave.inc
 
 --source include/rpl_end.inc