MDEV-30780 optimistic parallel slave hangs after hit an error

The hang could be seen as show slave status displaying an error like Last_Error: Could not execute Write_rows_v1 along with Slave_SQL_Running: Yes accompanied with one of the replication threads in show-processlist characteristically having status like 2394 | system user | | NULL | Slave_worker | 50852| closing tables It turns out that closing tables worker got entrapped in endless looping in mark_start_commit_inner() across already garbage-collected gco items. The reclaimed gco links are explained with actually possible out-of-order groups of events termination due to the Last_Error. This patch reinforces the correct ordering to perform finish_event_group's cleanup actions, incl unlinking gco:s from the active list.
2025-07-29 05:21:33 +03:00 · 2023-03-05 15:12:13 +02:00
parent dfdcd7ffab
commit d4339620be
5 changed files with 299 additions and 8 deletions
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@ -261,6 +261,12 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
                              STRING_WITH_LEN("now WAIT_FOR proceed_by_1000"));
      }
    });
+  DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
+      if (rgi->current_gtid.seq_no == 2001) {
+        DBUG_ASSERT(!rgi->worker_error || entry->stop_on_error_sub_id == sub_id);
+        debug_sync_set_action(thd, STRING_WITH_LEN("now SIGNAL cont_worker3"));
+      }
+    });
 #endif

  if (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
@ -284,6 +290,11 @@ signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
    In case we get an error during commit, inform following transactions that
    we aborted our commit.
  */
+  DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
+      if (rgi->current_gtid.seq_no == 2002) {
+        debug_sync_set_action(thd, STRING_WITH_LEN("now WAIT_FOR cont_worker2"));
+      }});
+
  rgi->unmark_start_commit();
  rgi->cleanup_context(thd, true);
  rgi->rli->abort_slave= true;
@ -788,7 +799,14 @@ do_retry:
  thd->reset_killed();
  thd->clear_error();
  rgi->killed_for_retry = rpl_group_info::RETRY_KILL_NONE;
-
+#ifdef ENABLED_DEBUG_SYNC
+    DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
+      if (rgi->current_gtid.seq_no == 2003) {
+        debug_sync_set_action(thd,
+                              STRING_WITH_LEN("now WAIT_FOR cont_worker3"));
+      }
+    });
+#endif
  /*
    If we retry due to a deadlock kill that occurred during the commit step, we
    might have already updated (but not committed) an update of table
@ -806,15 +824,12 @@ do_retry:
  for (;;)
  {
    mysql_mutex_lock(&entry->LOCK_parallel_entry);
-    if (entry->stop_on_error_sub_id == (uint64) ULONGLONG_MAX ||
+    register_wait_for_prior_event_group_commit(rgi, entry);
+    if (!(entry->stop_on_error_sub_id == (uint64) ULONGLONG_MAX ||
 #ifndef DBUG_OFF
-        (DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0)) ||
+          (DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0)) ||
 #endif
-        rgi->gtid_sub_id < entry->stop_on_error_sub_id)
-    {
-      register_wait_for_prior_event_group_commit(rgi, entry);
-    }
-    else
+          rgi->gtid_sub_id < entry->stop_on_error_sub_id))
    {
      /*
        A failure of a preceeding "parent" transaction may not be
@ -1991,6 +2006,9 @@ rpl_parallel_thread::get_gco(uint64 wait_count, group_commit_orderer *prev,
  gco->prior_sub_id= prior_sub_id;
  gco->installed= false;
  gco->flags= 0;
+#ifndef DBUG_OFF
+  gco->gc_done= false;
+#endif
  return gco;
 }

@ -1998,6 +2016,10 @@ rpl_parallel_thread::get_gco(uint64 wait_count, group_commit_orderer *prev,
 void
 rpl_parallel_thread::loc_free_gco(group_commit_orderer *gco)
 {
+#ifndef DBUG_OFF
+  DBUG_ASSERT(!gco->gc_done);
+  gco->gc_done= true;
+#endif
  if (!loc_gco_list)
    loc_gco_last_ptr_ptr= &gco->next_gco;
  else