From 5d61442c85b35ce382ac2afeaad5c19e630b41dd Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Thu, 15 Jun 2023 21:35:53 +0200 Subject: [PATCH] MDEV-31448: Killing a replica thread awaiting its GCO can hang/crash a parallel replica The problem is that when a worker thread is (user) killed in wait_for_prior_commit, the event group may complete out-of-order since the wait for prior commit was aborted by the kill. This fix ensures that event groups will always complete in-order, even in the error case. This is done in finish_event_group() by doing an extra wait_for_prior_commit(), if necessary, that ignores kills. This fix supersedes the fix for MDEV-30780, so the earlier fix for that is reverted in this patch. Also fix that an error from wait_for_prior_commit() inside finish_event_group() would not signal the error to wakeup_subsequent_commits(). Based on earlier work by Brandon Nesterenko and Andrei Elkin, with some changes to simplify the semantics of wait_for_prior_commit() and make the code more robust to future changes. Reviewed-by: Andrei Elkin Signed-off-by: Kristian Nielsen --- sql/rpl_parallel.cc | 59 ++++++++++++++++++++++++++++++++++++--------- sql/sql_class.cc | 11 +++++++-- sql/sql_class.h | 10 ++++---- 3 files changed, 61 insertions(+), 19 deletions(-) diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 1dab1224286..ed8858d67ba 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -27,6 +27,9 @@ struct rpl_parallel_thread_pool global_rpl_thread_pool; static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err); +static void +register_wait_for_prior_event_group_commit(rpl_group_info *rgi, + rpl_parallel_entry *entry); static int rpt_handle_event(rpl_parallel_thread::queued_event *qev, @@ -151,15 +154,35 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id, int err; thd->get_stmt_da()->set_overwrite_status(true); + + if (unlikely(rgi->worker_error)) + { + /* + In case a previous wait was killed, we need to re-register to be able to + repeat the wait. + + And before doing that, we un-register any previous registration (in case + we got an error earlier and skipped waiting). + */ + thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); + mysql_mutex_lock(&entry->LOCK_parallel_entry); + register_wait_for_prior_event_group_commit(rgi, entry); + mysql_mutex_unlock(&entry->LOCK_parallel_entry); + } + /* Remove any left-over registration to wait for a prior commit to complete. Normally, such wait would already have been removed at this point by wait_for_prior_commit() called from within COMMIT - processing. However, in case of MyISAM and no binlog, we might not - have any commit processing, and so we need to do the wait here, - before waking up any subsequent commits, to preserve correct - order of event execution. Also, in the error case we might have - skipped waiting and thus need to remove it explicitly. + processing. + + However, in case of MyISAM and no binlog, we might not have any commit + processing, and so we need to do the wait here, before waking up any + subsequent commits, to preserve correct order of event execution. + + Also, in the error case we might have skipped waiting and thus need to + remove it explicitly. Or the wait might have been killed and we need to + repeat the registration and the wait. It is important in the non-error case to do a wait, not just an unregister. Because we might be last in a group-commit that is @@ -172,8 +195,18 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id, all earlier event groups have also committed; this way no more mark_start_commit() calls can be made and it is safe to de-allocate the GCO. + + Thus this final wait is done with kill ignored during the wait. This is + fine, at this point there is no active query or transaction to abort, and + the thread will continue as soon as earlier event groups complete. + + Note though, that in the non-error case there is no guarantee that + finish_event_group() will be run in-order. For example, a successful + binlog group commit will wakeup all participating event groups + simultaneously so only thread scheduling will decide the order in which + finish_event_group() calls acquire LOCK_parallel_entry. */ - err= wfc->wait_for_prior_commit(thd); + err= wfc->wait_for_prior_commit(thd, false); if (unlikely(err) && !rgi->worker_error) signal_error_to_sql_driver_thread(thd, rgi, err); thd->wait_for_commit_ptr= NULL; @@ -242,8 +275,7 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id, not yet started should just skip their group, preparing for stop of the SQL driver thread. */ - if (unlikely(rgi->worker_error) && - entry->stop_on_error_sub_id == (uint64)ULONGLONG_MAX) + if (unlikely(rgi->worker_error) && entry->stop_on_error_sub_id > sub_id) entry->stop_on_error_sub_id= sub_id; mysql_mutex_unlock(&entry->LOCK_parallel_entry); #ifdef ENABLED_DEBUG_SYNC @@ -820,12 +852,15 @@ do_retry: for (;;) { mysql_mutex_lock(&entry->LOCK_parallel_entry); - register_wait_for_prior_event_group_commit(rgi, entry); - if (!(entry->stop_on_error_sub_id == (uint64) ULONGLONG_MAX || + if (rgi->gtid_sub_id < entry->stop_on_error_sub_id #ifndef DBUG_OFF - (DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0)) || + || DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0) #endif - rgi->gtid_sub_id < entry->stop_on_error_sub_id)) + ) + { + register_wait_for_prior_event_group_commit(rgi, entry); + } + else { /* A failure of a preceeding "parent" transaction may not be diff --git a/sql/sql_class.cc b/sql/sql_class.cc index a0d05846ca8..d2a3cbcf613 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -7635,11 +7635,18 @@ wait_for_commit::register_wait_for_prior_commit(wait_for_commit *waitee) with register_wait_for_prior_commit(). If the commit already completed, returns immediately. + If ALLOW_KILL is set to true (the default), the wait can be aborted by a + kill. In case of kill, the wait registration is still removed, so another + call of unregister_wait_for_prior_commit() is needed to later retry the + wait. If ALLOW_KILL is set to false, then kill will be ignored and this + function will not return until the prior commit (if any) has called + wakeup_subsequent_commits(). + If thd->backup_commit_lock is set, release it while waiting for other threads */ int -wait_for_commit::wait_for_prior_commit2(THD *thd) +wait_for_commit::wait_for_prior_commit2(THD *thd, bool allow_kill) { PSI_stage_info old_stage; wait_for_commit *loc_waitee; @@ -7664,7 +7671,7 @@ wait_for_commit::wait_for_prior_commit2(THD *thd) &stage_waiting_for_prior_transaction_to_commit, &old_stage); while ((loc_waitee= this->waitee.load(std::memory_order_relaxed)) && - likely(!thd->check_killed(1))) + (!allow_kill || likely(!thd->check_killed(1)))) mysql_cond_wait(&COND_wait_commit, &LOCK_wait_commit); if (!loc_waitee) { diff --git a/sql/sql_class.h b/sql/sql_class.h index 4e5aba33443..4487a67c76d 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2144,14 +2144,14 @@ struct wait_for_commit bool commit_started; void register_wait_for_prior_commit(wait_for_commit *waitee); - int wait_for_prior_commit(THD *thd) + int wait_for_prior_commit(THD *thd, bool allow_kill=true) { /* Quick inline check, to avoid function call and locking in the common case where no wakeup is registered, or a registered wait was already signalled. */ if (waitee.load(std::memory_order_acquire)) - return wait_for_prior_commit2(thd); + return wait_for_prior_commit2(thd, allow_kill); else { if (wakeup_error) @@ -2205,7 +2205,7 @@ struct wait_for_commit void wakeup(int wakeup_error); - int wait_for_prior_commit2(THD *thd); + int wait_for_prior_commit2(THD *thd, bool allow_kill); void wakeup_subsequent_commits2(int wakeup_error); void unregister_wait_for_prior_commit2(); @@ -4726,10 +4726,10 @@ public: } wait_for_commit *wait_for_commit_ptr; - int wait_for_prior_commit() + int wait_for_prior_commit(bool allow_kill=true) { if (wait_for_commit_ptr) - return wait_for_commit_ptr->wait_for_prior_commit(this); + return wait_for_commit_ptr->wait_for_prior_commit(this, allow_kill); return 0; } void wakeup_subsequent_commits(int wakeup_error)