diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 1cfdf96ee3b..9c4222d7817 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -1450,11 +1450,23 @@ handle_rpl_parallel_thread(void *arg) after mark_start_commit(), we have to unmark, which has at least a theoretical possibility of leaving a window where it looks like all transactions in a GCO have started committing, while in fact one - will need to rollback and retry. This is not supposed to be possible - (since there is a deadlock, at least one transaction should be - blocked from reaching commit), but this seems a fragile ensurance, - and there were historically a number of subtle bugs in this area. + will need to rollback and retry. + + Normally this will not happen, since the kill is there to resolve a + deadlock that is preventing at least one transaction from proceeding. + One case it can happen is with InnoDB dict stats update, which can + temporarily cause transactions to block each other, but locks are + released immediately, they don't linger until commit. There could be + other similar cases, there were historically a number of subtle bugs + in this area. + + But once we start the commit, we can expect that no new lock + conflicts will be introduced. So by handling any lingering deadlock + kill at this point just before mark_start_commit(), we should be + robust even towards spurious deadlock kills. */ + if (rgi->killed_for_retry != rpl_group_info::RETRY_KILL_NONE) + wait_for_pending_deadlock_kill(thd, rgi); if (!thd->killed) { DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index 9d4e09a362c..8284b07f7ff 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -2518,6 +2518,23 @@ rpl_group_info::unmark_start_commit() e= this->parallel_entry; mysql_mutex_lock(&e->LOCK_parallel_entry); + /* + Assert that we have not already wrongly completed this GCO and signalled + the next one to start, only to now unmark and make the signal invalid. + This is to catch problems like MDEV-34696. + + The error inject rpl_parallel_simulate_temp_err_xid is used to test this + precise situation, that we handle it gracefully if it somehow occurs in a + release build. So disable the assert in this case. + */ +#ifndef DBUG_OFF + bool allow_unmark_after_complete= false; + DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid", + allow_unmark_after_complete= true;); + DBUG_ASSERT(!gco->next_gco || + gco->next_gco->wait_count > e->count_committing_event_groups || + allow_unmark_after_complete); +#endif --e->count_committing_event_groups; mysql_mutex_unlock(&e->LOCK_parallel_entry); }