mirror of
https://github.com/MariaDB/server.git
synced 2025-07-30 16:24:05 +03:00
MDEV-31833 replication breaks when using optimistic replication and replica is a galera node
MariaDB async replication SQL thread was stopped for any failure in applying of replication events and error message logged for the failure was: "Node has dropped from cluster". The assumption was that event applying failure is always due to node dropping out. With optimistic parallel replication, event applying can fail for natural reasons and applying should be retried to handle the failure. This retry logic was never exercised because the slave SQL thread was stopped with first applying failure. To support optimistic parallel replication retrying logic this commit will now skip replication slave abort, if node remains in cluster (wsrep_ready==ON) and replication is configured for optimistic or aggressive retry logic. During the development of this fix, galera.galera_as_slave_nonprim test showed some problems. The test was analyzed, and it appears to need some attention. One excessive sleep command was removed in this commit, but it will need more fixes still to be fully deterministic. After this commit galera_as_slave_nonprim is successful, though. Signed-off-by: Julius Goryavsky <julius.goryavsky@mariadb.com>
This commit is contained in:
committed by
Julius Goryavsky
parent
ef569c324d
commit
a3cbc44b24
@ -820,6 +820,10 @@ do_retry:
|
||||
event_count= 0;
|
||||
err= 0;
|
||||
errmsg= NULL;
|
||||
#ifdef WITH_WSREP
|
||||
thd->wsrep_cs().reset_error();
|
||||
WSREP_DEBUG("retrying async replication event");
|
||||
#endif /* WITH_WSREP */
|
||||
|
||||
/*
|
||||
If we already started committing before getting the deadlock (or other
|
||||
@ -919,6 +923,7 @@ do_retry:
|
||||
err= rgi->worker_error= 1;
|
||||
my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
|
||||
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
|
||||
|
||||
goto err;
|
||||
}
|
||||
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
|
||||
@ -960,7 +965,17 @@ do_retry:
|
||||
possibility of an old deadlock kill lingering on beyond this point.
|
||||
*/
|
||||
thd->reset_killed();
|
||||
#ifdef WITH_WSREP
|
||||
if (wsrep_before_command(thd))
|
||||
{
|
||||
WSREP_WARN("Parallel slave worker failed at wsrep_before_command() hook");
|
||||
err= 1;
|
||||
goto err;
|
||||
}
|
||||
wsrep_start_trx_if_not_started(thd);
|
||||
WSREP_DEBUG("parallel slave retry, after trx start");
|
||||
|
||||
#endif /* WITH_WSREP */
|
||||
strmake_buf(log_name, ir->name);
|
||||
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
|
||||
{
|
||||
|
Reference in New Issue
Block a user