1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-30 16:24:05 +03:00

MDEV-23328 Server hang due to Galera lock conflict resolution

Mutex order violation when wsrep bf thread kills a conflicting trx,
the stack is

          wsrep_thd_LOCK()
          wsrep_kill_victim()
          lock_rec_other_has_conflicting()
          lock_clust_rec_read_check_and_lock()
          row_search_mvcc()
          ha_innobase::index_read()
          ha_innobase::rnd_pos()
          handler::ha_rnd_pos()
          handler::rnd_pos_by_record()
          handler::ha_rnd_pos_by_record()
          Rows_log_event::find_row()
          Update_rows_log_event::do_exec_row()
          Rows_log_event::do_apply_event()
          Log_event::apply_event()
          wsrep_apply_events()

and mutexes are taken in the order

          lock_sys->mutex -> victim_trx->mutex -> victim_thread->LOCK_thd_data

When a normal KILL statement is executed, the stack is

          innobase_kill_query()
          kill_handlerton()
          plugin_foreach_with_mask()
          ha_kill_query()
          THD::awake()
          kill_one_thread()

        and mutexes are

          victim_thread->LOCK_thd_data -> lock_sys->mutex -> victim_trx->mutex

This patch is the plan D variant for fixing potetial mutex locking
order exercised by BF aborting and KILL command execution.

In this approach, KILL command is replicated as TOI operation.
This guarantees total isolation for the KILL command execution
in the first node: there is no concurrent replication applying
and no concurrent DDL executing. Therefore there is no risk of
BF aborting to happen in parallel with KILL command execution
either. Potential mutex deadlocks between the different mutex
access paths with KILL command execution and BF aborting cannot
therefore happen.

TOI replication is used, in this approach,  purely as means
to provide isolated KILL command execution in the first node.
KILL command should not (and must not) be applied in secondary
nodes. In this patch, we make this sure by skipping KILL
execution in secondary nodes, in applying phase, where we
bail out if applier thread is trying to execute KILL command.
This is effective, but skipping the applying of KILL command
could happen much earlier as well.

This also fixed unprotected calls to wsrep_thd_abort
that will use wsrep_abort_transaction. This is fixed
by holding THD::LOCK_thd_data while we abort transaction.

Reviewed-by: Jan Lindström <jan.lindstrom@mariadb.com>
This commit is contained in:
sjaakola
2021-10-21 14:49:51 +03:00
committed by Jan Lindström
parent c8b39f7ee2
commit db50ea3ad3
15 changed files with 412 additions and 198 deletions

View File

@ -1,4 +1,4 @@
/* Copyright 2008-2015 Codership Oy <http://www.codership.com>
/* Copyright 2008-2021 Codership Oy <http://www.codership.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -835,13 +835,25 @@ void wsrep_thr_init()
DBUG_VOID_RETURN;
}
/* This is wrapper for wsrep_break_lock in thr_lock.c */
static int wsrep_thr_abort_thd(void *bf_thd_ptr, void *victim_thd_ptr, my_bool signal)
{
THD* victim_thd= (THD *) victim_thd_ptr;
/* We need to lock THD::LOCK_thd_data to protect victim
from concurrent usage or disconnect or delete. */
mysql_mutex_lock(&victim_thd->LOCK_thd_data);
int res= wsrep_abort_thd(bf_thd_ptr, victim_thd_ptr, signal);
return res;
}
void wsrep_init_startup (bool first)
{
if (wsrep_init()) unireg_abort(1);
wsrep_thr_lock_init(
(wsrep_thd_is_brute_force_fun)wsrep_thd_is_BF,
(wsrep_abort_thd_fun)wsrep_abort_thd,
(wsrep_abort_thd_fun)wsrep_thr_abort_thd,
wsrep_debug, wsrep_convert_LOCK_to_trx,
(wsrep_on_fun)wsrep_on);
@ -1694,6 +1706,11 @@ static int wsrep_TOI_begin(THD *thd, char *db_, char *table_,
case SQLCOM_DROP_TABLE:
buf_err= wsrep_drop_table_query(thd, &buf, &buf_len);
break;
case SQLCOM_KILL:
WSREP_DEBUG("KILL as TOI: %s", thd->query());
buf_err= wsrep_to_buf_helper(thd, thd->query(), thd->query_length(),
&buf, &buf_len);
break;
case SQLCOM_CREATE_ROLE:
if (sp_process_definer(thd))
{
@ -2058,8 +2075,13 @@ bool wsrep_grant_mdl_exception(MDL_context *requestor_ctx,
ticket->wsrep_report(true);
}
mysql_mutex_unlock(&granted_thd->LOCK_thd_data);
wsrep_abort_thd((void *) request_thd, (void *) granted_thd, 1);
/* This will call wsrep_abort_transaction so we should hold
THD::LOCK_thd_data to protect victim from concurrent usage
or disconnect or delete. */
if (request_thd->wsrep_exec_mode == REPL_RECV)
DEBUG_SYNC(request_thd, "wsrep_after_granted_lock");
wsrep_abort_thd((void *) request_thd, (void *) granted_thd, true);
ret= false;
}
}
@ -2241,6 +2263,7 @@ error:
static bool abort_replicated(THD *thd)
{
bool ret_code= false;
mysql_mutex_lock(&thd->LOCK_thd_data);
if (thd->wsrep_query_state== QUERY_COMMITTING)
{
WSREP_DEBUG("aborting replicated trx: %llu", (ulonglong)(thd->real_id));
@ -2248,6 +2271,8 @@ static bool abort_replicated(THD *thd)
(void)wsrep_abort_thd(thd, thd, TRUE);
ret_code= true;
}
else
mysql_mutex_unlock(&thd->LOCK_thd_data);
return ret_code;
}
@ -2294,6 +2319,8 @@ static bool have_client_connections()
(longlong) tmp->thread_id));
if (is_client_connection(tmp) && tmp->killed == KILL_CONNECTION)
{
WSREP_DEBUG("Informing thread %lld that it's time to die",
(longlong)tmp->thread_id);
(void)abort_replicated(tmp);
return true;
}
@ -2378,6 +2405,8 @@ void wsrep_close_client_connections(my_bool wait_to_end, THD *except_caller_thd)
{
DBUG_PRINT("quit",("Informing thread %lld that it's time to die",
(longlong) tmp->thread_id));
WSREP_DEBUG("Informing thread %lld that it's time to die",
(longlong)tmp->thread_id);
/* We skip slave threads & scheduler on this first loop through. */
if (!is_client_connection(tmp))
continue;
@ -2394,15 +2423,18 @@ void wsrep_close_client_connections(my_bool wait_to_end, THD *except_caller_thd)
continue;
}
/* replicated transactions must be skipped */
/* replicated transactions must be skipped and aborted
with wsrep_abort_thd. */
if (abort_replicated(tmp))
continue;
WSREP_DEBUG("closing connection %lld", (longlong) tmp->thread_id);
/*
instead of wsrep_close_thread() we do now soft kill by THD::awake
*/
instead of wsrep_close_thread() we do now soft kill by
THD::awake(). Here also victim needs to be protected from
concurrent usage or disconnect or delete.
*/
mysql_mutex_lock(&tmp->LOCK_thd_data);
tmp->awake(KILL_CONNECTION);
@ -2423,7 +2455,6 @@ void wsrep_close_client_connections(my_bool wait_to_end, THD *except_caller_thd)
I_List_iterator<THD> it2(threads);
while ((tmp=it2++))
{
#ifndef __bsdi__ // Bug in BSDI kernel
if (is_client_connection(tmp) &&
!abort_replicated(tmp) &&
!is_replaying_connection(tmp) &&
@ -2432,7 +2463,6 @@ void wsrep_close_client_connections(my_bool wait_to_end, THD *except_caller_thd)
WSREP_INFO("killing local connection: %lld", (longlong) tmp->thread_id);
close_connection(tmp,0);
}
#endif
}
DBUG_PRINT("quit",("Waiting for threads to die (count=%u)",thread_count));
@ -2621,7 +2651,8 @@ extern "C" void wsrep_thd_set_query_state(
void wsrep_thd_set_conflict_state(THD *thd, enum wsrep_conflict_state state)
{
if (WSREP(thd)) thd->wsrep_conflict_state= state;
mysql_mutex_assert_owner(&thd->LOCK_thd_data);
thd->wsrep_conflict_state= state;
}
@ -2762,9 +2793,10 @@ extern "C" void wsrep_thd_awake(THD *thd, my_bool signal)
{
if (signal)
{
mysql_mutex_lock(&thd->LOCK_thd_data);
/* Here we should hold THD::LOCK_thd_data to
protect from concurrent usage. */
mysql_mutex_assert_owner(&thd->LOCK_thd_data);
thd->awake(KILL_QUERY);
mysql_mutex_unlock(&thd->LOCK_thd_data);
}
else
{