From f517d8c7425257b6b9fe81c82c489e1e5619898d Mon Sep 17 00:00:00 2001 From: Andrei Elkin Date: Tue, 2 Oct 2018 14:30:44 +0300 Subject: [PATCH] MDEV-17346 parallel slave start and stop races to workers disappeared The bug appears as a slave SQL thread hanging in rpl_parallel_thread_pool::get_thread() while there are no slave worker threads to awake it. The reason of the hang is that at the parallel slave worker pool activation the being stared SQL thread could read the worker pool size concurrently with pool deactivation. At reading the SQL thread did not employ necessary protection from a race. Fixed with making the SQL thread at the pool activation first to grab the same lock as potential deactivator also does prior to access the pool size. --- sql/rpl_parallel.cc | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 35cddee6d4d..8fef2d66635 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -1617,13 +1617,32 @@ int rpl_parallel_resize_pool_if_no_slaves(void) } +/** + Pool activation is preceeded by taking a "lock" of pool_mark_busy + which guarantees the number of running slaves drops to zero atomicly + with the number of pool workers. + This resolves race between the function caller thread and one + that may be attempting to deactivate the pool. +*/ int rpl_parallel_activate_pool(rpl_parallel_thread_pool *pool) { + int rc= 0; + + if ((rc= pool_mark_busy(pool, current_thd))) + return rc; // killed + if (!pool->count) - return rpl_parallel_change_thread_count(pool, opt_slave_parallel_threads, - 0); - return 0; + { + pool_mark_not_busy(pool); + rc= rpl_parallel_change_thread_count(pool, opt_slave_parallel_threads, + 0); + } + else + { + pool_mark_not_busy(pool); + } + return rc; }