diff --git a/include/mysql/service_wsrep.h b/include/mysql/service_wsrep.h index a0d0a338a0e..cf94364c1cf 100644 --- a/include/mysql/service_wsrep.h +++ b/include/mysql/service_wsrep.h @@ -209,7 +209,7 @@ extern "C" my_bool wsrep_thd_is_local_toi(const MYSQL_THD thd); extern "C" my_bool wsrep_thd_is_in_rsu(const MYSQL_THD thd); /* Return true if thd is in BF mode, either high_priority or TOI */ extern "C" my_bool wsrep_thd_is_BF(const MYSQL_THD thd, my_bool sync); -/* Return true if thd is streaming */ +/* Return true if thd is streaming in progress */ extern "C" my_bool wsrep_thd_is_SR(const MYSQL_THD thd); extern "C" void wsrep_handle_SR_rollback(MYSQL_THD BF_thd, MYSQL_THD victim_thd); /* Return thd retry counter */ diff --git a/mysql-test/suite/galera_sr/r/MDEV-34836.result b/mysql-test/suite/galera_sr/r/MDEV-34836.result new file mode 100644 index 00000000000..10d302c415f --- /dev/null +++ b/mysql-test/suite/galera_sr/r/MDEV-34836.result @@ -0,0 +1,23 @@ +connection node_2; +connection node_1; +connection node_1; +CREATE TABLE parent (id INT AUTO_INCREMENT PRIMARY KEY, v INT) ENGINE=InnoDB; +INSERT INTO parent VALUES (1, 1),(2, 2),(3, 3); +CREATE TABLE child (id INT AUTO_INCREMENT PRIMARY KEY, parent_id INT, CONSTRAINT parent_fk +FOREIGN KEY (parent_id) REFERENCES parent (id)) ENGINE=InnoDB; +connection node_2; +SET SESSION wsrep_trx_fragment_size = 1; +START TRANSACTION; +INSERT INTO child (parent_id) VALUES (1),(2),(3); +connection node_1; +SET SESSION wsrep_sync_wait = 15; +SELECT COUNT(*) FROM child; +COUNT(*) +0 +ALTER TABLE parent AUTO_INCREMENT = 100; +connection node_2; +COMMIT; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +DROP TABLE child, parent; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera_sr/t/MDEV-34836.test b/mysql-test/suite/galera_sr/t/MDEV-34836.test new file mode 100644 index 00000000000..840ebaede4c --- /dev/null +++ b/mysql-test/suite/galera_sr/t/MDEV-34836.test @@ -0,0 +1,56 @@ +# +# MDEV-34836: TOI transaction on FK-referenced parent table must BF abort +# SR transaction in progress on a child table. +# +# Applied SR transaction on the child table was not BF aborted by TOI running +# on the parent table for several reasons: +# Although SR correctly collected FK-referenced keys to parent, TOI in Galera +# disregards common certification index and simply sets itself to depend on the +# latest certified write set seqno. +# Since this write set was the fragment of SR transaction, TOI was allowed to run in +# parallel with SR presuming it would BF abort the latter. +# +# At the same time, DML transactions in the server don't grab MDL locks on FK-referenced +# tables, thus parent table wasn't protected by an MDL lock from SR and it couldn't +# provoke MDL lock conflict for TOI to BF abort SR transaction. +# In InnoDB, DDL transactions grab shared MDL locks on child tables, which is not enough +# to trigger MDL conflict in Galera. +# InnoDB-level Wsrep patch didn't contain correct conflict resolution logic due to the +# fact that it was believed MDL locking should always produce conflicts correctly. +# +# The fix brings conflict resolution rules similar to MDL-level checks to InnoDB, thus +# accounting for the problematic case. +# + +--source include/galera_cluster.inc +--source include/have_innodb.inc + +--connection node_1 +CREATE TABLE parent (id INT AUTO_INCREMENT PRIMARY KEY, v INT) ENGINE=InnoDB; +INSERT INTO parent VALUES (1, 1),(2, 2),(3, 3); + +CREATE TABLE child (id INT AUTO_INCREMENT PRIMARY KEY, parent_id INT, CONSTRAINT parent_fk + FOREIGN KEY (parent_id) REFERENCES parent (id)) ENGINE=InnoDB; + +--connection node_2 +# Start SR transaction and make it lock both parent and child tales. +SET SESSION wsrep_trx_fragment_size = 1; +START TRANSACTION; +INSERT INTO child (parent_id) VALUES (1),(2),(3); + +--connection node_1 +# Sync wait for SR transaction to replicate and apply fragments, thus +# locking parent table as well. +SET SESSION wsrep_sync_wait = 15; +SELECT COUNT(*) FROM child; +# Now run TOI on the parent, which BF-aborts the SR-transaction in progress. +ALTER TABLE parent AUTO_INCREMENT = 100; + +--connection node_2 +# Check that SR is BF-aborted. +--error ER_LOCK_DEADLOCK +COMMIT; + +# Cleanup +DROP TABLE child, parent; +--source include/galera_end.inc diff --git a/sql/service_wsrep.cc b/sql/service_wsrep.cc index 22ae84aa353..0a26951b47e 100644 --- a/sql/service_wsrep.cc +++ b/sql/service_wsrep.cc @@ -183,7 +183,8 @@ extern "C" my_bool wsrep_thd_is_BF(const THD *thd, my_bool sync) extern "C" my_bool wsrep_thd_is_SR(const THD *thd) { - return thd && thd->wsrep_cs().transaction().is_streaming(); + return thd && thd->wsrep_cs().transaction().is_streaming() && + thd->wsrep_cs().transaction().state() == wsrep::transaction::s_executing; } extern "C" void wsrep_handle_SR_rollback(THD *bf_thd, diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index c267e45ebe6..ecca8901c40 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -496,15 +496,13 @@ void lock_sys_t::close() #ifdef WITH_WSREP # ifdef UNIV_DEBUG -/** Check if both conflicting lock transaction and other transaction -requesting record lock are brute force (BF). If they are check is -this BF-BF wait correct and if not report BF wait and assert. +/** Check if this BF-BF wait is correct and if not report and abort. @param lock other waiting lock @param trx transaction requesting conflicting lock */ -static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx, - const unsigned type_mode = LOCK_NONE) +static void wsrep_assert_valid_bf_bf_wait(const lock_t *lock, const trx_t *trx, + const unsigned type_mode) { ut_ad(!lock->is_table()); lock_sys.assert_locked(*lock); @@ -514,12 +512,8 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx, not acquire THD::LOCK_thd_data mutex below to avoid latching order violation. */ - if (!trx->is_wsrep() || !lock_trx->is_wsrep()) - return; - if (UNIV_LIKELY(!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) - || UNIV_LIKELY(!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE))) - return; - + ut_ad(wsrep_thd_is_BF(trx->mysql_thd, FALSE)); + ut_ad(wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)); ut_ad(trx->state == TRX_STATE_ACTIVE); switch (lock_trx->state) { @@ -536,16 +530,6 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx, ut_ad("invalid state" == 0); } - /* If BF - BF order is honored, i.e. trx already holding - record lock should be ordered before this new lock request - we can keep trx waiting for the lock. If conflicting - transaction is already aborting or rolling back for replaying - we can also let new transaction waiting. */ - if (wsrep_thd_order_before(lock_trx->mysql_thd, trx->mysql_thd) - || wsrep_thd_is_aborting(lock_trx->mysql_thd)) { - return; - } - if (type_mode != LOCK_NONE) ib::error() << " Requested lock " << ((type_mode & LOCK_TABLE) ? "on table " : " on record ") @@ -573,8 +557,81 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx, /* BF-BF wait is a bug */ ut_error; } + +void wsrep_report_error(const lock_t* victim_lock, const trx_t *bf_trx); # endif /* UNIV_DEBUG */ +/** Check if a high priority (BF) trx has to wait for the current +lock holder based on Wsrep transaction state relations. + +This code resembles the one in `wsrep_handle_mdl_conflict()`, but +it's specific to the storage engine and it doesn't perform any +BF aborts by itself. +The decision whether to BF abort a victim may depend on other conditions +like lock compatibility between InnoDB transactions. + +@param lock other waiting lock +@param trx BF transaction requesting conflicting lock +@return TRUE if BF trx has to wait for the lock to be removed +*/ +static bool wsrep_BF_has_to_wait(const lock_t *lock, const trx_t *trx, + bool report_bf_bf_wait, + const unsigned type_mode = LOCK_NONE) +{ + THD *request_thd= trx->mysql_thd; + THD *granted_thd= lock->trx->mysql_thd; + + ut_ad(wsrep_thd_is_BF(request_thd, false)); + ut_ad(lock->trx->is_wsrep()); + + /* Granted is aborting, let it complete. */ + if (wsrep_thd_is_aborting(granted_thd)) + return true; + + /* Granted is not BF, may BF abort it. */ + if (!wsrep_thd_is_BF(granted_thd, false)) + return false; + + /* Applying SR cannot BF abort other high priority (BF). */ + if (wsrep_thd_is_SR(request_thd)) + return true; + + /* Requester is truly BF and granted is applying SR in progress. */ + if (wsrep_thd_is_SR(granted_thd)) + return false; + +#ifdef UNIV_DEBUG + if (report_bf_bf_wait) + wsrep_report_error(lock, trx); + /* We very well can let BF to wait normally as other + BF will be replayed in case of conflict. For debug + builds we will do additional sanity checks to catch + unsupported BF wait if any. */ + ut_d(wsrep_assert_valid_bf_bf_wait(lock, trx, type_mode)); +#endif + return true; +} + +/** Determine whether BF abort on the lock holder is needed. + +@param lock other waiting lock +@param trx BF transaction requesting conflicting lock +@return TRUE if BF abort is needed +*/ +static bool wsrep_will_BF_abort(const lock_t *lock, const trx_t *trx) +{ + ut_ad(wsrep_thd_is_BF(trx->mysql_thd, false)); + + /* Don't BF abort system transactions. */ + if (!lock->trx->is_wsrep()) + return false; + + /* BF trx will wait for the lock, but it doesn't have to according + to Wsrep rules, meaning it must BF abort the lock holder. */ + return lock_has_to_wait(trx->lock.wait_lock, lock) && + !wsrep_BF_has_to_wait(lock, trx, true); +} + /** check if lock timeout was for priority thread, as a side effect trigger lock monitor @param trx transaction owning the lock @@ -648,19 +705,9 @@ bool lock_rec_has_to_wait_wsrep(const trx_t *trx, return false; } - if (wsrep_thd_order_before(trx->mysql_thd, trx2->mysql_thd)) - { - /* If two high priority threads have lock conflict, we look at the - order of these transactions and honor the earlier transaction. */ - - return false; - } - - /* We very well can let bf to wait normally as other - BF will be replayed in case of conflict. For debug - builds we will do additional sanity checks to catch - unsupported bf wait if any. */ - ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx, type_mode)); + /* If two high priority threads have lock conflict, we check if + new lock request has to wait for the transaction holding the lock. */ + return wsrep_BF_has_to_wait(lock2, trx, false, type_mode); } return true; @@ -997,14 +1044,13 @@ void wsrep_report_error(const lock_t* victim_lock, const trx_t *bf_trx) lock_rec_print(stderr, bf_trx->lock.wait_lock, mtr); WSREP_ERROR("victim holding lock: "); lock_rec_print(stderr, victim_lock, mtr); - wsrep_assert_no_bf_bf_wait(victim_lock, bf_trx); } #endif /* WITH_DEBUG */ /** Kill the holders of conflicting locks. @param trx brute-force applier transaction running in the current thread */ ATTRIBUTE_COLD ATTRIBUTE_NOINLINE -static void lock_wait_wsrep(trx_t *trx) +static void wsrep_handle_lock_conflict(trx_t *trx) { DBUG_ASSERT(wsrep_on(trx->mysql_thd)); if (!wsrep_thd_is_BF(trx->mysql_thd, false)) @@ -1030,8 +1076,8 @@ func_exit: for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock; lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { - /* Victim trx needs to be different from BF trx and it has to have a - THD so that we can kill it. Victim might not have THD in two cases: + /* Victim trx has to have a THD so that we can kill it. + Victim might not have THD in two cases: (1) An incomplete transaction that was recovered from undo logs on server startup (and not yet rolled back). @@ -1039,8 +1085,8 @@ func_exit: (2) Transaction that is in XA PREPARE state and whose client connection was disconnected. - Neither of these can complete before lock_wait_wsrep() releases - lock_sys.latch. + Neither of these can complete before wsrep_handle_lock_conflict() + releases lock_sys.latch. (1) trx_t::commit_in_memory() is clearing both trx_t::state and trx_t::is_recovered before it invokes @@ -1051,24 +1097,9 @@ func_exit: (2) If is in XA PREPARE state, it would eventually be rolled back and the lock conflict would be resolved when an XA COMMIT or XA ROLLBACK statement is executed in some other connection. - - If victim has also BF status, but has earlier seqno, we have to wait. */ - if (lock->trx != trx && lock->trx->mysql_thd && - !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) && - wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd))) + if (lock->trx->mysql_thd && wsrep_will_BF_abort(lock, trx)) { - if (wsrep_thd_is_BF(lock->trx->mysql_thd, false)) - { - // There is no need to kill victim with compatible lock - if (!lock_has_to_wait(trx->lock.wait_lock, lock)) - continue; - -#ifdef UNIV_DEBUG - wsrep_report_error(lock, trx); -#endif - } - victims.emplace(lock->trx); } } @@ -1090,21 +1121,8 @@ func_exit: record-locks instead of table locks. See details from comment above. */ - if (lock->trx != trx && lock->trx->mysql_thd && - !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) && - wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd))) + if (lock->trx->mysql_thd && wsrep_will_BF_abort(lock, trx)) { - if (wsrep_thd_is_BF(lock->trx->mysql_thd, false)) - { - // There is no need to kill victim with compatible lock - if (!lock_has_to_wait(trx->lock.wait_lock, lock)) - continue; - -#ifdef UNIV_DEBUG - wsrep_report_error(lock, trx); -#endif - } - victims.emplace(lock->trx); } } while ((lock= lock_rec_get_next(heap_no, lock))); @@ -2015,7 +2033,7 @@ dberr_t lock_wait(que_thr_t *thr) ut_ad(!trx->dict_operation_lock_mode); - IF_WSREP(if (trx->is_wsrep()) lock_wait_wsrep(trx),); + IF_WSREP(if (trx->is_wsrep()) wsrep_handle_lock_conflict(trx),); const auto type_mode= wait_lock->type_mode; #ifdef HAVE_REPLICATION