Move InnoDB/XtraDB to async deadlock kill for parallel replication.

In 10.2, use the thd_rpl_deadlock_check() API. This way, all the locking hacks around thd_report_wait_for() can be removed. Now parallel replication deadlock kill happens asynchroneously, from the slave background thread. In InnoDB, remove also the buffering of wait reports, to simplify the code, as this is no longer needed when the locking issues are gone. In XtraDB, the buffering is kept for now. This is just because presumably XtraDB will eventually be updated to MySQL 5.7-based InnoDB as well, so there is little need to modify the existing code only for clean-up purposes. The old synchronous function thd_report_wait_for() is no longer used and removed in this patch. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
2025-08-07 00:04:31 +03:00 · 2016-10-17 12:52:14 +02:00
parent 4d3e3669ec
commit cc49f00994
7 changed files with 41 additions and 211 deletions
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -4598,19 +4598,20 @@ extern "C" int thd_rpl_is_parallel(const MYSQL_THD thd)
 }

 /*
-  This function can optionally be called to check if thd_report_wait_for()
+  This function can optionally be called to check if thd_rpl_deadlock_check()
  needs to be called for waits done by a given transaction.

-  If this function returns false for a given thd, there is no need to do any
-  calls to thd_report_wait_for() on that thd.
+  If this function returns false for a given thd, there is no need to do
+  any calls to thd_rpl_deadlock_check() on that thd.

-  This call is optional; it is safe to call thd_report_wait_for() in any case.
-  This call can be used to save some redundant calls to thd_report_wait_for()
-  if desired. (This is unlikely to matter much unless there are _lots_ of
-  waits to report, as the overhead of thd_report_wait_for() is small).
+  This call is optional; it is safe to call thd_rpl_deadlock_check() in
+  any case. This call can be used to save some redundant calls to
+  thd_rpl_deadlock_check() if desired. (This is unlikely to matter much
+  unless there are _lots_ of waits to report, as the overhead of
+  thd_rpl_deadlock_check() is small).
 */
 extern "C" int
-thd_need_wait_for(const MYSQL_THD thd)
+thd_need_wait_reports(const MYSQL_THD thd)
 {
  rpl_group_info *rgi;

@@ -4625,75 +4626,9 @@ thd_need_wait_for(const MYSQL_THD thd)
 }

 /*
-  Used by InnoDB/XtraDB to report that one transaction THD is about to go to
-  wait for a transactional lock held by another transactions OTHER_THD.
-
-  This is used for parallel replication, where transactions are required to
-  commit in the same order on the slave as they did on the master. If the
-  transactions on the slave encounters lock conflicts on the slave that did
-  not exist on the master, this can cause deadlocks.
-
-  Normally, such conflicts will not occur, because the same conflict would
-  have prevented the two transactions from committing in parallel on the
-  master, thus preventing them from running in parallel on the slave in the
-  first place. However, it is possible in case when the optimizer chooses a
-  different plan on the slave than on the master (eg. table scan instead of
-  index scan).
-
-  InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a
-  deadlock with the pre-determined commit order, we kill the later transaction,
-  and later re-try it, to resolve the deadlock.
-
-  This call need only receive reports about waits for locks that will remain
-  until the holding transaction commits. InnoDB/XtraDB auto-increment locks
-  are released earlier, and so need not be reported. (Such false positives are
-  not harmful, but could lead to unnecessary kill and retry, so best avoided).
-*/
-extern "C" void
-thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
-{
-  rpl_group_info *rgi;
-  rpl_group_info *other_rgi;
-
-  if (!thd)
-    return;
-  DEBUG_SYNC(thd, "thd_report_wait_for");
-  thd->transaction.stmt.mark_trans_did_wait();
-  if (!other_thd)
-    return;
-  binlog_report_wait_for(thd, other_thd);
-  rgi= thd->rgi_slave;
-  other_rgi= other_thd->rgi_slave;
-  if (!rgi || !other_rgi)
-    return;
-  if (!rgi->is_parallel_exec)
-    return;
-  if (rgi->rli != other_rgi->rli)
-    return;
-  if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
-    return;
-  if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
-    return;
-  if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
-    return;
-  /*
-    This transaction is about to wait for another transaction that is required
-    by replication binlog order to commit after. This would cause a deadlock.
-
-    So send a kill to the other transaction, with a temporary error; this will
-    cause replication to rollback (and later re-try) the other transaction,
-    releasing the lock for this transaction so replication can proceed.
-  */
-  other_rgi->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
-  mysql_mutex_lock(&other_thd->LOCK_thd_data);
-  other_thd->awake(KILL_CONNECTION);
-  mysql_mutex_unlock(&other_thd->LOCK_thd_data);
-}
-
-/*
-  Used by storage engines (currently TokuDB) to report that one transaction
-  THD is about to go to wait for a transactional lock held by another
-  transactions OTHER_THD.
+  Used by storage engines (currently TokuDB and InnoDB/XtraDB) to report that
+  one transaction THD is about to go to wait for a transactional lock held by
+  another transactions OTHER_THD.

  This is used for parallel replication, where transactions are required to
  commit in the same order on the slave as they did on the master. If the
@@ -4708,9 +4643,9 @@ thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
  chooses a different plan on the slave than on the master (eg. table scan
  instead of index scan).

-  InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a
-  deadlock with the pre-determined commit order, we kill the later transaction,
-  and later re-try it, to resolve the deadlock.
+  Storage engines report lock waits using this call. If a lock wait causes a
+  deadlock with the pre-determined commit order, we kill the later
+  transaction, and later re-try it, to resolve the deadlock.

  This call need only receive reports about waits for locks that will remain
  until the holding transaction commits. InnoDB/XtraDB auto-increment locks,
@@ -4801,8 +4736,8 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd)

  Calling this function is just an optimisation to avoid unnecessary
  deadlocks. If it was not used, a gap lock would be set that could eventually
-  cause a deadlock; the deadlock would be caught by thd_report_wait_for() and
-  the transaction T2 killed and rolled back (and later re-tried).
+  cause a deadlock; the deadlock would be caught by thd_rpl_deadlock_check()
+  and the transaction T2 killed and rolled back (and later re-tried).
 */
 extern "C" int
 thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -5685,8 +5685,7 @@ innobase_kill_query(
 				wsrep_thd_is_BF(current_thd, FALSE));
 		}

-		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
-			trx->abort_type == TRX_SERVER_ABORT) {
+		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
 			ut_ad(!lock_mutex_own());
 			lock_mutex_enter();
 			lock_mutex_taken = true;
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -842,8 +842,7 @@ lock_sys->mutex and sometimes by trx->mutex. */

 typedef enum {
 	TRX_SERVER_ABORT = 0,
-	TRX_WSREP_ABORT  = 1,
-	TRX_REPLICATION_ABORT = 2
+	TRX_WSREP_ABORT  = 1
 } trx_abort_t;


--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -70,15 +70,8 @@ static const ulint	TABLE_LOCK_CACHE = 8;
 /** Size in bytes, of the table lock instance */
 static const ulint	TABLE_LOCK_SIZE = sizeof(ib_lock_t);

-/* Buffer to collect THDs to report waits for. */
-struct thd_wait_reports {
-	struct thd_wait_reports *next;	/*!< List link */
-	ulint used;			/*!< How many elements in waitees[] */
-	trx_t *waitees[64];		/*!< Trxs for thd_report_wait_for() */
-};
-
-extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
-extern "C" int thd_need_wait_for(const MYSQL_THD thd);
+extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
 extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);

 /** Deadlock checker. */
@@ -109,7 +102,7 @@ private:
 		const trx_t*	trx,
 		const lock_t*	wait_lock,
 		ib_uint64_t	mark_start,
-		struct thd_wait_reports* waitee_buf_ptr)
+		bool report_waiters)
 		:
 		m_cost(),
 		m_start(trx),
@@ -117,7 +110,7 @@ private:
 		m_wait_lock(wait_lock),
 		m_mark_start(mark_start),
 		m_n_elems(),
-		m_waitee_ptr(waitee_buf_ptr)
+		m_report_waiters(report_waiters)
 	{
 	}

@@ -276,8 +269,8 @@ private:
 	/** This is to avoid malloc/free calls. */
 	static state_t		s_states[MAX_STACK_SIZE];

-	/** Buffer to collect THDs to report waits for. */
-	struct thd_wait_reports* m_waitee_ptr;
+       /** Set if thd_rpl_deadlock_check() should be called for waits. */
+       bool m_report_waiters;
 };

 /** Counter to mark visited nodes during deadlock search. */
@@ -286,11 +279,6 @@ ib_uint64_t	DeadlockChecker::s_lock_mark_counter = 0;
 /** The stack used for deadlock searches. */
 DeadlockChecker::state_t	DeadlockChecker::s_states[MAX_STACK_SIZE];

-extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
-extern "C" int thd_need_wait_for(const MYSQL_THD thd);
-extern "C"
-int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
-
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
 Validates the lock system.
@@ -2074,14 +2062,6 @@ RecLock::add_to_waitq(const lock_t* wait_for, const lock_prdt_t* prdt)
 	dberr_t	err = deadlock_check(lock);

 	ut_ad(trx_mutex_own(m_trx));
-
-	/* m_trx->mysql_thd is NULL if it's an internal trx. So current_thd is used */
-	if (err == DB_LOCK_WAIT) {
-		ut_ad(wait_for && wait_for->trx);
-		wait_for->trx->abort_type = TRX_REPLICATION_ABORT;
-		thd_report_wait_for(current_thd, wait_for->trx->mysql_thd);
-		wait_for->trx->abort_type = TRX_SERVER_ABORT;
-	}
 	return(err);
 }

@@ -7968,27 +7948,11 @@ DeadlockChecker::search()
 			layer. These locks are released before commit, so they
 			can not cause deadlocks with binlog-fixed commit
 			order. */
-			if (m_waitee_ptr &&
+			if (m_report_waiters &&
 			    (lock_get_type_low(lock) != LOCK_TABLE ||
 			     lock_get_mode(lock) != LOCK_AUTO_INC)) {
-				if (m_waitee_ptr->used ==
-				    sizeof(m_waitee_ptr->waitees) /
-				    sizeof(m_waitee_ptr->waitees[0])) {
-					m_waitee_ptr->next =
-						(struct thd_wait_reports *)
-						ut_malloc_nokey(sizeof(*m_waitee_ptr));
-					m_waitee_ptr = m_waitee_ptr->next;
-
-					if (!m_waitee_ptr) {
-						m_too_deep = true;
-						return (m_start);
-					}
-
-					m_waitee_ptr->next = NULL;
-					m_waitee_ptr->used = 0;
-				}
-
-				m_waitee_ptr->waitees[m_waitee_ptr->used++] = lock->trx;
+				thd_rpl_deadlock_check(m_start->mysql_thd,
+						       lock->trx->mysql_thd);
 			}

 			if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
@@ -8068,47 +8032,6 @@ DeadlockChecker::trx_rollback()
 	trx_mutex_exit(trx);
 }

-static
-void
-lock_report_waiters_to_mysql(
-/*=======================*/
-	struct thd_wait_reports*	waitee_buf_ptr,	/*!< in: set of trxs */
-	THD*				mysql_thd,	/*!< in: THD */
-	const trx_t*			victim_trx)	/*!< in: Trx selected
-							as deadlock victim, if
-							any */
-{
-	struct thd_wait_reports*	p;
-	struct thd_wait_reports*	q;
-	ulint				i;
-
-	p = waitee_buf_ptr;
-	while (p) {
-		i = 0;
-		while (i < p->used) {
-			trx_t *w_trx = p->waitees[i];
-			/*  There is no need to report waits to a trx already
-			selected as a victim. */
-			if (w_trx != victim_trx) {
-				/* If thd_report_wait_for() decides to kill the
-				transaction, then we will get a call back into
-				innobase_kill_query. We mark this by setting
-				current_lock_mutex_owner, so we can avoid trying
-				to recursively take lock_sys->mutex. */
-				w_trx->abort_type = TRX_REPLICATION_ABORT;
-				thd_report_wait_for(mysql_thd, w_trx->mysql_thd);
-				w_trx->abort_type = TRX_SERVER_ABORT;
-			}
-			++i;
-		}
-		q = p->next;
-		if (p != waitee_buf_ptr) {
-			ut_free(p);
-		}
-		p = q;
-	}
-}
-
 /** Checks if a joining lock request results in a deadlock. If a deadlock is
 found this function will resolve the deadlock by choosing a victim transaction
 and rolling it back. It will attempt to resolve all deadlocks. The returned
@@ -8127,36 +8050,20 @@ DeadlockChecker::check_and_resolve(const lock_t* lock, const trx_t* trx)
 	ut_ad(!srv_read_only_mode);

 	const trx_t*	victim_trx;
-	struct thd_wait_reports	waitee_buf;
-	struct thd_wait_reports*waitee_buf_ptr;
-	THD*			start_mysql_thd;
+	THD*		start_mysql_thd;
+	bool report_waits = false;

 	start_mysql_thd = trx->mysql_thd;

-	if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) {
-		waitee_buf_ptr = &waitee_buf;
-	} else {
-		waitee_buf_ptr = NULL;
-	}
+	if (start_mysql_thd && thd_need_wait_reports(start_mysql_thd))
+		report_waits = true;

 	/* Try and resolve as many deadlocks as possible. */
 	do {
-		if (waitee_buf_ptr) {
-			waitee_buf_ptr->next = NULL;
-			waitee_buf_ptr->used = 0;
-		}
-
-		DeadlockChecker	checker(trx, lock, s_lock_mark_counter, waitee_buf_ptr);
+		DeadlockChecker	checker(trx, lock, s_lock_mark_counter, report_waits);

 		victim_trx = checker.search();

-		/* Report waits to upper layer, as needed. */
-		if (waitee_buf_ptr) {
-			lock_report_waiters_to_mysql(waitee_buf_ptr,
-						     start_mysql_thd,
-						     victim_trx);
-		}
-
 		/* Search too deep, we rollback the joining transaction only
 		if it is possible to rollback. Otherwise we rollback the
 		transaction that is holding the lock that the joining
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -5441,8 +5441,7 @@ innobase_kill_connection(
 			wsrep_thd_is_BF(current_thd, FALSE),
 			lock_get_info(trx->lock.wait_lock).c_str());

-		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
-		    trx->abort_type == TRX_SERVER_ABORT) {
+		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
 			ut_ad(!lock_mutex_own());
 			lock_mutex_enter();
 		}
@@ -5462,8 +5461,7 @@ innobase_kill_connection(
 			trx_mutex_exit(trx);
 		}

-		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
-		    trx->abort_type == TRX_SERVER_ABORT) {
+		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
 			lock_mutex_exit();
 		}
 	}
--- a/storage/xtradb/include/trx0trx.h
+++ b/storage/xtradb/include/trx0trx.h
@@ -707,8 +707,7 @@ lock_sys->mutex and sometimes by trx->mutex. */

 typedef enum {
 	TRX_SERVER_ABORT = 0,
-	TRX_WSREP_ABORT  = 1,
-	TRX_REPLICATION_ABORT = 2
+	TRX_WSREP_ABORT  = 1
 } trx_abort_t;

 struct trx_t{
--- a/storage/xtradb/lock/lock0lock.cc
+++ b/storage/xtradb/lock/lock0lock.cc
@@ -380,8 +380,8 @@ struct lock_stack_t {
 	ulint		heap_no;		/*!< heap number if rec lock */
 };

-extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
-extern "C" int thd_need_wait_for(const MYSQL_THD thd);
+extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
 extern "C"
 int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);

@@ -406,7 +406,7 @@ UNIV_INTERN mysql_pfs_key_t	lock_sys_wait_mutex_key;
 struct thd_wait_reports {
 	struct thd_wait_reports *next;	/*!< List link */
 	ulint used;			/*!< How many elements in waitees[] */
-	trx_t *waitees[64];		/*!< Trxs for thd_report_wait_for() */
+	trx_t *waitees[64];		/*!< Trxs for thd_rpl_deadlock_check() */
 };


@@ -4489,14 +4489,7 @@ lock_report_waiters_to_mysql(
 			/*  There is no need to report waits to a trx already
 			selected as a victim. */
 			if (w_trx->id != victim_trx_id) {
-				/* If thd_report_wait_for() decides to kill the
-				transaction, then we will get a call back into
-				innobase_kill_query. We mark this by setting
-				current_lock_mutex_owner, so we can avoid trying
-				to recursively take lock_sys->mutex. */
-				w_trx->abort_type = TRX_REPLICATION_ABORT;
-				thd_report_wait_for(mysql_thd, w_trx->mysql_thd);
-				w_trx->abort_type = TRX_SERVER_ABORT;
+				thd_rpl_deadlock_check(mysql_thd, w_trx->mysql_thd);
 			}
 			++i;
 		}
@@ -4535,7 +4528,7 @@ lock_deadlock_check_and_resolve(
 	assert_trx_in_list(trx);

 	start_mysql_thd = trx->mysql_thd;
-	if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) {
+	if (start_mysql_thd && thd_need_wait_reports(start_mysql_thd)) {
 		waitee_buf_ptr = &waitee_buf;
 	} else {
 		waitee_buf_ptr = NULL;