1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

Move InnoDB/XtraDB to async deadlock kill for parallel replication.

In 10.2, use the thd_rpl_deadlock_check() API. This way, all the
locking hacks around thd_report_wait_for() can be removed. Now
parallel replication deadlock kill happens asynchroneously, from the
slave background thread.

In InnoDB, remove also the buffering of wait reports, to simplify the
code, as this is no longer needed when the locking issues are gone.

In XtraDB, the buffering is kept for now. This is just because
presumably XtraDB will eventually be updated to MySQL 5.7-based InnoDB
as well, so there is little need to modify the existing code only for
clean-up purposes.

The old synchronous function thd_report_wait_for() is no longer used
and removed in this patch.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
Kristian Nielsen
2016-10-17 12:52:14 +02:00
parent 4d3e3669ec
commit cc49f00994
7 changed files with 41 additions and 211 deletions

View File

@@ -4598,19 +4598,20 @@ extern "C" int thd_rpl_is_parallel(const MYSQL_THD thd)
}
/*
This function can optionally be called to check if thd_report_wait_for()
This function can optionally be called to check if thd_rpl_deadlock_check()
needs to be called for waits done by a given transaction.
If this function returns false for a given thd, there is no need to do any
calls to thd_report_wait_for() on that thd.
If this function returns false for a given thd, there is no need to do
any calls to thd_rpl_deadlock_check() on that thd.
This call is optional; it is safe to call thd_report_wait_for() in any case.
This call can be used to save some redundant calls to thd_report_wait_for()
if desired. (This is unlikely to matter much unless there are _lots_ of
waits to report, as the overhead of thd_report_wait_for() is small).
This call is optional; it is safe to call thd_rpl_deadlock_check() in
any case. This call can be used to save some redundant calls to
thd_rpl_deadlock_check() if desired. (This is unlikely to matter much
unless there are _lots_ of waits to report, as the overhead of
thd_rpl_deadlock_check() is small).
*/
extern "C" int
thd_need_wait_for(const MYSQL_THD thd)
thd_need_wait_reports(const MYSQL_THD thd)
{
rpl_group_info *rgi;
@@ -4625,75 +4626,9 @@ thd_need_wait_for(const MYSQL_THD thd)
}
/*
Used by InnoDB/XtraDB to report that one transaction THD is about to go to
wait for a transactional lock held by another transactions OTHER_THD.
This is used for parallel replication, where transactions are required to
commit in the same order on the slave as they did on the master. If the
transactions on the slave encounters lock conflicts on the slave that did
not exist on the master, this can cause deadlocks.
Normally, such conflicts will not occur, because the same conflict would
have prevented the two transactions from committing in parallel on the
master, thus preventing them from running in parallel on the slave in the
first place. However, it is possible in case when the optimizer chooses a
different plan on the slave than on the master (eg. table scan instead of
index scan).
InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a
deadlock with the pre-determined commit order, we kill the later transaction,
and later re-try it, to resolve the deadlock.
This call need only receive reports about waits for locks that will remain
until the holding transaction commits. InnoDB/XtraDB auto-increment locks
are released earlier, and so need not be reported. (Such false positives are
not harmful, but could lead to unnecessary kill and retry, so best avoided).
*/
extern "C" void
thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
{
rpl_group_info *rgi;
rpl_group_info *other_rgi;
if (!thd)
return;
DEBUG_SYNC(thd, "thd_report_wait_for");
thd->transaction.stmt.mark_trans_did_wait();
if (!other_thd)
return;
binlog_report_wait_for(thd, other_thd);
rgi= thd->rgi_slave;
other_rgi= other_thd->rgi_slave;
if (!rgi || !other_rgi)
return;
if (!rgi->is_parallel_exec)
return;
if (rgi->rli != other_rgi->rli)
return;
if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
return;
if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
return;
if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
return;
/*
This transaction is about to wait for another transaction that is required
by replication binlog order to commit after. This would cause a deadlock.
So send a kill to the other transaction, with a temporary error; this will
cause replication to rollback (and later re-try) the other transaction,
releasing the lock for this transaction so replication can proceed.
*/
other_rgi->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
mysql_mutex_lock(&other_thd->LOCK_thd_data);
other_thd->awake(KILL_CONNECTION);
mysql_mutex_unlock(&other_thd->LOCK_thd_data);
}
/*
Used by storage engines (currently TokuDB) to report that one transaction
THD is about to go to wait for a transactional lock held by another
transactions OTHER_THD.
Used by storage engines (currently TokuDB and InnoDB/XtraDB) to report that
one transaction THD is about to go to wait for a transactional lock held by
another transactions OTHER_THD.
This is used for parallel replication, where transactions are required to
commit in the same order on the slave as they did on the master. If the
@@ -4708,9 +4643,9 @@ thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
chooses a different plan on the slave than on the master (eg. table scan
instead of index scan).
InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a
deadlock with the pre-determined commit order, we kill the later transaction,
and later re-try it, to resolve the deadlock.
Storage engines report lock waits using this call. If a lock wait causes a
deadlock with the pre-determined commit order, we kill the later
transaction, and later re-try it, to resolve the deadlock.
This call need only receive reports about waits for locks that will remain
until the holding transaction commits. InnoDB/XtraDB auto-increment locks,
@@ -4801,8 +4736,8 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd)
Calling this function is just an optimisation to avoid unnecessary
deadlocks. If it was not used, a gap lock would be set that could eventually
cause a deadlock; the deadlock would be caught by thd_report_wait_for() and
the transaction T2 killed and rolled back (and later re-tried).
cause a deadlock; the deadlock would be caught by thd_rpl_deadlock_check()
and the transaction T2 killed and rolled back (and later re-tried).
*/
extern "C" int
thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)

View File

@@ -5685,8 +5685,7 @@ innobase_kill_query(
wsrep_thd_is_BF(current_thd, FALSE));
}
if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
trx->abort_type == TRX_SERVER_ABORT) {
if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
ut_ad(!lock_mutex_own());
lock_mutex_enter();
lock_mutex_taken = true;

View File

@@ -842,8 +842,7 @@ lock_sys->mutex and sometimes by trx->mutex. */
typedef enum {
TRX_SERVER_ABORT = 0,
TRX_WSREP_ABORT = 1,
TRX_REPLICATION_ABORT = 2
TRX_WSREP_ABORT = 1
} trx_abort_t;

View File

@@ -70,15 +70,8 @@ static const ulint TABLE_LOCK_CACHE = 8;
/** Size in bytes, of the table lock instance */
static const ulint TABLE_LOCK_SIZE = sizeof(ib_lock_t);
/* Buffer to collect THDs to report waits for. */
struct thd_wait_reports {
struct thd_wait_reports *next; /*!< List link */
ulint used; /*!< How many elements in waitees[] */
trx_t *waitees[64]; /*!< Trxs for thd_report_wait_for() */
};
extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_for(const MYSQL_THD thd);
extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
/** Deadlock checker. */
@@ -109,7 +102,7 @@ private:
const trx_t* trx,
const lock_t* wait_lock,
ib_uint64_t mark_start,
struct thd_wait_reports* waitee_buf_ptr)
bool report_waiters)
:
m_cost(),
m_start(trx),
@@ -117,7 +110,7 @@ private:
m_wait_lock(wait_lock),
m_mark_start(mark_start),
m_n_elems(),
m_waitee_ptr(waitee_buf_ptr)
m_report_waiters(report_waiters)
{
}
@@ -276,8 +269,8 @@ private:
/** This is to avoid malloc/free calls. */
static state_t s_states[MAX_STACK_SIZE];
/** Buffer to collect THDs to report waits for. */
struct thd_wait_reports* m_waitee_ptr;
/** Set if thd_rpl_deadlock_check() should be called for waits. */
bool m_report_waiters;
};
/** Counter to mark visited nodes during deadlock search. */
@@ -286,11 +279,6 @@ ib_uint64_t DeadlockChecker::s_lock_mark_counter = 0;
/** The stack used for deadlock searches. */
DeadlockChecker::state_t DeadlockChecker::s_states[MAX_STACK_SIZE];
extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_for(const MYSQL_THD thd);
extern "C"
int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
#ifdef UNIV_DEBUG
/*********************************************************************//**
Validates the lock system.
@@ -2074,14 +2062,6 @@ RecLock::add_to_waitq(const lock_t* wait_for, const lock_prdt_t* prdt)
dberr_t err = deadlock_check(lock);
ut_ad(trx_mutex_own(m_trx));
/* m_trx->mysql_thd is NULL if it's an internal trx. So current_thd is used */
if (err == DB_LOCK_WAIT) {
ut_ad(wait_for && wait_for->trx);
wait_for->trx->abort_type = TRX_REPLICATION_ABORT;
thd_report_wait_for(current_thd, wait_for->trx->mysql_thd);
wait_for->trx->abort_type = TRX_SERVER_ABORT;
}
return(err);
}
@@ -7968,27 +7948,11 @@ DeadlockChecker::search()
layer. These locks are released before commit, so they
can not cause deadlocks with binlog-fixed commit
order. */
if (m_waitee_ptr &&
if (m_report_waiters &&
(lock_get_type_low(lock) != LOCK_TABLE ||
lock_get_mode(lock) != LOCK_AUTO_INC)) {
if (m_waitee_ptr->used ==
sizeof(m_waitee_ptr->waitees) /
sizeof(m_waitee_ptr->waitees[0])) {
m_waitee_ptr->next =
(struct thd_wait_reports *)
ut_malloc_nokey(sizeof(*m_waitee_ptr));
m_waitee_ptr = m_waitee_ptr->next;
if (!m_waitee_ptr) {
m_too_deep = true;
return (m_start);
}
m_waitee_ptr->next = NULL;
m_waitee_ptr->used = 0;
}
m_waitee_ptr->waitees[m_waitee_ptr->used++] = lock->trx;
thd_rpl_deadlock_check(m_start->mysql_thd,
lock->trx->mysql_thd);
}
if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
@@ -8068,47 +8032,6 @@ DeadlockChecker::trx_rollback()
trx_mutex_exit(trx);
}
static
void
lock_report_waiters_to_mysql(
/*=======================*/
struct thd_wait_reports* waitee_buf_ptr, /*!< in: set of trxs */
THD* mysql_thd, /*!< in: THD */
const trx_t* victim_trx) /*!< in: Trx selected
as deadlock victim, if
any */
{
struct thd_wait_reports* p;
struct thd_wait_reports* q;
ulint i;
p = waitee_buf_ptr;
while (p) {
i = 0;
while (i < p->used) {
trx_t *w_trx = p->waitees[i];
/* There is no need to report waits to a trx already
selected as a victim. */
if (w_trx != victim_trx) {
/* If thd_report_wait_for() decides to kill the
transaction, then we will get a call back into
innobase_kill_query. We mark this by setting
current_lock_mutex_owner, so we can avoid trying
to recursively take lock_sys->mutex. */
w_trx->abort_type = TRX_REPLICATION_ABORT;
thd_report_wait_for(mysql_thd, w_trx->mysql_thd);
w_trx->abort_type = TRX_SERVER_ABORT;
}
++i;
}
q = p->next;
if (p != waitee_buf_ptr) {
ut_free(p);
}
p = q;
}
}
/** Checks if a joining lock request results in a deadlock. If a deadlock is
found this function will resolve the deadlock by choosing a victim transaction
and rolling it back. It will attempt to resolve all deadlocks. The returned
@@ -8127,36 +8050,20 @@ DeadlockChecker::check_and_resolve(const lock_t* lock, const trx_t* trx)
ut_ad(!srv_read_only_mode);
const trx_t* victim_trx;
struct thd_wait_reports waitee_buf;
struct thd_wait_reports*waitee_buf_ptr;
THD* start_mysql_thd;
THD* start_mysql_thd;
bool report_waits = false;
start_mysql_thd = trx->mysql_thd;
if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) {
waitee_buf_ptr = &waitee_buf;
} else {
waitee_buf_ptr = NULL;
}
if (start_mysql_thd && thd_need_wait_reports(start_mysql_thd))
report_waits = true;
/* Try and resolve as many deadlocks as possible. */
do {
if (waitee_buf_ptr) {
waitee_buf_ptr->next = NULL;
waitee_buf_ptr->used = 0;
}
DeadlockChecker checker(trx, lock, s_lock_mark_counter, waitee_buf_ptr);
DeadlockChecker checker(trx, lock, s_lock_mark_counter, report_waits);
victim_trx = checker.search();
/* Report waits to upper layer, as needed. */
if (waitee_buf_ptr) {
lock_report_waiters_to_mysql(waitee_buf_ptr,
start_mysql_thd,
victim_trx);
}
/* Search too deep, we rollback the joining transaction only
if it is possible to rollback. Otherwise we rollback the
transaction that is holding the lock that the joining

View File

@@ -5441,8 +5441,7 @@ innobase_kill_connection(
wsrep_thd_is_BF(current_thd, FALSE),
lock_get_info(trx->lock.wait_lock).c_str());
if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
trx->abort_type == TRX_SERVER_ABORT) {
if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
ut_ad(!lock_mutex_own());
lock_mutex_enter();
}
@@ -5462,8 +5461,7 @@ innobase_kill_connection(
trx_mutex_exit(trx);
}
if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
trx->abort_type == TRX_SERVER_ABORT) {
if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
lock_mutex_exit();
}
}

View File

@@ -707,8 +707,7 @@ lock_sys->mutex and sometimes by trx->mutex. */
typedef enum {
TRX_SERVER_ABORT = 0,
TRX_WSREP_ABORT = 1,
TRX_REPLICATION_ABORT = 2
TRX_WSREP_ABORT = 1
} trx_abort_t;
struct trx_t{

View File

@@ -380,8 +380,8 @@ struct lock_stack_t {
ulint heap_no; /*!< heap number if rec lock */
};
extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_for(const MYSQL_THD thd);
extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
extern "C"
int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
@@ -406,7 +406,7 @@ UNIV_INTERN mysql_pfs_key_t lock_sys_wait_mutex_key;
struct thd_wait_reports {
struct thd_wait_reports *next; /*!< List link */
ulint used; /*!< How many elements in waitees[] */
trx_t *waitees[64]; /*!< Trxs for thd_report_wait_for() */
trx_t *waitees[64]; /*!< Trxs for thd_rpl_deadlock_check() */
};
@@ -4489,14 +4489,7 @@ lock_report_waiters_to_mysql(
/* There is no need to report waits to a trx already
selected as a victim. */
if (w_trx->id != victim_trx_id) {
/* If thd_report_wait_for() decides to kill the
transaction, then we will get a call back into
innobase_kill_query. We mark this by setting
current_lock_mutex_owner, so we can avoid trying
to recursively take lock_sys->mutex. */
w_trx->abort_type = TRX_REPLICATION_ABORT;
thd_report_wait_for(mysql_thd, w_trx->mysql_thd);
w_trx->abort_type = TRX_SERVER_ABORT;
thd_rpl_deadlock_check(mysql_thd, w_trx->mysql_thd);
}
++i;
}
@@ -4535,7 +4528,7 @@ lock_deadlock_check_and_resolve(
assert_trx_in_list(trx);
start_mysql_thd = trx->mysql_thd;
if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) {
if (start_mysql_thd && thd_need_wait_reports(start_mysql_thd)) {
waitee_buf_ptr = &waitee_buf;
} else {
waitee_buf_ptr = NULL;