1
0
mirror of https://github.com/MariaDB/server.git synced 2025-10-19 21:09:40 +03:00
Files
mariadb/storage/tokudb/PerconaFT/locktree/lock_request.cc
Kristian Nielsen d145d1b6ee fix bogus stalls in the lock tree for low concurrency applications
Merge into the MariaDB tree the pull request from Rich Prohaska for
PerconaFT. These changes are needed to get parallel replication to
work with TokuDB. Once the pull request is accepted by Percona and the new upstream version enters MariaDB, this commit can be superseded.

Original commit message from Rich Prohaska:

    1. Fix the release before wait race

    The release before wait race occurs when a lock is released by transaction A after transaction B tried to acquire it but before transaction B has a chance to register it's pending lock request.  There are several ways to fix this problem, but we want to optimize for the common situation of minimal lock conflicts, which is what the lock acquisition algorithm currently does.  Our solution to the release before wait race is for transaction B to retry its lock request after its lock request has been added to the pending lock set.

    2. Fix the retry race

    The retry race occurs in the current lock retry algorithm which assumes that if some transaction is running lock retry, then my transaction does not also need to run it.  There is a chance that some pending lock requests will be skipped, but these lock requests will eventually time out.  For applications with small numbers of concurrent transactions, timeouts will frequently occur, and the application throughput will be very small.

    The solution to the retry race is to use a group retry algorithm.  All threads run through the retry logic.  Sequence numbers are used to group retries into batches such that one transaction can run the retry logic on behalf of several transactions.  This amortizes the retry cost.  The sequence numbers also ensure that when a transaction releases its locks, all of the pending lock requests that it is blocking are retried.

    3. Implement a mechanism to find and kill a pending lock request

    Tags lock requests with a client id, use the client id as a key into the pending lock requests sets to find a lock request, complete the lock request with a lock timeout error.

    Copyright (c) 2016, Rich Prohaska
    All rights reserved.

    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2016-11-23 16:48:31 +01:00

441 lines
14 KiB
C++

/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "portability/toku_race_tools.h"
#include "ft/txn/txn.h"
#include "locktree/locktree.h"
#include "locktree/lock_request.h"
#include "util/dbt.h"
namespace toku {
// initialize a lock request's internals
void lock_request::create(void) {
m_txnid = TXNID_NONE;
m_conflicting_txnid = TXNID_NONE;
m_start_time = 0;
m_left_key = nullptr;
m_right_key = nullptr;
toku_init_dbt(&m_left_key_copy);
toku_init_dbt(&m_right_key_copy);
m_type = type::UNKNOWN;
m_lt = nullptr;
m_complete_r = 0;
m_state = state::UNINITIALIZED;
m_info = nullptr;
toku_cond_init(&m_wait_cond, nullptr);
m_start_test_callback = nullptr;
m_start_before_pending_test_callback = nullptr;
m_retry_test_callback = nullptr;
}
// destroy a lock request.
void lock_request::destroy(void) {
invariant(m_state != state::PENDING);
invariant(m_state != state::DESTROYED);
m_state = state::DESTROYED;
toku_destroy_dbt(&m_left_key_copy);
toku_destroy_dbt(&m_right_key_copy);
toku_cond_destroy(&m_wait_cond);
}
// set the lock request parameters. this API allows a lock request to be reused.
void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, lock_request::type lock_type, bool big_txn, void *extra) {
invariant(m_state != state::PENDING);
m_lt = lt;
m_txnid = txnid;
m_left_key = left_key;
m_right_key = right_key;
toku_destroy_dbt(&m_left_key_copy);
toku_destroy_dbt(&m_right_key_copy);
m_type = lock_type;
m_state = state::INITIALIZED;
m_info = lt ? lt->get_lock_request_info() : nullptr;
m_big_txn = big_txn;
m_extra = extra;
}
// get rid of any stored left and right key copies and
// replace them with copies of the given left and right key
void lock_request::copy_keys() {
if (!toku_dbt_is_infinite(m_left_key)) {
toku_clone_dbt(&m_left_key_copy, *m_left_key);
m_left_key = &m_left_key_copy;
}
if (!toku_dbt_is_infinite(m_right_key)) {
toku_clone_dbt(&m_right_key_copy, *m_right_key);
m_right_key = &m_right_key_copy;
}
}
// what are the conflicts for this pending lock request?
void lock_request::get_conflicts(txnid_set *conflicts) {
invariant(m_state == state::PENDING);
const bool is_write_request = m_type == type::WRITE;
m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key, conflicts);
}
// build a wait-for-graph for this lock request and the given conflict set
// for each transaction B that blocks A's lock request
// if B is blocked then
// add (A,T) to the WFG and if B is new, fill in the WFG from B
void lock_request::build_wait_graph(wfg *wait_graph, const txnid_set &conflicts) {
size_t num_conflicts = conflicts.size();
for (size_t i = 0; i < num_conflicts; i++) {
TXNID conflicting_txnid = conflicts.get(i);
lock_request *conflicting_request = find_lock_request(conflicting_txnid);
invariant(conflicting_txnid != m_txnid);
invariant(conflicting_request != this);
if (conflicting_request) {
bool already_exists = wait_graph->node_exists(conflicting_txnid);
wait_graph->add_edge(m_txnid, conflicting_txnid);
if (!already_exists) {
// recursively build the wait for graph rooted at the conflicting
// request, given its set of lock conflicts.
txnid_set other_conflicts;
other_conflicts.create();
conflicting_request->get_conflicts(&other_conflicts);
conflicting_request->build_wait_graph(wait_graph, other_conflicts);
other_conflicts.destroy();
}
}
}
}
// returns: true if the current set of lock requests contains
// a deadlock, false otherwise.
bool lock_request::deadlock_exists(const txnid_set &conflicts) {
wfg wait_graph;
wait_graph.create();
build_wait_graph(&wait_graph, conflicts);
bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid);
wait_graph.destroy();
return deadlock;
}
// try to acquire a lock described by this lock request.
int lock_request::start(void) {
int r;
txnid_set conflicts;
conflicts.create();
if (m_type == type::WRITE) {
r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
} else {
invariant(m_type == type::READ);
r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
}
// if the lock is not granted, save it to the set of lock requests
// and check for a deadlock. if there is one, complete it as failed
if (r == DB_LOCK_NOTGRANTED) {
copy_keys();
m_state = state::PENDING;
m_start_time = toku_current_time_microsec() / 1000;
m_conflicting_txnid = conflicts.get(0);
if (m_start_before_pending_test_callback) m_start_before_pending_test_callback();
toku_mutex_lock(&m_info->mutex);
insert_into_lock_requests();
if (deadlock_exists(conflicts)) {
remove_from_lock_requests();
r = DB_LOCK_DEADLOCK;
}
toku_mutex_unlock(&m_info->mutex);
if (m_start_test_callback) m_start_test_callback(); // test callback
}
if (r != DB_LOCK_NOTGRANTED) {
complete(r);
}
conflicts.destroy();
return r;
}
// sleep on the lock request until it becomes resolved or the wait time has elapsed.
int lock_request::wait(uint64_t wait_time_ms) {
return wait(wait_time_ms, 0, nullptr);
}
int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void)) {
uint64_t t_now = toku_current_time_microsec();
uint64_t t_start = t_now;
uint64_t t_end = t_start + wait_time_ms * 1000;
toku_mutex_lock(&m_info->mutex);
// check again, this time locking out other retry calls
if (m_state == state::PENDING) {
retry();
}
while (m_state == state::PENDING) {
// check if this thread is killed
if (killed_callback && killed_callback()) {
remove_from_lock_requests();
complete(DB_LOCK_NOTGRANTED);
continue;
}
// compute next wait time
uint64_t t_wait;
if (killed_time_ms == 0) {
t_wait = t_end;
} else {
t_wait = t_now + killed_time_ms * 1000;
if (t_wait > t_end)
t_wait = t_end;
}
struct timespec ts = {};
ts.tv_sec = t_wait / 1000000;
ts.tv_nsec = (t_wait % 1000000) * 1000;
int r = toku_cond_timedwait(&m_wait_cond, &m_info->mutex, &ts);
invariant(r == 0 || r == ETIMEDOUT);
t_now = toku_current_time_microsec();
if (m_state == state::PENDING && t_now >= t_end) {
m_info->counters.timeout_count += 1;
// if we're still pending and we timed out, then remove our
// request from the set of lock requests and fail.
remove_from_lock_requests();
// complete sets m_state to COMPLETE, breaking us out of the loop
complete(DB_LOCK_NOTGRANTED);
}
}
uint64_t t_real_end = toku_current_time_microsec();
uint64_t duration = t_real_end - t_start;
m_info->counters.wait_count += 1;
m_info->counters.wait_time += duration;
if (duration >= 1000000) {
m_info->counters.long_wait_count += 1;
m_info->counters.long_wait_time += duration;
}
toku_mutex_unlock(&m_info->mutex);
invariant(m_state == state::COMPLETE);
return m_complete_r;
}
// complete this lock request with the given return value
void lock_request::complete(int complete_r) {
m_complete_r = complete_r;
m_state = state::COMPLETE;
}
const DBT *lock_request::get_left_key(void) const {
return m_left_key;
}
const DBT *lock_request::get_right_key(void) const {
return m_right_key;
}
TXNID lock_request::get_txnid(void) const {
return m_txnid;
}
uint64_t lock_request::get_start_time(void) const {
return m_start_time;
}
TXNID lock_request::get_conflicting_txnid(void) const {
return m_conflicting_txnid;
}
int lock_request::retry(void) {
invariant(m_state == state::PENDING);
int r;
txnid_set conflicts;
conflicts.create();
if (m_type == type::WRITE) {
r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
} else {
r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
}
// if the acquisition succeeded then remove ourselves from the
// set of lock requests, complete, and signal the waiting thread.
if (r == 0) {
remove_from_lock_requests();
complete(r);
if (m_retry_test_callback) m_retry_test_callback(); // test callback
toku_cond_broadcast(&m_wait_cond);
} else {
m_conflicting_txnid = conflicts.get(0);
}
conflicts.destroy();
return r;
}
void lock_request::retry_all_lock_requests(locktree *lt, void (*after_retry_all_test_callback)(void)) {
lt_lock_request_info *info = lt->get_lock_request_info();
info->retry_want++;
// if there are no pending lock requests than there is nothing to do
// the unlocked data race on pending_is_empty is OK since lock requests
// are retried after added to the pending set.
if (info->pending_is_empty)
return;
toku_mutex_lock(&info->mutex);
// here is the group retry algorithm.
// get the latest retry_want count and use it as the generation number of this retry operation.
// if this retry generation is > the last retry generation, then do the lock retries. otherwise,
// no lock retries are needed.
unsigned long long retry_gen = info->retry_want.load();
if (retry_gen > info->retry_done) {
// retry all of the pending lock requests.
for (size_t i = 0; i < info->pending_lock_requests.size(); ) {
lock_request *request;
int r = info->pending_lock_requests.fetch(i, &request);
invariant_zero(r);
// retry this lock request. if it didn't succeed,
// move on to the next lock request. otherwise
// the request is gone from the list so we may
// read the i'th entry for the next one.
r = request->retry();
if (r != 0) {
i++;
}
}
if (after_retry_all_test_callback) after_retry_all_test_callback();
info->retry_done = retry_gen;
}
toku_mutex_unlock(&info->mutex);
}
void *lock_request::get_extra(void) const {
return m_extra;
}
void lock_request::kill_waiter(void) {
remove_from_lock_requests();
complete(DB_LOCK_NOTGRANTED);
toku_cond_broadcast(&m_wait_cond);
}
void lock_request::kill_waiter(locktree *lt, void *extra) {
lt_lock_request_info *info = lt->get_lock_request_info();
toku_mutex_lock(&info->mutex);
for (size_t i = 0; i < info->pending_lock_requests.size(); i++) {
lock_request *request;
int r = info->pending_lock_requests.fetch(i, &request);
if (r == 0 && request->get_extra() == extra) {
request->kill_waiter();
break;
}
}
toku_mutex_unlock(&info->mutex);
}
// find another lock request by txnid. must hold the mutex.
lock_request *lock_request::find_lock_request(const TXNID &txnid) {
lock_request *request;
int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(txnid, &request, nullptr);
if (r != 0) {
request = nullptr;
}
return request;
}
// insert this lock request into the locktree's set. must hold the mutex.
void lock_request::insert_into_lock_requests(void) {
uint32_t idx;
lock_request *request;
int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(m_txnid, &request, &idx);
invariant(r == DB_NOTFOUND);
r = m_info->pending_lock_requests.insert_at(this, idx);
invariant_zero(r);
m_info->pending_is_empty = false;
}
// remove this lock request from the locktree's set. must hold the mutex.
void lock_request::remove_from_lock_requests(void) {
uint32_t idx;
lock_request *request;
int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(m_txnid, &request, &idx);
invariant_zero(r);
invariant(request == this);
r = m_info->pending_lock_requests.delete_at(idx);
invariant_zero(r);
if (m_info->pending_lock_requests.size() == 0)
m_info->pending_is_empty = true;
}
int lock_request::find_by_txnid(lock_request * const &request, const TXNID &txnid) {
TXNID request_txnid = request->m_txnid;
if (request_txnid < txnid) {
return -1;
} else if (request_txnid == txnid) {
return 0;
} else {
return 1;
}
}
void lock_request::set_start_test_callback(void (*f)(void)) {
m_start_test_callback = f;
}
void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
m_start_before_pending_test_callback = f;
}
void lock_request::set_retry_test_callback(void (*f)(void)) {
m_retry_test_callback = f;
}
} /* namespace toku */