1
0
mirror of https://github.com/codership/wsrep-lib.git synced 2025-07-03 16:22:35 +03:00

Cache rollback events that failed to replicate for later retry

This patch introduces a queue to store ids of transactions that failed
to send a rollback fragment in streaming_rollback(). This is to avoid
potentially  missed rollback fragments when a cluster splits and then
later reforms. Rollback fragments would be missing if a node rolled
back a transaction locally (either BFed or voluntary rollback) while
non-primary, and the attempt to send rollback fragment failed in
transaction::streaming_rollback().
Transaction that fail to send rollback fragment can proceed to
rollback locally.  However we must ensure that rollback fragments for
those transactions are eventually delivered by the cluster. This must
be done before a potentially conflicting writeset causes BF-BF
conflicts in the rest of the cluster.
This commit is contained in:
Daniele Sciascia
2021-09-24 10:45:34 +02:00
parent efb4aab090
commit 22921e7082
7 changed files with 443 additions and 28 deletions

View File

@ -1113,6 +1113,14 @@ void wsrep::server_state::on_sync()
}
}
init_synced_ = true;
enum wsrep::provider::status status(send_pending_rollback_events(lock));
if (status)
{
// TODO should be retried?
wsrep::log_warning()
<< "Failed to flush rollback event cache: " << status;
}
}
int wsrep::server_state::on_apply(
@ -1456,6 +1464,10 @@ void wsrep::server_state::close_orphaned_sr_transactions(
// - (1 non-primary) and (2 non-primary)
// - (1,2 primary)
// We need to rollback SRs owned by both 1 and 2.
// Notice that since the introduction of rollback_event_queue_,
// checking for equal consecutive views is no longer needed.
// However, we must keep it here for the time being, for backwards
// compatibility.
const bool equal_consecutive_views =
current_view_.equal_membership(previous_primary_view_);
@ -1504,10 +1516,8 @@ void wsrep::server_state::close_orphaned_sr_transactions(
if ((streaming_applier->transaction().state() !=
wsrep::transaction::s_prepared) &&
(equal_consecutive_views ||
(std::find_if(current_view_.members().begin(),
current_view_.members().end(),
server_id_cmp(i->first.first)) ==
current_view_.members().end())))
not current_view_.is_member(
streaming_applier->transaction().server_id())))
{
WSREP_LOG_DEBUG(wsrep::log::debug_log_level(),
wsrep::log::debug_level_server_state,
@ -1580,3 +1590,50 @@ void wsrep::server_state::close_transactions_at_disconnect(
}
streaming_appliers_recovered_ = false;
}
//
// Rollback event queue
//
void wsrep::server_state::queue_rollback_event(
const wsrep::transaction_id& id)
{
wsrep::unique_lock<wsrep::mutex> lock(mutex_);
#ifndef NDEBUG
// Make sure we don't have duplicate
// transaction ids in rollback event queue.
// There is no need to do this in release
// build given that caller (streaming_rollback())
// should avoid duplicates.
for (auto i : rollback_event_queue_)
{
assert(id != i);
}
#endif
rollback_event_queue_.push_back(id);
}
enum wsrep::provider::status
wsrep::server_state::send_pending_rollback_events(
wsrep::unique_lock<wsrep::mutex>& lock WSREP_UNUSED)
{
assert(lock.owns_lock());
while (not rollback_event_queue_.empty())
{
const wsrep::transaction_id& id(rollback_event_queue_.front());
const enum wsrep::provider::status status(provider().rollback(id));
if (status)
{
return status;
}
rollback_event_queue_.pop_front();
}
return wsrep::provider::success;
}
enum wsrep::provider::status
wsrep::server_state::send_pending_rollback_events()
{
wsrep::unique_lock<wsrep::mutex> lock(mutex_);
return send_pending_rollback_events(lock);
}