mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
MDEV-4506: Parallel replication. Intermediate commit.
Implement facility for the commit in one thread to wait for the commit of another to complete first. The wait is done in a way that does not hinder that a waiter and a waitee can group commit together with a single fsync() in both binlog and InnoDB. The wait is done efficiently with respect to locking. The patch was originally made to support TaoBao parallel replication with in-order commit; now it will be adapted to also be used for parallel replication of group-committed transactions. A waiter THD registers itself with a prior waitee THD. The waiter will then complete its commit at the earliest in the same group commit of the waitee (when using binlog). The wait can also be done explicitly by the waitee.
This commit is contained in:
193
sql/log.cc
193
sql/log.cc
@@ -6542,44 +6542,199 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
|
||||
}
|
||||
|
||||
bool
|
||||
MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
|
||||
MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *entry,
|
||||
wait_for_commit *wfc)
|
||||
{
|
||||
group_commit_entry *orig_queue;
|
||||
wait_for_commit *list, *cur, *last;
|
||||
|
||||
/*
|
||||
To facilitate group commit for the binlog, we first queue up ourselves in
|
||||
the group commit queue. Then the first thread to enter the queue waits for
|
||||
the LOCK_log mutex, and commits for everyone in the queue once it gets the
|
||||
lock. Any other threads in the queue just wait for the first one to finish
|
||||
the commit and wake them up.
|
||||
|
||||
To support in-order parallel replication with group commit, after we add
|
||||
some transaction to the queue, we check if there were other transactions
|
||||
already prepared to commit but just waiting for the first one to commit.
|
||||
If so, we add those to the queue as well, transitively for all waiters.
|
||||
*/
|
||||
|
||||
entry->thd->clear_wakeup_ready();
|
||||
mysql_mutex_lock(&LOCK_prepare_ordered);
|
||||
group_commit_entry *orig_queue= group_commit_queue;
|
||||
entry->next= orig_queue;
|
||||
group_commit_queue= entry;
|
||||
orig_queue= group_commit_queue;
|
||||
|
||||
if (entry->cache_mngr->using_xa)
|
||||
/*
|
||||
Iteratively process everything added to the queue, looking for waiters,
|
||||
and their waiters, and so on. If a waiter is ready to commit, we
|
||||
immediately add it to the queue; if not we just wake it up.
|
||||
|
||||
This would be natural to do with recursion, but we want to avoid
|
||||
potentially unbounded recursion blowing the C stack, so we use the list
|
||||
approach instead.
|
||||
*/
|
||||
list= wfc;
|
||||
cur= list;
|
||||
last= list;
|
||||
for (;;)
|
||||
{
|
||||
DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
|
||||
run_prepare_ordered(entry->thd, entry->all);
|
||||
DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
|
||||
/* Add the entry to the group commit queue. */
|
||||
entry->next= group_commit_queue;
|
||||
group_commit_queue= entry;
|
||||
|
||||
if (entry->cache_mngr->using_xa)
|
||||
{
|
||||
DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
|
||||
run_prepare_ordered(entry->thd, entry->all);
|
||||
DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
|
||||
}
|
||||
|
||||
if (!cur)
|
||||
break; // Can happen if initial entry has no wait_for_commit
|
||||
|
||||
if (cur->subsequent_commits_list)
|
||||
{
|
||||
bool have_lock;
|
||||
wait_for_commit *waiter;
|
||||
|
||||
mysql_mutex_lock(&cur->LOCK_wait_commit);
|
||||
have_lock= true;
|
||||
waiter= cur->subsequent_commits_list;
|
||||
/* Check again, now safely under lock. */
|
||||
if (waiter)
|
||||
{
|
||||
/* Grab the list of waiters and process it. */
|
||||
cur->subsequent_commits_list= NULL;
|
||||
do
|
||||
{
|
||||
wait_for_commit *next= waiter->next_subsequent_commit;
|
||||
group_commit_entry *entry2=
|
||||
(group_commit_entry *)waiter->opaque_pointer;
|
||||
if (entry2)
|
||||
{
|
||||
/*
|
||||
This is another transaction ready to be written to the binary
|
||||
log. We can put it into the queue directly, without needing a
|
||||
separate context switch to the other thread. We just set a flag
|
||||
so that the other thread will know when it wakes up that it was
|
||||
already processed.
|
||||
|
||||
So put it at the end of the list to be processed in a subsequent
|
||||
iteration of the outer loop.
|
||||
*/
|
||||
entry2->queued_by_other= true;
|
||||
last->next_subsequent_commit= waiter;
|
||||
last= waiter;
|
||||
/*
|
||||
As a small optimisation, we do not actually need to set
|
||||
waiter->next_subsequent_commit to NULL, as we can use the
|
||||
pointer `last' to check for end-of-list.
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
Wake up the waiting transaction.
|
||||
|
||||
For this, we need to set the "wakeup running" flag and release
|
||||
the waitee lock to avoid a deadlock, see comments on
|
||||
THD::wakeup_subsequent_commits2() for details.
|
||||
*/
|
||||
if (have_lock)
|
||||
{
|
||||
cur->wakeup_subsequent_commits_running= true;
|
||||
mysql_mutex_unlock(&cur->LOCK_wait_commit);
|
||||
have_lock= false;
|
||||
}
|
||||
waiter->wakeup();
|
||||
}
|
||||
waiter= next;
|
||||
} while (waiter);
|
||||
}
|
||||
if (have_lock)
|
||||
mysql_mutex_unlock(&cur->LOCK_wait_commit);
|
||||
}
|
||||
if (cur == last)
|
||||
break;
|
||||
cur= cur->next_subsequent_commit;
|
||||
entry= (group_commit_entry *)cur->opaque_pointer;
|
||||
DBUG_ASSERT(entry != NULL);
|
||||
}
|
||||
|
||||
/* Now we need to clear the wakeup_subsequent_commits_running flags. */
|
||||
if (list)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
if (list->wakeup_subsequent_commits_running)
|
||||
{
|
||||
mysql_mutex_lock(&list->LOCK_wait_commit);
|
||||
list->wakeup_subsequent_commits_running= false;
|
||||
mysql_mutex_unlock(&list->LOCK_wait_commit);
|
||||
}
|
||||
if (list == last)
|
||||
break;
|
||||
list= list->next_subsequent_commit;
|
||||
}
|
||||
}
|
||||
|
||||
mysql_mutex_unlock(&LOCK_prepare_ordered);
|
||||
DEBUG_SYNC(entry->thd, "commit_after_release_LOCK_prepare_ordered");
|
||||
|
||||
return orig_queue == NULL;
|
||||
}
|
||||
|
||||
bool
|
||||
MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
|
||||
{
|
||||
wait_for_commit *wfc;
|
||||
bool is_leader;
|
||||
|
||||
wfc= entry->thd->wait_for_commit_ptr;
|
||||
entry->queued_by_other= false;
|
||||
if (wfc && wfc->waiting_for_commit)
|
||||
{
|
||||
mysql_mutex_lock(&wfc->LOCK_wait_commit);
|
||||
/* Do an extra check here, this time safely under lock. */
|
||||
if (wfc->waiting_for_commit)
|
||||
{
|
||||
wfc->opaque_pointer= entry;
|
||||
do
|
||||
{
|
||||
mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
|
||||
} while (wfc->waiting_for_commit);
|
||||
wfc->opaque_pointer= NULL;
|
||||
}
|
||||
mysql_mutex_unlock(&wfc->LOCK_wait_commit);
|
||||
}
|
||||
|
||||
if (entry->queued_by_other)
|
||||
is_leader= false;
|
||||
else
|
||||
is_leader= queue_for_group_commit(entry, wfc);
|
||||
|
||||
/*
|
||||
The first in the queue handle group commit for all; the others just wait
|
||||
The first in the queue handles group commit for all; the others just wait
|
||||
to be signalled when group commit is done.
|
||||
*/
|
||||
if (orig_queue != NULL)
|
||||
if (is_leader)
|
||||
trx_group_commit_leader(entry);
|
||||
else if (!entry->queued_by_other)
|
||||
entry->thd->wait_for_wakeup_ready();
|
||||
else
|
||||
trx_group_commit_leader(entry);
|
||||
{
|
||||
/*
|
||||
If we were queued by another prior commit, then we are woken up
|
||||
only when the leader has already completed the commit for us.
|
||||
So nothing to do here then.
|
||||
*/
|
||||
}
|
||||
|
||||
if (!opt_optimize_thread_scheduling)
|
||||
{
|
||||
/* For the leader, trx_group_commit_leader() already took the lock. */
|
||||
if (orig_queue != NULL)
|
||||
if (!is_leader)
|
||||
mysql_mutex_lock(&LOCK_commit_ordered);
|
||||
|
||||
DEBUG_SYNC(entry->thd, "commit_loop_entry_commit_ordered");
|
||||
@@ -6598,7 +6753,10 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
|
||||
|
||||
if (next)
|
||||
{
|
||||
next->thd->signal_wakeup_ready();
|
||||
if (next->queued_by_other)
|
||||
next->thd->wait_for_commit_ptr->wakeup();
|
||||
else
|
||||
next->thd->signal_wakeup_ready();
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -6884,7 +7042,12 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
|
||||
*/
|
||||
next= current->next;
|
||||
if (current != leader) // Don't wake up ourself
|
||||
current->thd->signal_wakeup_ready();
|
||||
{
|
||||
if (current->queued_by_other)
|
||||
current->thd->wait_for_commit_ptr->wakeup();
|
||||
else
|
||||
current->thd->signal_wakeup_ready();
|
||||
}
|
||||
current= next;
|
||||
}
|
||||
DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
|
||||
@@ -7514,6 +7677,8 @@ int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
|
||||
mysql_mutex_unlock(&LOCK_prepare_ordered);
|
||||
}
|
||||
|
||||
thd->wait_for_prior_commit();
|
||||
|
||||
cookie= 0;
|
||||
if (xid)
|
||||
cookie= log_one_transaction(xid);
|
||||
|
Reference in New Issue
Block a user