1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-29 05:21:33 +03:00

MDEV-4991: GTID binlog indexing

Improve the performance of slave connect using B+-Tree indexes on each binlog
file. The index allows fast lookup of a GTID position to the corresponding
offset in the binlog file, as well as lookup of a position to find the
corresponding GTID position.

This eliminates a costly sequential scan of the starting binlog file
to find the GTID starting position when a slave connects. This is
especially costly if the binlog file is not cached in memory (IO
cost), or if it is encrypted or a lot of slaves connect simultaneously
(CPU cost).

The size of the index files is generally less than 1% of the binlog data, so
not expected to be an issue.

Most of the work writing the index is done as a background task, in
the binlog background thread. This minimises the performance impact on
transaction commit. A simple global mutex is used to protect index
reads and (background) index writes; this is fine as slave connect is
a relatively infrequent operation.

Here are the user-visible options and status variables. The feature is on by
default and is expected to need no tuning or configuration for most users.

binlog_gtid_index
  On by default. Can be used to disable the indexes for testing purposes.

binlog_gtid_index_page_size (default 4096)
  Page size to use for the binlog GTID index. This is the size of the nodes
  in the B+-tree used internally in the index. A very small page-size (64 is
  the minimum) will be less efficient, but can be used to stress the
  BTree-code during testing.

binlog_gtid_index_span_min (default 65536)
  Control sparseness of the binlog GTID index. If set to N, at most one
  index record will be added for every N bytes of binlog file written.
  This can be used to reduce the number of records in the index, at
  the cost only of having to scan a few more events in the binlog file
  before finding the target position

Two status variables are available to monitor the use of the GTID indexes:

  Binlog_gtid_index_hit
  Binlog_gtid_index_miss

The "hit" status increments for each successful lookup in a GTID index.
The "miss" increments when a lookup is not possible. This indicates that the
index file is missing (eg. binlog written by old server version
without GTID index support), or corrupt.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
Kristian Nielsen
2023-09-08 13:12:49 +02:00
parent 20741b9237
commit d039346a7a
32 changed files with 4315 additions and 256 deletions

View File

@ -31,6 +31,7 @@
#include "semisync_master.h"
#include "semisync_slave.h"
#include "mysys_err.h"
#include "gtid_index.h"
enum enum_gtid_until_state {
@ -1286,6 +1287,100 @@ end:
return err;
}
/*
Helper function for gtid_find_binlog_pos() below.
Check a binlog file against a slave position. Use a GTID index if present.
Returns:
0 This is the binlog file that contains the position. If *out_start_seek
is non-zero, it is the offset found in the GTID index at which to start
scanning the binlog file for events to send to the slave.
1 This binlog file is too new to contain the given slave position.
-1 Error, *out_errormsg contains error string.
The *out_glev event must be deleted by the caller if set non-null.
*/
static int
gtid_check_binlog_file(slave_connection_state *state,
Gtid_index_reader_hot *reader,
const binlog_file_entry *list,
bool *found_in_index, uint32 *out_start_seek,
uint32 *found_count,
char *out_name, Gtid_list_log_event **out_glev,
const char **out_errormsg)
{
Gtid_list_log_event *glev= nullptr;
char buf[FN_REFLEN];
File file;
IO_CACHE cache;
int res= -1;
*found_in_index= false;
*out_glev= nullptr;
*out_errormsg= nullptr;
/*
Try to lookup the GTID position in the gtid index.
If that doesn't work, read the Gtid_list_log_event at the start of the
binlog file to get the binlog state.
*/
if (normalize_binlog_name(buf, list->name.str, false))
{
*out_errormsg= "Failed to determine binlog file name while looking for "
"GTID position in binlog";
goto end;
}
if (likely(reader && !reader->open_index_file(buf)))
{
int lookup= reader->search_gtid_pos(state, out_start_seek, found_count);
reader->close_index_file();
if (lookup >= 0)
{
statistic_increment(binlog_gtid_index_hit, &LOCK_status);
if (lookup == 0)
res= 1;
else
{
strmake(out_name, buf, FN_REFLEN);
*found_in_index= true;
res= 0;
}
goto end;
}
/*
Error in the index lookup; fall back to reading the GTID_LIST event from
the binlog file and scan it from the beginning.
*/
}
statistic_increment(binlog_gtid_index_miss, &LOCK_status);
bzero((char*) &cache, sizeof(cache));
if (unlikely((file= open_binlog(&cache, buf, out_errormsg)) == (File)-1))
goto end;
*out_errormsg= get_gtid_list_event(&cache, &glev);
end_io_cache(&cache);
mysql_file_close(file, MYF(MY_WME));
if (unlikely(*out_errormsg))
goto end;
if (!glev || contains_all_slave_gtid(state, glev))
{
strmake(out_name, buf, FN_REFLEN);
*out_glev= glev;
*out_errormsg= nullptr;
res= 0;
}
else
{
delete glev;
res= 1;
}
end:
return res;
}
/*
Find the name of the binlog file to start reading for a slave that connects
using GTID state.
@ -1314,14 +1409,17 @@ end:
the requested GTID that was already purged.
*/
static const char *
gtid_find_binlog_file(slave_connection_state *state, char *out_name,
slave_connection_state *until_gtid_state)
gtid_find_binlog_pos(slave_connection_state *state, char *out_name,
slave_connection_state *until_gtid_state,
rpl_binlog_state *until_binlog_state,
bool *found_in_index, uint32 *out_start_seek)
{
MEM_ROOT memroot;
binlog_file_entry *list;
Gtid_list_log_event *glev= NULL;
const char *errormsg= NULL;
char buf[FN_REFLEN];
Gtid_index_reader_hot *reader= NULL;
*found_in_index= false;
init_alloc_root(PSI_INSTRUMENT_ME, &memroot,
10*(FN_REFLEN+sizeof(binlog_file_entry)), 0,
@ -1332,48 +1430,41 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
goto end;
}
if (opt_binlog_gtid_index)
reader= new Gtid_index_reader_hot();
while (list)
{
File file;
IO_CACHE cache;
if (!list->next)
{
/*
It should be safe to read the currently used binlog, as we will only
read the header part that is already written.
But if that does not work on windows, then we will need to cache the
event somewhere in memory I suppose - that could work too.
*/
}
/*
Read the Gtid_list_log_event at the start of the binlog file to
get the binlog state.
*/
if (normalize_binlog_name(buf, list->name.str, false))
{
errormsg= "Failed to determine binlog file name while looking for "
"GTID position in binlog";
uint32 found_count;
int res= gtid_check_binlog_file(state, reader, list, found_in_index,
out_start_seek, &found_count,
out_name, &glev, &errormsg);
if (res < 0)
goto end;
}
bzero((char*) &cache, sizeof(cache));
if (unlikely((file= open_binlog(&cache, buf, &errormsg)) == (File)-1))
goto end;
errormsg= get_gtid_list_event(&cache, &glev);
end_io_cache(&cache);
mysql_file_close(file, MYF(MY_WME));
if (unlikely(errormsg))
goto end;
if (!glev || contains_all_slave_gtid(state, glev))
if (res == 0)
{
strmake(out_name, buf, FN_REFLEN);
if (glev)
if (*found_in_index || glev)
{
uint32 i;
uint32 count;
rpl_gtid *gtids;
if (*found_in_index)
{
count= found_count;
gtids= reader->search_gtid_list();
/*
Load the initial GTID state corresponding to the position found in
the GTID index, as we will not have a GTID_LIST event to load it
from.
*/
until_binlog_state->load(gtids, count);
}
else
{
count= glev->count;
gtids= glev->list;
}
/*
As a special case, we allow to start from binlog file N if the
requested GTID is the last event (in the corresponding domain) in
@ -1385,9 +1476,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
from the UNTIL hash, to mark that such domains have already reached
their UNTIL condition.
*/
for (i= 0; i < glev->count; ++i)
for (i= 0; i < count; ++i)
{
const rpl_gtid *gtid= state->find(glev->list[i].domain_id);
const rpl_gtid *gtid= state->find(gtids[i].domain_id);
if (!gtid)
{
/*
@ -1400,8 +1491,8 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
further GTIDs in the Gtid_list.
*/
DBUG_ASSERT(0);
} else if (gtid->server_id == glev->list[i].server_id &&
gtid->seq_no == glev->list[i].seq_no)
} else if (gtid->server_id == gtids[i].server_id &&
gtid->seq_no == gtids[i].seq_no)
{
/*
The slave requested to start from the very beginning of this
@ -1412,9 +1503,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
}
if (until_gtid_state &&
(gtid= until_gtid_state->find(glev->list[i].domain_id)) &&
gtid->server_id == glev->list[i].server_id &&
gtid->seq_no <= glev->list[i].seq_no)
(gtid= until_gtid_state->find(gtids[i].domain_id)) &&
gtid->server_id == gtids[i].server_id &&
gtid->seq_no <= gtids[i].seq_no)
{
/*
We've already reached the stop position in UNTIL for this domain,
@ -1427,8 +1518,6 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
goto end;
}
delete glev;
glev= NULL;
list= list->next;
}
@ -1441,11 +1530,56 @@ end:
if (glev)
delete glev;
if (reader)
delete reader;
free_root(&memroot, MYF(0));
return errormsg;
}
static bool
gtid_index_lookup_pos(const char *name, uint32 offset, uint32 *out_start_seek,
slave_connection_state *out_gtid_state)
{
Gtid_index_reader_hot *reader= nullptr;
bool opened= false;
bool found= false;
uint32 found_offset, found_gtid_count;
rpl_gtid *found_gtids;
int res;
if (!(reader= new Gtid_index_reader_hot()) ||
reader->open_index_file(name))
{
statistic_increment(binlog_gtid_index_miss, &LOCK_status);
goto err;
}
opened= true;
res= reader->search_offset(offset, &found_offset, &found_gtid_count);
if (res <= 0)
{
statistic_increment(binlog_gtid_index_miss, &LOCK_status);
goto err;
}
statistic_increment(binlog_gtid_index_hit, &LOCK_status);
/* We found the position, initialize the state from the index. */
found_gtids= reader->search_gtid_list();
if (out_gtid_state->load(found_gtids, found_gtid_count))
goto err;
*out_start_seek= found_offset;
found= true;
err:
if (opened)
reader->close_index_file();
if (reader)
delete reader;
return found;
}
/*
Given an old-style binlog position with file name and file offset, find the
corresponding gtid position. If the offset is not at an event boundary, give
@ -1469,8 +1603,22 @@ gtid_state_from_pos(const char *name, uint32 offset,
int err;
String packet;
Format_description_log_event *fdev= NULL;
bool found_in_index;
uint32 UNINIT_VAR(start_seek);
bool seek_done= false;
if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0)))
/*
Try to lookup the position in the binlog gtid index. If found (as it will
usually be unless the index is corrupted somehow), we can seek directly to
a point at or just before the desired location, saving an expensive scan
of the binlog file from the start.
*/
found_in_index= opt_binlog_gtid_index ?
gtid_index_lookup_pos(name, offset, &start_seek, gtid_state) :
false;
if (found_in_index)
found_gtid_list_event= true;
else if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0)))
{
errormsg= "Internal error (out of memory?) initializing slave state "
"while scanning binlog to find start position";
@ -1559,6 +1707,25 @@ gtid_state_from_pos(const char *name, uint32 offset,
errormsg= "Could not start decryption of binlog.";
goto end;
}
if (found_in_index && !seek_done)
{
/*
Just to avoid a redundant event read before hitting the next branch.
ToDo: share this code with the below somehow.
*/
my_b_seek(&cache, start_seek);
seek_done= true;
}
}
else if (found_in_index && !seek_done)
{
/*
After reading the format_description event and possibly
start_encryption, we can seek forward to avoid most or all of the scan
(depending on the sparseness of the index).
*/
my_b_seek(&cache, start_seek);
seek_done= true;
}
else if (unlikely(typ != FORMAT_DESCRIPTION_EVENT &&
!found_format_description_event))
@ -1570,7 +1737,7 @@ gtid_state_from_pos(const char *name, uint32 offset,
else if (typ == ROTATE_EVENT || typ == STOP_EVENT ||
typ == BINLOG_CHECKPOINT_EVENT)
continue; /* Continue looking */
else if (typ == GTID_LIST_EVENT)
else if (typ == GTID_LIST_EVENT && !found_in_index)
{
rpl_gtid *gtid_list;
bool status;
@ -1798,7 +1965,7 @@ send_event_to_slave(binlog_send_info *info, Log_event_type event_type,
}
});
if (info->until_binlog_state.update_nolock(&event_gtid, false))
if (info->until_binlog_state.update_nolock(&event_gtid))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
return "Failed in internal GTID book-keeping: Out of memory";
@ -2198,6 +2365,8 @@ static int init_binlog_sender(binlog_send_info *info,
char search_file_name[FN_REFLEN];
const char *name=search_file_name;
bool found_in_index= false;
uint32 start_seek= 0;
if (info->using_gtid_state)
{
if (info->gtid_state.load(connect_gtid_state.ptr(),
@ -2223,16 +2392,26 @@ static int init_binlog_sender(binlog_send_info *info,
info->error= error;
return 1;
}
if ((info->errmsg= gtid_find_binlog_file(&info->gtid_state,
search_file_name,
info->until_gtid_state)))
if ((info->errmsg= gtid_find_binlog_pos(&info->gtid_state,
search_file_name,
info->until_gtid_state,
&info->until_binlog_state,
&found_in_index, &start_seek)))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
return 1;
}
/* start from beginning of binlog file */
*pos = 4;
if (found_in_index)
{
/* Start from a position looked up in the binlog gtid index. */
*pos = start_seek;
}
else
{
/* start from beginning of binlog file */
*pos = 4;
}
}
else
{
@ -2865,6 +3044,7 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
ushort flags)
{
LOG_INFO linfo;
ulong ev_offset;
IO_CACHE log;
File file = -1;
@ -2990,6 +3170,34 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
if (info->until_gtid_state && info->until_gtid_state->count() == 0)
info->gtid_until_group= GTID_UNTIL_STOP_AFTER_STANDALONE;
if (info->using_gtid_state && pos > BIN_LOG_HEADER_SIZE &&
( info->gtid_state.is_pos_reached() ||
info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE ) )
{
/*
We are starting a GTID connect from a point not at the start of the
binlog file (from a GTID index lookup). Send a fake GTID_LIST event
in place of the real GTID_LIST that would normally be sent from the
start of the binlog file.
If we already reached the gtid UNTIL position, then set the
FLAG_UNTIL_REACHED in the GTID_LIST event and stop immediately.
*/
uint32 flag= 0;
if (info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE)
{
flag= Gtid_list_log_event::FLAG_UNTIL_REACHED;
info->should_stop= true;
}
Gtid_list_log_event glev(&info->until_binlog_state, flag);
if (reset_transmit_packet(info, info->flags, &ev_offset, &info->errmsg) ||
fake_gtid_list_event(info, &glev, &info->errmsg, (int32)pos))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
goto err;
}
}
THD_STAGE_INFO(thd, stage_sending_binlog_event_to_slave);
if (send_one_binlog_file(info, &log, &linfo, pos))
break;