1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-34705: Binlog-in-engine: Implement file header page

Now the first page of each binlog tablespace file is reserved as a file
header, replacing the use of extra fields in the first gtid state record of
the file. The header is primarily used during recovery, especially to get
the file LSN before which no redo should be applied to the file.

Using a dedicated page makes it possible to durably sync the file header to
disk after RESET MASTER (and at first server startup) and not have it
overwritten (and potentially corrupted) later; this guarantees that the
recovery will have at least one file header to look at to determine from
which LSN to apply redo records.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
Kristian Nielsen
2025-04-02 10:21:08 +02:00
parent 21751e21f1
commit e1055af14f
7 changed files with 406 additions and 196 deletions

View File

@@ -18,11 +18,11 @@ binlog-000001.ibb 262144
FLUSH BINARY LOGS; FLUSH BINARY LOGS;
SHOW BINARY LOGS; SHOW BINARY LOGS;
Log_name File_size Log_name File_size
binlog-000000.ibb 36864 binlog-000000.ibb 40960
binlog-000001.ibb 262144 binlog-000001.ibb 262144
binlog-000002.ibb 262144 binlog-000002.ibb 262144
SET STATEMENT sql_log_bin=0 FOR SET STATEMENT sql_log_bin=0 FOR
CALL mtr.add_suppression("InnoDB: Page corruption in binlog tablespace file page number 0"); CALL mtr.add_suppression("InnoDB: Page corruption in binlog tablespace file page number 1");
FLUSH BINARY LOGS; FLUSH BINARY LOGS;
FLUSH BINARY LOGS; FLUSH BINARY LOGS;
SHOW BINLOG EVENTS IN 'binlog-000000.ibb' LIMIT 1; SHOW BINLOG EVENTS IN 'binlog-000000.ibb' LIMIT 1;
@@ -83,7 +83,7 @@ binlog-000022.ibb 262144
binlog-000023.ibb 262144 binlog-000023.ibb 262144
binlog-000024.ibb 262144 binlog-000024.ibb 262144
SET @now= NOW(); SET @now= NOW();
*** Do 187 inserts ... *** Do 149 inserts ...
PURGE BINARY LOGS BEFORE @now; PURGE BINARY LOGS BEFORE @now;
SHOW BINARY LOGS; SHOW BINARY LOGS;
Log_name File_size Log_name File_size

View File

@@ -27,22 +27,23 @@ FLUSH BINARY LOGS;
SHOW BINARY LOGS; SHOW BINARY LOGS;
# Flush couple logs so we are sure the first file is on disk. # Flush couple logs so we are sure the first file is on disk.
# Corrupt one bit in the first page of the first file to test that crc32 # Corrupt one bit in the first data page of the first file to test that crc32
# mismatch is caught. # mismatch is caught.
SET STATEMENT sql_log_bin=0 FOR SET STATEMENT sql_log_bin=0 FOR
CALL mtr.add_suppression("InnoDB: Page corruption in binlog tablespace file page number 0"); CALL mtr.add_suppression("InnoDB: Page corruption in binlog tablespace file page number 1");
FLUSH BINARY LOGS; FLUSH BINARY LOGS;
FLUSH BINARY LOGS; FLUSH BINARY LOGS;
--let $file= binlog-000000.ibb --let $file= binlog-000000.ibb
--let $datadir= `SELECT @@datadir` --let $datadir= `SELECT @@datadir`
--let BINLOG_FILE= $datadir/$file --let BINLOG_FILE= $datadir/$file
perl; perl;
my $pos= 4096 + 50; # Early byte in page 1 (page 0 is file header).
open F, '+<', $ENV{BINLOG_FILE} or die $!; open F, '+<', $ENV{BINLOG_FILE} or die $!;
sysseek F, 50, 0 or die $!; sysseek F, $pos, 0 or die $!;
my $x; my $x;
sysread F, $x, 1 or die $!; sysread F, $x, 1 or die $!;
$x= chr(ord($x) ^ (1 <<3)); $x= chr(ord($x) ^ (1 <<3));
sysseek F, 50, 0 or die $!; sysseek F, $pos, 0 or die $!;
syswrite F, $x, 1 or die $!; syswrite F, $x, 1 or die $!;
EOF EOF
@@ -146,7 +147,7 @@ SHOW BINARY LOGS;
--sleep 1 --sleep 1
SET @now= NOW(); SET @now= NOW();
--sleep 1 --sleep 1
--let $num_insert= `SELECT floor(256*1.5*1024/2100)` --let $num_insert= `SELECT floor(256*1.2*1024/2100)`
--echo *** Do $num_insert inserts ... --echo *** Do $num_insert inserts ...
--disable_query_log --disable_query_log
BEGIN; BEGIN;

View File

@@ -57,6 +57,6 @@ EOF
--source include/wait_until_connected_again.inc --source include/wait_until_connected_again.inc
--let $binlog_file= --let $binlog_file=
--let $binlog_start= 0 --let $binlog_start= 4
--source include/show_binlog_events.inc --source include/show_binlog_events.inc
DROP TABLE t1; DROP TABLE t1;

View File

@@ -58,9 +58,10 @@ ulong ibb_page_size= (1 << ibb_page_size_shift);
This value must be used over the setting innodb_binlog_state_interval, This value must be used over the setting innodb_binlog_state_interval,
because after a restart the latest binlog file will be using the value of the because after a restart the latest binlog file will be using the value of the
setting prior to the restart; the new value of the setting (if different) setting prior to the restart; the new value of the setting (if different)
will be used for newly created binlog files. will be used for newly created binlog files. The value refers to the file
of active_binlog_file_no.
*/ */
uint32_t current_binlog_state_interval; uint64_t current_binlog_state_interval;
/* /*
Mutex protecting active_binlog_file_no. Mutex protecting active_binlog_file_no.
@@ -583,7 +584,7 @@ fsp_binlog_page_fifo::flush_thread_run()
if (all_flushed && file_no <= first_file_no) if (all_flushed && file_no <= first_file_no)
all_flushed= flush_one_page(file_no + 1, false); all_flushed= flush_one_page(file_no + 1, false);
} }
if (all_flushed) if (all_flushed && !flush_thread_end)
my_cond_wait(&m_cond, &m_mutex.m_mutex); my_cond_wait(&m_cond, &m_mutex.m_mutex);
} }
@@ -597,21 +598,30 @@ size_t
crc32_pwrite_page(File fd, byte *buf, uint32_t page_no, myf MyFlags) noexcept crc32_pwrite_page(File fd, byte *buf, uint32_t page_no, myf MyFlags) noexcept
{ {
const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM; const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM;
mach_write_to_4(buf + payload, my_crc32c(0, buf, payload)); int4store(buf + payload, my_crc32c(0, buf, payload));
return my_pwrite(fd, (const uchar *)buf, ibb_page_size, return my_pwrite(fd, (const uchar *)buf, ibb_page_size,
(my_off_t)page_no << ibb_page_size_shift, MyFlags); (my_off_t)page_no << ibb_page_size_shift, MyFlags);
} }
size_t /*
Read a page, with CRC check.
Returns:
-1 error
0 EOF
1 Ok
*/
int
crc32_pread_page(File fd, byte *buf, uint32_t page_no, myf MyFlags) noexcept crc32_pread_page(File fd, byte *buf, uint32_t page_no, myf MyFlags) noexcept
{ {
size_t res= my_pread(fd, buf, ibb_page_size, size_t read= my_pread(fd, buf, ibb_page_size,
(my_off_t)page_no << ibb_page_size_shift, MyFlags); (my_off_t)page_no << ibb_page_size_shift, MyFlags);
if (UNIV_LIKELY(res == ibb_page_size)) int res= 1;
if (UNIV_LIKELY(read == ibb_page_size))
{ {
const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM; const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM;
uint32_t crc32= mach_read_from_4(buf + payload); uint32_t crc32= uint4korr(buf + payload);
/* Allow a completely zero (empty) page as well. */ /* Allow a completely zero (empty) page as well. */
if (UNIV_UNLIKELY(crc32 != my_crc32c(0, buf, payload)) && if (UNIV_UNLIKELY(crc32 != my_crc32c(0, buf, payload)) &&
(buf[0] != 0 || 0 != memcmp(buf, buf+1, ibb_page_size - 1))) (buf[0] != 0 || 0 != memcmp(buf, buf+1, ibb_page_size - 1)))
@@ -624,10 +634,46 @@ crc32_pread_page(File fd, byte *buf, uint32_t page_no, myf MyFlags) noexcept
page_no, crc32); page_no, crc32);
} }
} }
else if (read == (size_t)-1)
res= -1;
else
res= 0;
return res; return res;
} }
int
crc32_pread_page(pfs_os_file_t fh, byte *buf, uint32_t page_no, myf MyFlags)
noexcept
{
const uint32_t page_size= (uint32_t)ibb_page_size;
ulint bytes_read= 0;
dberr_t err= os_file_read(IORequestRead, fh, buf,
(os_offset_t)page_no << ibb_page_size_shift,
page_size, &bytes_read);
if (UNIV_UNLIKELY(err != DB_SUCCESS))
return -1;
else if (UNIV_UNLIKELY(bytes_read < page_size))
return 0;
const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM;
uint32_t crc32= uint4korr(buf + payload);
/* Allow a completely zero (empty) page as well. */
if (UNIV_UNLIKELY(crc32 != my_crc32c(0, buf, payload)) &&
(buf[0] != 0 || 0 != memcmp(buf, buf+1, ibb_page_size - 1)))
{
my_errno= EIO;
if (MyFlags & MY_WME)
sql_print_error("InnoDB: Page corruption in binlog tablespace file "
"page number %u (invalid crc32 checksum 0x%08X)",
page_no, crc32);
return -1;
}
return 1;
}
void void
binlog_write_up_to_now() noexcept binlog_write_up_to_now() noexcept
{ {
@@ -654,6 +700,36 @@ binlog_write_up_to_now() noexcept
} }
void
fsp_binlog_extract_header_page(const byte *page_buf,
binlog_header_data *out_header_data) noexcept
{
uint32_t magic= uint4korr(page_buf);
uint32_t vers_major= uint4korr(page_buf + 8);
const uint32_t payload= IBB_HEADER_PAGE_SIZE - BINLOG_PAGE_CHECKSUM;
uint32_t crc32= uint4korr(page_buf + payload);
out_header_data->is_empty= false;
out_header_data->is_invalid= false;
if (crc32 != my_crc32c(0, page_buf, payload) ||
magic != IBB_MAGIC || vers_major > IBB_FILE_VERS_MAJOR)
{
if (page_buf[0] == 0 &&
0 == memcmp(page_buf, page_buf+1, IBB_HEADER_PAGE_SIZE - 1))
out_header_data->is_empty= true;
else
out_header_data->is_invalid= true;
return;
}
out_header_data->page_size_shift= uint4korr(page_buf + 4);
out_header_data->vers_major= vers_major;
out_header_data->vers_minor= uint4korr(page_buf + 12);
out_header_data->file_no= uint8korr(page_buf + 16);
out_header_data-> page_count= uint8korr(page_buf + 24);
out_header_data-> start_lsn= uint8korr(page_buf + 32);
out_header_data-> diff_state_interval= uint8korr(page_buf + 40);
}
void void
fsp_log_binlog_write(mtr_t *mtr, fsp_binlog_page_entry *page, fsp_log_binlog_write(mtr_t *mtr, fsp_binlog_page_entry *page,
uint32_t page_offset, uint32_t len) uint32_t page_offset, uint32_t len)
@@ -678,6 +754,19 @@ fsp_log_binlog_write(mtr_t *mtr, fsp_binlog_page_entry *page,
page_offset + &page->page_buf[0], len); page_offset + &page->page_buf[0], len);
} }
void
fsp_log_header_page(mtr_t *mtr, fsp_binlog_page_entry *page, uint32_t len)
noexcept
{
uint64_t file_no= page->file_no;
uint32_t page_no= page->page_no;
ut_ad(page_no == 0);
page->complete= true;
mtr->write_binlog((file_no & 1), page_no, 0, &page->page_buf[0], len);
}
/* /*
Initialize the InnoDB implementation of binlog. Initialize the InnoDB implementation of binlog.
Note that we do not create or open any binlog tablespaces here. Note that we do not create or open any binlog tablespaces here.
@@ -849,6 +938,7 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type)
byte cont_flag= 0; byte cont_flag= 0;
for (;;) { for (;;) {
if (page_offset == BINLOG_PAGE_DATA) { if (page_offset == BINLOG_PAGE_DATA) {
ut_ad(!block);
uint32_t file_size_in_pages= binlog_page_fifo->size_in_pages(file_no); uint32_t file_size_in_pages= binlog_page_fifo->size_in_pages(file_no);
if (UNIV_UNLIKELY(page_no >= file_size_in_pages)) { if (UNIV_UNLIKELY(page_no >= file_size_in_pages)) {
/* /*
@@ -859,7 +949,7 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type)
The normal case is that the next tablespace is already pre-allocated The normal case is that the next tablespace is already pre-allocated
and available; binlog tablespace N is active while (N+1) is being and available; binlog tablespace N is active while (N+1) is being
pre-allocated. Only under extreme I/O pressure should be need to pre-allocated. Only under extreme I/O pressure should we need to
stall here. stall here.
*/ */
ut_ad(!pending_prev_end_offset); ut_ad(!pending_prev_end_offset);
@@ -873,14 +963,24 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type)
// ToDo: assert that a single write doesn't span more than two binlog files. // ToDo: assert that a single write doesn't span more than two binlog files.
++file_no; ++file_no;
file_size_in_pages= binlog_page_fifo->size_in_pages(file_no);
binlog_cur_written_offset[file_no & 1].store(0, std::memory_order_relaxed); binlog_cur_written_offset[file_no & 1].store(0, std::memory_order_relaxed);
binlog_cur_end_offset[file_no & 1].store(0, std::memory_order_relaxed); binlog_cur_end_offset[file_no & 1].store(0, std::memory_order_relaxed);
pthread_cond_signal(&active_binlog_cond); pthread_cond_signal(&active_binlog_cond);
mysql_mutex_unlock(&active_binlog_mutex); mysql_mutex_unlock(&active_binlog_mutex);
binlog_cur_page_no= page_no= 0; binlog_cur_page_no= page_no= 0;
/* ToDo: Here we must use the value from the file, if this file was pre-allocated before a server restart where the value of innodb_binlog_state_interval changed. Maybe just make innodb_binlog_state_interval dynamic and make the prealloc thread (and discover code at startup) supply the correct value to use for each file. */
current_binlog_state_interval= current_binlog_state_interval=
(uint32_t)(innodb_binlog_state_interval >> page_size_shift); (uint64_t)(innodb_binlog_state_interval >> page_size_shift);
}
/* Write the header page at the start of a binlog tablespace file. */
if (page_no == 0)
{
lsn_t start_lsn= log_sys.get_lsn(std::memory_order_acquire);
bool err= ibb_write_header_page(mtr, file_no, file_size_in_pages,
start_lsn, current_binlog_state_interval);
ut_a(!err /* ToDo error handling */);
page_no= 1;
} }
/* Must be a power of two. */ /* Must be a power of two. */
@@ -888,14 +988,15 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type)
current_binlog_state_interval == current_binlog_state_interval ==
(uint64_t)1 << (63 - nlz(current_binlog_state_interval))); (uint64_t)1 << (63 - nlz(current_binlog_state_interval)));
if (0 == (page_no & (current_binlog_state_interval - 1))) { if (page_no == 1 ||
if (page_no == 0) { 0 == (page_no & (current_binlog_state_interval - 1))) {
if (page_no == 1) {
rpl_binlog_state_base full_state; rpl_binlog_state_base full_state;
bool err; bool err;
full_state.init(); full_state.init();
err= load_global_binlog_state(&full_state); err= load_global_binlog_state(&full_state);
ut_a(!err /* ToDo error handling */); ut_a(!err /* ToDo error handling */);
if (UNIV_UNLIKELY(file_no == 0 && page_no == 0) && if (UNIV_UNLIKELY(file_no == 0 && page_no == 1) &&
(full_state.count_nolock() == 1)) (full_state.count_nolock() == 1))
{ {
/* /*
@@ -932,14 +1033,14 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type)
} }
} }
err= binlog_gtid_state(&full_state, mtr, block, page_no, err= binlog_gtid_state(&full_state, mtr, block, page_no,
page_offset, file_no, file_size_in_pages); page_offset, file_no);
ut_a(!err /* ToDo error handling */); ut_a(!err /* ToDo error handling */);
ut_ad(block); ut_ad(block);
full_state.free(); full_state.free();
binlog_diff_state.reset_nolock(); binlog_diff_state.reset_nolock();
} else { } else {
bool err= binlog_gtid_state(&binlog_diff_state, mtr, block, page_no, bool err= binlog_gtid_state(&binlog_diff_state, mtr, block, page_no,
page_offset, file_no, file_size_in_pages); page_offset, file_no);
ut_a(!err /* ToDo error handling */); ut_a(!err /* ToDo error handling */);
} }
} else } else
@@ -1256,15 +1357,15 @@ binlog_chunk_reader::fetch_current_page()
cur_file_length= ~(uint64_t)0; cur_file_length= ~(uint64_t)0;
} }
++s.file_no; ++s.file_no;
s.page_no= 0; s.page_no= 1; /* Skip the header page. */
continue; continue;
} }
size_t res= crc32_pread_page(cur_file_handle, page_buffer, s.page_no, int res= crc32_pread_page(cur_file_handle, page_buffer, s.page_no,
MYF(MY_WME)); MYF(MY_WME));
if (res == (size_t)-1) if (res < 0)
return CHUNK_READER_ERROR; return CHUNK_READER_ERROR;
if (res == 0 && my_errno == HA_ERR_FILE_TOO_SHORT) if (res == 0)
goto goto_next_file; goto goto_next_file;
page_ptr= page_buffer; page_ptr= page_buffer;
return CHUNK_READER_FOUND; return CHUNK_READER_FOUND;
@@ -1459,7 +1560,7 @@ go_next_page:
cur_file_handle= (File)-1; cur_file_handle= (File)-1;
cur_file_length= ~(uint64_t)0; cur_file_length= ~(uint64_t)0;
++s.file_no; ++s.file_no;
s.page_no= 0; s.page_no= 1; /* Skip the header page. */
} }
} }
@@ -1470,6 +1571,19 @@ go_next_page:
} }
int
binlog_chunk_reader::get_file_header(binlog_header_data *out_header)
{
seek(current_file_no(), 0);
if (fetch_current_page() != CHUNK_READER_FOUND)
return -1;
fsp_binlog_extract_header_page(page_ptr, out_header);
if (out_header->is_invalid || out_header->is_empty)
return -1;
return 0;
}
void void
binlog_chunk_reader::restore_pos(binlog_chunk_reader::saved_position *pos) binlog_chunk_reader::restore_pos(binlog_chunk_reader::saved_position *pos)
{ {

View File

@@ -389,8 +389,7 @@ public:
~gtid_search(); ~gtid_search();
enum Read_Result read_gtid_state_file_no(rpl_binlog_state_base *state, enum Read_Result read_gtid_state_file_no(rpl_binlog_state_base *state,
uint64_t file_no, uint32_t page_no, uint64_t file_no, uint32_t page_no,
uint64_t *out_file_end, uint64_t *out_file_end);
uint32_t *out_diff_state_interval);
int find_gtid_pos(slave_connection_state *pos, int find_gtid_pos(slave_connection_state *pos,
rpl_binlog_state_base *out_state, uint64_t *out_file_no, rpl_binlog_state_base *out_state, uint64_t *out_file_no,
uint64_t *out_offset); uint64_t *out_offset);
@@ -491,7 +490,7 @@ public:
bool close_file() noexcept; bool close_file() noexcept;
bool next_file() noexcept; bool next_file() noexcept;
bool next_page() noexcept; bool next_page() noexcept;
void update_page_from_record(uint16_t offset, bool update_page_from_record(uint16_t offset,
const byte *buf, size_t size) noexcept; const byte *buf, size_t size) noexcept;
}; };
@@ -506,8 +505,8 @@ static int innodb_binlog_discover();
static bool binlog_state_recover(); static bool binlog_state_recover();
static void innodb_binlog_autopurge(uint64_t first_open_file_no); static void innodb_binlog_autopurge(uint64_t first_open_file_no);
static int read_gtid_state_from_page(rpl_binlog_state_base *state, static int read_gtid_state_from_page(rpl_binlog_state_base *state,
const byte *page, uint32_t page_no, const byte *page, uint32_t page_no)
binlog_header_data *out_header_data); noexcept;
/* /*
@@ -526,7 +525,6 @@ binlog_recovery::get_header(uint64_t file_no, lsn_t &out_lsn, bool &out_empty)
noexcept noexcept
{ {
char full_path[OS_FILE_MAX_PATH]; char full_path[OS_FILE_MAX_PATH];
rpl_binlog_state_base dummy_state;
binlog_header_data header; binlog_header_data header;
out_empty= true; out_empty= true;
@@ -547,14 +545,13 @@ binlog_recovery::get_header(uint64_t file_no, lsn_t &out_lsn, bool &out_empty)
it as an empty file. it as an empty file.
*/ */
const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM; const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM;
uint32_t crc32= mach_read_from_4(page_buf + payload); uint32_t crc32= uint4korr(page_buf + payload);
if (UNIV_UNLIKELY(crc32 != my_crc32c(0, page_buf, payload))) if (UNIV_UNLIKELY(crc32 != my_crc32c(0, page_buf, payload)))
return 0; return 0;
dummy_state.init(); fsp_binlog_extract_header_page(page_buf, &header);
int res= read_gtid_state_from_page(&dummy_state, page_buf, 0, &header); if (header.is_invalid)
if (res <= 0) return 0;
return res;
if (!header.is_empty) if (!header.is_empty)
{ {
out_empty= false; out_empty= false;
@@ -724,8 +721,8 @@ binlog_recovery::init_recovery_from(uint64_t file_no, lsn_t file_lsn,
skipping_early_lsn= false; skipping_early_lsn= false;
if (offset <= BINLOG_PAGE_DATA) if (offset <= BINLOG_PAGE_DATA)
{ {
update_page_from_record(offset, buf, size);
skipping_partial_page= false; skipping_partial_page= false;
return update_page_from_record(offset, buf, size);
} }
} }
return false; return false;
@@ -852,21 +849,23 @@ binlog_recovery::zero_out_cur_file()
return; return;
/* Recover the original size from the current file. */ /* Recover the original size from the current file. */
size_t read= crc32_pread_page(cur_file_fh, page_buf, 0, MYF(0)); int res= crc32_pread_page(cur_file_fh, page_buf, 0, MYF(0));
if (read != (size_t)ibb_page_size) if (res <= 0)
{ {
sql_print_warning("InnoDB: Could not read last binlog file during recovery"); sql_print_warning("InnoDB: Could not read last binlog file during recovery");
return; return;
} }
binlog_header_data header; binlog_header_data header;
rpl_binlog_state_base dummy_state; fsp_binlog_extract_header_page(page_buf, &header);
dummy_state.init();
int res= read_gtid_state_from_page(&dummy_state, page_buf, 0, &header); if (header.is_invalid)
if (res <= 0) {
sql_print_warning("InnoDB: Invalid header page in last binlog file "
"during recovery");
return;
}
if (header.is_empty)
{ {
if (res < 0)
sql_print_warning("InnoDB: Could not read last binlog file during recovery");
else
sql_print_warning("InnoDB: Empty binlog file header found during recovery"); sql_print_warning("InnoDB: Empty binlog file header found during recovery");
ut_ad(0); ut_ad(0);
return; return;
@@ -1040,17 +1039,56 @@ binlog_recovery::apply_redo(bool space_id, uint32_t page_no, uint16_t offset,
if (offset + size >= ibb_page_size) if (offset + size >= ibb_page_size)
return !srv_force_recovery; return !srv_force_recovery;
update_page_from_record(offset, buf, size); return update_page_from_record(offset, buf, size);
return false;
} }
void bool
binlog_recovery::update_page_from_record(uint16_t offset, binlog_recovery::update_page_from_record(uint16_t offset,
const byte *buf, size_t size) noexcept const byte *buf, size_t size) noexcept
{ {
memcpy(page_buf + offset, buf, size); memcpy(page_buf + offset, buf, size);
if (cur_page_no == 0 && offset == 0)
{
binlog_header_data header;
/*
This recovery record is for the file header page.
This record is special, it covers only the used part of the header page.
The reaminder of the page must be set to zeroes.
Additionally, there is an extra CRC corresponding to a minimum
page size of IBB_PAGE_SIZE_MIN, in anticipation for future configurable
page size.
*/
memset(page_buf + size, 0, ibb_page_size - (size + BINLOG_PAGE_DATA_END));
cur_page_offset= (uint32_t)ibb_page_size - BINLOG_PAGE_DATA_END;
uint32_t payload= IBB_HEADER_PAGE_SIZE - BINLOG_PAGE_CHECKSUM;
int4store(page_buf + payload, my_crc32c(0, page_buf, payload));
fsp_binlog_extract_header_page(page_buf, &header);
if (header.is_invalid)
{
sql_print_error("InnoDB: Corrupt or invalid file header found during "
"recovery of file number %" PRIu64, cur_file_no);
return !srv_force_recovery;
}
if (header.is_empty)
{
sql_print_error("InnoDB: Empty file header found during "
"recovery of file number %" PRIu64, cur_file_no);
return !srv_force_recovery;
}
if (header.file_no != cur_file_no)
{
sql_print_error("InnoDB: Inconsistency in file header during recovery. "
"The header in file number %" PRIu64 " is for file "
"number %" PRIu64, cur_file_no, header.file_no);
return !srv_force_recovery;
}
return false;
}
cur_page_offset= offset + (uint32_t)size; cur_page_offset= offset + (uint32_t)size;
return false;
} }
@@ -1107,7 +1145,7 @@ innodb_binlog_init_state()
binlog_cur_page_no= 0; binlog_cur_page_no= 0;
binlog_cur_page_offset= BINLOG_PAGE_DATA; binlog_cur_page_offset= BINLOG_PAGE_DATA;
current_binlog_state_interval= current_binlog_state_interval=
(uint32_t)(innodb_binlog_state_interval >> ibb_page_size_shift); (uint64_t)(innodb_binlog_state_interval >> ibb_page_size_shift);
ut_a(innodb_binlog_state_interval == ut_a(innodb_binlog_state_interval ==
(current_binlog_state_interval << ibb_page_size_shift)); (current_binlog_state_interval << ibb_page_size_shift));
} }
@@ -1149,6 +1187,7 @@ binlog_sync_initial()
mtr.commit(); mtr.commit();
log_buffer_flush_to_disk(true); log_buffer_flush_to_disk(true);
binlog_page_fifo->flush_up_to(0, 0); binlog_page_fifo->flush_up_to(0, 0);
binlog_page_fifo->do_fdatasync(0);
} }
@@ -1300,41 +1339,57 @@ binlog_page_empty(const byte *page)
static int static int
find_pos_in_binlog(uint64_t file_no, size_t file_size, byte *page_buf, find_pos_in_binlog(uint64_t file_no, size_t file_size, byte *page_buf,
uint32_t *out_page_no, uint32_t *out_pos_in_page) uint32_t *out_page_no, uint32_t *out_pos_in_page,
uint64_t *out_state_interval)
{ {
const uint32_t page_size= (uint32_t)ibb_page_size; const uint32_t page_size= (uint32_t)ibb_page_size;
const uint32_t page_size_shift= (uint32_t)ibb_page_size_shift; const uint32_t page_size_shift= (uint32_t)ibb_page_size_shift;
const uint32_t idx= file_no & 1; const uint32_t idx= file_no & 1;
char file_name[OS_FILE_MAX_PATH]; char file_name[OS_FILE_MAX_PATH];
uint32_t p_0, p_1, p_2, last_nonempty; uint32_t p_0, p_1, p_2, last_nonempty;
dberr_t err;
byte *p, *page_end; byte *p, *page_end;
bool ret; bool ret;
binlog_header_data header_data;
*out_page_no= 0; *out_page_no= 0;
*out_pos_in_page= BINLOG_PAGE_DATA; *out_pos_in_page= BINLOG_PAGE_DATA;
*out_state_interval= 0;
binlog_name_make(file_name, file_no); binlog_name_make(file_name, file_no);
pfs_os_file_t fh= os_file_create(innodb_data_file_key, file_name, pfs_os_file_t fh= os_file_create(innodb_data_file_key, file_name,
OS_FILE_OPEN, OS_DATA_FILE, OS_FILE_OPEN, OS_DATA_FILE,
srv_read_only_mode, &ret); srv_read_only_mode, &ret);
if (!ret) { if (!ret) {
sql_print_warning("Unable to open file '%s'", file_name); sql_print_warning("InnoDB: Unable to open file '%s'", file_name);
return -1; return -1;
} }
err= os_file_read(IORequestRead, fh, page_buf, 0, page_size, nullptr); int res= crc32_pread_page(fh, page_buf, 0, MYF(MY_WME));
if (err != DB_SUCCESS) { if (res <= 0) {
os_file_close(fh); os_file_close(fh);
return -1; return -1;
} }
if (binlog_page_empty(page_buf)) { fsp_binlog_extract_header_page(page_buf, &header_data);
if (header_data.is_invalid)
{
sql_print_error("InnoDB: Invalid or corrupt file header in file "
"'%s'", file_name);
return -1;
}
if (header_data.is_empty) {
ret= ret=
fsp_binlog_open(file_name, fh, file_no, file_size, ~(uint32_t)0, nullptr); fsp_binlog_open(file_name, fh, file_no, file_size, ~(uint32_t)0, nullptr);
binlog_cur_written_offset[idx].store(0, std::memory_order_relaxed); binlog_cur_written_offset[idx].store(0, std::memory_order_relaxed);
binlog_cur_end_offset[idx].store(0, std::memory_order_relaxed); binlog_cur_end_offset[idx].store(0, std::memory_order_relaxed);
return (ret ? -1 : 0); return (ret ? -1 : 0);
} }
if (header_data.file_no != file_no)
{
sql_print_error("InnoDB: Inconsistent file header in file '%s', "
"wrong file_no %" PRIu64, file_name, header_data.file_no);
return -1;
}
*out_state_interval= header_data.diff_state_interval;
last_nonempty= 0; last_nonempty= 0;
/* /*
@@ -1348,9 +1403,8 @@ find_pos_in_binlog(uint64_t file_no, size_t file_size, byte *page_buf,
break; break;
ut_ad(p_0 < p_2); ut_ad(p_0 < p_2);
p_1= (p_0 + p_2) / 2; p_1= (p_0 + p_2) / 2;
err= os_file_read(IORequestRead, fh, page_buf, p_1 << page_size_shift, res= crc32_pread_page(fh, page_buf, p_1, MYF(MY_WME));
page_size, nullptr); if (res <= 0) {
if (err != DB_SUCCESS) {
os_file_close(fh); os_file_close(fh);
return -1; return -1;
} }
@@ -1368,9 +1422,8 @@ find_pos_in_binlog(uint64_t file_no, size_t file_size, byte *page_buf,
This sometimes does an extra read, but as this is only during startup it This sometimes does an extra read, but as this is only during startup it
does not matter. does not matter.
*/ */
err= os_file_read(IORequestRead, fh, page_buf, res= crc32_pread_page(fh, page_buf, last_nonempty, MYF(MY_WME));
last_nonempty << page_size_shift, page_size, nullptr); if (res <= 0) {
if (err != DB_SUCCESS) {
os_file_close(fh); os_file_close(fh);
return -1; return -1;
} }
@@ -1416,6 +1469,7 @@ innodb_binlog_discover()
const uint32_t page_size= (uint32_t)ibb_page_size; const uint32_t page_size= (uint32_t)ibb_page_size;
const uint32_t page_size_shift= (uint32_t)ibb_page_size_shift; const uint32_t page_size_shift= (uint32_t)ibb_page_size_shift;
struct found_binlogs UNINIT_VAR(binlog_files); struct found_binlogs UNINIT_VAR(binlog_files);
uint64_t diff_state_interval;
int res= scan_for_binlogs(innodb_binlog_directory, &binlog_files, false); int res= scan_for_binlogs(innodb_binlog_directory, &binlog_files, false);
if (res <= 0) if (res <= 0)
@@ -1437,10 +1491,12 @@ innodb_binlog_discover()
res= find_pos_in_binlog(binlog_files.last_file_no, res= find_pos_in_binlog(binlog_files.last_file_no,
binlog_files.last_size, binlog_files.last_size,
page_buf.get(), &page_no, &pos_in_page); page_buf.get(), &page_no, &pos_in_page,
&diff_state_interval);
if (res < 0) { if (res < 0) {
file_no= binlog_files.last_file_no; file_no= binlog_files.last_file_no;
active_binlog_file_no.store(file_no, std::memory_order_release); active_binlog_file_no.store(file_no, std::memory_order_release);
current_binlog_state_interval= innodb_binlog_state_interval;
sql_print_warning("Binlog number %llu could no be opened. Starting a new " sql_print_warning("Binlog number %llu could no be opened. Starting a new "
"binlog file from number %llu", "binlog file from number %llu",
binlog_files.last_file_no, (file_no + 1)); binlog_files.last_file_no, (file_no + 1));
@@ -1451,6 +1507,7 @@ innodb_binlog_discover()
/* Found start position in the last binlog file. */ /* Found start position in the last binlog file. */
file_no= binlog_files.last_file_no; file_no= binlog_files.last_file_no;
active_binlog_file_no.store(file_no, std::memory_order_release); active_binlog_file_no.store(file_no, std::memory_order_release);
current_binlog_state_interval= diff_state_interval;
binlog_cur_page_no= page_no; binlog_cur_page_no= page_no;
binlog_cur_page_offset= pos_in_page; binlog_cur_page_offset= pos_in_page;
ib::info() << "Continuing binlog number " << file_no << " from position " ib::info() << "Continuing binlog number " << file_no << " from position "
@@ -1465,10 +1522,12 @@ innodb_binlog_discover()
res= find_pos_in_binlog(binlog_files.prev_file_no, res= find_pos_in_binlog(binlog_files.prev_file_no,
binlog_files.prev_size, binlog_files.prev_size,
page_buf.get(), page_buf.get(),
&prev_page_no, &prev_pos_in_page); &prev_page_no, &prev_pos_in_page,
&diff_state_interval);
if (res < 0) { if (res < 0) {
file_no= binlog_files.last_file_no; file_no= binlog_files.last_file_no;
active_binlog_file_no.store(file_no, std::memory_order_release); active_binlog_file_no.store(file_no, std::memory_order_release);
current_binlog_state_interval= innodb_binlog_state_interval;
binlog_cur_page_no= page_no; binlog_cur_page_no= page_no;
binlog_cur_page_offset= pos_in_page; binlog_cur_page_offset= pos_in_page;
sql_print_warning("Binlog number %llu could not be opened, starting " sql_print_warning("Binlog number %llu could not be opened, starting "
@@ -1478,6 +1537,7 @@ innodb_binlog_discover()
} }
file_no= binlog_files.prev_file_no; file_no= binlog_files.prev_file_no;
active_binlog_file_no.store(file_no, std::memory_order_release); active_binlog_file_no.store(file_no, std::memory_order_release);
current_binlog_state_interval= diff_state_interval;
binlog_cur_page_no= prev_page_no; binlog_cur_page_no= prev_page_no;
binlog_cur_page_offset= prev_pos_in_page; binlog_cur_page_offset= prev_pos_in_page;
ib::info() << "Continuing binlog number " << file_no << " from position " ib::info() << "Continuing binlog number " << file_no << " from position "
@@ -1490,6 +1550,7 @@ innodb_binlog_discover()
/* Just one empty binlog file found. */ /* Just one empty binlog file found. */
file_no= binlog_files.last_file_no; file_no= binlog_files.last_file_no;
active_binlog_file_no.store(file_no, std::memory_order_release); active_binlog_file_no.store(file_no, std::memory_order_release);
current_binlog_state_interval= innodb_binlog_state_interval;
binlog_cur_page_no= page_no; binlog_cur_page_no= page_no;
binlog_cur_page_offset= pos_in_page; binlog_cur_page_offset= pos_in_page;
ib::info() << "Continuing binlog number " << file_no << " from position " ib::info() << "Continuing binlog number " << file_no << " from position "
@@ -1501,6 +1562,7 @@ innodb_binlog_discover()
file_no= 0; file_no= 0;
earliest_binlog_file_no= 0; earliest_binlog_file_no= 0;
total_binlog_used_size= 0; total_binlog_used_size= 0;
current_binlog_state_interval= innodb_binlog_state_interval;
ib::info() << "Starting a new binlog from file number " << file_no << "."; ib::info() << "Starting a new binlog from file number " << file_no << ".";
return 0; return 0;
} }
@@ -1532,8 +1594,8 @@ void innodb_binlog_close(bool shutdown)
if (shutdown && innodb_binlog_inited >= 1) if (shutdown && innodb_binlog_inited >= 1)
{ {
binlog_diff_state.free(); binlog_diff_state.free();
mysql_mutex_destroy(&purge_binlog_mutex);
fsp_binlog_shutdown(); fsp_binlog_shutdown();
mysql_mutex_destroy(&purge_binlog_mutex);
} }
} }
@@ -1624,38 +1686,56 @@ innodb_binlog_prealloc_thread()
} }
bool
ibb_write_header_page(mtr_t *mtr, uint64_t file_no, uint64_t file_size_in_pages,
lsn_t start_lsn, uint64_t gtid_state_interval_in_pages)
{
fsp_binlog_page_entry *block;
uint32_t used_bytes;
block= binlog_page_fifo->create_page(file_no, 0);
ut_a(block /* ToDo: error handling? */);
byte *ptr= &block->page_buf[0];
int4store(ptr, IBB_MAGIC);
int4store(ptr + 4, ibb_page_size_shift);
int4store(ptr + 8, IBB_FILE_VERS_MAJOR);
int4store(ptr + 12, IBB_FILE_VERS_MINOR);
int8store(ptr + 16, file_no);
int8store(ptr + 24, file_size_in_pages);
int8store(ptr + 32, start_lsn);
int8store(ptr + 40, gtid_state_interval_in_pages);
used_bytes= 48;
ut_ad(ibb_page_size >= IBB_HEADER_PAGE_SIZE);
memset(ptr + used_bytes, 0, ibb_page_size - (used_bytes + BINLOG_PAGE_CHECKSUM));
/*
For future expansion with configurable page size:
Write a CRC32 at the end of the minimal page size. This way, the header
page can be read and checksummed without knowing the page size used in
the file, and then the actual page size can be obtained from the header
page.
*/
const uint32_t payload= IBB_HEADER_PAGE_SIZE - BINLOG_PAGE_CHECKSUM;
int4store(ptr + payload, my_crc32c(0, ptr, payload));
fsp_log_header_page(mtr, block, used_bytes);
binlog_page_fifo->release_page_mtr(block, mtr);
return false; // No error
}
__attribute__((noinline)) __attribute__((noinline))
static ssize_t static ssize_t
serialize_gtid_state(rpl_binlog_state_base *state, byte *buf, size_t buf_size, serialize_gtid_state(rpl_binlog_state_base *state, byte *buf, size_t buf_size)
uint32_t file_size_in_pages, uint64_t file_no, noexcept
bool is_first_page)
{ {
unsigned char *p= (unsigned char *)buf; unsigned char *p= (unsigned char *)buf;
/* /*
1 uint64_t for the current LSN at start of binlog file.
1 uint64_t for the file_no.
1 uint32_t for the file size in pages.
1 uint32_t for the innodb_binlog_state_interval in pages.
1 uint64_t for the number of entries in the state stored. 1 uint64_t for the number of entries in the state stored.
2 uint32_t + 1 uint64_t for at least one GTID. 2 uint32_t + 1 uint64_t for at least one GTID.
*/ */
ut_ad(buf_size >= 4*COMPR_INT_MAX32 + 4*COMPR_INT_MAX64); ut_ad(buf_size >= 2*COMPR_INT_MAX32 + 2*COMPR_INT_MAX64);
if (is_first_page) {
/*
In the first page where we put the full state, include the value of the
setting for the interval at which differential states are binlogged, so
we know how to search them independent of how the setting changes.
We also include the current LSN for recovery purposes; and the file
length and file_no, which is also useful if we have to recover the whole
file from the redo log after a crash.
*/
p= compr_int_write(p, log_sys.get_lsn(std::memory_order_acquire));
p= compr_int_write(p, file_no);
p= compr_int_write(p, file_size_in_pages);
/* ToDo: Check that this current_binlog_state_interval is the correct value! */
p= compr_int_write(p, current_binlog_state_interval);
}
p= compr_int_write(p, state->count_nolock()); p= compr_int_write(p, state->count_nolock());
unsigned char * const pmax= unsigned char * const pmax=
p + (buf_size - (2*COMPR_INT_MAX32 + COMPR_INT_MAX64)); p + (buf_size - (2*COMPR_INT_MAX32 + COMPR_INT_MAX64));
@@ -1678,8 +1758,7 @@ serialize_gtid_state(rpl_binlog_state_base *state, byte *buf, size_t buf_size,
bool bool
binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr, binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
fsp_binlog_page_entry * &block, uint32_t &page_no, fsp_binlog_page_entry * &block, uint32_t &page_no,
uint32_t &page_offset, uint64_t file_no, uint32_t &page_offset, uint64_t file_no)
uint32_t file_size_in_pages)
{ {
/* /*
Use a small, efficient stack-allocated buffer by default, falling back to Use a small, efficient stack-allocated buffer by default, falling back to
@@ -1690,9 +1769,7 @@ binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
uint32_t block_page_no= ~(uint32_t)0; uint32_t block_page_no= ~(uint32_t)0;
block= nullptr; block= nullptr;
ssize_t used_bytes= serialize_gtid_state(state, small_buf, sizeof(small_buf), ssize_t used_bytes= serialize_gtid_state(state, small_buf, sizeof(small_buf));
file_size_in_pages, file_no,
page_no==0);
if (used_bytes >= 0) if (used_bytes >= 0)
{ {
buf= small_buf; buf= small_buf;
@@ -1706,8 +1783,7 @@ binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
if (UNIV_UNLIKELY(!alloced_buf)) if (UNIV_UNLIKELY(!alloced_buf))
return true; return true;
buf= alloced_buf; buf= alloced_buf;
used_bytes= serialize_gtid_state(state, buf, buf_size, file_size_in_pages, used_bytes= serialize_gtid_state(state, buf, buf_size);
file_no, page_no==0);
if (UNIV_UNLIKELY(used_bytes < 0)) if (UNIV_UNLIKELY(used_bytes < 0))
{ {
ut_ad(0 /* Shouldn't happen, as we allocated maximum needed size. */); ut_ad(0 /* Shouldn't happen, as we allocated maximum needed size. */);
@@ -1722,6 +1798,8 @@ binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
/* For now, GTID state always at the start of a page. */ /* For now, GTID state always at the start of a page. */
ut_ad(page_offset == BINLOG_PAGE_DATA); ut_ad(page_offset == BINLOG_PAGE_DATA);
/* Page 0 is reserved for the header page. */
ut_ad(page_no != 0);
/* /*
Only write the GTID state record if there is room for actual event data Only write the GTID state record if there is room for actual event data
@@ -1792,16 +1870,12 @@ binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
*/ */
static int static int
read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page, read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page,
uint32_t page_no, binlog_header_data *out_header_data) uint32_t page_no) noexcept
{ {
const byte *p= page + BINLOG_PAGE_DATA; const byte *p= page + BINLOG_PAGE_DATA;
byte t= *p; byte t= *p;
if (UNIV_UNLIKELY((t & FSP_BINLOG_TYPE_MASK) != FSP_BINLOG_TYPE_GTID_STATE)) if (UNIV_UNLIKELY((t & FSP_BINLOG_TYPE_MASK) != FSP_BINLOG_TYPE_GTID_STATE))
{
out_header_data->is_empty= binlog_page_empty(page);
return 0; return 0;
}
out_header_data->is_empty= false;
/* ToDo: Handle reading a state that spans multiple pages. For now, we assume the state fits in a single page. */ /* ToDo: Handle reading a state that spans multiple pages. For now, we assume the state fits in a single page. */
ut_a(t & FSP_BINLOG_FLAG_LAST); ut_a(t & FSP_BINLOG_FLAG_LAST);
@@ -1811,42 +1885,6 @@ read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page,
return -1; return -1;
std::pair<uint64_t, const unsigned char *> v_and_p= compr_int_read(p + 3); std::pair<uint64_t, const unsigned char *> v_and_p= compr_int_read(p + 3);
p= v_and_p.second; p= v_and_p.second;
if (page_no == 0)
{
/*
The state in the first page has four extra words: The start LSN of the
file; the file_no of the file; the file length, in pages; and the offset
between differential binlog states logged regularly in the binlog
tablespace.
*/
if (UNIV_UNLIKELY(p >= p_end))
return -1;
out_header_data->start_lsn= (uint32_t)v_and_p.first;
v_and_p= compr_int_read(p);
p= v_and_p.second;
if (UNIV_UNLIKELY(p >= p_end))
return -1;
out_header_data->file_no= v_and_p.first;
v_and_p= compr_int_read(p);
p= v_and_p.second;
if (UNIV_UNLIKELY(p >= p_end) || UNIV_UNLIKELY(v_and_p.first >= UINT32_MAX))
return -1;
out_header_data->page_count= (uint32_t)v_and_p.first;
v_and_p= compr_int_read(p);
p= v_and_p.second;
if (UNIV_UNLIKELY(p >= p_end) || UNIV_UNLIKELY(v_and_p.first >= UINT32_MAX))
return -1;
out_header_data->diff_state_interval= (uint32_t)v_and_p.first;
v_and_p= compr_int_read(p);
p= v_and_p.second;
}
else
{
out_header_data->start_lsn= 0;
out_header_data->file_no= ~(uint64_t)0;
out_header_data->page_count= 0;
out_header_data->diff_state_interval= 0;
}
if (UNIV_UNLIKELY(p > p_end)) if (UNIV_UNLIKELY(p > p_end))
return -1; return -1;
@@ -1899,8 +1937,7 @@ read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page,
-1 Error -1 Error
*/ */
static int static int
read_gtid_state(rpl_binlog_state_base *state, File file, uint32_t page_no, read_gtid_state(rpl_binlog_state_base *state, File file, uint32_t page_no)
binlog_header_data *out_header_data)
{ {
std::unique_ptr<byte [], void (*)(void *)> page_buf std::unique_ptr<byte [], void (*)(void *)> page_buf
((byte *)my_malloc(PSI_NOT_INSTRUMENTED, ibb_page_size, MYF(MY_WME)), ((byte *)my_malloc(PSI_NOT_INSTRUMENTED, ibb_page_size, MYF(MY_WME)),
@@ -1909,12 +1946,11 @@ read_gtid_state(rpl_binlog_state_base *state, File file, uint32_t page_no,
return -1; return -1;
/* ToDo: Handle encryption. */ /* ToDo: Handle encryption. */
size_t res= crc32_pread_page(file, page_buf.get(), page_no, MYF(MY_WME)); int res= crc32_pread_page(file, page_buf.get(), page_no, MYF(MY_WME));
if (UNIV_UNLIKELY(res == (size_t)-1)) if (UNIV_UNLIKELY(res <= 0))
return -1; return -1;
return read_gtid_state_from_page(state, page_buf.get(), page_no, return read_gtid_state_from_page(state, page_buf.get(), page_no);
out_header_data);
} }
@@ -1929,40 +1965,36 @@ read_gtid_state(rpl_binlog_state_base *state, File file, uint32_t page_no,
static bool static bool
binlog_state_recover() binlog_state_recover()
{ {
binlog_header_data header_data;
rpl_binlog_state_base state; rpl_binlog_state_base state;
state.init(); state.init();
uint32_t diff_state_interval= 0; uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
uint32_t page_no= 0; uint64_t diff_state_interval= current_binlog_state_interval;
uint32_t page_no= 1;
char filename[OS_FILE_MAX_PATH]; char filename[OS_FILE_MAX_PATH];
binlog_name_make(filename, binlog_name_make(filename, active);
active_binlog_file_no.load(std::memory_order_relaxed));
File file= my_open(filename, O_RDONLY | O_BINARY, MYF(MY_WME)); File file= my_open(filename, O_RDONLY | O_BINARY, MYF(MY_WME));
if (UNIV_UNLIKELY(file < (File)0)) if (UNIV_UNLIKELY(file < (File)0))
return true; return true;
int res= read_gtid_state(&state, file, page_no, &header_data); int res= read_gtid_state(&state, file, page_no);
if (res < 0) if (res < 0)
{ {
my_close(file, MYF(0)); my_close(file, MYF(0));
return true; return true;
} }
diff_state_interval= header_data.diff_state_interval;
if (diff_state_interval == 0) if (diff_state_interval == 0)
{ {
sql_print_warning("Invalid differential binlog state interval %llu found " sql_print_warning("Invalid differential binlog state interval %llu found "
"in binlog file, ignoring", diff_state_interval); "in binlog file, ignoring", diff_state_interval);
current_binlog_state_interval= 0; /* Disable in this binlog file */
} }
else else
{ {
current_binlog_state_interval= diff_state_interval;
page_no= (uint32_t)(binlog_cur_page_no - page_no= (uint32_t)(binlog_cur_page_no -
(binlog_cur_page_no % diff_state_interval)); (binlog_cur_page_no % diff_state_interval));
while (page_no > 0) while (page_no > 1)
{ {
res= read_gtid_state(&state, file, page_no, &header_data); res= read_gtid_state(&state, file, page_no);
if (res > 0) if (res > 0)
break; break;
page_no-= (uint32_t)diff_state_interval; page_no-= (uint32_t)diff_state_interval;
@@ -1970,9 +2002,7 @@ binlog_state_recover()
} }
my_close(file, MYF(0)); my_close(file, MYF(0));
ha_innodb_binlog_reader reader(active_binlog_file_no.load ha_innodb_binlog_reader reader(active, page_no << ibb_page_size_shift);
(std::memory_order_relaxed),
page_no << ibb_page_size_shift);
return binlog_recover_gtid_state(&state, &reader); return binlog_recover_gtid_state(&state, &reader);
} }
@@ -2410,6 +2440,8 @@ ha_innodb_binlog_reader::ha_innodb_binlog_reader(uint64_t file_no,
{ {
page_buf= (uchar *)ut_malloc(ibb_page_size, mem_key_binlog); page_buf= (uchar *)ut_malloc(ibb_page_size, mem_key_binlog);
chunk_rd.set_page_buf(page_buf); chunk_rd.set_page_buf(page_buf);
if (offset < ibb_page_size)
offset= ibb_page_size;
chunk_rd.seek(file_no, offset); chunk_rd.seek(file_no, offset);
chunk_rd.skip_partial(true); chunk_rd.skip_partial(true);
} }
@@ -2617,10 +2649,8 @@ gtid_search::~gtid_search()
enum gtid_search::Read_Result enum gtid_search::Read_Result
gtid_search::read_gtid_state_file_no(rpl_binlog_state_base *state, gtid_search::read_gtid_state_file_no(rpl_binlog_state_base *state,
uint64_t file_no, uint32_t page_no, uint64_t file_no, uint32_t page_no,
uint64_t *out_file_end, uint64_t *out_file_end)
uint32_t *out_diff_state_interval)
{ {
binlog_header_data header_data;
*out_file_end= 0; *out_file_end= 0;
uint64_t active2= active_binlog_file_no.load(std::memory_order_acquire); uint64_t active2= active_binlog_file_no.load(std::memory_order_acquire);
if (file_no > active2) if (file_no > active2)
@@ -2675,9 +2705,7 @@ gtid_search::read_gtid_state_file_no(rpl_binlog_state_base *state,
if (block) if (block)
{ {
ut_ad(end_offset != ~(uint64_t)0); ut_ad(end_offset != ~(uint64_t)0);
int res= read_gtid_state_from_page(state, block->page_buf, page_no, int res= read_gtid_state_from_page(state, block->page_buf, page_no);
&header_data);
*out_diff_state_interval= header_data.diff_state_interval;
binlog_page_fifo->release_page(block); binlog_page_fifo->release_page(block);
return (Read_Result)res; return (Read_Result)res;
} }
@@ -2716,8 +2744,7 @@ gtid_search::read_gtid_state_file_no(rpl_binlog_state_base *state,
} }
if (!*out_file_end) if (!*out_file_end)
*out_file_end= cur_open_file_length; *out_file_end= cur_open_file_length;
int res= read_gtid_state(state, cur_open_file, page_no, &header_data); int res= read_gtid_state(state, cur_open_file, page_no);
*out_diff_state_interval= header_data.diff_state_interval;
return (Read_Result)res; return (Read_Result)res;
} }
} }
@@ -2735,7 +2762,6 @@ gtid_search::read_gtid_state_file_no(rpl_binlog_state_base *state,
0 Position not found (has been purged) 0 Position not found (has been purged)
1 Position found 1 Position found
*/ */
int int
gtid_search::find_gtid_pos(slave_connection_state *pos, gtid_search::find_gtid_pos(slave_connection_state *pos,
rpl_binlog_state_base *out_state, rpl_binlog_state_base *out_state,
@@ -2750,14 +2776,39 @@ gtid_search::find_gtid_pos(slave_connection_state *pos,
/* First search backwards for the right file to start from. */ /* First search backwards for the right file to start from. */
uint64_t file_end= 0; uint64_t file_end= 0;
uint32_t diff_state_page_interval= 0; uint64_t diff_state_page_interval= 0;
rpl_binlog_state_base base_state, page0_diff_state, tmp_diff_state; rpl_binlog_state_base base_state, page0_diff_state, tmp_diff_state;
base_state.init(); base_state.init();
for (;;) for (;;)
{ {
/*
Read the header page, needed to get the binlog diff state interval.
ToDo: Here we instantiate our own binlog_chunk_reader specifically for
this. Later, when read_gtid_state_file_no() is fixed to also use a
binlog_chunk_reader, integrate and use the same single
binlog_chunk_reader object.
*/
binlog_header_data header;
int err;
byte *page_buffer= (byte *)ut_malloc(ibb_page_size, mem_key_binlog);
if (!page_buffer)
{
my_error(ER_OUTOFMEMORY, MYF(0), ibb_page_size);
return -1;
}
{
binlog_chunk_reader chunk_reader;
chunk_reader.set_page_buf(page_buffer);
chunk_reader.seek(file_no, 0);
err= chunk_reader.get_file_header(&header);
diff_state_page_interval= header.diff_state_interval;
}
ut_free(page_buffer);
if (err)
return -1;
enum Read_Result res= enum Read_Result res=
read_gtid_state_file_no(&base_state, file_no, 0, &file_end, read_gtid_state_file_no(&base_state, file_no, 1, &file_end);
&diff_state_page_interval);
if (res == READ_ENOENT) if (res == READ_ENOENT)
return 0; return 0;
if (res == READ_ERROR) if (res == READ_ERROR)
@@ -2769,7 +2820,7 @@ gtid_search::find_gtid_pos(slave_connection_state *pos,
/* Handle the special case of a completely empty binlog file. */ /* Handle the special case of a completely empty binlog file. */
out_state->reset_nolock(); out_state->reset_nolock();
*out_file_no= file_no; *out_file_no= file_no;
*out_offset= 0; *out_offset= ibb_page_size;
return 1; return 1;
} }
ut_ad(0 /* Not expected to find no state, should always be written. */); ut_ad(0 /* Not expected to find no state, should always be written. */);
@@ -2794,20 +2845,18 @@ gtid_search::find_gtid_pos(slave_connection_state *pos,
uint32_t page2= (uint32_t) uint32_t page2= (uint32_t)
(diff_state_page_interval + ((file_end - 1) >> ibb_page_size_shift)); (diff_state_page_interval + ((file_end - 1) >> ibb_page_size_shift));
/* Round to the next diff_state_page_interval after file_end. */ /* Round to the next diff_state_page_interval after file_end. */
page2-= page2 % diff_state_page_interval; page2-= page2 % (uint32_t)diff_state_page_interval;
uint32_t page1= (page0 + page2) / 2; uint32_t page1= (page0 + page2) / 2;
page0_diff_state.init(); page0_diff_state.init();
page0_diff_state.load_nolock(&base_state); page0_diff_state.load_nolock(&base_state);
tmp_diff_state.init(); tmp_diff_state.init();
while (page1 >= page0 + diff_state_page_interval) while (page1 >= page0 + diff_state_page_interval && page1 > 1)
{ {
ut_ad((page1 - page0) % diff_state_page_interval == 0); ut_ad((page1 - page0) % diff_state_page_interval == 0);
tmp_diff_state.reset_nolock(); tmp_diff_state.reset_nolock();
tmp_diff_state.load_nolock(&base_state); tmp_diff_state.load_nolock(&base_state);
uint32_t dummy;
enum Read_Result res= enum Read_Result res=
read_gtid_state_file_no(&tmp_diff_state, file_no, page1, &file_end, read_gtid_state_file_no(&tmp_diff_state, file_no, page1, &file_end);
&dummy);
if (res == READ_ENOENT) if (res == READ_ENOENT)
return 0; /* File purged while we are reading from it? */ return 0; /* File purged while we are reading from it? */
if (res == READ_ERROR) if (res == READ_ERROR)
@@ -2819,7 +2868,7 @@ gtid_search::find_gtid_pos(slave_connection_state *pos,
try the one just before. It will be safe, even if not always optimal, try the one just before. It will be safe, even if not always optimal,
and this is an abnormal situation anyway. and this is an abnormal situation anyway.
*/ */
page1= page1 - diff_state_page_interval; page1= page1 - (uint32_t)diff_state_page_interval;
continue; continue;
} }
if (tmp_diff_state.is_before_pos(pos)) if (tmp_diff_state.is_before_pos(pos))
@@ -2835,6 +2884,8 @@ gtid_search::find_gtid_pos(slave_connection_state *pos,
ut_ad(page1 >= page0); ut_ad(page1 >= page0);
out_state->load_nolock(&page0_diff_state); out_state->load_nolock(&page0_diff_state);
*out_file_no= file_no; *out_file_no= file_no;
if (page0 == 0)
page0= 1; /* Skip the initial file header page. */
*out_offset= (uint64_t)page0 << ibb_page_size_shift; *out_offset= (uint64_t)page0 << ibb_page_size_shift;
return 1; return 1;
} }
@@ -2888,6 +2939,8 @@ ha_innodb_binlog_reader::init_legacy_pos(const char *filename, ulonglong offset)
reached. This way we avoid reading garbaga data for invalid request reached. This way we avoid reading garbaga data for invalid request
offset. offset.
*/ */
if (offset < ibb_page_size)
offset= ibb_page_size;
chunk_rd.seek(file_no, (uint64_t)offset); chunk_rd.seek(file_no, (uint64_t)offset);
chunk_rd.skip_partial(true); chunk_rd.skip_partial(true);
cur_file_no= chunk_rd.current_file_no(); cur_file_no= chunk_rd.current_file_no();
@@ -2980,14 +3033,12 @@ innodb_binlog_get_init_state(rpl_binlog_state_base *out_state)
{ {
gtid_search search_obj; gtid_search search_obj;
uint64_t dummy_file_end; uint64_t dummy_file_end;
uint32_t dummy_diff_state_interval;
bool err= false; bool err= false;
mysql_mutex_lock(&purge_binlog_mutex); mysql_mutex_lock(&purge_binlog_mutex);
uint64_t file_no= earliest_binlog_file_no; uint64_t file_no= earliest_binlog_file_no;
enum gtid_search::Read_Result res= enum gtid_search::Read_Result res=
search_obj.read_gtid_state_file_no(out_state, file_no, 0, &dummy_file_end, search_obj.read_gtid_state_file_no(out_state, file_no, 1, &dummy_file_end);
&dummy_diff_state_interval);
mysql_mutex_unlock(&purge_binlog_mutex); mysql_mutex_unlock(&purge_binlog_mutex);
if (res != gtid_search::READ_FOUND) if (res != gtid_search::READ_FOUND)
err= true; err= true;

View File

@@ -32,7 +32,27 @@ InnoDB implementation of binlog.
struct chunk_data_base; struct chunk_data_base;
struct binlog_header_data;
/* 4-byte "magic" identifying InnoDB binlog file (little endian). */
static constexpr uint32_t IBB_MAGIC= 0x010dfefe;
static constexpr uint32_t IBB_FILE_VERS_MAJOR= 0;
static constexpr uint32_t IBB_FILE_VERS_MINOR= 0;
/*
The size of the header page that is stored in the first page of a file.
This is the smallest page size that can be used in a backwards compatible
way. Having a fixed-size small header page means we can get the real page
size of the file from the header page, but still be able to checksum the
header page without relying on unchecked page size field to compute the
checksum.
(The remainder of the header page is just unused or could potentially
later be used for other data as needed).
*/
static constexpr uint32_t IBB_HEADER_PAGE_SIZE= 512;
static constexpr uint32_t IBB_PAGE_SIZE_MIN= IBB_HEADER_PAGE_SIZE;
static constexpr uint32_t IBB_PAGE_SIZE_MAX= 65536;
/** Store crc32 checksum at the end of the page */ /** Store crc32 checksum at the end of the page */
#define BINLOG_PAGE_CHECKSUM 4 #define BINLOG_PAGE_CHECKSUM 4
@@ -266,6 +286,8 @@ public:
of the current binlog (ie. end-of-file). of the current binlog (ie. end-of-file).
*/ */
int read_data(byte *buffer, int max_len, bool multipage); int read_data(byte *buffer, int max_len, bool multipage);
/* Read the file header of current file_no. */
int get_file_header(binlog_header_data *out_header);
/* Save current position, and restore it later. */ /* Save current position, and restore it later. */
void save_pos(saved_position *out_pos) { *out_pos= s; } void save_pos(saved_position *out_pos) { *out_pos= s; }
@@ -294,7 +316,8 @@ public:
extern uint32_t ibb_page_size_shift; extern uint32_t ibb_page_size_shift;
extern ulong ibb_page_size; extern ulong ibb_page_size;
extern uint32_t current_binlog_state_interval; /* The state interval (in pages) used for active_binlog_file_no. */
extern uint64_t current_binlog_state_interval;
extern mysql_mutex_t active_binlog_mutex; extern mysql_mutex_t active_binlog_mutex;
extern pthread_cond_t active_binlog_cond; extern pthread_cond_t active_binlog_cond;
extern std::atomic<uint64_t> active_binlog_file_no; extern std::atomic<uint64_t> active_binlog_file_no;
@@ -313,11 +336,18 @@ fsp_binlog_release(fsp_binlog_page_entry *page)
extern size_t crc32_pwrite_page(File fd, byte *buf, uint32_t page_no, extern size_t crc32_pwrite_page(File fd, byte *buf, uint32_t page_no,
myf MyFlags) noexcept; myf MyFlags) noexcept;
extern size_t crc32_pread_page(File fd, byte *buf, uint32_t page_no, extern int crc32_pread_page(File fd, byte *buf, uint32_t page_no,
myf MyFlags) noexcept;
extern int crc32_pread_page(pfs_os_file_t fh, byte *buf, uint32_t page_no,
myf MyFlags) noexcept; myf MyFlags) noexcept;
extern void binlog_write_up_to_now() noexcept; extern void binlog_write_up_to_now() noexcept;
extern void fsp_binlog_extract_header_page(const byte *page_buf,
binlog_header_data *out_header_data)
noexcept;
extern void fsp_log_binlog_write(mtr_t *mtr, fsp_binlog_page_entry *page, extern void fsp_log_binlog_write(mtr_t *mtr, fsp_binlog_page_entry *page,
uint32_t page_offset, uint32_t len); uint32_t page_offset, uint32_t len);
extern void fsp_log_header_page(mtr_t *mtr, fsp_binlog_page_entry *page,
uint32_t len) noexcept;
extern void fsp_binlog_init(); extern void fsp_binlog_init();
extern void fsp_binlog_shutdown(); extern void fsp_binlog_shutdown();
extern dberr_t fsp_binlog_tablespace_close(uint64_t file_no); extern dberr_t fsp_binlog_tablespace_close(uint64_t file_no);

View File

@@ -76,8 +76,8 @@ struct chunk_data_flush : public chunk_data_base {
/* /*
Data stored at the start of each binlog file. Data stored at the start of each binlog file.
(The data is stored in the file as compressed integers; this is just a (The data is stored as little-engian values in the first page of the file;
struct to pass around the values in-memory). this is just a struct to pass around the values in-memory).
*/ */
struct binlog_header_data { struct binlog_header_data {
/* /*
@@ -93,16 +93,28 @@ struct binlog_header_data {
*/ */
uint64_t file_no; uint64_t file_no;
/* The length of this binlog file, in pages. */ /* The length of this binlog file, in pages. */
uint32_t page_count; uint64_t page_count;
/* /*
The interval (in pages) at which the (differential) binlog GTID state is The interval (in pages) at which the (differential) binlog GTID state is
written into the binlog file, for faster GTID position search. This written into the binlog file, for faster GTID position search. This
corresponds to the value of --innodb-binlog-state-interval at the time the corresponds to the value of --innodb-binlog-state-interval at the time the
binlog file was created. binlog file was created.
*/ */
uint32_t diff_state_interval; uint64_t diff_state_interval;
/* The log_2 of the page size (eg. ibb_page_size_shift). */
uint32_t page_size_shift;
/*
Major and minor file format version number. The idea is that minor version
increments are backwards compatible, major version upgrades are not.
*/
uint32_t vers_major, vers_minor;
/* Whether the page was found empty. */ /* Whether the page was found empty. */
bool is_empty; bool is_empty;
/*
Whether the page was found invalid, bad magic or major version, or CRC32
error (and not empty).
*/
bool is_invalid;
}; };
@@ -149,10 +161,12 @@ binlog_name_make_short(char *name_buf, uint64_t file_no)
extern void innodb_binlog_startup_init(); extern void innodb_binlog_startup_init();
extern bool innodb_binlog_init(size_t binlog_size, const char *directory); extern bool innodb_binlog_init(size_t binlog_size, const char *directory);
extern void innodb_binlog_close(bool shutdown); extern void innodb_binlog_close(bool shutdown);
extern bool ibb_write_header_page(mtr_t *mtr, uint64_t file_no,
uint64_t file_size_in_pages, lsn_t start_lsn,
uint64_t gtid_state_interval_in_pages);
extern bool binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr, extern bool binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
fsp_binlog_page_entry * &block, uint32_t &page_no, fsp_binlog_page_entry * &block, uint32_t &page_no,
uint32_t &page_offset, uint64_t file_no, uint32_t &page_offset, uint64_t file_no);
uint32_t file_size_in_pages);
extern bool innodb_binlog_oob(THD *thd, const unsigned char *data, extern bool innodb_binlog_oob(THD *thd, const unsigned char *data,
size_t data_len, void **engine_data); size_t data_len, void **engine_data);
extern void innodb_free_oob(THD *thd, void *engine_data); extern void innodb_free_oob(THD *thd, void *engine_data);