From b3c6bbdbd378571a22a5dfc82c251bd2713cf432 Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Wed, 12 Mar 2025 16:57:42 +0100 Subject: [PATCH] MDEV-34705: Binlog-in-engine: First working recovery Still needs more testing. Signed-off-by: Kristian Nielsen --- .../binlog_in_engine/binlog_in_engine.result | 34 +- .../binlog_in_engine/binlog_in_engine.test | 9 +- .../suite/binlog_in_engine/recovery.result | 56 ++ .../suite/binlog_in_engine/recovery.test | 12 +- sql/log.cc | 2 +- storage/innobase/fsp/fsp_binlog.cc | 32 +- storage/innobase/handler/innodb_binlog.cc | 803 +++++++++++++++++- storage/innobase/include/innodb_binlog.h | 44 +- storage/innobase/log/log0recv.cc | 71 +- 9 files changed, 934 insertions(+), 129 deletions(-) create mode 100644 mysql-test/suite/binlog_in_engine/recovery.result diff --git a/mysql-test/suite/binlog_in_engine/binlog_in_engine.result b/mysql-test/suite/binlog_in_engine/binlog_in_engine.result index 131b046dea0..cacf627cee7 100644 --- a/mysql-test/suite/binlog_in_engine/binlog_in_engine.result +++ b/mysql-test/suite/binlog_in_engine/binlog_in_engine.result @@ -16,25 +16,25 @@ SELECT @@GLOBAL.binlog_checksum; NONE SHOW MASTER STATUS; File Position Binlog_Do_DB Binlog_Ignore_DB -binlog-000000.ibb 767 -SHOW BINLOG EVENTS IN "binlog-000000.ibb"; +binlog-000000.ibb # +include/show_binlog_events.inc Log_name Pos Event_type Server_id End_log_pos Info -binlog-000000.ibb 0 Gtid 1 0 GTID 0-1-1 -binlog-000000.ibb 0 Query 1 0 use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB -binlog-000000.ibb 0 Gtid 1 0 BEGIN GTID 0-1-2 -binlog-000000.ibb 0 Query 1 0 use `test`; INSERT INTO t1 VALUES (1) -binlog-000000.ibb 0 Xid 1 0 COMMIT /* xid=34 */ -binlog-000000.ibb 0 Gtid 1 0 BEGIN GTID 0-1-3 -binlog-000000.ibb 0 Query 1 0 use `test`; INSERT INTO t1 VALUES (2) -binlog-000000.ibb 0 Query 1 0 use `test`; INSERT INTO t1 VALUES (3) -binlog-000000.ibb 0 Xid 1 0 COMMIT /* xid=36 */ -binlog-000000.ibb 0 Gtid 1 0 GTID 0-1-4 -binlog-000000.ibb 0 Query 1 0 use `test`; DROP TABLE `t1` /* generated by server */ -SHOW BINLOG EVENTS LIMIT 2, 3; +binlog-000000.ibb # Gtid # # GTID #-#-# +binlog-000000.ibb # Query # # use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (1) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (2) +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (3) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # GTID #-#-# +binlog-000000.ibb # Query # # use `test`; DROP TABLE `t1` /* generated by server */ +include/show_binlog_events.inc Log_name Pos Event_type Server_id End_log_pos Info -binlog-000000.ibb 0 Gtid 1 0 BEGIN GTID 0-1-2 -binlog-000000.ibb 0 Query 1 0 use `test`; INSERT INTO t1 VALUES (1) -binlog-000000.ibb 0 Xid 1 0 COMMIT /* xid=34 */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (1) +binlog-000000.ibb # Xid # # COMMIT /* XID */ CREATE TABLE t2 (a INT PRIMARY KEY, b VARCHAR(2048)) ENGINE=InnoDB; SET SESSION binlog_format= ROW; *** Do 1500 transactions ... diff --git a/mysql-test/suite/binlog_in_engine/binlog_in_engine.test b/mysql-test/suite/binlog_in_engine/binlog_in_engine.test index d34693c54ea..8785c3c8202 100644 --- a/mysql-test/suite/binlog_in_engine/binlog_in_engine.test +++ b/mysql-test/suite/binlog_in_engine/binlog_in_engine.test @@ -27,9 +27,14 @@ SELECT @@GLOBAL.binlog_checksum; # If this gets too annoying to do, we can replace this with something that # checks that the reported file and position is within some reasonable range # of the value left by current code. +--replace_column 2 # SHOW MASTER STATUS; -SHOW BINLOG EVENTS IN "binlog-000000.ibb"; -SHOW BINLOG EVENTS LIMIT 2, 3; +--let $binlog_file= binlog-000000.ibb +--let $binlog_start= 0 +--source include/show_binlog_events.inc +--let $binlog_file= +--let $binlog_limit= 2, 3 +--source include/show_binlog_events.inc CREATE TABLE t2 (a INT PRIMARY KEY, b VARCHAR(2048)) ENGINE=InnoDB; diff --git a/mysql-test/suite/binlog_in_engine/recovery.result b/mysql-test/suite/binlog_in_engine/recovery.result new file mode 100644 index 00000000000..d703d7540bf --- /dev/null +++ b/mysql-test/suite/binlog_in_engine/recovery.result @@ -0,0 +1,56 @@ +RESET MASTER; +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1); + +# Flush all dirty pages from buffer pool +SET @no_checkpoint_save_pct= @@GLOBAL.innodb_max_dirty_pages_pct; +SET @no_checkpoint_save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm; +SET GLOBAL innodb_max_dirty_pages_pct_lwm=0.0; +SET GLOBAL innodb_max_dirty_pages_pct=0.0; +SET GLOBAL innodb_max_dirty_pages_pct= @no_checkpoint_save_pct; +SET GLOBAL innodb_max_dirty_pages_pct_lwm= @no_checkpoint_save_pct_lwm; + +BEGIN; +INSERT INTO t1 VALUES (2); +INSERT INTO t1 VALUES (3); +COMMIT; +INSERT INTO t1 VALUES (4); +INSERT INTO t1 VALUES (5); +INSERT INTO t1 VALUES (6); +INSERT INTO t1 VALUES (7); +SELECT * FROM t1 ORDER BY a; +a +1 +2 +3 +4 +5 +6 +7 +SET SESSION debug_dbug="+d,crash_dispatch_command_before"; +SELECT 1; +Got one of the listed errors +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +binlog-000000.ibb # Gtid # # GTID #-#-# +binlog-000000.ibb # Query # # use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (1) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (2) +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (3) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (4) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (5) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (6) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +binlog-000000.ibb # Gtid # # BEGIN GTID #-#-# +binlog-000000.ibb # Query # # use `test`; INSERT INTO t1 VALUES (7) +binlog-000000.ibb # Xid # # COMMIT /* XID */ +DROP TABLE t1; diff --git a/mysql-test/suite/binlog_in_engine/recovery.test b/mysql-test/suite/binlog_in_engine/recovery.test index 1a91679910e..88db68d20ad 100644 --- a/mysql-test/suite/binlog_in_engine/recovery.test +++ b/mysql-test/suite/binlog_in_engine/recovery.test @@ -14,7 +14,6 @@ INSERT INTO t1 VALUES (1); --let $no_checkpoint_flush= 1 --let $no_checkpoint_kill= 1 --source ../../suite/innodb/include/no_checkpoint_start.inc -SHOW MASTER STATUS; --let $file= query_get_value(SHOW MASTER STATUS, File, 1) --let $pos= query_get_value(SHOW MASTER STATUS, Position, 1) @@ -23,8 +22,11 @@ BEGIN; INSERT INTO t1 VALUES (2); INSERT INTO t1 VALUES (3); COMMIT; +INSERT INTO t1 VALUES (4); +INSERT INTO t1 VALUES (5); +INSERT INTO t1 VALUES (6); +INSERT INTO t1 VALUES (7); SELECT * FROM t1 ORDER BY a; -DROP TABLE t1; # Crash the server --write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect @@ -55,5 +57,7 @@ EOF --enable_reconnect --source include/wait_until_connected_again.inc -SHOW MASTER STATUS; -SHOW BINLOG EVENTS; +--let $binlog_file= +--let $binlog_start= 0 +--source include/show_binlog_events.inc +DROP TABLE t1; diff --git a/sql/log.cc b/sql/log.cc index a729f5e4821..086f6df3fef 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -4819,11 +4819,11 @@ MYSQL_BIN_LOG::reset_engine_binlogs(THD *thd, rpl_gtid *init_state, mysql_mutex_lock(&LOCK_log); mysql_mutex_lock(&LOCK_index); - err= (*opt_binlog_engine_hton->reset_binlogs)(); if (init_state) rpl_global_gtid_binlog_state.load(init_state, init_state_len); else rpl_global_gtid_binlog_state.reset(); + err= (*opt_binlog_engine_hton->reset_binlogs)(); mysql_mutex_unlock(&LOCK_index); mysql_mutex_unlock(&LOCK_log); diff --git a/storage/innobase/fsp/fsp_binlog.cc b/storage/innobase/fsp/fsp_binlog.cc index 1fd768686a3..b68b1c84a8f 100644 --- a/storage/innobase/fsp/fsp_binlog.cc +++ b/storage/innobase/fsp/fsp_binlog.cc @@ -618,12 +618,8 @@ fsp_log_binlog_write(mtr_t *mtr, fsp_binlog_page_entry *page, page_offset= 0; page->flushed_clean= false; } - mtr->write_binlog(LOG_BINLOG_ID_0 + (file_no & 1), page_no, - (uint16_t)page_offset, page_offset + &page->page_buf[0], - len); - sql_print_information("ToDo2: %d, page=%u, off=%u, len=%u)", (int)(file_no & 1), page_no, page_offset, len); - for (uint32_t i= page_offset; i < page_offset+len; i+=8) - sql_print_information("ToDo2: 0x%04x %02X %02X %02X %02X %02X %02X %02X %02X", i, page->page_buf[i], page->page_buf[i+1], page->page_buf[i+2], page->page_buf[i+3], page->page_buf[i+4], page->page_buf[i+5], page->page_buf[i+6], page->page_buf[i+7]); + mtr->write_binlog((file_no & 1), page_no, (uint16_t)page_offset, + page_offset + &page->page_buf[0], len); } /* @@ -809,14 +805,6 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type) and available; binlog tablespace N is active while (N+1) is being pre-allocated. Only under extreme I/O pressure should be need to stall here. - - ToDo: Handle recovery. Idea: write the current LSN at the start of - the binlog tablespace when we create it. At recovery, we should open - the (at most) 2 most recent binlog tablespaces. Whenever we have a - redo record, skip it if its LSN is smaller than the one stored in the - tablespace corresponding to its space_id. This way, it should be safe - to re-use tablespace ids between just two, SRV_SPACE_ID_BINLOG0 and - SRV_SPACE_ID_BINLOG1. */ ut_ad(!pending_prev_end_offset); pending_prev_end_offset= page_no << page_size_shift; @@ -981,21 +969,6 @@ fsp_binlog_write_rec(chunk_data_base *chunk_data, mtr_t *mtr, byte chunk_type) } -/* - Empty chunk data, used to pass a dummy record to fsp_binlog_write_rec() - in fsp_binlog_flush(). -*/ -struct chunk_data_flush : public chunk_data_base { - ~chunk_data_flush() { } - - virtual std::pair copy_data(byte *p, uint32_t max_len) final - { - memset(p, 0xff, max_len); - return {max_len, true}; - } -}; - - /* Implementation of FLUSH BINARY LOGS. Truncate the current binlog tablespace, fill up the last page with dummy data @@ -1080,6 +1053,7 @@ fsp_binlog_flush() mtr.start(); fsp_binlog_write_rec(&dummy_data, &mtr, FSP_BINLOG_TYPE_FILLER); mtr.commit(); + log_buffer_flush_to_disk(srv_flush_log_at_trx_commit & 1); return false; } diff --git a/storage/innobase/handler/innodb_binlog.cc b/storage/innobase/handler/innodb_binlog.cc index 1cf717c1f14..a1dd9aa2181 100644 --- a/storage/innobase/handler/innodb_binlog.cc +++ b/storage/innobase/handler/innodb_binlog.cc @@ -408,10 +408,635 @@ struct found_binlogs { }; +/* + This structure holds the state needed during InnoDB recovery for recovering + binlog tablespace files. +*/ +class binlog_recovery { +public: + struct found_binlogs scan_result; + byte *page_buf; + const char *binlog_dir; + /* + The current file number being recovered. + This starts out as the most recent existing non-empty binlog that has a + starting LSN no bigger than the recovery starting LSN. This should always be + one of the two most recent binlog files found at startup. + */ + uint64_t cur_file_no; + /* The physical length of cur_file_no file. */ + uint64_t cur_phys_size; + /* + The starting LSN (as stored in the header of the binlog tablespace file). + No redo prior to this LSN should be applied to this file. + */ + lsn_t start_file_lsn; + /* Open file for cur_file_no, or -1 if not open. */ + File cur_file_fh; + /* The sofar position of redo in cur_file_no (end point of previous redo). */ + uint32_t cur_page_no; + uint32_t cur_page_offset; + + /* The path to cur_file_no. */ + char full_path[OS_FILE_MAX_PATH]; + + bool inited; + /* + Flag set in case of severe error and --innodb-force_recovery to completely + skip any binlog recovery. + */ + bool skip_recovery; + /* + Special case, if we start from completely empty (no non-empty binlog files). + This should recover into an empty binlog state. + */ + bool start_empty; + /* + Special case: The last two files are empty. Then we ignore the last empty + file and use the 2 previous files instead. The ignored file is deleted only + after successful recovery, to try to avoid destroying data in case of + recovery problems. + */ + bool ignore_last; + /* + Mark the case where the first binlog tablespace file we need to consider for + recovery has file LSN that is later than the first redo record; in this case + we need to skip records until the first one that applies to this file. + */ + bool skipping_early_lsn; + /* + Skip any initial records until the start of a page. We are guaranteed that + any page that needs to be recovered will have recovery data for the whole + page, and this way we never need to read-modify-write pages during recovery. + */ + bool skipping_partial_page; + + bool init_recovery(bool space_id, uint32_t page_no, uint16_t offset, + lsn_t start_lsn, lsn_t lsn, + const byte *buf, size_t size) noexcept; + bool apply_redo(bool space_id, uint32_t page_no, uint16_t offset, + lsn_t start_lsn, lsn_t lsn, + const byte *buf, size_t size) noexcept; + int get_header(uint64_t file_no, lsn_t &out_lsn, bool &out_empty) noexcept; + bool init_recovery_from(uint64_t file_no, lsn_t file_lsn, uint32_t page_no, + uint16_t offset, lsn_t lsn, + const byte *buf, size_t size) noexcept; + void init_recovery_empty() noexcept; + void init_recovery_skip_all() noexcept; + void end_actions(bool recovery_successful) noexcept; + void release() noexcept; + bool open_cur_file() noexcept; + bool flush_page() noexcept; + void zero_out_cur_file(); + bool close_file() noexcept; + bool next_file() noexcept; + bool next_page() noexcept; + void update_page_from_record(uint16_t offset, + const byte *buf, size_t size) noexcept; +}; + + +static binlog_recovery recover_obj; + + static void innodb_binlog_prealloc_thread(); +static int scan_for_binlogs(const char *binlog_dir, found_binlogs *binlog_files, + bool error_if_missing) noexcept; static int innodb_binlog_discover(); static bool binlog_state_recover(); static void innodb_binlog_autopurge(uint64_t first_open_file_no); +static int read_gtid_state_from_page(rpl_binlog_state_base *state, + const byte *page, uint32_t page_no, + binlog_header_data *out_header_data); + + +/* + Read the header of a binlog tablespace file identified by file_no. + Sets the out_empty false if the file is empty or has checksum error (or + is missing). + Else sets out_empty true and sets out_lsn from the header. + + Returns: + -1 error + 0 File is missing (ENOENT) + 1 File found (but may be empty according to out_empty). +*/ +int +binlog_recovery::get_header(uint64_t file_no, lsn_t &out_lsn, bool &out_empty) + noexcept +{ + char full_path[OS_FILE_MAX_PATH]; + rpl_binlog_state_base dummy_state; + binlog_header_data header; + + out_empty= true; + out_lsn= 0; + + binlog_name_make(full_path, file_no, binlog_dir); + File fh= my_open(full_path, O_RDONLY | O_BINARY, MYF(0)); + if (fh < (File)0) + return (my_errno == ENOENT ? 0 : -1); + size_t read= my_pread(fh, page_buf, srv_page_size, 0, MYF(0)); + my_close(fh, MYF(0)); + if (UNIV_UNLIKELY(read == (size_t)-1)) + return -1; + if (read == 0) + return 0; + dummy_state.init(); + int res= read_gtid_state_from_page(&dummy_state, page_buf, 0, &header); + if (res <= 0) + return res; + if (!header.is_empty) + { + out_empty= false; + out_lsn= header.start_lsn; + } + return 1; +} + + +bool binlog_recovery::init_recovery(bool space_id, uint32_t page_no, + uint16_t offset, + lsn_t start_lsn, lsn_t end_lsn, + const byte *buf, size_t size) noexcept +{ + /* Start by initializing resource pointers so we are safe to releaes(). */ + cur_file_fh= (File)-1; + if (!(page_buf= (byte *)ut_malloc(srv_page_size, mem_key_binlog))) + { + my_error(ER_OUTOFMEMORY, MYF(MY_WME), srv_page_size); + return true; + } + memset(page_buf, 0, srv_page_size); + inited= true; + /* + ToDo: It would be good to find a way to not duplicate this logic for + where the binlog tablespace filess are stored with the code in + innodb_binlog_init(). But it's a bit awkward, because InnoDB recovery + runs during plugin init, so not even available for the server to call + into until after recovery is done. + */ + binlog_dir= opt_binlog_directory; + if (!binlog_dir || !binlog_dir[0]) + binlog_dir= "."; + if (scan_for_binlogs(binlog_dir, &scan_result, true) <= 0) + return true; + + /* + Here we find the two most recent, non-empty binlogs to do recovery on. + Before we allocate binlog tablespace file N+2, we flush and fsync file N + to disk. This ensures that we only ever need to apply redo records to the + two most recent files during recovery. + + A special case however arises if the two most recent binlog files are + both completely empty. Then we do not have any LSN to match against to + know if a redo record applies to one of these two files, or to an earlier + file with same value of bit 0 of the file_no. In this case, we ignore the + most recent file (deleting it later after successful recovery), and + consider instead the two prior files, the first of which is guaranteed to + have durably saved a starting LSN to use. + + Hence the loop, which can only ever have one or two iterations. + + A further special case is if there are fewer than two (or three if last + two are empty) files. If there are no files, or only empty files, then the + server must have stopped just after RESET MASTER (or just after + initializing the binlogs at first startup), and we should just start the + binlogs from scratch. + */ + ignore_last= false; + uint64_t file_no2= scan_result.last_file_no; + uint64_t file_no1= scan_result.prev_file_no; + int num_binlogs= scan_result.found_binlogs; + for (;;) + { + lsn_t lsn1= 0, lsn2= 0; + bool is_empty1= true, is_empty2= true; + int res2= get_header(file_no2, lsn2, is_empty2); + + if (num_binlogs == 0 || + (num_binlogs == 1 && is_empty2)) + { + init_recovery_empty(); + return false; + } + if (num_binlogs == 1) + return init_recovery_from(file_no2 + (space_id != (file_no2 & 1)), lsn2, + page_no, offset, start_lsn, buf, size); + + int res1= get_header(file_no1, lsn1, is_empty1); + + if (res2 < 0 && !srv_force_recovery) + { + sql_print_error("InnoDB: I/O error reading binlog file number " PRIu64, + file_no2); + return true; + } + if (res1 < 0 && !srv_force_recovery) + { + sql_print_error("InnoDB: I/O error reading binlog file number " PRIu64, + file_no1); + return true; + } + if (is_empty1 && is_empty2) + { + if (!ignore_last) + { + ignore_last= true; + if (file_no2 > scan_result.earliest_file_no) + { + --file_no2; + if (file_no1 > scan_result.earliest_file_no) + --file_no1; + else + --num_binlogs; + } + else + --num_binlogs; + continue; + } + if (srv_force_recovery) + { + /* + If the last 3 files are empty, we cannot get an LSN to know which + records apply to each file. This should not happen unless there is + damage to the file system. If force recovery is requested, we must + simply do no recovery at all on the binlog files. + */ + sql_print_warning("InnoDB: Binlog tablespace file recovery is not " + "possible. Recovery is skipped due to " + "--innodb-force-recovery"); + init_recovery_skip_all(); + return false; + } + sql_print_error("InnoDB: Last 3 binlog tablespace files are all empty. " + "Recovery is not possible"); + return true; + } + if (is_empty2) + lsn2= lsn1; + if (space_id == (file_no2 & 1) && start_lsn >= lsn1) + { + if (start_lsn < lsn2 && !srv_force_recovery) + { + sql_print_error("InnoDB: inconsistent space_id %d for lsn=%" LSN_PF, + (int)space_id, start_lsn); + return true; + } + return init_recovery_from(file_no2, lsn2, + page_no, offset, start_lsn, buf, size); + } + else + return init_recovery_from(file_no1, lsn1, + page_no, offset, start_lsn, buf, size); + /* NotReached. */ + } +} + + +bool +binlog_recovery::init_recovery_from(uint64_t file_no, lsn_t file_lsn, + uint32_t page_no, uint16_t offset, + lsn_t lsn, const byte *buf, size_t size) + noexcept +{ + cur_file_no= file_no; + cur_phys_size= 0; + start_file_lsn= file_lsn; + cur_page_no= page_no; + cur_page_offset= 0; + skip_recovery= false; + start_empty= false; + skipping_partial_page= true; + if (lsn < start_file_lsn) + skipping_early_lsn= true; + else + { + skipping_early_lsn= false; + if (offset <= FIL_PAGE_DATA) + { + update_page_from_record(offset, buf, size); + skipping_partial_page= false; + } + } + return false; +} + + +/* + Initialize recovery from the state where there are no binlog files, or only + completely empty binlog files. In this case we have no file LSN to compare + redo records against. + + This can only happen if we crash immediately after RESET MASTER (or fresh + server installation) as an initial file header is durably written to disk + before binlogging new data. Therefore we should skip _all_ redo records and + recover into a completely empty state. +*/ +void +binlog_recovery::init_recovery_empty() noexcept +{ + cur_file_no= 0; + cur_phys_size= 0; + start_file_lsn= (lsn_t)0; + cur_page_no= 0; + cur_page_offset= 0; + skip_recovery= false; + start_empty= true; + ignore_last= false; + skipping_early_lsn= false; + skipping_partial_page= true; +} + + +void +binlog_recovery::init_recovery_skip_all() noexcept +{ + skip_recovery= true; +} + + +void +binlog_recovery::end_actions(bool recovery_successful) noexcept +{ + char full_path[OS_FILE_MAX_PATH]; + if (recovery_successful && !skip_recovery) + { + if (!start_empty) + { + if (cur_page_offset) + flush_page(); + if (cur_file_fh > (File)-1) + zero_out_cur_file(); + close_file(); + ++cur_file_no; + } + + /* + Delete any binlog tablespace files following the last recovered file. + These files could be pre-allocated but never used files, or they could be + files that were written with data that was eventually not recovered due + to --innodb-flush-log-at-trx-commit=0|2. + */ + for (uint64_t i= cur_file_no; + scan_result.found_binlogs >= 1 && i <= scan_result.last_file_no; + ++i) + { + binlog_name_make(full_path, i, binlog_dir); + if (my_delete(full_path, MYF(MY_WME))) + sql_print_warning("InnoDB: Could not delete empty file '%s' (" + "error: %d)", full_path, my_errno); + } + } + release(); +} + + +void +binlog_recovery::release() noexcept +{ + if (cur_file_fh >= (File)0) + { + my_close(cur_file_fh, MYF(0)); + cur_file_fh= (File)-1; + } + ut_free(page_buf); + page_buf= nullptr; + inited= false; +} + + +bool +binlog_recovery::open_cur_file() noexcept +{ + if (cur_file_fh >= (File)0) + my_close(cur_file_fh, MYF(0)); + binlog_name_make(full_path, cur_file_no, binlog_dir); + cur_file_fh= my_open(full_path, O_RDWR | O_BINARY, MYF(MY_WME)); + if (cur_file_fh < (File)0) + return true; + cur_phys_size= (uint64_t)my_seek(cur_file_fh, 0, MY_SEEK_END, MYF(0)); + return false; +} + + +bool +binlog_recovery::flush_page() noexcept +{ + if (cur_file_fh < (File)0 && + open_cur_file()) + return true; + size_t res= my_pwrite(cur_file_fh, page_buf, srv_page_size, + (uint64_t)cur_page_no << srv_page_size_shift, + MYF(MY_WME)); + if (res != srv_page_size) + return true; + cur_page_offset= 0; + memset(page_buf, 0, srv_page_size); + return false; +} + + +void +binlog_recovery::zero_out_cur_file() +{ + if (cur_file_fh < (File)0) + return; + + /* Recover the original size from the current file. */ + size_t read= my_pread(cur_file_fh, page_buf, srv_page_size, 0, MYF(0)); + if (read != (size_t)srv_page_size) + { + sql_print_warning("InnoDB: Could not read last binlog file during recovery"); + return; + } + binlog_header_data header; + rpl_binlog_state_base dummy_state; + dummy_state.init(); + int res= read_gtid_state_from_page(&dummy_state, page_buf, 0, &header); + if (res <= 0) + { + if (res < 0) + sql_print_warning("InnoDB: Could not read last binlog file during recovery"); + else + sql_print_warning("InnoDB: Empty binlog file header found during recovery"); + ut_ad(0); + return; + } + + /* Fill up or truncate the file to its original size. */ + if (my_chsize(cur_file_fh, (my_off_t)header.page_count << srv_page_size_shift, + 0, MYF(0))) + sql_print_warning("InnoDB: Could not change the size of last binlog file " + "during recovery (error: %d)", my_errno); + for (uint32_t i= cur_page_no + 1; i < header.page_count; ++i) + { + if (my_pread(cur_file_fh, page_buf, srv_page_size, + (my_off_t)i << srv_page_size_shift, MYF(0)) < + (size_t)srv_page_size) + break; + /* Check if page already zeroed out. */ + if (page_buf[0] == 0 && !memcmp(page_buf, page_buf+1, srv_page_size - 1)) + continue; + memset(page_buf, 0, srv_page_size); + if (my_pwrite(cur_file_fh, page_buf, srv_page_size, + (uint64_t)i << srv_page_size_shift, MYF(MY_WME)) < + (size_t)srv_page_size) + { + sql_print_warning("InnoDB: Error writing to last binlog file during " + "recovery (error code: %d)", my_errno); + break; + } + } +} + + +bool +binlog_recovery::close_file() noexcept +{ + if (cur_file_fh >= (File)0) + { + if (my_sync(cur_file_fh, MYF(MY_WME))) + return true; + my_close(cur_file_fh, (File)0); + cur_file_fh= (File)-1; + cur_phys_size= 0; + } + return false; +} + + +bool +binlog_recovery::next_file() noexcept +{ + if (flush_page()) + return true; + if (close_file()) + return true; + ++cur_file_no; + cur_page_no= 0; + cur_page_offset= 0; + return false; +} + + +bool +binlog_recovery::next_page() noexcept +{ + if (flush_page()) + return true; + ++cur_page_no; + return false; +} + + +bool +binlog_recovery::apply_redo(bool space_id, uint32_t page_no, uint16_t offset, + lsn_t start_lsn, lsn_t end_lsn, + const byte *buf, size_t size) noexcept +{ + if (UNIV_UNLIKELY(skip_recovery) || start_empty) + return false; + + if (skipping_partial_page) + { + if (offset > FIL_PAGE_DATA) + return false; + skipping_partial_page= false; + } + + if (start_lsn < start_file_lsn) + { + if (skipping_early_lsn) + return false; /* Skip record for earlier file that's already durable. */ + if (!srv_force_recovery) + { + sql_print_error("InnoDB: Unexpected LSN " LSN_PF " during recovery, " + "expected at least " LSN_PF, start_lsn, start_file_lsn); + return true; + } + sql_print_warning("InnoDB: Ignoring unexpected LSN " LSN_PF " during " + "recovery, ", start_lsn); + return false; + } + skipping_early_lsn= false; + + /* Test for moving to the next file. */ + if (space_id != (cur_file_no & 1)) + { + /* Check that we recovered all of this file. */ + if ( ( (cur_page_offset > FIL_PAGE_DATA && + cur_page_offset < srv_page_size - FIL_PAGE_DATA_END) || + cur_page_no + (cur_page_offset > FIL_PAGE_DATA) < + cur_phys_size >> srv_page_size_shift) && + !srv_force_recovery) + { + sql_print_error("InnoDB: Missing recovery record at end of file_no=" + PRIu64 ", LSN " LSN_PF, cur_file_no, start_lsn); + return true; + } + + /* Check that we recover from the start of the next file. */ + if ((page_no > 0 || offset > FIL_PAGE_DATA) && !srv_force_recovery) + { + sql_print_error("InnoDB: Missing recovery record at start of file_no=" + PRIu64 ", LSN " LSN_PF, cur_file_no+1, start_lsn); + return true; + } + + if (next_file()) + return true; + } + /* Test for moving to the next page. */ + else if (page_no != cur_page_no) + { + if (cur_page_offset < srv_page_size - FIL_PAGE_DATA_END && + !srv_force_recovery) + { + sql_print_error("InnoDB: Missing recovery record in file_no=" + PRIu64 ", page_no=%u, LSN " LSN_PF, + cur_file_no, cur_page_no, start_lsn); + return true; + } + + if ((page_no != cur_page_no + 1 || offset > FIL_PAGE_DATA) && + !srv_force_recovery) + { + sql_print_error("InnoDB: Missing recovery record in file_no=" + PRIu64 ", page_no=%u, LSN " LSN_PF, + cur_file_no, cur_page_no + 1, start_lsn); + return true; + } + + if (next_page()) + return true; + } + /* Test no gaps in offset. */ + else if (offset != cur_page_offset && + offset > FIL_PAGE_DATA && + !srv_force_recovery) + { + sql_print_error("InnoDB: Missing recovery record in file_no=" + PRIu64 ", page_no=%u, LSN " LSN_PF, + cur_file_no, cur_page_no, start_lsn); + return true; + } + + if (offset + size >= srv_page_size) + return !srv_force_recovery; + + update_page_from_record(offset, buf, size); + return false; +} + + +void +binlog_recovery::update_page_from_record(uint16_t offset, + const byte *buf, size_t size) noexcept +{ + memcpy(page_buf + offset, buf, size); + cur_page_offset= offset + (uint32_t)size; +} /* @@ -489,6 +1114,29 @@ start_binlog_prealloc_thread() } +/* + Write the initial header record to the file and durably sync it to disk in + the binlog tablespace file and in the redo log. + + This is to ensure recovery can work correctly. This way, recovery will + always find a non-empty file with an initial lsn to start recovery from. + Except in the case where we crash right here; in this case recovery will + find no binlog files at all and will know to recover to the empty state + with no binlog files present. +*/ +static void +binlog_sync_initial() +{ + chunk_data_flush dummy_data; + mtr_t mtr; + mtr.start(); + fsp_binlog_write_rec(&dummy_data, &mtr, FSP_BINLOG_TYPE_FILLER); + mtr.commit(); + log_buffer_flush_to_disk(true); + binlog_page_fifo->flush_up_to(0, 0); +} + + /* Open the InnoDB binlog implementation. This is called from server binlog layer if the user configured the binlog to @@ -539,6 +1187,7 @@ innodb_binlog_init(size_t binlog_size, const char *directory) } start_binlog_prealloc_thread(); + binlog_sync_initial(); return false; } @@ -579,6 +1228,42 @@ process_binlog_name(found_binlogs *bls, uint64_t idx, size_t size) } +/* + Scan the binlog directory for binlog files. + Returns: + 1 Success + 0 Binlog directory not found + -1 Other error +*/ +static int +scan_for_binlogs(const char *binlog_dir, found_binlogs *binlog_files, + bool error_if_missing) noexcept +{ + MY_DIR *dir= my_dir(binlog_dir, MYF(MY_WANT_STAT)); + if (!dir) + { + if (my_errno != ENOENT || error_if_missing) + sql_print_error("Could not read the binlog directory '%s', error code %d", + binlog_dir, my_errno); + return (my_errno == ENOENT ? 0 : -1); + } + + binlog_files->found_binlogs= 0; + size_t num_entries= dir->number_of_files; + fileinfo *entries= dir->dir_entry; + for (size_t i= 0; i < num_entries; ++i) { + const char *name= entries[i].name; + uint64_t idx; + if (!is_binlog_name(name, &idx)) + continue; + process_binlog_name(binlog_files, idx, entries[i].mystat->st_size); + } + my_dirend(dir); + + return 1; /* Success */ +} + + static bool binlog_page_empty(const byte *page) { @@ -715,28 +1400,11 @@ innodb_binlog_discover() uint64_t file_no; const uint32_t page_size= (uint32_t)srv_page_size; const uint32_t page_size_shift= (uint32_t)srv_page_size_shift; - MY_DIR *dir= my_dir(innodb_binlog_directory, MYF(MY_WANT_STAT)); - if (!dir) - { - if (my_errno == ENOENT) - return 0; - sql_print_error("Could not read the binlog directory '%s', error code %d", - innodb_binlog_directory, my_errno); - return -1; - } - struct found_binlogs UNINIT_VAR(binlog_files); - binlog_files.found_binlogs= 0; - size_t num_entries= dir->number_of_files; - fileinfo *entries= dir->dir_entry; - for (size_t i= 0; i < num_entries; ++i) { - const char *name= entries[i].name; - uint64_t idx; - if (!is_binlog_name(name, &idx)) - continue; - process_binlog_name(&binlog_files, idx, entries[i].mystat->st_size); - } - my_dirend(dir); + + int res= scan_for_binlogs(innodb_binlog_directory, &binlog_files, false); + if (res <= 0) + return res; /* Now, if we found any binlog files, locate the point in one of them where @@ -752,9 +1420,9 @@ innodb_binlog_discover() earliest_binlog_file_no= binlog_files.earliest_file_no; total_binlog_used_size= binlog_files.total_size; - int res= find_pos_in_binlog(binlog_files.last_file_no, - binlog_files.last_size, - page_buf.get(), &page_no, &pos_in_page); + res= find_pos_in_binlog(binlog_files.last_file_no, + binlog_files.last_size, + page_buf.get(), &page_no, &pos_in_page); if (res < 0) { file_no= binlog_files.last_file_no; active_binlog_file_no.store(file_no, std::memory_order_release); @@ -944,17 +1612,19 @@ innodb_binlog_prealloc_thread() __attribute__((noinline)) static ssize_t serialize_gtid_state(rpl_binlog_state_base *state, byte *buf, size_t buf_size, - uint32_t file_size_in_pages, bool is_first_page) + uint32_t file_size_in_pages, uint64_t file_no, + bool is_first_page) { unsigned char *p= (unsigned char *)buf; /* 1 uint64_t for the current LSN at start of binlog file. - 1 uint32_t for the file length in pages. + 1 uint64_t for the file_no. + 1 uint32_t for the file size in pages. 1 uint32_t for the innodb_binlog_state_interval in pages. 1 uint64_t for the number of entries in the state stored. 2 uint32_t + 1 uint64_t for at least one GTID. */ - ut_ad(buf_size >= 4*COMPR_INT_MAX32 + 2*COMPR_INT_MAX64); + ut_ad(buf_size >= 4*COMPR_INT_MAX32 + 4*COMPR_INT_MAX64); if (is_first_page) { /* In the first page where we put the full state, include the value of the @@ -962,10 +1632,11 @@ serialize_gtid_state(rpl_binlog_state_base *state, byte *buf, size_t buf_size, we know how to search them independent of how the setting changes. We also include the current LSN for recovery purposes; and the file - length, which is also useful if we have to recover the whole file from - the redo log after a crash. + length and file_no, which is also useful if we have to recover the whole + file from the redo log after a crash. */ p= compr_int_write(p, log_sys.get_lsn(std::memory_order_acquire)); + p= compr_int_write(p, file_no); p= compr_int_write(p, file_size_in_pages); /* ToDo: Check that this current_binlog_state_interval is the correct value! */ p= compr_int_write(p, current_binlog_state_interval); @@ -1005,7 +1676,8 @@ binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr, block= nullptr; ssize_t used_bytes= serialize_gtid_state(state, small_buf, sizeof(small_buf), - file_size_in_pages, page_no==0); + file_size_in_pages, file_no, + page_no==0); if (used_bytes >= 0) { buf= small_buf; @@ -1019,8 +1691,8 @@ binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr, if (UNIV_UNLIKELY(!alloced_buf)) return true; buf= alloced_buf; - used_bytes= serialize_gtid_state(state, buf, buf_size, - file_size_in_pages, page_no==0); + used_bytes= serialize_gtid_state(state, buf, buf_size, file_size_in_pages, + file_no, page_no==0); if (UNIV_UNLIKELY(used_bytes < 0)) { ut_ad(0 /* Shouldn't happen, as we allocated maximum needed size. */); @@ -1110,7 +1782,11 @@ read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page, const byte *p= page + FIL_PAGE_DATA; byte t= *p; if (UNIV_UNLIKELY((t & FSP_BINLOG_TYPE_MASK) != FSP_BINLOG_TYPE_GTID_STATE)) + { + out_header_data->is_empty= binlog_page_empty(page); return 0; + } + out_header_data->is_empty= false; /* ToDo: Handle reading a state that spans multiple pages. For now, we assume the state fits in a single page. */ ut_a(t & FSP_BINLOG_FLAG_LAST); @@ -1123,15 +1799,21 @@ read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page, if (page_no == 0) { /* - The state in the first page has three extra words: The start LSN of the - file; length of the file in pages; and the offset between differential - binlog states logged regularly in the binlog tablespace. + The state in the first page has four extra words: The start LSN of the + file; the file_no of the file; the file length, in pages; and the offset + between differential binlog states logged regularly in the binlog + tablespace. */ if (UNIV_UNLIKELY(p >= p_end)) return -1; out_header_data->start_lsn= (uint32_t)v_and_p.first; v_and_p= compr_int_read(p); p= v_and_p.second; + if (UNIV_UNLIKELY(p >= p_end)) + return -1; + out_header_data->file_no= v_and_p.first; + v_and_p= compr_int_read(p); + p= v_and_p.second; if (UNIV_UNLIKELY(p >= p_end) || UNIV_UNLIKELY(v_and_p.first >= UINT32_MAX)) return -1; out_header_data->page_count= (uint32_t)v_and_p.first; @@ -1146,6 +1828,7 @@ read_gtid_state_from_page(rpl_binlog_state_base *state, const byte *page, else { out_header_data->start_lsn= 0; + out_header_data->file_no= ~(uint64_t)0; out_header_data->page_count= 0; out_header_data->diff_state_interval= 0; } @@ -2298,6 +2981,7 @@ innodb_binlog_get_init_state(rpl_binlog_state_base *out_state) } + bool innodb_reset_binlogs() { @@ -2308,6 +2992,15 @@ innodb_reset_binlogs() /* Close existing binlog tablespaces and stop the pre-alloc thread. */ innodb_binlog_close(false); + /* + Durably flush the redo log to disk. This is mostly to simplify + conceptually (RESET MASTER is not performance critical). This way, we will + never see a state where recovery stops at an LSN prior to the RESET + MASTER, so we do not have any question around truncating the binlog to a + point before the RESET MASTER. + */ + log_buffer_flush_to_disk(true); + /* Prevent any flushing activity while resetting. */ binlog_page_fifo->lock_wait_for_idle(); binlog_page_fifo->reset(); @@ -2346,6 +3039,7 @@ innodb_reset_binlogs() innodb_binlog_init_state(); binlog_page_fifo->unlock(); start_binlog_prealloc_thread(); + binlog_sync_initial(); return err; } @@ -2384,8 +3078,10 @@ innodb_binlog_purge_low(uint64_t limit_file_no, bool by_name, uint64_t limit_name_file_no, uint64_t *out_file_no) { + uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed); + bool need_active_flush= (active <= limit_file_no + 2); ut_ad(by_date || by_size || by_name); - ut_a(limit_file_no <= active_binlog_file_no.load(std::memory_order_relaxed)); + ut_a(limit_file_no <= active); ut_a(limit_file_no <= first_open_binlog_file_no); mysql_mutex_assert_owner(&purge_binlog_mutex); @@ -2431,6 +3127,19 @@ innodb_binlog_purge_low(uint64_t limit_file_no, } else loc_total_size-= stat_buf.st_size; + + /* + Make sure that we always leave at least one binlog file durably non-empty, + by fsync()'ing the first page of the active file before deleting file + (active-2). This way, recovery will always have at least one file header + from which to determine the LSN at which to start applying redo records. + */ + if (file_no + 2 >= active && need_active_flush) + { + binlog_page_fifo->flush_up_to(active, 0); + need_active_flush= false; + } + if (my_delete(filename, MYF(0))) { if (my_errno == ENOENT) @@ -2578,3 +3287,25 @@ innodb_binlog_purge(handler_binlog_purge_info *purge_info) return res; } + + +bool +binlog_recover_write_data(bool space_id, uint32_t page_no, + uint16_t offset, + lsn_t start_lsn, lsn_t lsn, + const byte *buf, size_t size) noexcept +{ + if (!recover_obj.inited) + return recover_obj.init_recovery(space_id, page_no, offset, start_lsn, lsn, + buf, size); + return recover_obj.apply_redo(space_id, page_no, offset, start_lsn, lsn, + buf, size); +} + + +void +binlog_recover_end(lsn_t lsn) noexcept +{ + if (recover_obj.inited) + recover_obj.end_actions(true); +} diff --git a/storage/innobase/include/innodb_binlog.h b/storage/innobase/include/innodb_binlog.h index 80c3a53f074..dcf4c7c0634 100644 --- a/storage/innobase/include/innodb_binlog.h +++ b/storage/innobase/include/innodb_binlog.h @@ -59,6 +59,21 @@ struct chunk_data_base { }; +/* + Empty chunk data, used to pass a dummy record to fsp_binlog_write_rec() + in fsp_binlog_flush(). +*/ +struct chunk_data_flush : public chunk_data_base { + ~chunk_data_flush() { } + + virtual std::pair copy_data(byte *p, uint32_t max_len) final + { + memset(p, 0xff, max_len); + return {max_len, true}; + } +}; + + /* Data stored at the start of each binlog file. (The data is stored in the file as compressed integers; this is just a @@ -72,10 +87,12 @@ struct binlog_header_data { */ lsn_t start_lsn; /* - The length of this binlog file, in pages. Used during recovery to know - what length to create the binlog file with (in the case where we need to - recover the whole file). + The file_no of the binlog file. This is written into the header to be able + to recover it in the case where no binlog files are present at server + start (could be due to FLUSH BINARY LOGS or RESET MASTER). */ + uint64_t file_no; + /* The length of this binlog file, in pages. */ uint32_t page_count; /* The interval (in pages) at which the (differential) binlog GTID state is @@ -84,6 +101,8 @@ struct binlog_header_data { binlog file was created. */ uint32_t diff_state_interval; + /* Whether the page was found empty. */ + bool is_empty; }; @@ -104,11 +123,19 @@ extern size_t total_binlog_used_size; static inline void -binlog_name_make(char name_buf[OS_FILE_MAX_PATH], uint64_t file_no) +binlog_name_make(char name_buf[OS_FILE_MAX_PATH], uint64_t file_no, + const char *binlog_dir) { snprintf(name_buf, OS_FILE_MAX_PATH, "%s/" BINLOG_NAME_BASE "%06" PRIu64 BINLOG_NAME_EXT, - innodb_binlog_directory, file_no); + binlog_dir, file_no); +} + + +static inline void +binlog_name_make(char name_buf[OS_FILE_MAX_PATH], uint64_t file_no) +{ + binlog_name_make(name_buf, file_no, innodb_binlog_directory); } @@ -125,7 +152,7 @@ extern void innodb_binlog_close(bool shutdown); extern bool binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr, fsp_binlog_page_entry * &block, uint32_t &page_no, uint32_t &page_offset, uint64_t file_no, - uint32_t file_size_in_pages); + uint32_t file_size_in_pages); extern bool innodb_binlog_oob(THD *thd, const unsigned char *data, size_t data_len, void **engine_data); extern void innodb_free_oob(THD *thd, void *engine_data); @@ -140,5 +167,10 @@ extern void innodb_binlog_status(char out_filename[FN_REFLEN], extern bool innodb_binlog_get_init_state(rpl_binlog_state_base *out_state); extern bool innodb_reset_binlogs(); extern int innodb_binlog_purge(handler_binlog_purge_info *purge_info); +extern bool binlog_recover_write_data(bool space_id, uint32_t page_no, + uint16_t offset, + lsn_t start_lsn, lsn_t lsn, + const byte *buf, size_t size) noexcept; +extern void binlog_recover_end(lsn_t lsn) noexcept; #endif /* innodb_binlog_h */ diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index fdc8304a67d..a67e7c13138 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -53,6 +53,7 @@ Created 9/20/1997 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0pagecompress.h" +#include "innodb_binlog.h" #include "log.h" /** The recovery system */ @@ -2379,20 +2380,6 @@ void recv_sys_t::rewind(source &l, source &begin) noexcept pages_it= pages.end(); } -static void binlog_recover_write_data(bool space_id, uint32_t page_no, - uint16_t offset, - lsn_t start_lsn, lsn_t lsn, - const byte *buf, size_t size) noexcept -{ - sql_print_information("ToDo1: binlog_recover_write_data(space_id=%d page_no=%u offset=%u start_lsn=%lu lsn=%lu size=%lu)", (int)space_id, (unsigned)page_no, (unsigned)offset, (ulong)start_lsn, (ulong)lsn, (ulong)size); - for (size_t i= offset; i < offset+size; i+=8) - sql_print_information("ToDo1: 0x%04x %02X %02X %02X %02X %02X %02X %02X %02X", i, buf[i], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7]); -} -static void binlog_recover_end(lsn_t lsn) noexcept -{ - sql_print_information("ToDo1: binlog_recover_end(lsn=%lu)]", (ulong)lsn); -} - /** Parse and register one log_t::FORMAT_10_8 mini-transaction. @tparam storing whether to store the records @@ -2548,6 +2535,7 @@ restart: } ut_ad(!l.is_eof(rlen)); + bool is_binlog= false; uint32_t idlen; if ((b & 0x80) && got_page_op) { @@ -2597,6 +2585,8 @@ restart: space_id= mlog_decode_varint(l); if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR)) goto page_id_corrupted; + static_assert((LOG_BINLOG_ID_0 | 1) == LOG_BINLOG_ID_1, ""); + is_binlog= storing == YES && (space_id | 1) == LOG_BINLOG_ID_1; l+= idlen; rlen-= idlen; idlen= mlog_decode_varint_length(*l); @@ -2632,6 +2622,7 @@ restart: continue; } if (storing == YES && UNIV_LIKELY(space_id != TRX_SYS_SPACE) && + !is_binlog && !srv_is_undo_tablespace(space_id)) { ut_ad(file_checkpoint != 0); @@ -2781,28 +2772,34 @@ restart: ignore the payload and only compute the mini-transaction checksum; there will be a subsequent call with storing==YES. */ continue; + if (storing == NO) + is_binlog= false; if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1)) goto record_corrupted; ut_d(const source payload{l}); cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); - const uint32_t olen= mlog_decode_varint_length(*cl); - if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) - goto record_corrupted; - const uint32_t offset= mlog_decode_varint(cl); - ut_ad(offset != MLOG_DECODE_ERROR); - static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); - if (UNIV_UNLIKELY(offset >= srv_page_size)) - goto record_corrupted; - last_offset+= offset; - if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size)) - goto record_corrupted; - cl+= olen; - rlen-= olen; + if (!is_binlog) + { + const uint32_t olen= mlog_decode_varint_length(*cl); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(cl); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= srv_page_size)) + goto record_corrupted; + last_offset+= offset; + if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size)) + goto record_corrupted; + cl+= olen; + rlen-= olen; + } if ((b & 0x70) == WRITE) { - if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) + if (is_binlog); + else if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) goto record_corrupted; - if (UNIV_UNLIKELY(!page_no) && file_checkpoint) + else if (UNIV_UNLIKELY(!page_no) && file_checkpoint) { const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE && last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4; @@ -2822,6 +2819,7 @@ restart: : file_name_t::initial_flags; if (it == recv_spaces.end()) ut_ad(storing == NO || space_id == TRX_SYS_SPACE || + is_binlog || srv_is_undo_tablespace(space_id)); else if (!it->second.space) { @@ -2883,7 +2881,7 @@ restart: #endif if (storing == YES) { - if (space_id >= LOG_BINLOG_ID_0 && space_id <= LOG_BINLOG_ID_1) + if (is_binlog) { if ((b & 0xf0) != WRITE) goto record_corrupted; @@ -2894,10 +2892,12 @@ restart: ut_ad(offset != MLOG_DECODE_ERROR); if (UNIV_UNLIKELY(offset + rlen - olen >= 65535)) goto record_corrupted; - binlog_recover_write_data(space_id & 1, page_no, uint16_t(offset), - start_lsn, lsn, - l.get_buf(cl, recs, decrypt_buf) + olen, - l - recs + rlen - olen); + const size_t head{l - recs + olen}; + if (binlog_recover_write_data(space_id & 1, page_no, uint16_t(offset), + start_lsn, lsn, + l.get_buf(cl, recs, decrypt_buf) + head, + rlen - olen)) + goto record_corrupted; continue; } if (if_exists) @@ -4256,6 +4256,9 @@ static bool recv_scan_log(bool last_phase) ut_ad(recv_sys.file_checkpoint); recv_sys.lsn= rewound_lsn; } + else if (store) + binlog_recover_end(recv_sys.lsn); + func_exit: ut_d(recv_sys.after_apply= last_phase); mysql_mutex_unlock(&recv_sys.mutex);