From 72e1cc8f5296947c0bcea8b8d682fc8c94484cd7 Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Fri, 10 Jan 2025 00:06:25 +0100 Subject: [PATCH] MDEV-35806: Error in read_log_event() corrupts relay log writer, crashes server In Log_event::read_log_event(), don't use IO_CACHE::error of the relay log's IO_CACHE to signal an error back to the caller. When reading the active relay log, this flag is also being used by the IO thread, and setting it can randomly cause the IO thread to wrongly detect IO error on writing and permanently disable the relay log. This was seen sporadically in test case rpl.rpl_from_mysql80. The read error set by the SQL thread in the IO_CACHE would be interpreted as a write error by the IO thread, which would cause it to throw a fatal error and close the relay log. And this would later cause CHANGE MASTER to try to purge a closed relay log, resulting in nullptr crash. SQL thread is not able to parse an event read from the relay log. This can happen like here when replicating unknown events from a MySQL master, potentially also for other reasons. Also fix a mistake in my_b_flush_io_cache() introduced back in 2001 (fa09f2cd7e7) where my_b_flush_io_cache() could wrongly return an error set in IO_CACHE::error, even if the flush operation itself succeeded. Also fix another sporadic failure in rpl.rpl_from_mysql80 where the outout of MASTER_POS_WAIT() depended on timing of SQL and IO thread. Reviewed-by: Monty Reviewed-by: Andrei Elkin Signed-off-by: Kristian Nielsen --- client/mysqlbinlog.cc | 17 +++++++++++------ mysql-test/suite/rpl/r/rpl_from_mysql80.result | 2 -- mysql-test/suite/rpl/t/rpl_from_mysql80.test | 2 ++ mysys/mf_iocache.c | 2 +- sql/log.cc | 12 ++++++++---- sql/log_event.cc | 15 ++++++++++++--- sql/log_event.h | 2 +- sql/log_event_server.cc | 4 ++-- sql/rpl_parallel.cc | 5 +++-- sql/rpl_rli.cc | 9 +++++---- sql/slave.cc | 7 ++++--- sql/sql_repl.cc | 8 +++++--- 12 files changed, 54 insertions(+), 31 deletions(-) diff --git a/client/mysqlbinlog.cc b/client/mysqlbinlog.cc index 0cb5cf2c67c..f17bf50d4a3 100644 --- a/client/mysqlbinlog.cc +++ b/client/mysqlbinlog.cc @@ -2702,6 +2702,7 @@ static Exit_status check_header(IO_CACHE* file, uchar buf[PROBE_HEADER_LEN]; my_off_t tmp_pos, pos; MY_STAT my_file_stat; + int read_error; delete glob_description_event; if (!(glob_description_event= new Format_description_log_event(3))) @@ -2802,7 +2803,8 @@ static Exit_status check_header(IO_CACHE* file, Format_description_log_event *new_description_event; my_b_seek(file, tmp_pos); /* seek back to event's start */ if (!(new_description_event= (Format_description_log_event*) - Log_event::read_log_event(file, glob_description_event, + Log_event::read_log_event(file, &read_error, + glob_description_event, opt_verify_binlog_checksum))) /* EOF can't be hit here normally, so it's a real error */ { @@ -2835,7 +2837,8 @@ static Exit_status check_header(IO_CACHE* file, { Log_event *ev; my_b_seek(file, tmp_pos); /* seek back to event's start */ - if (!(ev= Log_event::read_log_event(file, glob_description_event, + if (!(ev= Log_event::read_log_event(file, &read_error, + glob_description_event, opt_verify_binlog_checksum))) { /* EOF can't be hit here normally, so it's a real error */ @@ -2948,8 +2951,10 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info, { char llbuff[21]; my_off_t old_off = my_b_tell(file); + int read_error; - Log_event* ev = Log_event::read_log_event(file, glob_description_event, + Log_event* ev = Log_event::read_log_event(file, &read_error, + glob_description_event, opt_verify_binlog_checksum); if (!ev) { @@ -2958,15 +2963,15 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info, about a corruption, but treat it as EOF and move to the next binlog. */ if (glob_description_event->flags & LOG_EVENT_BINLOG_IN_USE_F) - file->error= 0; - else if (file->error) + read_error= 0; + else if (read_error) { error("Could not read entry at offset %s: " "Error in log format or read error.", llstr(old_off,llbuff)); goto err; } - // else file->error == 0 means EOF, that's OK, we break in this case + // else read_error == 0 means EOF, that's OK, we break in this case /* Emit a warning in the event that we finished processing input diff --git a/mysql-test/suite/rpl/r/rpl_from_mysql80.result b/mysql-test/suite/rpl/r/rpl_from_mysql80.result index 56dca85150b..0f67f8a6235 100644 --- a/mysql-test/suite/rpl/r/rpl_from_mysql80.result +++ b/mysql-test/suite/rpl/r/rpl_from_mysql80.result @@ -11,8 +11,6 @@ START SLAVE IO_THREAD; include/wait_for_slave_io_to_start.inc START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178; SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60); -MASTER_POS_WAIT('master-bin.000001', 1178, 60) -NULL SELECT * FROM t1 ORDER BY a; a b c 1 0 diff --git a/mysql-test/suite/rpl/t/rpl_from_mysql80.test b/mysql-test/suite/rpl/t/rpl_from_mysql80.test index ec4a22056f9..a5723486ee9 100644 --- a/mysql-test/suite/rpl/t/rpl_from_mysql80.test +++ b/mysql-test/suite/rpl/t/rpl_from_mysql80.test @@ -81,7 +81,9 @@ START SLAVE IO_THREAD; # The position 1178 is the start of: INSERT INTO t1 VALUES (4, 0, 'skip'); # After that comes unknown MySQL 8.0 events, which we test error for below. START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178; +--disable_result_log SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60); +--enable_result_log SELECT * FROM t1 ORDER BY a; --source include/wait_for_slave_sql_to_stop.inc diff --git a/mysys/mf_iocache.c b/mysys/mf_iocache.c index 4ee1331bdb3..c92b0457e07 100644 --- a/mysys/mf_iocache.c +++ b/mysys/mf_iocache.c @@ -1735,7 +1735,7 @@ int my_b_flush_io_cache(IO_CACHE *info, int need_append_buffer_lock) info->write_pos= info->write_buffer; ++info->disk_writes; UNLOCK_APPEND_BUFFER; - DBUG_RETURN(info->error); + DBUG_RETURN(0); } } UNLOCK_APPEND_BUFFER; diff --git a/sql/log.cc b/sql/log.cc index 8f4f293a3d0..ce98843b2ad 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -11385,7 +11385,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, binlog_checkpoint_found= false; for (round= 1;;) { - while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log, + int error; + while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log, &error, fdle, opt_master_verify_checksum)) && ev->is_valid()) { @@ -11671,7 +11672,8 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) return 1; } - if ((ev= Log_event::read_log_event(&log, &fdle, + int read_error; + if ((ev= Log_event::read_log_event(&log, &read_error, &fdle, opt_master_verify_checksum)) && ev->get_type_code() == FORMAT_DESCRIPTION_EVENT) { @@ -11895,10 +11897,11 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list) Format_description_log_event *fdle; Log_event *ev; const char *errormsg = NULL; + int read_error; *out_gtid_list= NULL; - if (!(ev= Log_event::read_log_event(cache, &init_fdle, + if (!(ev= Log_event::read_log_event(cache, &read_error, &init_fdle, opt_master_verify_checksum)) || ev->get_type_code() != FORMAT_DESCRIPTION_EVENT) { @@ -11914,7 +11917,8 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list) { Log_event_type typ; - ev= Log_event::read_log_event(cache, fdle, opt_master_verify_checksum); + ev= Log_event::read_log_event(cache, &read_error, fdle, + opt_master_verify_checksum); if (!ev) { errormsg= "Could not read GTID list event while looking for GTID " diff --git a/sql/log_event.cc b/sql/log_event.cc index 6e2ce0ab4bf..b1657275128 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -916,7 +916,7 @@ int Log_event::read_log_event(IO_CACHE* file, String* packet, DBUG_RETURN(0); } -Log_event* Log_event::read_log_event(IO_CACHE* file, +Log_event* Log_event::read_log_event(IO_CACHE* file, int *out_error, const Format_description_log_event *fdle, my_bool crc_check, my_bool print_errors) @@ -927,6 +927,7 @@ Log_event* Log_event::read_log_event(IO_CACHE* file, const char *error= 0; Log_event *res= 0; + *out_error= 0; switch (read_log_event(file, &event, fdle, BINLOG_CHECKSUM_ALG_OFF)) { case 0: @@ -976,14 +977,22 @@ err: #endif /* - The SQL slave thread will check if file->error<0 to know + The SQL slave thread will check *out_error to know if there was an I/O error. Even if there is no "low-level" I/O errors with 'file', any of the high-level above errors is worrying enough to stop the SQL thread now ; as we are skipping the current event, going on with reading and successfully executing other events can only corrupt the slave's databases. So stop. */ - file->error= -1; + *out_error= 1; + /* + Clear any error that might have been set in the IO_CACHE from a read + error, while we are still holding the relay log mutex (if reading from + the hot log). Otherwise the error might interfere unpredictably with + write operations to the same IO_CACHE in the IO thread. + */ + file->error= 0; + #ifndef MYSQL_CLIENT if (!print_errors) diff --git a/sql/log_event.h b/sql/log_event.h index 03902cc286e..de93450669c 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -1344,7 +1344,7 @@ public: we detect the event's type, then call the specific event's constructor and pass description_event as an argument. */ - static Log_event* read_log_event(IO_CACHE* file, + static Log_event* read_log_event(IO_CACHE* file, int *out_error, const Format_description_log_event *description_event, my_bool crc_check, diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index cd974ce0466..8b740195b24 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -5041,7 +5041,7 @@ int Execute_load_log_event::do_apply_event(rpl_group_info *rgi) char fname[FN_REFLEN+10]; char *ext; int fd; - int error= 1; + int error= 1, read_error; IO_CACHE file; Load_log_event *lev= 0; Relay_log_info const *rli= rgi->rli; @@ -5060,7 +5060,7 @@ int Execute_load_log_event::do_apply_event(rpl_group_info *rgi) goto err; } if (!(lev= (Load_log_event*) - Log_event::read_log_event(&file, + Log_event::read_log_event(&file, &read_error, rli->relay_log.description_event_for_exec, opt_slave_sql_verify_checksum)) || lev->get_type_code() != NEW_LOAD_EVENT) diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 25bc31638d8..f2633c3e1d1 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -1037,14 +1037,15 @@ do_retry: /* The loop is here so we can try again the next relay log file on EOF. */ for (;;) { + int error; old_offset= cur_offset; - ev= Log_event::read_log_event(&rlog, description_event, + ev= Log_event::read_log_event(&rlog, &error, description_event, opt_slave_sql_verify_checksum); cur_offset= my_b_tell(&rlog); if (ev) break; - if (unlikely(rlog.error < 0)) + if (unlikely(error)) { errmsg= "slave SQL thread aborted because of I/O error"; err= 1; diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index 736fd11d14f..97d70decb2e 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -544,12 +544,13 @@ read_relay_log_description_event(IO_CACHE *cur_log, ulonglong start_pos, if (my_b_tell(cur_log) >= start_pos) break; - if (!(ev= Log_event::read_log_event(cur_log, fdev, + int read_error; + if (!(ev= Log_event::read_log_event(cur_log, &read_error, fdev, opt_slave_sql_verify_checksum))) { - DBUG_PRINT("info",("could not read event, cur_log->error=%d", - cur_log->error)); - if (cur_log->error) /* not EOF */ + DBUG_PRINT("info",("could not read event, read_error=%d", + read_error)); + if (read_error) /* not EOF */ { *errmsg= "I/O error reading event at position 4"; delete fdev; diff --git a/sql/slave.cc b/sql/slave.cc index c27a203f2d8..90680144426 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -7845,7 +7845,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size) MYSQL_BIN_LOG::open() will write the buffered description event. */ old_pos= rli->event_relay_log_pos; - if ((ev= Log_event::read_log_event(cur_log, + int error; + if ((ev= Log_event::read_log_event(cur_log, &error, rli->relay_log.description_event_for_exec, opt_slave_sql_verify_checksum))) @@ -7862,8 +7863,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size) DBUG_RETURN(ev); } if (opt_reckless_slave) // For mysql-test - cur_log->error = 0; - if (unlikely(cur_log->error < 0)) + error = 0; + if (unlikely(error)) { errmsg = "slave SQL thread aborted because of I/O error"; if (hot_log) diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index 0ed5cb5e99f..f1661fbad95 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -4188,7 +4188,8 @@ bool mysql_show_binlog_events(THD* thd) my_off_t scan_pos = BIN_LOG_HEADER_SIZE; while (scan_pos < pos) { - ev= Log_event::read_log_event(&log, description_event, + int error; + ev= Log_event::read_log_event(&log, &error, description_event, opt_master_verify_checksum); scan_pos = my_b_tell(&log); if (ev == NULL || !ev->is_valid()) @@ -4263,8 +4264,9 @@ bool mysql_show_binlog_events(THD* thd) writing about this in the server log would be confusing as it isn't related to server operational status. */ + int error; for (event_count = 0; - (ev = Log_event::read_log_event(&log, + (ev = Log_event::read_log_event(&log, &error, description_event, (opt_master_verify_checksum || verify_checksum_once), false)); ) @@ -4308,7 +4310,7 @@ bool mysql_show_binlog_events(THD* thd) break; } - if (unlikely(event_count < unit->lim.get_select_limit() && log.error)) + if (unlikely(event_count < unit->lim.get_select_limit() && error)) { errmsg = "Wrong offset or I/O error"; mysql_mutex_unlock(log_lock);