BUG#50364: FLUSH LOGS crashes the server (rpl.rpl_heartbeat_basic

fails in PB sporadically) The IO thread can concurrently access the relay log IO_CACHE while another thread is performing an FLUSH LOGS procedure. FLUSH LOGS closes and reopens the relay log and while doing so it (re)initializes its IO_CACHE. During this procedure the IO_CACHE mutex is also reinitialized, which can cause problems if some other thread (namely the IO THREAD) is concurrently accessing it at the time . This patch fixes the problem by extending the interface of the flush_master_info function to also include a second paramater, "need_relay_log_lock", stating whether the thread should grab the relay log lock or not before actually flushing the relay log. Also, IO thread now calls flush_master_info with this flag set when it flushes master info with in the event read_event loop. Finally, we also increase loop time in rpl_heartbeat_basic test case, so that the number of calls to flush logs doubles, stressing this part of the code a little more. mysql-test/suite/rpl/t/rpl_heartbeat_basic.test: Doubled the number of iterations on the FLUSH LOGS loop by doubling the time available to perform all iterations. sql/repl_failsafe.cc: Updating flush_master_info call so that it uses two parameters instead of one. sql/rpl_mi.cc: Updating flush_master_info call so that it uses two parameters instead of one. sql/rpl_mi.h: Changed flush_master_info interface. Now takes a second parameter instead of just one. The second parameter is: need_lock_relay_log. sql/rpl_rli.cc: Small fix in comment. sql/slave.cc: Updating flush_master_info call so that it uses two parameters instead of one. sql/sql_repl.cc: Updating flush_master_info call so that it uses two parameters instead of one.
2025-07-29 05:21:33 +03:00 · 2010-02-03 16:56:17 +00:00
parent 54fc7997e2
commit 55aab082a9
7 changed files with 25 additions and 10 deletions
--- a/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test
+++ b/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test
@ -382,7 +382,7 @@ let $slave_param_comparison= =;
 let $rcvd_heartbeats_before= query_get_value(SHOW STATUS LIKE 'slave_received_heartbeats', Value, 1);
 # Flush logs every 0.1 second during 5 sec
 --disable_query_log
-let $i=50;
+let $i=100;
 while ($i) {
  FLUSH LOGS;
  dec $i;
--- a/sql/repl_failsafe.cc
+++ b/sql/repl_failsafe.cc
@ -977,7 +977,7 @@ bool load_master_data(THD* thd)
          host was specified; there could have been a problem when replication
          started, which led to relay log's IO_CACHE to not be inited.
        */
-        if (flush_master_info(active_mi, 0))
+        if (flush_master_info(active_mi, FALSE, FALSE))
          sql_print_error("Failed to flush master info file");
      }
      mysql_free_result(master_status_res);
--- a/sql/rpl_mi.cc
+++ b/sql/rpl_mi.cc
@ -387,7 +387,7 @@ file '%s')", fname);
  mi->rli.is_relay_log_recovery= FALSE;
  // now change cache READ -> WRITE - must do this before flush_master_info
  reinit_io_cache(&mi->file, WRITE_CACHE, 0L, 0, 1);
-  if ((error=test(flush_master_info(mi, 1))))
+  if ((error=test(flush_master_info(mi, TRUE, TRUE))))
    sql_print_error("Failed to flush master info file");
  pthread_mutex_unlock(&mi->data_lock);
  DBUG_RETURN(error);
@ -413,7 +413,9 @@ err:
     1 - flush master info failed
     0 - all ok
 */
-int flush_master_info(Master_info* mi, bool flush_relay_log_cache)
+int flush_master_info(Master_info* mi, 
+                      bool flush_relay_log_cache, 
+                      bool need_lock_relay_log)
 {
  IO_CACHE* file = &mi->file;
  char lbuf[22];
@ -436,8 +438,19 @@ int flush_master_info(Master_info* mi, bool flush_relay_log_cache)
  */
  if (flush_relay_log_cache)
  {
+    pthread_mutex_t *log_lock= mi->rli.relay_log.get_log_lock();
    IO_CACHE *log_file= mi->rli.relay_log.get_log_file();
-    if (flush_io_cache(log_file))
+
+    if (need_lock_relay_log)
+      pthread_mutex_lock(log_lock);
+
+    safe_mutex_assert_owner(log_lock);
+    err= flush_io_cache(log_file);
+
+    if (need_lock_relay_log)
+      pthread_mutex_unlock(log_lock);
+
+    if (err)
      DBUG_RETURN(2);
  }
  
--- a/sql/rpl_mi.h
+++ b/sql/rpl_mi.h
@ -120,7 +120,9 @@ int init_master_info(Master_info* mi, const char* master_info_fname,
 		     bool abort_if_no_master_info_file,
 		     int thread_mask);
 void end_master_info(Master_info* mi);
-int flush_master_info(Master_info* mi, bool flush_relay_log_cache);
+int flush_master_info(Master_info* mi, 
+                      bool flush_relay_log_cache, 
+                      bool need_lock_relay_log);
 int change_master_server_id_cmp(ulong *id1, ulong *id2);

 #endif /* HAVE_REPLICATION */
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@ -121,7 +121,7 @@ int init_relay_log_info(Relay_log_info* rli,
  /*
    The relay log will now be opened, as a SEQ_READ_APPEND IO_CACHE.
    Note that the I/O thread flushes it to disk after writing every
-    event, in flush_master_info(mi, 1).
+    event, in flush_master_info(mi, 1, ?).
  */

  /*
--- a/sql/slave.cc
+++ b/sql/slave.cc
@ -1726,7 +1726,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, Master_info *mi)
                   " to the relay log, SHOW SLAVE STATUS may be"
                   " inaccurate");
      rli->relay_log.harvest_bytes_written(&rli->log_space_total);
-      if (flush_master_info(mi, 1))
+      if (flush_master_info(mi, TRUE, FALSE))
        sql_print_error("Failed to flush master info file");
      delete ev;
    }
@ -3047,7 +3047,7 @@ Stopping slave I/O thread due to out-of-memory error from master");
        goto err;
      }

-      if (flush_master_info(mi, 1))
+      if (flush_master_info(mi, TRUE, TRUE))
      {
        sql_print_error("Failed to flush master info file");
        goto err;
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@ -1532,7 +1532,7 @@ bool change_master(THD* thd, Master_info* mi)
    Relay log's IO_CACHE may not be inited, if rli->inited==0 (server was never
    a slave before).
  */
-  if (flush_master_info(mi, 0))
+  if (flush_master_info(mi, FALSE, FALSE))
  {
    my_error(ER_RELAY_LOG_INIT, MYF(0), "Failed to flush master info file");
    ret= TRUE;