From cf287341cc4e418b5310f8882970ad32a00c728a Mon Sep 17 00:00:00 2001 From: "guilhem@mysql.com" <> Date: Thu, 11 Mar 2004 16:23:35 +0100 Subject: [PATCH] Fix for BUG#2921 "Replication problem on mutex lock in mySQL-4.0.18": re-using unused LOCK_active_mi to serialize all administrative commands related to replication: START SLAVE, STOP SLAVE, RESET SLAVE, CHANGE MASTER, init_slave() (replication autostart at server startup), end_slave() (replication autostop at server shutdown), LOAD DATA FROM MASTER. This protects us against a handful of deadlocks (like BUG#2921 when two START SLAVE, but when two STOP SLAVE too). Removing unused variables. --- sql/item_func.cc | 2 -- sql/repl_failsafe.cc | 8 ++++---- sql/set_var.cc | 8 ++++---- sql/slave.cc | 21 +++++++++++++++++---- sql/slave.h | 36 ++++++++++++++++-------------------- sql/sql_parse.cc | 28 ++++++++++++++-------------- sql/sql_show.cc | 4 ++-- 7 files changed, 57 insertions(+), 50 deletions(-) diff --git a/sql/item_func.cc b/sql/item_func.cc index 656dff63609..ab96915e746 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -1543,13 +1543,11 @@ longlong Item_master_pos_wait::val_int() } longlong pos = args[1]->val_int(); longlong timeout = (arg_count==3) ? args[2]->val_int() : 0 ; - LOCK_ACTIVE_MI; if ((event_count = active_mi->rli.wait_for_pos(thd, log_name, pos, timeout)) == -2) { null_value = 1; event_count=0; } - UNLOCK_ACTIVE_MI; return event_count; } diff --git a/sql/repl_failsafe.cc b/sql/repl_failsafe.cc index 56789a01bf4..e687b227946 100644 --- a/sql/repl_failsafe.cc +++ b/sql/repl_failsafe.cc @@ -750,7 +750,7 @@ int load_master_data(THD* thd) We do not want anyone messing with the slave at all for the entire duration of the data load. */ - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); lock_slave_threads(active_mi); init_thread_mask(&restart_thread_mask,active_mi,0 /*not inverse*/); if (restart_thread_mask && @@ -759,7 +759,7 @@ int load_master_data(THD* thd) { send_error(&thd->net,error); unlock_slave_threads(active_mi); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); return 1; } @@ -913,7 +913,7 @@ int load_master_data(THD* thd) { send_error(&thd->net, 0, "Failed purging old relay logs"); unlock_slave_threads(active_mi); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); return 1; } pthread_mutex_lock(&active_mi->rli.data_lock); @@ -934,7 +934,7 @@ int load_master_data(THD* thd) err: unlock_slave_threads(active_mi); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); thd->proc_info = 0; mc_mysql_close(&mysql); // safe to call since we always do mc_mysql_init() diff --git a/sql/set_var.cc b/sql/set_var.cc index b9229abfcc4..b28b6cda23e 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -1271,7 +1271,7 @@ byte *sys_var_insert_id::value_ptr(THD *thd, enum_var_type type) bool sys_var_slave_skip_counter::check(THD *thd, set_var *var) { int result= 0; - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); pthread_mutex_lock(&active_mi->rli.run_lock); if (active_mi->rli.slave_running) { @@ -1279,14 +1279,14 @@ bool sys_var_slave_skip_counter::check(THD *thd, set_var *var) result=1; } pthread_mutex_unlock(&active_mi->rli.run_lock); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); return result; } bool sys_var_slave_skip_counter::update(THD *thd, set_var *var) { - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); pthread_mutex_lock(&active_mi->rli.run_lock); /* The following test should normally never be true as we test this @@ -1300,7 +1300,7 @@ bool sys_var_slave_skip_counter::update(THD *thd, set_var *var) pthread_mutex_unlock(&active_mi->rli.data_lock); } pthread_mutex_unlock(&active_mi->rli.run_lock); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); return 0; } diff --git a/sql/slave.cc b/sql/slave.cc index 84f4ff4f3e0..30052e874b3 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -34,7 +34,6 @@ typedef bool (*CHECK_KILLED_FUNC)(THD*,void*); volatile bool slave_sql_running = 0, slave_io_running = 0; char* slave_load_tmpdir = 0; MASTER_INFO *active_mi; -volatile int active_mi_in_use = 0; HASH replicate_do_table, replicate_ignore_table; DYNAMIC_ARRAY replicate_wild_do_table, replicate_wild_ignore_table; bool do_table_inited = 0, ignore_table_inited = 0; @@ -114,8 +113,12 @@ int init_slave() { DBUG_ENTER("init_slave"); - /* This is called when mysqld starts */ - + /* + This is called when mysqld starts. Before client connections are + accepted. However bootstrap may conflict with us if it does START SLAVE. + So it's safer to take the lock. + */ + pthread_mutex_lock(&LOCK_active_mi); /* TODO: re-write this to interate through the list of files for multi-master @@ -160,9 +163,11 @@ int init_slave() goto err; } } + pthread_mutex_unlock(&LOCK_active_mi); DBUG_RETURN(0); err: + pthread_mutex_unlock(&LOCK_active_mi); DBUG_RETURN(1); } @@ -806,7 +811,14 @@ static int end_slave_on_walk(MASTER_INFO* mi, gptr /*unused*/) void end_slave() { - /* This is called when the server terminates, in close_connections(). */ + /* + This is called when the server terminates, in close_connections(). + It terminates slave threads. However, some CHANGE MASTER etc may still be + running presently. If a START SLAVE was in progress, the mutex lock below + will make us wait until slave threads have started, and START SLAVE + returns, then we terminate them here. + */ + pthread_mutex_lock(&LOCK_active_mi); if (active_mi) { /* @@ -827,6 +839,7 @@ void end_slave() delete active_mi; active_mi= 0; } + pthread_mutex_unlock(&LOCK_active_mi); } diff --git a/sql/slave.h b/sql/slave.h index 9877582d094..4a6389e9f87 100644 --- a/sql/slave.h +++ b/sql/slave.h @@ -27,12 +27,19 @@ /* MUTEXES in replication: - LOCK_active_mi: this is meant for multimaster, when we can switch from a - master to another. It protects active_mi. We don't care of it for the moment, - as active_mi never moves (it's created at startup and deleted at shutdown, and - not changed: it always points to the same MASTER_INFO struct), because we - don't have multimaster. So for the moment, mi does not move, and mi->rli does - not either. + LOCK_active_mi: [note: this was originally meant for multimaster, to switch + from a master to another, to protect active_mi] It is used to SERIALIZE ALL + administrative commands of replication: START SLAVE, STOP SLAVE, CHANGE + MASTER, RESET SLAVE, end_slave() (when mysqld stops) [init_slave() does not + need it it's called early]. Any of these commands holds the mutex from the + start till the end. This thus protects us against a handful of deadlocks + (consider start_slave_thread() which, when starting the I/O thread, releases + mi->run_lock, keeps rli->run_lock, and tries to re-acquire mi->run_lock). + + Currently active_mi never moves (it's created at startup and deleted at + shutdown, and not changed: it always points to the same MASTER_INFO struct), + because we don't have multimaster. So for the moment, mi does not move, and + mi->rli does not either. In MASTER_INFO: run_lock, data_lock run_lock protects all information about the run state: slave_running, and the @@ -43,6 +50,9 @@ In RELAY_LOG_INFO: run_lock, data_lock see MASTER_INFO + Order of acquisition: if you want to have LOCK_active_mi and a run_lock, you + must acquire LOCK_active_mi first. + In MYSQL_LOG: LOCK_log, LOCK_index of the binlog and the relay log LOCK_log: when you write to it. LOCK_index: when you create/delete a binlog (so that you have to update the .index file). @@ -64,19 +74,6 @@ enum enum_binlog_formats { BINLOG_FORMAT_323_LESS_57, BINLOG_FORMAT_323_GEQ_57 }; -/* - TODO: this needs to be redone, but for now it does not matter since - we do not have multi-master yet. -*/ - -#define LOCK_ACTIVE_MI { pthread_mutex_lock(&LOCK_active_mi); \ - ++active_mi_in_use; \ - pthread_mutex_unlock(&LOCK_active_mi);} - -#define UNLOCK_ACTIVE_MI { pthread_mutex_lock(&LOCK_active_mi); \ - --active_mi_in_use; \ - pthread_mutex_unlock(&LOCK_active_mi); } - /* st_relay_log_info contains information on the current relay log and relay log offset, and master log name and log sequence corresponding to the @@ -441,7 +438,6 @@ extern "C" pthread_handler_decl(handle_slave_io,arg); extern "C" pthread_handler_decl(handle_slave_sql,arg); extern bool volatile abort_loop; extern MASTER_INFO main_mi, *active_mi; /* active_mi for multi-master */ -extern volatile int active_mi_in_use; extern LIST master_list; extern HASH replicate_do_table, replicate_ignore_table; extern DYNAMIC_ARRAY replicate_wild_do_table, replicate_wild_ignore_table; diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index bc69f7136ad..88af18491ec 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -1519,9 +1519,9 @@ mysql_execute_command(void) { if (check_global_access(thd, SUPER_ACL)) goto error; - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); res = change_master(thd,active_mi); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); break; } case SQLCOM_SHOW_SLAVE_STAT: @@ -1529,9 +1529,9 @@ mysql_execute_command(void) /* Accept one of two privileges */ if (check_global_access(thd, SUPER_ACL | REPL_CLIENT_ACL)) goto error; - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); res = show_master_info(thd,active_mi); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); break; } case SQLCOM_SHOW_MASTER_STAT: @@ -1581,7 +1581,7 @@ mysql_execute_command(void) if (error) goto error; } - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); /* fetch_master_table will send the error to the client on failure. Give error if the table already exists. @@ -1591,7 +1591,7 @@ mysql_execute_command(void) { send_ok(&thd->net); } - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); break; } #endif /* HAVE_REPLICATION */ @@ -1702,9 +1702,9 @@ mysql_execute_command(void) #ifdef HAVE_REPLICATION case SQLCOM_SLAVE_START: { - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); start_slave(thd,active_mi,1 /* net report*/); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); break; } case SQLCOM_SLAVE_STOP: @@ -1727,9 +1727,9 @@ mysql_execute_command(void) break; } { - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); stop_slave(thd,active_mi,1/* net report*/); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); break; } #endif /* HAVE_REPLICATION */ @@ -3638,9 +3638,9 @@ bool reload_acl_and_cache(THD *thd, ulong options, TABLE_LIST *tables) mysql_update_log.new_file(1); mysql_bin_log.new_file(1); mysql_slow_log.new_file(1); - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); rotate_relay_log(active_mi); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); if (ha_flush_logs()) result=1; @@ -3685,7 +3685,7 @@ bool reload_acl_and_cache(THD *thd, ulong options, TABLE_LIST *tables) #endif if (options & REFRESH_SLAVE) { - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); if (reset_slave(thd, active_mi)) { result=1; @@ -3697,7 +3697,7 @@ bool reload_acl_and_cache(THD *thd, ulong options, TABLE_LIST *tables) */ error_already_sent=1; } - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); } if (options & REFRESH_USER_RESOURCES) reset_mqh(thd,(LEX_USER *) NULL); diff --git a/sql/sql_show.cc b/sql/sql_show.cc index 3d727a33173..63f3afab128 100644 --- a/sql/sql_show.cc +++ b/sql/sql_show.cc @@ -1270,11 +1270,11 @@ int mysqld_show(THD *thd, const char *wild, show_var_st *variables, #ifdef HAVE_REPLICATION case SHOW_SLAVE_RUNNING: { - LOCK_ACTIVE_MI; + pthread_mutex_lock(&LOCK_active_mi); net_store_data(&packet2, (active_mi->slave_running && active_mi->rli.slave_running) ? "ON" : "OFF"); - UNLOCK_ACTIVE_MI; + pthread_mutex_unlock(&LOCK_active_mi); break; } #endif