mirror of
https://github.com/MariaDB/server.git
synced 2025-12-24 11:21:21 +03:00
"After Monty's review" changes to the fix for BUG#8325 "Deadlock in replication thread stops replication":
s/sleep/safe_sleep (thread safe); sleep 0/1/2/3/4/5/5/5 (get slave less late); no message on error log (deadlock is too common sometimes), a global counter instead (SHOW STATUS LIKE 'slave_retried_transactions'). Plus a fix for libmysql/Makefile.shared
This commit is contained in:
@@ -44,6 +44,7 @@ dlenev@build.mysql.com
|
||||
dlenev@jabberwock.localdomain
|
||||
dlenev@mysql.com
|
||||
ejonore@mc03.ndb.mysql.com
|
||||
gbichot@quadita2.mysql.com
|
||||
gbichot@quadxeon.mysql.com
|
||||
georg@beethoven.local
|
||||
georg@lmy002.wdf.sap.corp
|
||||
|
||||
@@ -94,7 +94,8 @@ clean-local:
|
||||
`echo $(sql_cmn_objects) | sed "s;\.lo;.c;g"` \
|
||||
$(CHARSET_SRCS) $(CHARSET_OBJS) \
|
||||
$(mystringsextra) $(mysysheaders) $(vioheaders)\
|
||||
../linked_client_sources net.c
|
||||
../linked_libmysql_sources ../linked_libmysql_r_sources \
|
||||
net.c
|
||||
|
||||
conf_to_src_SOURCES = conf_to_src.c
|
||||
conf_to_src_LDADD=
|
||||
|
||||
@@ -8,6 +8,9 @@ create table t1 (a int not null, key(a)) engine=innodb;
|
||||
create table t2 (a int not null, key(a)) engine=innodb;
|
||||
create table t3 (a int) engine=innodb;
|
||||
create table t4 (a int) engine=innodb;
|
||||
show variables like 'slave_transaction_retries';
|
||||
Variable_name Value
|
||||
slave_transaction_retries 0
|
||||
show create table t1;
|
||||
Table Create Table
|
||||
t1 CREATE TABLE `t1` (
|
||||
@@ -20,6 +23,9 @@ t2 CREATE TABLE `t2` (
|
||||
`a` int(11) NOT NULL default '0',
|
||||
KEY `a` (`a`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=latin1
|
||||
show variables like 'slave_transaction_retries';
|
||||
Variable_name Value
|
||||
slave_transaction_retries 2
|
||||
stop slave;
|
||||
begin;
|
||||
insert into t3 select * from t2 for update;
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
# (Guilhem) have seen the test manage to provoke lock wait timeout
|
||||
# error but not deadlock error; that is ok as code deals with the two
|
||||
# errors in exactly the same way.
|
||||
# We don't 'show status like 'slave_retried_transactions'' because this
|
||||
# is not repeatable (depends on sleeps).
|
||||
|
||||
source include/have_innodb.inc;
|
||||
source include/master-slave.inc;
|
||||
@@ -16,10 +18,12 @@ create table t1 (a int not null, key(a)) engine=innodb;
|
||||
create table t2 (a int not null, key(a)) engine=innodb;
|
||||
create table t3 (a int) engine=innodb;
|
||||
create table t4 (a int) engine=innodb;
|
||||
show variables like 'slave_transaction_retries';
|
||||
sync_slave_with_master;
|
||||
|
||||
show create table t1;
|
||||
show create table t2;
|
||||
show variables like 'slave_transaction_retries';
|
||||
stop slave;
|
||||
|
||||
# 1) Test deadlock
|
||||
|
||||
@@ -3062,8 +3062,17 @@ we force server id to 2, but this MySQL server will not act as a slave.");
|
||||
#endif
|
||||
if (opt_bootstrap) /* If running with bootstrap, do not start replication. */
|
||||
opt_skip_slave_start= 1;
|
||||
/* init_slave() must be called after the thread keys are created */
|
||||
init_slave();
|
||||
/*
|
||||
init_slave() must be called after the thread keys are created.
|
||||
Some parts of the code (e.g. SHOW STATUS LIKE 'slave_running' and other
|
||||
places) assume that active_mi != 0, so let's fail if it's 0 (out of
|
||||
memory); a message has already been printed.
|
||||
*/
|
||||
if (init_slave() && !active_mi)
|
||||
{
|
||||
end_thr_alarm(1); // Don't allow alarms
|
||||
unireg_abort(1);
|
||||
}
|
||||
|
||||
if (opt_bootstrap)
|
||||
{
|
||||
@@ -5494,7 +5503,8 @@ struct show_var_st status_vars[]= {
|
||||
{"Select_range_check", (char*) &select_range_check_count, SHOW_LONG},
|
||||
{"Select_scan", (char*) &select_scan_count, SHOW_LONG},
|
||||
{"Slave_open_temp_tables", (char*) &slave_open_temp_tables, SHOW_LONG},
|
||||
{"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING},
|
||||
{"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING},
|
||||
{"Slave_retried_transactions",(char*) 0, SHOW_SLAVE_RETRIED_TRANS},
|
||||
{"Slow_launch_threads", (char*) &slow_launch_threads, SHOW_LONG},
|
||||
{"Slow_queries", (char*) &long_query_count, SHOW_LONG},
|
||||
{"Sort_merge_passes", (char*) &filesort_merge_passes, SHOW_LONG},
|
||||
|
||||
33
sql/slave.cc
33
sql/slave.cc
@@ -27,6 +27,7 @@
|
||||
#include <my_dir.h>
|
||||
#include <sql_common.h>
|
||||
|
||||
#define MAX_SLAVE_RETRY_PAUSE 5
|
||||
bool use_slave_mask = 0;
|
||||
MY_BITMAP slave_error_mask;
|
||||
|
||||
@@ -2335,7 +2336,7 @@ st_relay_log_info::st_relay_log_info()
|
||||
ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0),
|
||||
abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0),
|
||||
inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE),
|
||||
until_log_pos(0)
|
||||
until_log_pos(0), retried_trans(0)
|
||||
{
|
||||
group_relay_log_name[0]= event_relay_log_name[0]=
|
||||
group_master_log_name[0]= 0;
|
||||
@@ -2980,9 +2981,8 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
|
||||
init_master_info()).
|
||||
b) init_relay_log_pos(), because the BEGIN may be an older relay log.
|
||||
*/
|
||||
if (rli->trans_retries--)
|
||||
if (rli->trans_retries < slave_trans_retries)
|
||||
{
|
||||
sql_print_information("Slave SQL thread retries transaction");
|
||||
if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
|
||||
sql_print_error("Failed to initialize the master info structure");
|
||||
else if (init_relay_log_pos(rli,
|
||||
@@ -2994,8 +2994,16 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
|
||||
else
|
||||
{
|
||||
exec_res= 0;
|
||||
sleep(2); // chance for concurrent connection to get more locks
|
||||
}
|
||||
/* chance for concurrent connection to get more locks */
|
||||
safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
|
||||
(CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
|
||||
pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS
|
||||
rli->trans_retries++;
|
||||
rli->retried_trans++;
|
||||
pthread_mutex_unlock(&rli->data_lock);
|
||||
DBUG_PRINT("info", ("Slave retries transaction "
|
||||
"rli->trans_retries: %lu", rli->trans_retries));
|
||||
}
|
||||
}
|
||||
else
|
||||
sql_print_error("Slave SQL thread retried transaction %lu time(s) "
|
||||
@@ -3004,17 +3012,8 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
|
||||
slave_trans_retries);
|
||||
}
|
||||
if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
|
||||
{
|
||||
rli->trans_retries= slave_trans_retries; // restart from fresh
|
||||
/*
|
||||
TODO: when merged into 5.0, when slave does auto-rollback if
|
||||
corrupted binlog, this should reset the retry counter too
|
||||
(any rollback should). In fact it will work, as here we are just out
|
||||
of a Format_description_log_event::exec_event() which rolled back.
|
||||
But check repl code in 5.0 for new ha_rollback calls, just in case.
|
||||
*/
|
||||
}
|
||||
}
|
||||
rli->trans_retries= 0; // restart from fresh
|
||||
}
|
||||
return exec_res;
|
||||
}
|
||||
else
|
||||
@@ -3426,7 +3425,7 @@ slave_begin:
|
||||
pthread_mutex_lock(&rli->log_space_lock);
|
||||
rli->ignore_log_space_limit= 0;
|
||||
pthread_mutex_unlock(&rli->log_space_lock);
|
||||
rli->trans_retries= slave_trans_retries; // start from "no error"
|
||||
rli->trans_retries= 0; // start from "no error"
|
||||
|
||||
if (init_relay_log_pos(rli,
|
||||
rli->group_relay_log_name,
|
||||
|
||||
@@ -295,7 +295,14 @@ typedef struct st_relay_log_info
|
||||
UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1
|
||||
} until_log_names_cmp_result;
|
||||
|
||||
ulong trans_retries;
|
||||
/*
|
||||
trans_retries varies between 0 to slave_transaction_retries and counts how
|
||||
many times the slave has retried the present transaction; gets reset to 0
|
||||
when the transaction finally succeeds. retried_trans is a cumulative
|
||||
counter: how many times the slave has retried a transaction (any) since
|
||||
slave started.
|
||||
*/
|
||||
ulong trans_retries, retried_trans;
|
||||
|
||||
st_relay_log_info();
|
||||
~st_relay_log_info();
|
||||
|
||||
@@ -1887,6 +1887,19 @@ int mysqld_show(THD *thd, const char *wild, show_var_st *variables,
|
||||
pthread_mutex_unlock(&LOCK_active_mi);
|
||||
break;
|
||||
}
|
||||
case SHOW_SLAVE_RETRIED_TRANS:
|
||||
{
|
||||
/*
|
||||
TODO: in 5.1 with multimaster, have one such counter per line in SHOW
|
||||
SLAVE STATUS, and have the sum over all lines here.
|
||||
*/
|
||||
pthread_mutex_lock(&LOCK_active_mi);
|
||||
pthread_mutex_lock(&active_mi->rli.data_lock);
|
||||
end= int10_to_str(active_mi->rli.retried_trans, buff, 10);
|
||||
pthread_mutex_unlock(&active_mi->rli.data_lock);
|
||||
pthread_mutex_unlock(&LOCK_active_mi);
|
||||
break;
|
||||
}
|
||||
#endif /* HAVE_REPLICATION */
|
||||
case SHOW_OPENTABLES:
|
||||
end= int10_to_str((long) cached_tables(), buff, 10);
|
||||
|
||||
@@ -180,7 +180,7 @@ enum SHOW_TYPE
|
||||
SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL,
|
||||
SHOW_SSL_GET_CIPHER_LIST,
|
||||
#endif /* HAVE_OPENSSL */
|
||||
SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING,
|
||||
SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_SLAVE_RETRIED_TRANS,
|
||||
SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user