mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
MDEV-4506: Parallel replication.
MDEV-5189: Error handling in parallel replication. Fix error handling in parallel worker threads when a query fails: - Report the error to the error log. - Return the error back, and set rli->abort_slave. - Stop executing more events after the error.
This commit is contained in:
@@ -9,11 +9,6 @@
|
|||||||
|
|
||||||
ToDo list:
|
ToDo list:
|
||||||
|
|
||||||
- Error handling. If we fail in one of multiple parallel executions, we
|
|
||||||
need to make a best effort to complete prior transactions and roll back
|
|
||||||
following transactions, so slave binlog position will be correct.
|
|
||||||
And all the retry logic for temporary errors like deadlock.
|
|
||||||
|
|
||||||
- Retry of failed transactions is not yet implemented for the parallel case.
|
- Retry of failed transactions is not yet implemented for the parallel case.
|
||||||
|
|
||||||
- All the waits (eg. in struct wait_for_commit and in
|
- All the waits (eg. in struct wait_for_commit and in
|
||||||
@@ -212,7 +207,7 @@ handle_rpl_parallel_thread(void *arg)
|
|||||||
processing between the event groups as a simple way to ensure that
|
processing between the event groups as a simple way to ensure that
|
||||||
everything is stopped and cleaned up correctly.
|
everything is stopped and cleaned up correctly.
|
||||||
*/
|
*/
|
||||||
if (!sql_worker_killed(thd, rgi, in_event_group))
|
if (!rgi->is_error && !sql_worker_killed(thd, rgi, in_event_group))
|
||||||
err= rpt_handle_event(events, rpt);
|
err= rpt_handle_event(events, rpt);
|
||||||
else
|
else
|
||||||
err= thd->wait_for_prior_commit();
|
err= thd->wait_for_prior_commit();
|
||||||
@@ -228,6 +223,13 @@ handle_rpl_parallel_thread(void *arg)
|
|||||||
delete_or_keep_event_post_apply(rgi, event_type, events->ev);
|
delete_or_keep_event_post_apply(rgi, event_type, events->ev);
|
||||||
my_free(events);
|
my_free(events);
|
||||||
|
|
||||||
|
if (err)
|
||||||
|
{
|
||||||
|
rgi->is_error= true;
|
||||||
|
slave_output_error_info(rgi->rli, thd);
|
||||||
|
rgi->cleanup_context(thd, true);
|
||||||
|
rgi->rli->abort_slave= true;
|
||||||
|
}
|
||||||
if (end_of_group)
|
if (end_of_group)
|
||||||
{
|
{
|
||||||
in_event_group= false;
|
in_event_group= false;
|
||||||
@@ -785,6 +787,7 @@ rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
|
|||||||
}
|
}
|
||||||
else if (!is_group_event || !current)
|
else if (!is_group_event || !current)
|
||||||
{
|
{
|
||||||
|
int err;
|
||||||
/*
|
/*
|
||||||
Events like ROTATE and FORMAT_DESCRIPTION. Do not run in worker thread.
|
Events like ROTATE and FORMAT_DESCRIPTION. Do not run in worker thread.
|
||||||
Same for events not preceeded by GTID (we should not see those normally,
|
Same for events not preceeded by GTID (we should not see those normally,
|
||||||
@@ -802,11 +805,11 @@ rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
|
|||||||
rev->new_log_ident, rev->ident_len+1);
|
rev->new_log_ident, rev->ident_len+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
rpt_handle_event(qev, NULL);
|
err= rpt_handle_event(qev, NULL);
|
||||||
delete_or_keep_event_post_apply(serial_rgi, typ, qev->ev);
|
delete_or_keep_event_post_apply(serial_rgi, typ, qev->ev);
|
||||||
my_free(qev);
|
my_free(qev);
|
||||||
|
|
||||||
return false;
|
return (err != 0);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@@ -1274,12 +1274,10 @@ void Relay_log_info::stmt_done(my_off_t event_master_log_pos,
|
|||||||
DBA aware of the problem in the error log.
|
DBA aware of the problem in the error log.
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
DBUG_EXECUTE_IF("inject_crash_before_flush_rli", DBUG_SUICIDE(););
|
||||||
if (mi->using_gtid == Master_info::USE_GTID_NO)
|
if (mi->using_gtid == Master_info::USE_GTID_NO)
|
||||||
{
|
|
||||||
DBUG_EXECUTE_IF("inject_crash_before_flush_rli", DBUG_SUICIDE(););
|
|
||||||
flush_relay_log_info(this);
|
flush_relay_log_info(this);
|
||||||
DBUG_EXECUTE_IF("inject_crash_after_flush_rli", DBUG_SUICIDE(););
|
DBUG_EXECUTE_IF("inject_crash_after_flush_rli", DBUG_SUICIDE(););
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
Note that Rotate_log_event::do_apply_event() does not call this
|
Note that Rotate_log_event::do_apply_event() does not call this
|
||||||
function, so there is no chance that a fake rotate event resets
|
function, so there is no chance that a fake rotate event resets
|
||||||
@@ -1453,7 +1451,7 @@ rpl_group_info::rpl_group_info(Relay_log_info *rli_)
|
|||||||
wait_commit_group_info(0), wait_start_sub_id(0), parallel_entry(0),
|
wait_commit_group_info(0), wait_start_sub_id(0), parallel_entry(0),
|
||||||
deferred_events(NULL), m_annotate_event(0), tables_to_lock(0),
|
deferred_events(NULL), m_annotate_event(0), tables_to_lock(0),
|
||||||
tables_to_lock_count(0), trans_retries(0), last_event_start_time(0),
|
tables_to_lock_count(0), trans_retries(0), last_event_start_time(0),
|
||||||
is_parallel_exec(false),
|
is_parallel_exec(false), is_error(false),
|
||||||
row_stmt_start_timestamp(0), long_find_row_note_printed(false)
|
row_stmt_start_timestamp(0), long_find_row_note_printed(false)
|
||||||
{
|
{
|
||||||
bzero(¤t_gtid, sizeof(current_gtid));
|
bzero(¤t_gtid, sizeof(current_gtid));
|
||||||
|
@@ -558,6 +558,7 @@ struct rpl_group_info
|
|||||||
*/
|
*/
|
||||||
char future_event_master_log_name[FN_REFLEN];
|
char future_event_master_log_name[FN_REFLEN];
|
||||||
bool is_parallel_exec;
|
bool is_parallel_exec;
|
||||||
|
bool is_error;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/*
|
/*
|
||||||
|
168
sql/slave.cc
168
sql/slave.cc
@@ -4078,6 +4078,92 @@ end:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
slave_output_error_info(Relay_log_info *rli, THD *thd)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
retrieve as much info as possible from the thd and, error
|
||||||
|
codes and warnings and print this to the error log as to
|
||||||
|
allow the user to locate the error
|
||||||
|
*/
|
||||||
|
uint32 const last_errno= rli->last_error().number;
|
||||||
|
char llbuff[22];
|
||||||
|
|
||||||
|
if (thd->is_error())
|
||||||
|
{
|
||||||
|
char const *const errmsg= thd->stmt_da->message();
|
||||||
|
|
||||||
|
DBUG_PRINT("info",
|
||||||
|
("thd->stmt_da->sql_errno()=%d; rli->last_error.number=%d",
|
||||||
|
thd->stmt_da->sql_errno(), last_errno));
|
||||||
|
if (last_errno == 0)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
This function is reporting an error which was not reported
|
||||||
|
while executing exec_relay_log_event().
|
||||||
|
*/
|
||||||
|
rli->report(ERROR_LEVEL, thd->stmt_da->sql_errno(), "%s", errmsg);
|
||||||
|
}
|
||||||
|
else if (last_errno != thd->stmt_da->sql_errno())
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* An error was reported while executing exec_relay_log_event()
|
||||||
|
* however the error code differs from what is in the thread.
|
||||||
|
* This function prints out more information to help finding
|
||||||
|
* what caused the problem.
|
||||||
|
*/
|
||||||
|
sql_print_error("Slave (additional info): %s Error_code: %d",
|
||||||
|
errmsg, thd->stmt_da->sql_errno());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Print any warnings issued */
|
||||||
|
List_iterator_fast<MYSQL_ERROR> it(thd->warning_info->warn_list());
|
||||||
|
MYSQL_ERROR *err;
|
||||||
|
/*
|
||||||
|
Added controlled slave thread cancel for replication
|
||||||
|
of user-defined variables.
|
||||||
|
*/
|
||||||
|
bool udf_error = false;
|
||||||
|
while ((err= it++))
|
||||||
|
{
|
||||||
|
if (err->get_sql_errno() == ER_CANT_OPEN_LIBRARY)
|
||||||
|
udf_error = true;
|
||||||
|
sql_print_warning("Slave: %s Error_code: %d", err->get_message_text(), err->get_sql_errno());
|
||||||
|
}
|
||||||
|
if (udf_error)
|
||||||
|
{
|
||||||
|
String tmp;
|
||||||
|
if (rli->mi->using_gtid != Master_info::USE_GTID_NO)
|
||||||
|
{
|
||||||
|
tmp.append(STRING_WITH_LEN("; GTID position '"));
|
||||||
|
rpl_append_gtid_state(&tmp, false);
|
||||||
|
tmp.append(STRING_WITH_LEN("'"));
|
||||||
|
}
|
||||||
|
sql_print_error("Error loading user-defined library, slave SQL "
|
||||||
|
"thread aborted. Install the missing library, and restart the "
|
||||||
|
"slave SQL thread with \"SLAVE START\". We stopped at log '%s' "
|
||||||
|
"position %s%s", RPL_LOG_NAME, llstr(rli->group_master_log_pos,
|
||||||
|
llbuff), tmp.c_ptr_safe());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
String tmp;
|
||||||
|
if (rli->mi->using_gtid != Master_info::USE_GTID_NO)
|
||||||
|
{
|
||||||
|
tmp.append(STRING_WITH_LEN("; GTID position '"));
|
||||||
|
rpl_append_gtid_state(&tmp, false);
|
||||||
|
tmp.append(STRING_WITH_LEN("'"));
|
||||||
|
}
|
||||||
|
sql_print_error("\
|
||||||
|
Error running query, slave SQL thread aborted. Fix the problem, and restart \
|
||||||
|
the slave SQL thread with \"SLAVE START\". We stopped at log \
|
||||||
|
'%s' position %s%s", RPL_LOG_NAME, llstr(rli->group_master_log_pos, llbuff),
|
||||||
|
tmp.c_ptr_safe());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Slave SQL thread entry point.
|
Slave SQL thread entry point.
|
||||||
|
|
||||||
@@ -4335,87 +4421,7 @@ log '%s' at position %s, relay log '%s' position: %s%s", RPL_LOG_NAME,
|
|||||||
DBUG_PRINT("info", ("exec_relay_log_event() failed"));
|
DBUG_PRINT("info", ("exec_relay_log_event() failed"));
|
||||||
// do not scare the user if SQL thread was simply killed or stopped
|
// do not scare the user if SQL thread was simply killed or stopped
|
||||||
if (!sql_slave_killed(serial_rgi))
|
if (!sql_slave_killed(serial_rgi))
|
||||||
{
|
slave_output_error_info(rli, thd);
|
||||||
/*
|
|
||||||
retrieve as much info as possible from the thd and, error
|
|
||||||
codes and warnings and print this to the error log as to
|
|
||||||
allow the user to locate the error
|
|
||||||
*/
|
|
||||||
uint32 const last_errno= rli->last_error().number;
|
|
||||||
|
|
||||||
if (thd->is_error())
|
|
||||||
{
|
|
||||||
char const *const errmsg= thd->stmt_da->message();
|
|
||||||
|
|
||||||
DBUG_PRINT("info",
|
|
||||||
("thd->stmt_da->sql_errno()=%d; rli->last_error.number=%d",
|
|
||||||
thd->stmt_da->sql_errno(), last_errno));
|
|
||||||
if (last_errno == 0)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
This function is reporting an error which was not reported
|
|
||||||
while executing exec_relay_log_event().
|
|
||||||
*/
|
|
||||||
rli->report(ERROR_LEVEL, thd->stmt_da->sql_errno(), "%s", errmsg);
|
|
||||||
}
|
|
||||||
else if (last_errno != thd->stmt_da->sql_errno())
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* An error was reported while executing exec_relay_log_event()
|
|
||||||
* however the error code differs from what is in the thread.
|
|
||||||
* This function prints out more information to help finding
|
|
||||||
* what caused the problem.
|
|
||||||
*/
|
|
||||||
sql_print_error("Slave (additional info): %s Error_code: %d",
|
|
||||||
errmsg, thd->stmt_da->sql_errno());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Print any warnings issued */
|
|
||||||
List_iterator_fast<MYSQL_ERROR> it(thd->warning_info->warn_list());
|
|
||||||
MYSQL_ERROR *err;
|
|
||||||
/*
|
|
||||||
Added controlled slave thread cancel for replication
|
|
||||||
of user-defined variables.
|
|
||||||
*/
|
|
||||||
bool udf_error = false;
|
|
||||||
while ((err= it++))
|
|
||||||
{
|
|
||||||
if (err->get_sql_errno() == ER_CANT_OPEN_LIBRARY)
|
|
||||||
udf_error = true;
|
|
||||||
sql_print_warning("Slave: %s Error_code: %d", err->get_message_text(), err->get_sql_errno());
|
|
||||||
}
|
|
||||||
if (udf_error)
|
|
||||||
{
|
|
||||||
String tmp;
|
|
||||||
if (mi->using_gtid != Master_info::USE_GTID_NO)
|
|
||||||
{
|
|
||||||
tmp.append(STRING_WITH_LEN("; GTID position '"));
|
|
||||||
rpl_append_gtid_state(&tmp, false);
|
|
||||||
tmp.append(STRING_WITH_LEN("'"));
|
|
||||||
}
|
|
||||||
sql_print_error("Error loading user-defined library, slave SQL "
|
|
||||||
"thread aborted. Install the missing library, and restart the "
|
|
||||||
"slave SQL thread with \"SLAVE START\". We stopped at log '%s' "
|
|
||||||
"position %s%s", RPL_LOG_NAME, llstr(rli->group_master_log_pos,
|
|
||||||
llbuff), tmp.c_ptr_safe());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
String tmp;
|
|
||||||
if (mi->using_gtid != Master_info::USE_GTID_NO)
|
|
||||||
{
|
|
||||||
tmp.append(STRING_WITH_LEN("; GTID position '"));
|
|
||||||
rpl_append_gtid_state(&tmp, false);
|
|
||||||
tmp.append(STRING_WITH_LEN("'"));
|
|
||||||
}
|
|
||||||
sql_print_error("\
|
|
||||||
Error running query, slave SQL thread aborted. Fix the problem, and restart \
|
|
||||||
the slave SQL thread with \"SLAVE START\". We stopped at log \
|
|
||||||
'%s' position %s%s", RPL_LOG_NAME, llstr(rli->group_master_log_pos, llbuff),
|
|
||||||
tmp.c_ptr_safe());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -233,6 +233,7 @@ int apply_event_and_update_pos(Log_event* ev, THD* thd,
|
|||||||
rpl_parallel_thread *rpt);
|
rpl_parallel_thread *rpt);
|
||||||
|
|
||||||
pthread_handler_t handle_slave_io(void *arg);
|
pthread_handler_t handle_slave_io(void *arg);
|
||||||
|
void slave_output_error_info(Relay_log_info *rli, THD *thd);
|
||||||
pthread_handler_t handle_slave_sql(void *arg);
|
pthread_handler_t handle_slave_sql(void *arg);
|
||||||
bool net_request_file(NET* net, const char* fname);
|
bool net_request_file(NET* net, const char* fname);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user