mirror of
https://github.com/MariaDB/server.git
synced 2025-08-01 03:47:19 +03:00
MWL#116: Efficient group commit for binary log
Preliminary commit for testing
This commit is contained in:
63
mysql-test/r/group_commit.result
Normal file
63
mysql-test/r/group_commit.result
Normal file
@ -0,0 +1,63 @@
|
||||
CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
|
||||
SELECT variable_value INTO @commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_commits';
|
||||
SELECT variable_value INTO @group_commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_group_commits';
|
||||
SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
|
||||
INSERT INTO t1 VALUES ("con1");
|
||||
set DEBUG_SYNC= "now WAIT_FOR group1_running";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
|
||||
SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
|
||||
SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
|
||||
INSERT INTO t1 VALUES ("con2");
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
|
||||
INSERT INTO t1 VALUES ("con3");
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
|
||||
INSERT INTO t1 VALUES ("con4");
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
|
||||
SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a
|
||||
SET DEBUG_SYNC= "now SIGNAL group2_queued";
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a
|
||||
con1
|
||||
SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
|
||||
SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
|
||||
INSERT INTO t1 VALUES ("con5");
|
||||
SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
|
||||
INSERT INTO t1 VALUES ("con6");
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a
|
||||
con1
|
||||
SET DEBUG_SYNC= "now SIGNAL group3_committed";
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a
|
||||
con1
|
||||
con2
|
||||
con3
|
||||
con4
|
||||
SET DEBUG_SYNC= "now SIGNAL group2_checked";
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
a
|
||||
con1
|
||||
con2
|
||||
con3
|
||||
con4
|
||||
con5
|
||||
con6
|
||||
SELECT variable_value - @commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_commits';
|
||||
variable_value - @commits
|
||||
6
|
||||
SELECT variable_value - @group_commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_group_commits';
|
||||
variable_value - @group_commits
|
||||
3
|
||||
SET DEBUG_SYNC= 'RESET';
|
||||
DROP TABLE t1;
|
28
mysql-test/suite/binlog/r/binlog_ioerr.result
Normal file
28
mysql-test/suite/binlog/r/binlog_ioerr.result
Normal file
@ -0,0 +1,28 @@
|
||||
CALL mtr.add_suppression("Error writing file 'master-bin'");
|
||||
RESET MASTER;
|
||||
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
|
||||
INSERT INTO t1 VALUES(0);
|
||||
SET SESSION debug='+d,fail_binlog_write_1';
|
||||
INSERT INTO t1 VALUES(1);
|
||||
ERROR HY000: Error writing file 'master-bin' (errno: 22)
|
||||
INSERT INTO t1 VALUES(2);
|
||||
ERROR HY000: Error writing file 'master-bin' (errno: 22)
|
||||
SET SESSION debug='';
|
||||
INSERT INTO t1 VALUES(3);
|
||||
SELECT * FROM t1;
|
||||
a
|
||||
0
|
||||
3
|
||||
SHOW BINLOG EVENTS;
|
||||
Log_name Pos Event_type Server_id End_log_pos Info
|
||||
BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: #
|
||||
BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
|
||||
BINLOG POS Query 1 ENDPOS BEGIN
|
||||
BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0)
|
||||
BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
|
||||
BINLOG POS Query 1 ENDPOS BEGIN
|
||||
BINLOG POS Query 1 ENDPOS BEGIN
|
||||
BINLOG POS Query 1 ENDPOS BEGIN
|
||||
BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3)
|
||||
BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
|
||||
DROP TABLE t1;
|
29
mysql-test/suite/binlog/t/binlog_ioerr.test
Normal file
29
mysql-test/suite/binlog/t/binlog_ioerr.test
Normal file
@ -0,0 +1,29 @@
|
||||
source include/have_debug.inc;
|
||||
source include/have_innodb.inc;
|
||||
source include/have_log_bin.inc;
|
||||
source include/have_binlog_format_mixed_or_statement.inc;
|
||||
|
||||
CALL mtr.add_suppression("Error writing file 'master-bin'");
|
||||
|
||||
RESET MASTER;
|
||||
|
||||
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
|
||||
INSERT INTO t1 VALUES(0);
|
||||
SET SESSION debug='+d,fail_binlog_write_1';
|
||||
--error ER_ERROR_ON_WRITE
|
||||
INSERT INTO t1 VALUES(1);
|
||||
--error ER_ERROR_ON_WRITE
|
||||
INSERT INTO t1 VALUES(2);
|
||||
SET SESSION debug='';
|
||||
INSERT INTO t1 VALUES(3);
|
||||
SELECT * FROM t1;
|
||||
|
||||
# Actually the output from this currently shows a bug.
|
||||
# The injected IO error leaves partially written transactions in the binlog in
|
||||
# the form of stray "BEGIN" events.
|
||||
# These should disappear from the output if binlog error handling is improved.
|
||||
--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
|
||||
--replace_column 1 BINLOG 2 POS 5 ENDPOS
|
||||
SHOW BINLOG EVENTS;
|
||||
|
||||
DROP TABLE t1;
|
115
mysql-test/t/group_commit.test
Normal file
115
mysql-test/t/group_commit.test
Normal file
@ -0,0 +1,115 @@
|
||||
--source include/have_debug_sync.inc
|
||||
--source include/have_innodb.inc
|
||||
--source include/have_log_bin.inc
|
||||
|
||||
# Test some group commit code paths by using debug_sync to do controlled
|
||||
# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
|
||||
# group.
|
||||
#
|
||||
# Group 3 is allowed to race as far as possible ahead before group 2 finishes
|
||||
# to check some edge case for concurrency control.
|
||||
|
||||
CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
|
||||
|
||||
SELECT variable_value INTO @commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_commits';
|
||||
SELECT variable_value INTO @group_commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_group_commits';
|
||||
|
||||
connect(con1,localhost,root,,);
|
||||
connect(con2,localhost,root,,);
|
||||
connect(con3,localhost,root,,);
|
||||
connect(con4,localhost,root,,);
|
||||
connect(con5,localhost,root,,);
|
||||
connect(con6,localhost,root,,);
|
||||
|
||||
# Start group1 (with one thread) doing commit, waiting for
|
||||
# group2 to queue up before finishing.
|
||||
|
||||
connection con1;
|
||||
SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
|
||||
send INSERT INTO t1 VALUES ("con1");
|
||||
|
||||
# Make group2 (with three threads) queue up.
|
||||
# Make sure con2 is the group commit leader for group2.
|
||||
# Make group2 wait with running commit_ordered() until group3 has committed.
|
||||
|
||||
connection con2;
|
||||
set DEBUG_SYNC= "now WAIT_FOR group1_running";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
|
||||
SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
|
||||
SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
|
||||
send INSERT INTO t1 VALUES ("con2");
|
||||
connection con3;
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
|
||||
send INSERT INTO t1 VALUES ("con3");
|
||||
connection con4;
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
|
||||
send INSERT INTO t1 VALUES ("con4");
|
||||
|
||||
# When group2 is queued, let group1 continue and queue group3.
|
||||
|
||||
connection default;
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
|
||||
|
||||
# At this point, trasaction 1 is still not visible as commit_ordered() has not
|
||||
# been called yet.
|
||||
SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
|
||||
SET DEBUG_SYNC= "now SIGNAL group2_queued";
|
||||
connection con1;
|
||||
reap;
|
||||
|
||||
# Now transaction 1 is visible.
|
||||
connection default;
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
|
||||
connection con5;
|
||||
SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
|
||||
SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
|
||||
send INSERT INTO t1 VALUES ("con5");
|
||||
|
||||
connection con6;
|
||||
SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
|
||||
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
|
||||
send INSERT INTO t1 VALUES ("con6");
|
||||
|
||||
connection default;
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
|
||||
# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
SET DEBUG_SYNC= "now SIGNAL group3_committed";
|
||||
SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
|
||||
# Now transactions 1-4 visible.
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
SET DEBUG_SYNC= "now SIGNAL group2_checked";
|
||||
|
||||
connection con2;
|
||||
reap;
|
||||
|
||||
connection con3;
|
||||
reap;
|
||||
|
||||
connection con4;
|
||||
reap;
|
||||
|
||||
connection con5;
|
||||
reap;
|
||||
|
||||
connection con6;
|
||||
reap;
|
||||
|
||||
connection default;
|
||||
# Check all transactions finally visible.
|
||||
SELECT * FROM t1 ORDER BY a;
|
||||
|
||||
SELECT variable_value - @commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_commits';
|
||||
SELECT variable_value - @group_commits FROM information_schema.global_status
|
||||
WHERE variable_name = 'binlog_group_commits';
|
||||
|
||||
SET DEBUG_SYNC= 'RESET';
|
||||
DROP TABLE t1;
|
224
sql/handler.cc
224
sql/handler.cc
@ -76,6 +76,8 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
|
||||
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
|
||||
uint known_extensions_id= 0;
|
||||
|
||||
static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans,
|
||||
bool is_real_trans);
|
||||
|
||||
|
||||
static plugin_ref ha_default_plugin(THD *thd)
|
||||
@ -1070,7 +1072,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
|
||||
*/
|
||||
int ha_commit_trans(THD *thd, bool all)
|
||||
{
|
||||
int error= 0, cookie= 0;
|
||||
int error= 0, cookie;
|
||||
/*
|
||||
'all' means that this is either an explicit commit issued by
|
||||
user, or an implicit commit issued by a DDL.
|
||||
@ -1085,7 +1087,8 @@ int ha_commit_trans(THD *thd, bool all)
|
||||
*/
|
||||
bool is_real_trans= all || thd->transaction.all.ha_list == 0;
|
||||
Ha_trx_info *ha_info= trans->ha_list;
|
||||
my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
|
||||
bool need_prepare_ordered, need_commit_ordered;
|
||||
my_xid xid;
|
||||
DBUG_ENTER("ha_commit_trans");
|
||||
|
||||
/*
|
||||
@ -1118,85 +1121,112 @@ int ha_commit_trans(THD *thd, bool all)
|
||||
DBUG_RETURN(2);
|
||||
}
|
||||
#ifdef USING_TRANSACTIONS
|
||||
if (ha_info)
|
||||
if (!ha_info)
|
||||
{
|
||||
uint rw_ha_count;
|
||||
bool rw_trans;
|
||||
|
||||
DBUG_EXECUTE_IF("crash_commit_before", abort(););
|
||||
|
||||
/* Close all cursors that can not survive COMMIT */
|
||||
if (is_real_trans) /* not a statement commit */
|
||||
thd->stmt_map.close_transient_cursors();
|
||||
|
||||
rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
|
||||
/* rw_trans is TRUE when we in a transaction changing data */
|
||||
rw_trans= is_real_trans && (rw_ha_count > 0);
|
||||
|
||||
if (rw_trans &&
|
||||
wait_if_global_read_lock(thd, 0, 0))
|
||||
{
|
||||
ha_rollback_trans(thd, all);
|
||||
DBUG_RETURN(1);
|
||||
}
|
||||
|
||||
if (rw_trans &&
|
||||
opt_readonly &&
|
||||
!(thd->security_ctx->master_access & SUPER_ACL) &&
|
||||
!thd->slave_thread)
|
||||
{
|
||||
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
|
||||
ha_rollback_trans(thd, all);
|
||||
error= 1;
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (!trans->no_2pc && (rw_ha_count > 1))
|
||||
{
|
||||
for (; ha_info && !error; ha_info= ha_info->next())
|
||||
{
|
||||
int err;
|
||||
handlerton *ht= ha_info->ht();
|
||||
/*
|
||||
Do not call two-phase commit if this particular
|
||||
transaction is read-only. This allows for simpler
|
||||
implementation in engines that are always read-only.
|
||||
*/
|
||||
if (! ha_info->is_trx_read_write())
|
||||
continue;
|
||||
/*
|
||||
Sic: we know that prepare() is not NULL since otherwise
|
||||
trans->no_2pc would have been set.
|
||||
*/
|
||||
if ((err= ht->prepare(ht, thd, all)))
|
||||
{
|
||||
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
|
||||
error= 1;
|
||||
}
|
||||
status_var_increment(thd->status_var.ha_prepare_count);
|
||||
}
|
||||
DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
|
||||
if (error || (is_real_trans && xid &&
|
||||
(error= !(cookie= tc_log->log_xid(thd, xid)))))
|
||||
{
|
||||
ha_rollback_trans(thd, all);
|
||||
error= 1;
|
||||
goto end;
|
||||
}
|
||||
DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
|
||||
}
|
||||
error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
|
||||
DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
|
||||
if (cookie)
|
||||
tc_log->unlog(cookie, xid);
|
||||
DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
|
||||
end:
|
||||
if (rw_trans)
|
||||
start_waiting_global_read_lock(thd);
|
||||
/* Free resources and perform other cleanup even for 'empty' transactions. */
|
||||
if (is_real_trans)
|
||||
thd->transaction.cleanup();
|
||||
DBUG_RETURN(0);
|
||||
}
|
||||
/* Free resources and perform other cleanup even for 'empty' transactions. */
|
||||
else if (is_real_trans)
|
||||
thd->transaction.cleanup();
|
||||
|
||||
DBUG_EXECUTE_IF("crash_commit_before", abort(););
|
||||
|
||||
/* Close all cursors that can not survive COMMIT */
|
||||
if (is_real_trans) /* not a statement commit */
|
||||
thd->stmt_map.close_transient_cursors();
|
||||
|
||||
uint rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
|
||||
/* rw_trans is TRUE when we in a transaction changing data */
|
||||
bool rw_trans= is_real_trans && (rw_ha_count > 0);
|
||||
|
||||
if (rw_trans &&
|
||||
wait_if_global_read_lock(thd, 0, 0))
|
||||
{
|
||||
ha_rollback_trans(thd, all);
|
||||
DBUG_RETURN(1);
|
||||
}
|
||||
|
||||
if (rw_trans &&
|
||||
opt_readonly &&
|
||||
!(thd->security_ctx->master_access & SUPER_ACL) &&
|
||||
!thd->slave_thread)
|
||||
{
|
||||
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (trans->no_2pc || (rw_ha_count <= 1))
|
||||
{
|
||||
error= ha_commit_one_phase(thd, all);
|
||||
DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
|
||||
goto end;
|
||||
}
|
||||
|
||||
need_prepare_ordered= FALSE;
|
||||
need_commit_ordered= FALSE;
|
||||
xid= thd->transaction.xid_state.xid.get_my_xid();
|
||||
|
||||
for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
|
||||
{
|
||||
int err;
|
||||
handlerton *ht= hi->ht();
|
||||
/*
|
||||
Do not call two-phase commit if this particular
|
||||
transaction is read-only. This allows for simpler
|
||||
implementation in engines that are always read-only.
|
||||
*/
|
||||
if (! hi->is_trx_read_write())
|
||||
continue;
|
||||
/*
|
||||
Sic: we know that prepare() is not NULL since otherwise
|
||||
trans->no_2pc would have been set.
|
||||
*/
|
||||
if ((err= ht->prepare(ht, thd, all)))
|
||||
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
|
||||
status_var_increment(thd->status_var.ha_prepare_count);
|
||||
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
if (ht->prepare_ordered)
|
||||
need_prepare_ordered= TRUE;
|
||||
if (ht->commit_ordered)
|
||||
need_commit_ordered= TRUE;
|
||||
}
|
||||
DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
|
||||
|
||||
if (!is_real_trans)
|
||||
{
|
||||
error= commit_one_phase_2(thd, all, trans, is_real_trans);
|
||||
DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
|
||||
goto end;
|
||||
}
|
||||
|
||||
cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered,
|
||||
need_commit_ordered);
|
||||
if (!cookie)
|
||||
goto err;
|
||||
|
||||
DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
|
||||
|
||||
error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0;
|
||||
DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
|
||||
|
||||
DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
|
||||
tc_log->unlog(cookie, xid);
|
||||
|
||||
DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
|
||||
goto end;
|
||||
|
||||
/* Come here if error and we need to rollback. */
|
||||
err:
|
||||
if (!error)
|
||||
error= 1;
|
||||
ha_rollback_trans(thd, all);
|
||||
|
||||
end:
|
||||
if (rw_trans)
|
||||
start_waiting_global_read_lock(thd);
|
||||
#endif /* USING_TRANSACTIONS */
|
||||
DBUG_RETURN(error);
|
||||
}
|
||||
@ -1207,7 +1237,6 @@ end:
|
||||
*/
|
||||
int ha_commit_one_phase(THD *thd, bool all)
|
||||
{
|
||||
int error=0;
|
||||
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
|
||||
/*
|
||||
"real" is a nick name for a transaction for which a commit will
|
||||
@ -1217,8 +1246,41 @@ int ha_commit_one_phase(THD *thd, bool all)
|
||||
enclosing 'all' transaction is rolled back.
|
||||
*/
|
||||
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
|
||||
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
|
||||
Ha_trx_info *ha_info= trans->ha_list;
|
||||
DBUG_ENTER("ha_commit_one_phase");
|
||||
#ifdef USING_TRANSACTIONS
|
||||
if (ha_info)
|
||||
{
|
||||
if (is_real_trans)
|
||||
{
|
||||
bool locked= false;
|
||||
for (; ha_info; ha_info= ha_info->next())
|
||||
{
|
||||
handlerton *ht= ha_info->ht();
|
||||
if (ht->commit_ordered)
|
||||
{
|
||||
if (ha_info->is_trx_read_write() && !locked)
|
||||
{
|
||||
pthread_mutex_lock(&LOCK_commit_ordered);
|
||||
locked= 1;
|
||||
}
|
||||
ht->commit_ordered(ht, thd, all);
|
||||
}
|
||||
}
|
||||
if (locked)
|
||||
pthread_mutex_unlock(&LOCK_commit_ordered);
|
||||
}
|
||||
}
|
||||
#endif /* USING_TRANSACTIONS */
|
||||
DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans));
|
||||
}
|
||||
|
||||
static int
|
||||
commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
|
||||
{
|
||||
int error= 0;
|
||||
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
|
||||
DBUG_ENTER("commit_one_phase_2");
|
||||
#ifdef USING_TRANSACTIONS
|
||||
if (ha_info)
|
||||
{
|
||||
|
@ -656,9 +656,96 @@ struct handlerton
|
||||
NOTE 'all' is also false in auto-commit mode where 'end of statement'
|
||||
and 'real commit' mean the same event.
|
||||
*/
|
||||
int (*commit)(handlerton *hton, THD *thd, bool all);
|
||||
int (*commit)(handlerton *hton, THD *thd, bool all);
|
||||
/*
|
||||
The commit_ordered() method is called prior to the commit() method, after
|
||||
the transaction manager has decided to commit (not rollback) the
|
||||
transaction. Unlike commit(), commit_ordered() is called only when the
|
||||
full transaction is committed, not for each commit of statement
|
||||
transaction in a multi-statement transaction.
|
||||
|
||||
The calls to commit_ordered() in multiple parallel transactions is
|
||||
guaranteed to happen in the same order in every participating
|
||||
handler. This can be used to ensure the same commit order among multiple
|
||||
handlers (eg. in table handler and binlog). So if transaction T1 calls
|
||||
into commit_ordered() of handler A before T2, then T1 will also call
|
||||
commit_ordered() of handler B before T2.
|
||||
|
||||
Engines that implement this method should during this call make the
|
||||
transaction visible to other transactions, thereby making the order of
|
||||
transaction commits be defined by the order of commit_ordered() calls.
|
||||
|
||||
The intension is that commit_ordered() should do the minimal amount of
|
||||
work that needs to happen in consistent commit order among handlers. To
|
||||
preserve ordering, calls need to be serialised on a global mutex, so
|
||||
doing any time-consuming or blocking operations in commit_ordered() will
|
||||
limit scalability.
|
||||
|
||||
Handlers can rely on commit_ordered() calls for transactions that updated
|
||||
data to be serialised (no two calls can run in parallel, so no extra
|
||||
locking on the handler part is required to ensure this). However, calls
|
||||
for SELECT-only transactions are not serialised, so can occur in parallel
|
||||
with each other and with at most one write-transaction.
|
||||
|
||||
Note that commit_ordered() can be called from a different thread than the
|
||||
one handling the transaction! So it can not do anything that depends on
|
||||
thread local storage, in particular it can not call my_error() and
|
||||
friends (instead it can store the error code and delay the call of
|
||||
my_error() to the commit() method).
|
||||
|
||||
Similarly, since commit_ordered() returns void, any return error code
|
||||
must be saved and returned from the commit() method instead.
|
||||
|
||||
The commit_ordered method is optional, and can be left unset if not
|
||||
needed in a particular handler.
|
||||
*/
|
||||
void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
|
||||
int (*rollback)(handlerton *hton, THD *thd, bool all);
|
||||
int (*prepare)(handlerton *hton, THD *thd, bool all);
|
||||
/*
|
||||
The prepare_ordered method is optional. If set, it will be called after
|
||||
successful prepare() in all handlers participating in 2-phase
|
||||
commit. Like commit_ordered(), it is called only when the full
|
||||
transaction is committed, not for each commit of statement transaction.
|
||||
|
||||
The calls to prepare_ordered() among multiple parallel transactions are
|
||||
ordered consistently with calls to commit_ordered(). This means that
|
||||
calls to prepare_ordered() effectively define the commit order, and that
|
||||
each handler will see the same sequence of transactions calling into
|
||||
prepare_ordered() and commit_ordered().
|
||||
|
||||
Thus, prepare_ordered() can be used to define commit order for handlers
|
||||
that need to do this in the prepare step (like binlog). It can also be
|
||||
used to release transaction's locks early in an order consistent with the
|
||||
order transactions will be eventually committed.
|
||||
|
||||
Like commit_ordered(), prepare_ordered() calls are serialised to maintain
|
||||
ordering, so the intension is that they should execute fast, with only
|
||||
the minimal amount of work needed to define commit order. Handlers can
|
||||
rely on this serialisation, and do not need to do any extra locking to
|
||||
avoid two prepare_ordered() calls running in parallel.
|
||||
|
||||
Like commit_ordered(), prepare_ordered() is not guaranteed to be called
|
||||
in the context of the thread handling the rest of the transaction. So it
|
||||
cannot invoke code that relies on thread local storage, in particular it
|
||||
cannot call my_error().
|
||||
|
||||
When prepare_ordered() is called, the transaction coordinator has already
|
||||
decided to commit (not rollback) the transaction. So prepare_ordered()
|
||||
cannot cause a rollback by returning an error, all possible errors must
|
||||
be handled in prepare() (the prepare_ordered() method returns void). In
|
||||
case of some fatal error, a record of the error must be made internally
|
||||
by the engine and returned from commit() later.
|
||||
|
||||
Note that for user-level XA SQL commands, no consistent ordering among
|
||||
prepare_ordered() and commit_ordered() is guaranteed (as that would
|
||||
require blocking all other commits for an indefinite time).
|
||||
|
||||
When 2-phase commit is not used (eg. only one engine (and no binlog) in
|
||||
transaction), prepare() is not called and in such cases prepare_ordered()
|
||||
also is not called.
|
||||
*/
|
||||
void (*prepare_ordered)(handlerton *hton, THD *thd, bool all);
|
||||
int (*recover)(handlerton *hton, XID *xid_list, uint len);
|
||||
int (*commit_by_xid)(handlerton *hton, XID *xid);
|
||||
int (*rollback_by_xid)(handlerton *hton, XID *xid);
|
||||
|
1292
sql/log.cc
1292
sql/log.cc
File diff suppressed because it is too large
Load Diff
209
sql/log.h
209
sql/log.h
@ -33,11 +33,173 @@ class TC_LOG
|
||||
|
||||
virtual int open(const char *opt_name)=0;
|
||||
virtual void close()=0;
|
||||
virtual int log_xid(THD *thd, my_xid xid)=0;
|
||||
virtual int log_and_order(THD *thd, my_xid xid, bool all,
|
||||
bool need_prepare_ordered,
|
||||
bool need_commit_ordered) = 0;
|
||||
virtual void unlog(ulong cookie, my_xid xid)=0;
|
||||
|
||||
protected:
|
||||
/*
|
||||
These methods are meant to be invoked from log_and_order() implementations
|
||||
to run any prepare_ordered() respectively commit_ordered() methods in
|
||||
participating handlers.
|
||||
|
||||
They must be called using suitable thread syncronisation to ensure that
|
||||
they are each called in the correct commit order among all
|
||||
transactions. However, it is only necessary to call them if the
|
||||
corresponding flag passed to log_and_order is set (it is safe, but not
|
||||
required, to call them when the flag is false).
|
||||
|
||||
The caller must be holding LOCK_prepare_ordered respectively
|
||||
LOCK_commit_ordered when calling these methods.
|
||||
*/
|
||||
void run_prepare_ordered(THD *thd, bool all);
|
||||
void run_commit_ordered(THD *thd, bool all);
|
||||
};
|
||||
|
||||
class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
|
||||
/*
|
||||
Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered()
|
||||
and TC_LOG::run_commit_ordered(), or any other code that calls handler
|
||||
prepare_ordered() or commit_ordered() methods.
|
||||
*/
|
||||
extern pthread_mutex_t LOCK_prepare_ordered;
|
||||
extern pthread_mutex_t LOCK_commit_ordered;
|
||||
|
||||
extern void TC_init();
|
||||
extern void TC_destroy();
|
||||
|
||||
/*
|
||||
Base class for two TC implementations TC_LOG_unordered and
|
||||
TC_LOG_group_commit that both use a queue of threads waiting for group
|
||||
commit.
|
||||
*/
|
||||
class TC_LOG_queued: public TC_LOG
|
||||
{
|
||||
protected:
|
||||
TC_LOG_queued();
|
||||
~TC_LOG_queued();
|
||||
|
||||
/* Structure used to link list of THDs waiting for group commit. */
|
||||
struct TC_group_commit_entry
|
||||
{
|
||||
struct TC_group_commit_entry *next;
|
||||
THD *thd;
|
||||
/* This is the `all' parameter for ha_commit_trans() etc. */
|
||||
bool all;
|
||||
/*
|
||||
Flag set true when it is time for this thread to wake up after group
|
||||
commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered.
|
||||
*/
|
||||
bool group_commit_ready;
|
||||
/*
|
||||
Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and
|
||||
cookie.
|
||||
*/
|
||||
int xid_error;
|
||||
};
|
||||
|
||||
TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue);
|
||||
|
||||
void group_commit_wait_for_wakeup(TC_group_commit_entry *entry);
|
||||
void group_commit_wakeup_other(TC_group_commit_entry *other);
|
||||
|
||||
/*
|
||||
This is a queue of threads waiting for being allowed to commit.
|
||||
Access to the queue must be protected by LOCK_prepare_ordered.
|
||||
*/
|
||||
TC_group_commit_entry *group_commit_queue;
|
||||
};
|
||||
|
||||
class TC_LOG_unordered: public TC_LOG_queued
|
||||
{
|
||||
public:
|
||||
TC_LOG_unordered();
|
||||
~TC_LOG_unordered();
|
||||
|
||||
int log_and_order(THD *thd, my_xid xid, bool all,
|
||||
bool need_prepare_ordered, bool need_commit_ordered);
|
||||
|
||||
protected:
|
||||
virtual int log_xid(THD *thd, my_xid xid)=0;
|
||||
|
||||
private:
|
||||
/*
|
||||
This flag and condition is used to reserve the queue while threads in it
|
||||
each run the commit_ordered() methods one after the other. Only once the
|
||||
last commit_ordered() in the queue is done can we start on a new queue
|
||||
run.
|
||||
|
||||
Since we start this process in the first thread in the queue and finish in
|
||||
the last (and possibly different) thread, we need a condition variable for
|
||||
this (we cannot unlock a mutex in a different thread than the one who
|
||||
locked it).
|
||||
|
||||
The condition is used together with the LOCK_prepare_ordered mutex.
|
||||
*/
|
||||
my_bool group_commit_queue_busy;
|
||||
pthread_cond_t COND_queue_busy;
|
||||
};
|
||||
|
||||
class TC_LOG_group_commit: public TC_LOG_queued
|
||||
{
|
||||
public:
|
||||
TC_LOG_group_commit();
|
||||
~TC_LOG_group_commit();
|
||||
|
||||
int log_and_order(THD *thd, my_xid xid, bool all,
|
||||
bool need_prepare_ordered, bool need_commit_ordered);
|
||||
|
||||
protected:
|
||||
/* Total number of committed transactions. */
|
||||
ulonglong num_commits;
|
||||
/* Number of group commits done. */
|
||||
ulonglong num_group_commits;
|
||||
|
||||
/*
|
||||
When using this class, this method is used instead of log_xid() to do
|
||||
logging of a group of transactions all at once.
|
||||
|
||||
The transactions will be linked through THD::next_commit_ordered.
|
||||
|
||||
Additionally, when this method is used instead of log_xid(), the order in
|
||||
which handler->prepare_ordered() and handler->commit_ordered() are called
|
||||
is guaranteed to be the same as the order of calls and THD list elements
|
||||
for group_log_xid().
|
||||
|
||||
This can be used to efficiently implement group commit that at the same
|
||||
time preserves the order of commits among handlers and TC (eg. to get same
|
||||
commit order in InnoDB and binary log).
|
||||
|
||||
For TCs that do not need this, it can be preferable to use plain log_xid()
|
||||
with class TC_LOG_unordered instead, as it allows threads to run log_xid()
|
||||
in parallel with each other. In contrast, group_log_xid() runs under a
|
||||
global mutex, so it is guaranteed that only once call into it will be
|
||||
active at once.
|
||||
|
||||
Since this call handles multiple threads/THDs at once, my_error() (and
|
||||
other code that relies on thread local storage) cannot be used in this
|
||||
method. Instead, the implementation must record any error and report it as
|
||||
the return value from xid_log_after(), which will be invoked individually
|
||||
for each thread.
|
||||
|
||||
In the success case, this method must set thd->xid_cookie for each thread
|
||||
to the cookie that is normally returned from log_xid() (which must be
|
||||
non-zero in the non-error case).
|
||||
*/
|
||||
virtual void group_log_xid(TC_group_commit_entry *first) = 0;
|
||||
/*
|
||||
Called for each transaction (in corrent thread context) after
|
||||
group_log_xid() has finished, but with no guarantee on ordering among
|
||||
threads.
|
||||
Can be used to do error reporting etc. */
|
||||
virtual int xid_log_after(TC_group_commit_entry *entry) = 0;
|
||||
|
||||
private:
|
||||
/* Mutex used to serialise calls to group_log_xid(). */
|
||||
pthread_mutex_t LOCK_group_commit;
|
||||
};
|
||||
|
||||
class TC_LOG_DUMMY: public TC_LOG_unordered // use it to disable the logging
|
||||
{
|
||||
public:
|
||||
TC_LOG_DUMMY() {}
|
||||
@ -48,7 +210,7 @@ public:
|
||||
};
|
||||
|
||||
#ifdef HAVE_MMAP
|
||||
class TC_LOG_MMAP: public TC_LOG
|
||||
class TC_LOG_MMAP: public TC_LOG_unordered
|
||||
{
|
||||
public: // only to keep Sun Forte on sol9x86 happy
|
||||
typedef enum {
|
||||
@ -227,12 +389,19 @@ private:
|
||||
time_t last_time;
|
||||
};
|
||||
|
||||
class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
|
||||
class binlog_trx_data;
|
||||
class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG
|
||||
{
|
||||
private:
|
||||
/* LOCK_log and LOCK_index are inited by init_pthread_objects() */
|
||||
pthread_mutex_t LOCK_index;
|
||||
pthread_mutex_t LOCK_prep_xids;
|
||||
/*
|
||||
Mutex to protect the queue of transactions waiting to participate in group
|
||||
commit. (Only used on platforms without native atomic operations).
|
||||
*/
|
||||
pthread_mutex_t LOCK_queue;
|
||||
|
||||
pthread_cond_t COND_prep_xids;
|
||||
pthread_cond_t update_cond;
|
||||
ulonglong bytes_written;
|
||||
@ -271,8 +440,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
|
||||
In 5.0 it's 0 for relay logs too!
|
||||
*/
|
||||
bool no_auto_events;
|
||||
|
||||
ulonglong m_table_map_version;
|
||||
/* Queue of transactions queued up to participate in group commit. */
|
||||
binlog_trx_data *group_commit_queue;
|
||||
|
||||
int write_to_file(IO_CACHE *cache);
|
||||
/*
|
||||
@ -282,6 +451,14 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
|
||||
*/
|
||||
void new_file_without_locking();
|
||||
void new_file_impl(bool need_lock);
|
||||
int write_transaction(binlog_trx_data *trx_data);
|
||||
bool write_transaction_to_binlog_events(binlog_trx_data *trx_data);
|
||||
void trx_group_commit_participant(binlog_trx_data *trx_data);
|
||||
void trx_group_commit_leader(TC_group_commit_entry *first);
|
||||
binlog_trx_data *atomic_enqueue_trx(binlog_trx_data *trx_data);
|
||||
binlog_trx_data *atomic_grab_trx_queue();
|
||||
void mark_xid_done();
|
||||
void mark_xids_active(uint xid_count);
|
||||
|
||||
public:
|
||||
MYSQL_LOG::generate_name;
|
||||
@ -310,18 +487,11 @@ public:
|
||||
|
||||
int open(const char *opt_name);
|
||||
void close();
|
||||
int log_xid(THD *thd, my_xid xid);
|
||||
void group_log_xid(TC_group_commit_entry *first);
|
||||
int xid_log_after(TC_group_commit_entry *entry);
|
||||
void unlog(ulong cookie, my_xid xid);
|
||||
int recover(IO_CACHE *log, Format_description_log_event *fdle);
|
||||
#if !defined(MYSQL_CLIENT)
|
||||
bool is_table_mapped(TABLE *table) const
|
||||
{
|
||||
return table->s->table_map_version == table_map_version();
|
||||
}
|
||||
|
||||
ulonglong table_map_version() const { return m_table_map_version; }
|
||||
void update_table_map_version() { ++m_table_map_version; }
|
||||
|
||||
int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event);
|
||||
int remove_pending_rows_event(THD *thd);
|
||||
|
||||
@ -362,10 +532,12 @@ public:
|
||||
void new_file();
|
||||
|
||||
bool write(Log_event* event_info); // binary log write
|
||||
bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
|
||||
bool write_incident(THD *thd, bool lock);
|
||||
bool write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data,
|
||||
Log_event *end_ev);
|
||||
bool trx_group_commit_finish(binlog_trx_data *trx_data);
|
||||
bool write_incident(THD *thd);
|
||||
|
||||
int write_cache(IO_CACHE *cache, bool lock_log, bool flush_and_sync);
|
||||
int write_cache(IO_CACHE *cache);
|
||||
void set_write_error(THD *thd);
|
||||
bool check_write_error(THD *thd);
|
||||
|
||||
@ -420,6 +592,7 @@ public:
|
||||
inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);}
|
||||
inline IO_CACHE *get_index_file() { return &index_file;}
|
||||
inline uint32 get_open_count() { return open_count; }
|
||||
void set_status_variables();
|
||||
};
|
||||
|
||||
class Log_event_handler
|
||||
|
@ -463,10 +463,9 @@ struct sql_ex_info
|
||||
#define LOG_EVENT_SUPPRESS_USE_F 0x8
|
||||
|
||||
/*
|
||||
The table map version internal to the log should be increased after
|
||||
the event has been written to the binary log.
|
||||
This used to be LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F, but is now unused.
|
||||
*/
|
||||
#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x10
|
||||
#define LOG_EVENT_UNUSED1_F 0x10
|
||||
|
||||
/**
|
||||
@def LOG_EVENT_ARTIFICIAL_F
|
||||
|
@ -1333,6 +1333,7 @@ void clean_up(bool print_message)
|
||||
ha_end();
|
||||
if (tc_log)
|
||||
tc_log->close();
|
||||
TC_destroy();
|
||||
xid_cache_free();
|
||||
wt_end();
|
||||
delete_elements(&key_caches, (void (*)(const char*, uchar*)) free_key_cache);
|
||||
@ -4124,6 +4125,8 @@ a file name for --log-bin-index option", opt_binlog_index_name);
|
||||
if (!errmesg[0][0])
|
||||
unireg_abort(1);
|
||||
|
||||
TC_init();
|
||||
|
||||
/* We have to initialize the storage engines before CSV logging */
|
||||
if (ha_init())
|
||||
{
|
||||
|
@ -673,6 +673,8 @@ THD::THD()
|
||||
active_vio = 0;
|
||||
#endif
|
||||
pthread_mutex_init(&LOCK_thd_data, MY_MUTEX_INIT_FAST);
|
||||
pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_FAST);
|
||||
pthread_cond_init(&COND_commit_ordered, 0);
|
||||
|
||||
/* Variables with default values */
|
||||
proc_info="login";
|
||||
@ -999,6 +1001,8 @@ THD::~THD()
|
||||
free_root(&transaction.mem_root,MYF(0));
|
||||
#endif
|
||||
mysys_var=0; // Safety (shouldn't be needed)
|
||||
pthread_cond_destroy(&COND_commit_ordered);
|
||||
pthread_mutex_destroy(&LOCK_commit_ordered);
|
||||
pthread_mutex_destroy(&LOCK_thd_data);
|
||||
#ifndef DBUG_OFF
|
||||
dbug_sentry= THD_SENTRY_GONE;
|
||||
@ -3773,7 +3777,6 @@ int THD::binlog_flush_pending_rows_event(bool stmt_end)
|
||||
if (stmt_end)
|
||||
{
|
||||
pending->set_flags(Rows_log_event::STMT_END_F);
|
||||
pending->flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
|
||||
binlog_table_maps= 0;
|
||||
}
|
||||
|
||||
@ -3901,7 +3904,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
|
||||
{
|
||||
Query_log_event qinfo(this, query_arg, query_len, is_trans, suppress_use,
|
||||
errcode);
|
||||
qinfo.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
|
||||
/*
|
||||
Binlog table maps will be irrelevant after a Query_log_event
|
||||
(they are just removed on the slave side) so after the query
|
||||
|
@ -1438,6 +1438,10 @@ public:
|
||||
/* container for handler's private per-connection data */
|
||||
Ha_data ha_data[MAX_HA];
|
||||
|
||||
/* Mutex and condition for waking up threads after group commit. */
|
||||
pthread_mutex_t LOCK_commit_ordered;
|
||||
pthread_cond_t COND_commit_ordered;
|
||||
|
||||
#ifndef MYSQL_CLIENT
|
||||
int binlog_setup_trx_data();
|
||||
|
||||
|
@ -516,7 +516,6 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
|
||||
else
|
||||
{
|
||||
Delete_file_log_event d(thd, db, transactional_table);
|
||||
d.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
|
||||
(void) mysql_bin_log.write(&d);
|
||||
}
|
||||
}
|
||||
@ -698,7 +697,6 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
|
||||
(duplicates == DUP_REPLACE) ? LOAD_DUP_REPLACE :
|
||||
(ignore ? LOAD_DUP_IGNORE : LOAD_DUP_ERROR),
|
||||
transactional_table, FALSE, errcode);
|
||||
e.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
|
||||
return mysql_bin_log.write(&e);
|
||||
}
|
||||
|
||||
|
10
sql/table.cc
10
sql/table.cc
@ -296,13 +296,6 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key,
|
||||
|
||||
share->version= refresh_version;
|
||||
|
||||
/*
|
||||
This constant is used to mark that no table map version has been
|
||||
assigned. No arithmetic is done on the value: it will be
|
||||
overwritten with a value taken from MYSQL_BIN_LOG.
|
||||
*/
|
||||
share->table_map_version= ~(ulonglong)0;
|
||||
|
||||
/*
|
||||
Since alloc_table_share() can be called without any locking (for
|
||||
example, ha_create_table... functions), we do not assign a table
|
||||
@ -367,10 +360,9 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key,
|
||||
share->frm_version= FRM_VER_TRUE_VARCHAR;
|
||||
|
||||
/*
|
||||
Temporary tables are not replicated, but we set up these fields
|
||||
Temporary tables are not replicated, but we set up this fields
|
||||
anyway to be able to catch errors.
|
||||
*/
|
||||
share->table_map_version= ~(ulonglong)0;
|
||||
share->cached_row_logging_check= -1;
|
||||
|
||||
/*
|
||||
|
@ -433,7 +433,6 @@ typedef struct st_table_share
|
||||
bool waiting_on_cond; /* Protection against free */
|
||||
bool deleting; /* going to delete this table */
|
||||
ulong table_map_id; /* for row-based replication */
|
||||
ulonglong table_map_version;
|
||||
|
||||
/*
|
||||
Cache for row-based replication table share checks that does not
|
||||
|
@ -138,8 +138,6 @@ bool check_global_access(THD *thd, ulong want_access);
|
||||
|
||||
/** to protect innobase_open_files */
|
||||
static pthread_mutex_t innobase_share_mutex;
|
||||
/** to force correct commit order in binlog */
|
||||
static pthread_mutex_t prepare_commit_mutex;
|
||||
static ulong commit_threads = 0;
|
||||
static pthread_mutex_t commit_threads_m;
|
||||
static pthread_cond_t commit_cond;
|
||||
@ -239,6 +237,7 @@ static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
|
||||
static INNOBASE_SHARE *get_share(const char *table_name);
|
||||
static void free_share(INNOBASE_SHARE *share);
|
||||
static int innobase_close_connection(handlerton *hton, THD* thd);
|
||||
static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
|
||||
static int innobase_commit(handlerton *hton, THD* thd, bool all);
|
||||
static int innobase_rollback(handlerton *hton, THD* thd, bool all);
|
||||
static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
|
||||
@ -1356,7 +1355,6 @@ innobase_trx_init(
|
||||
trx_t* trx) /*!< in/out: InnoDB transaction handle */
|
||||
{
|
||||
DBUG_ENTER("innobase_trx_init");
|
||||
DBUG_ASSERT(EQ_CURRENT_THD(thd));
|
||||
DBUG_ASSERT(thd == trx->mysql_thd);
|
||||
|
||||
trx->check_foreigns = !thd_test_options(
|
||||
@ -1416,8 +1414,6 @@ check_trx_exists(
|
||||
{
|
||||
trx_t*& trx = thd_to_trx(thd);
|
||||
|
||||
ut_ad(EQ_CURRENT_THD(thd));
|
||||
|
||||
if (trx == NULL) {
|
||||
trx = innobase_trx_allocate(thd);
|
||||
} else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
|
||||
@ -2024,6 +2020,7 @@ innobase_init(
|
||||
innobase_hton->savepoint_set=innobase_savepoint;
|
||||
innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
|
||||
innobase_hton->savepoint_release=innobase_release_savepoint;
|
||||
innobase_hton->commit_ordered=innobase_commit_ordered;
|
||||
innobase_hton->commit=innobase_commit;
|
||||
innobase_hton->rollback=innobase_rollback;
|
||||
innobase_hton->prepare=innobase_xa_prepare;
|
||||
@ -2492,7 +2489,6 @@ skip_overwrite:
|
||||
|
||||
innobase_open_tables = hash_create(200);
|
||||
pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST);
|
||||
pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST);
|
||||
pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST);
|
||||
pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST);
|
||||
pthread_mutex_init(&analyze_mutex, MY_MUTEX_INIT_FAST);
|
||||
@ -2547,7 +2543,6 @@ innobase_end(
|
||||
my_free(internal_innobase_data_file_path,
|
||||
MYF(MY_ALLOW_ZERO_PTR));
|
||||
pthread_mutex_destroy(&innobase_share_mutex);
|
||||
pthread_mutex_destroy(&prepare_commit_mutex);
|
||||
pthread_mutex_destroy(&commit_threads_m);
|
||||
pthread_mutex_destroy(&commit_cond_m);
|
||||
pthread_mutex_destroy(&analyze_mutex);
|
||||
@ -2680,6 +2675,101 @@ innobase_start_trx_and_assign_read_view(
|
||||
DBUG_RETURN(0);
|
||||
}
|
||||
|
||||
/*****************************************************************//**
|
||||
Perform the first, fast part of InnoDB commit.
|
||||
|
||||
Doing it in this call ensures that we get the same commit order here
|
||||
as in binlog and any other participating transactional storage engines.
|
||||
|
||||
Note that we want to do as little as really needed here, as we run
|
||||
under a global mutex. The expensive fsync() is done later, in
|
||||
innobase_commit(), without a lock so group commit can take place.
|
||||
|
||||
Note also that this method can be called from a different thread than
|
||||
the one handling the rest of the transaction. */
|
||||
static
|
||||
void
|
||||
innobase_commit_ordered(
|
||||
/*============*/
|
||||
handlerton *hton, /*!< in: Innodb handlerton */
|
||||
THD* thd, /*!< in: MySQL thread handle of the user for whom
|
||||
the transaction should be committed */
|
||||
bool all) /*!< in: TRUE - commit transaction
|
||||
FALSE - the current SQL statement ended */
|
||||
{
|
||||
trx_t* trx;
|
||||
DBUG_ENTER("innobase_commit_ordered");
|
||||
DBUG_ASSERT(hton == innodb_hton_ptr);
|
||||
|
||||
trx = check_trx_exists(thd);
|
||||
|
||||
if (trx->active_trans == 0
|
||||
&& trx->conc_state != TRX_NOT_STARTED) {
|
||||
/* We cannot throw error here; instead we will catch this error
|
||||
again in innobase_commit() and report it from there. */
|
||||
DBUG_VOID_RETURN;
|
||||
}
|
||||
/* Since we will reserve the kernel mutex, we have to release
|
||||
the search system latch first to obey the latching order. */
|
||||
|
||||
if (trx->has_search_latch) {
|
||||
trx_search_latch_release_if_reserved(trx);
|
||||
}
|
||||
|
||||
/* commit_ordered is only called when committing the whole transaction
|
||||
(or an SQL statement when autocommit is on). */
|
||||
DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
|
||||
|
||||
/* We need current binlog position for ibbackup to work.
|
||||
Note, the position is current because commit_ordered is guaranteed
|
||||
to be called in same sequenece as writing to binlog. */
|
||||
|
||||
retry:
|
||||
if (innobase_commit_concurrency > 0) {
|
||||
pthread_mutex_lock(&commit_cond_m);
|
||||
commit_threads++;
|
||||
|
||||
if (commit_threads > innobase_commit_concurrency) {
|
||||
commit_threads--;
|
||||
pthread_cond_wait(&commit_cond,
|
||||
&commit_cond_m);
|
||||
pthread_mutex_unlock(&commit_cond_m);
|
||||
goto retry;
|
||||
}
|
||||
else {
|
||||
pthread_mutex_unlock(&commit_cond_m);
|
||||
}
|
||||
}
|
||||
|
||||
/* The following calls to read the MySQL binary log
|
||||
file name and the position return consistent results:
|
||||
1) We use commit_ordered() to get same commit order
|
||||
in InnoDB as in binary log.
|
||||
2) A MySQL log file rotation cannot happen because
|
||||
MySQL protects against this by having a counter of
|
||||
transactions in prepared state and it only allows
|
||||
a rotation when the counter drops to zero. See
|
||||
LOCK_prep_xids and COND_prep_xids in log.cc. */
|
||||
trx->mysql_log_file_name = mysql_bin_log_file_name();
|
||||
trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
|
||||
|
||||
/* Don't do write + flush right now. For group commit
|
||||
to work we want to do the flush in the innobase_commit()
|
||||
method, which runs without holding any locks. */
|
||||
trx->flush_log_later = TRUE;
|
||||
innobase_commit_low(trx);
|
||||
trx->flush_log_later = FALSE;
|
||||
|
||||
if (innobase_commit_concurrency > 0) {
|
||||
pthread_mutex_lock(&commit_cond_m);
|
||||
commit_threads--;
|
||||
pthread_cond_signal(&commit_cond);
|
||||
pthread_mutex_unlock(&commit_cond_m);
|
||||
}
|
||||
|
||||
DBUG_VOID_RETURN;
|
||||
}
|
||||
|
||||
/*****************************************************************//**
|
||||
Commits a transaction in an InnoDB database or marks an SQL statement
|
||||
ended.
|
||||
@ -2702,13 +2792,6 @@ innobase_commit(
|
||||
|
||||
trx = check_trx_exists(thd);
|
||||
|
||||
/* Since we will reserve the kernel mutex, we have to release
|
||||
the search system latch first to obey the latching order. */
|
||||
|
||||
if (trx->has_search_latch) {
|
||||
trx_search_latch_release_if_reserved(trx);
|
||||
}
|
||||
|
||||
/* The flag trx->active_trans is set to 1 in
|
||||
|
||||
1. ::external_lock(),
|
||||
@ -2736,62 +2819,8 @@ innobase_commit(
|
||||
/* We were instructed to commit the whole transaction, or
|
||||
this is an SQL statement end and autocommit is on */
|
||||
|
||||
/* We need current binlog position for ibbackup to work.
|
||||
Note, the position is current because of
|
||||
prepare_commit_mutex */
|
||||
retry:
|
||||
if (innobase_commit_concurrency > 0) {
|
||||
pthread_mutex_lock(&commit_cond_m);
|
||||
commit_threads++;
|
||||
|
||||
if (commit_threads > innobase_commit_concurrency) {
|
||||
commit_threads--;
|
||||
pthread_cond_wait(&commit_cond,
|
||||
&commit_cond_m);
|
||||
pthread_mutex_unlock(&commit_cond_m);
|
||||
goto retry;
|
||||
}
|
||||
else {
|
||||
pthread_mutex_unlock(&commit_cond_m);
|
||||
}
|
||||
}
|
||||
|
||||
/* The following calls to read the MySQL binary log
|
||||
file name and the position return consistent results:
|
||||
1) Other InnoDB transactions cannot intervene between
|
||||
these calls as we are holding prepare_commit_mutex.
|
||||
2) Binary logging of other engines is not relevant
|
||||
to InnoDB as all InnoDB requires is that committing
|
||||
InnoDB transactions appear in the same order in the
|
||||
MySQL binary log as they appear in InnoDB logs.
|
||||
3) A MySQL log file rotation cannot happen because
|
||||
MySQL protects against this by having a counter of
|
||||
transactions in prepared state and it only allows
|
||||
a rotation when the counter drops to zero. See
|
||||
LOCK_prep_xids and COND_prep_xids in log.cc. */
|
||||
trx->mysql_log_file_name = mysql_bin_log_file_name();
|
||||
trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
|
||||
|
||||
/* Don't do write + flush right now. For group commit
|
||||
to work we want to do the flush after releasing the
|
||||
prepare_commit_mutex. */
|
||||
trx->flush_log_later = TRUE;
|
||||
innobase_commit_low(trx);
|
||||
trx->flush_log_later = FALSE;
|
||||
|
||||
if (innobase_commit_concurrency > 0) {
|
||||
pthread_mutex_lock(&commit_cond_m);
|
||||
commit_threads--;
|
||||
pthread_cond_signal(&commit_cond);
|
||||
pthread_mutex_unlock(&commit_cond_m);
|
||||
}
|
||||
|
||||
if (trx->active_trans == 2) {
|
||||
|
||||
pthread_mutex_unlock(&prepare_commit_mutex);
|
||||
}
|
||||
|
||||
/* Now do a write + flush of logs. */
|
||||
/* We did the first part already in innobase_commit_ordered(),
|
||||
Now finish by doing a write + flush of logs. */
|
||||
trx_commit_complete_for_mysql(trx);
|
||||
trx->active_trans = 0;
|
||||
|
||||
@ -4621,6 +4650,7 @@ no_commit:
|
||||
no need to re-acquire locks on it. */
|
||||
|
||||
/* Altering to InnoDB format */
|
||||
innobase_commit_ordered(ht, user_thd, 1);
|
||||
innobase_commit(ht, user_thd, 1);
|
||||
/* Note that this transaction is still active. */
|
||||
prebuilt->trx->active_trans = 1;
|
||||
@ -4637,6 +4667,7 @@ no_commit:
|
||||
|
||||
/* Commit the transaction. This will release the table
|
||||
locks, so they have to be acquired again. */
|
||||
innobase_commit_ordered(ht, user_thd, 1);
|
||||
innobase_commit(ht, user_thd, 1);
|
||||
/* Note that this transaction is still active. */
|
||||
prebuilt->trx->active_trans = 1;
|
||||
@ -8339,6 +8370,7 @@ ha_innobase::external_lock(
|
||||
|
||||
if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
|
||||
if (trx->active_trans != 0) {
|
||||
innobase_commit_ordered(ht, thd, TRUE);
|
||||
innobase_commit(ht, thd, TRUE);
|
||||
}
|
||||
} else {
|
||||
@ -9448,36 +9480,6 @@ innobase_xa_prepare(
|
||||
|
||||
srv_active_wake_master_thread();
|
||||
|
||||
if (thd_sql_command(thd) != SQLCOM_XA_PREPARE &&
|
||||
(all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
|
||||
{
|
||||
if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) {
|
||||
/* choose group commit rather than binlog order */
|
||||
return(error);
|
||||
}
|
||||
|
||||
/* For ibbackup to work the order of transactions in binlog
|
||||
and InnoDB must be the same. Consider the situation
|
||||
|
||||
thread1> prepare; write to binlog; ...
|
||||
<context switch>
|
||||
thread2> prepare; write to binlog; commit
|
||||
thread1> ... commit
|
||||
|
||||
To ensure this will not happen we're taking the mutex on
|
||||
prepare, and releasing it on commit.
|
||||
|
||||
Note: only do it for normal commits, done via ha_commit_trans.
|
||||
If 2pc protocol is executed by external transaction
|
||||
coordinator, it will be just a regular MySQL client
|
||||
executing XA PREPARE and XA COMMIT commands.
|
||||
In this case we cannot know how many minutes or hours
|
||||
will be between XA PREPARE and XA COMMIT, and we don't want
|
||||
to block for undefined period of time. */
|
||||
pthread_mutex_lock(&prepare_commit_mutex);
|
||||
trx->active_trans = 2;
|
||||
}
|
||||
|
||||
return(error);
|
||||
}
|
||||
|
||||
@ -10669,11 +10671,6 @@ static MYSQL_SYSVAR_ENUM(adaptive_checkpoint, srv_adaptive_checkpoint,
|
||||
"Enable/Disable flushing along modified age. (none, reflex, [estimate])",
|
||||
NULL, innodb_adaptive_checkpoint_update, 2, &adaptive_checkpoint_typelib);
|
||||
|
||||
static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit,
|
||||
PLUGIN_VAR_RQCMDARG,
|
||||
"Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
|
||||
NULL, NULL, 0, 0, 1, 0);
|
||||
|
||||
static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import,
|
||||
PLUGIN_VAR_RQCMDARG,
|
||||
"Enable/Disable converting automatically *.ibd files when import tablespace.",
|
||||
@ -10763,7 +10760,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
|
||||
MYSQL_SYSVAR(flush_neighbor_pages),
|
||||
MYSQL_SYSVAR(read_ahead),
|
||||
MYSQL_SYSVAR(adaptive_checkpoint),
|
||||
MYSQL_SYSVAR(enable_unsafe_group_commit),
|
||||
MYSQL_SYSVAR(expand_import),
|
||||
MYSQL_SYSVAR(extra_rsegments),
|
||||
MYSQL_SYSVAR(dict_size_limit),
|
||||
|
Reference in New Issue
Block a user