From 0394cf203042eb6e408b9c88802c93444f226af9 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 30 Sep 2010 15:20:15 +0200 Subject: [PATCH 01/13] MWL#116: Efficient group commit for binary log Preliminary commit for testing --- mysql-test/r/group_commit.result | 63 + mysql-test/suite/binlog/r/binlog_ioerr.result | 28 + mysql-test/suite/binlog/t/binlog_ioerr.test | 29 + mysql-test/t/group_commit.test | 115 ++ sql/handler.cc | 224 +-- sql/handler.h | 89 +- sql/log.cc | 1292 +++++++++++++---- sql/log.h | 209 ++- sql/log_event.h | 5 +- sql/mysqld.cc | 3 + sql/sql_class.cc | 6 +- sql/sql_class.h | 4 + sql/sql_load.cc | 2 - sql/table.cc | 10 +- sql/table.h | 1 - storage/xtradb/handler/ha_innodb.cc | 208 ++- 16 files changed, 1821 insertions(+), 467 deletions(-) create mode 100644 mysql-test/r/group_commit.result create mode 100644 mysql-test/suite/binlog/r/binlog_ioerr.result create mode 100644 mysql-test/suite/binlog/t/binlog_ioerr.test create mode 100644 mysql-test/t/group_commit.test diff --git a/mysql-test/r/group_commit.result b/mysql-test/r/group_commit.result new file mode 100644 index 00000000000..c7993227f8f --- /dev/null +++ b/mysql-test/r/group_commit.result @@ -0,0 +1,63 @@ +CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb; +SELECT variable_value INTO @commits FROM information_schema.global_status +WHERE variable_name = 'binlog_commits'; +SELECT variable_value INTO @group_commits FROM information_schema.global_status +WHERE variable_name = 'binlog_group_commits'; +SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued"; +INSERT INTO t1 VALUES ("con1"); +set DEBUG_SYNC= "now WAIT_FOR group1_running"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2"; +SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed"; +SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked"; +INSERT INTO t1 VALUES ("con2"); +SET DEBUG_SYNC= "now WAIT_FOR group2_con2"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3"; +INSERT INTO t1 VALUES ("con3"); +SET DEBUG_SYNC= "now WAIT_FOR group2_con3"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4"; +INSERT INTO t1 VALUES ("con4"); +SET DEBUG_SYNC= "now WAIT_FOR group2_con4"; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +SELECT * FROM t1 ORDER BY a; +a +SET DEBUG_SYNC= "now SIGNAL group2_queued"; +SELECT * FROM t1 ORDER BY a; +a +con1 +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5"; +SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued"; +INSERT INTO t1 VALUES ("con5"); +SET DEBUG_SYNC= "now WAIT_FOR con5_leader"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued"; +INSERT INTO t1 VALUES ("con6"); +SET DEBUG_SYNC= "now WAIT_FOR group3_con5"; +SELECT * FROM t1 ORDER BY a; +a +con1 +SET DEBUG_SYNC= "now SIGNAL group3_committed"; +SET DEBUG_SYNC= "now WAIT_FOR group2_visible"; +SELECT * FROM t1 ORDER BY a; +a +con1 +con2 +con3 +con4 +SET DEBUG_SYNC= "now SIGNAL group2_checked"; +SELECT * FROM t1 ORDER BY a; +a +con1 +con2 +con3 +con4 +con5 +con6 +SELECT variable_value - @commits FROM information_schema.global_status +WHERE variable_name = 'binlog_commits'; +variable_value - @commits +6 +SELECT variable_value - @group_commits FROM information_schema.global_status +WHERE variable_name = 'binlog_group_commits'; +variable_value - @group_commits +3 +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; diff --git a/mysql-test/suite/binlog/r/binlog_ioerr.result b/mysql-test/suite/binlog/r/binlog_ioerr.result new file mode 100644 index 00000000000..2300f3c5f82 --- /dev/null +++ b/mysql-test/suite/binlog/r/binlog_ioerr.result @@ -0,0 +1,28 @@ +CALL mtr.add_suppression("Error writing file 'master-bin'"); +RESET MASTER; +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; +INSERT INTO t1 VALUES(0); +SET SESSION debug='+d,fail_binlog_write_1'; +INSERT INTO t1 VALUES(1); +ERROR HY000: Error writing file 'master-bin' (errno: 22) +INSERT INTO t1 VALUES(2); +ERROR HY000: Error writing file 'master-bin' (errno: 22) +SET SESSION debug=''; +INSERT INTO t1 VALUES(3); +SELECT * FROM t1; +a +0 +3 +SHOW BINLOG EVENTS; +Log_name Pos Event_type Server_id End_log_pos Info +BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: # +BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb +BINLOG POS Query 1 ENDPOS BEGIN +BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0) +BINLOG POS Xid 1 ENDPOS COMMIT /* XID */ +BINLOG POS Query 1 ENDPOS BEGIN +BINLOG POS Query 1 ENDPOS BEGIN +BINLOG POS Query 1 ENDPOS BEGIN +BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3) +BINLOG POS Xid 1 ENDPOS COMMIT /* XID */ +DROP TABLE t1; diff --git a/mysql-test/suite/binlog/t/binlog_ioerr.test b/mysql-test/suite/binlog/t/binlog_ioerr.test new file mode 100644 index 00000000000..e58cd93361a --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_ioerr.test @@ -0,0 +1,29 @@ +source include/have_debug.inc; +source include/have_innodb.inc; +source include/have_log_bin.inc; +source include/have_binlog_format_mixed_or_statement.inc; + +CALL mtr.add_suppression("Error writing file 'master-bin'"); + +RESET MASTER; + +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; +INSERT INTO t1 VALUES(0); +SET SESSION debug='+d,fail_binlog_write_1'; +--error ER_ERROR_ON_WRITE +INSERT INTO t1 VALUES(1); +--error ER_ERROR_ON_WRITE +INSERT INTO t1 VALUES(2); +SET SESSION debug=''; +INSERT INTO t1 VALUES(3); +SELECT * FROM t1; + +# Actually the output from this currently shows a bug. +# The injected IO error leaves partially written transactions in the binlog in +# the form of stray "BEGIN" events. +# These should disappear from the output if binlog error handling is improved. +--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/ +--replace_column 1 BINLOG 2 POS 5 ENDPOS +SHOW BINLOG EVENTS; + +DROP TABLE t1; diff --git a/mysql-test/t/group_commit.test b/mysql-test/t/group_commit.test new file mode 100644 index 00000000000..df4ea6654d4 --- /dev/null +++ b/mysql-test/t/group_commit.test @@ -0,0 +1,115 @@ +--source include/have_debug_sync.inc +--source include/have_innodb.inc +--source include/have_log_bin.inc + +# Test some group commit code paths by using debug_sync to do controlled +# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a +# group. +# +# Group 3 is allowed to race as far as possible ahead before group 2 finishes +# to check some edge case for concurrency control. + +CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb; + +SELECT variable_value INTO @commits FROM information_schema.global_status + WHERE variable_name = 'binlog_commits'; +SELECT variable_value INTO @group_commits FROM information_schema.global_status + WHERE variable_name = 'binlog_group_commits'; + +connect(con1,localhost,root,,); +connect(con2,localhost,root,,); +connect(con3,localhost,root,,); +connect(con4,localhost,root,,); +connect(con5,localhost,root,,); +connect(con6,localhost,root,,); + +# Start group1 (with one thread) doing commit, waiting for +# group2 to queue up before finishing. + +connection con1; +SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued"; +send INSERT INTO t1 VALUES ("con1"); + +# Make group2 (with three threads) queue up. +# Make sure con2 is the group commit leader for group2. +# Make group2 wait with running commit_ordered() until group3 has committed. + +connection con2; +set DEBUG_SYNC= "now WAIT_FOR group1_running"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2"; +SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed"; +SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked"; +send INSERT INTO t1 VALUES ("con2"); +connection con3; +SET DEBUG_SYNC= "now WAIT_FOR group2_con2"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3"; +send INSERT INTO t1 VALUES ("con3"); +connection con4; +SET DEBUG_SYNC= "now WAIT_FOR group2_con3"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4"; +send INSERT INTO t1 VALUES ("con4"); + +# When group2 is queued, let group1 continue and queue group3. + +connection default; +SET DEBUG_SYNC= "now WAIT_FOR group2_con4"; + +# At this point, trasaction 1 is still not visible as commit_ordered() has not +# been called yet. +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +SELECT * FROM t1 ORDER BY a; + +SET DEBUG_SYNC= "now SIGNAL group2_queued"; +connection con1; +reap; + +# Now transaction 1 is visible. +connection default; +SELECT * FROM t1 ORDER BY a; + +connection con5; +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5"; +SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued"; +send INSERT INTO t1 VALUES ("con5"); + +connection con6; +SET DEBUG_SYNC= "now WAIT_FOR con5_leader"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued"; +send INSERT INTO t1 VALUES ("con6"); + +connection default; +SET DEBUG_SYNC= "now WAIT_FOR group3_con5"; +# Still only transaction 1 visible, as group2 have not yet run commit_ordered(). +SELECT * FROM t1 ORDER BY a; +SET DEBUG_SYNC= "now SIGNAL group3_committed"; +SET DEBUG_SYNC= "now WAIT_FOR group2_visible"; +# Now transactions 1-4 visible. +SELECT * FROM t1 ORDER BY a; +SET DEBUG_SYNC= "now SIGNAL group2_checked"; + +connection con2; +reap; + +connection con3; +reap; + +connection con4; +reap; + +connection con5; +reap; + +connection con6; +reap; + +connection default; +# Check all transactions finally visible. +SELECT * FROM t1 ORDER BY a; + +SELECT variable_value - @commits FROM information_schema.global_status + WHERE variable_name = 'binlog_commits'; +SELECT variable_value - @group_commits FROM information_schema.global_status + WHERE variable_name = 'binlog_group_commits'; + +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; diff --git a/sql/handler.cc b/sql/handler.cc index b817673ed23..0bce67596fa 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -76,6 +76,8 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"", static TYPELIB known_extensions= {0,"known_exts", NULL, NULL}; uint known_extensions_id= 0; +static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, + bool is_real_trans); static plugin_ref ha_default_plugin(THD *thd) @@ -1070,7 +1072,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list, */ int ha_commit_trans(THD *thd, bool all) { - int error= 0, cookie= 0; + int error= 0, cookie; /* 'all' means that this is either an explicit commit issued by user, or an implicit commit issued by a DDL. @@ -1085,7 +1087,8 @@ int ha_commit_trans(THD *thd, bool all) */ bool is_real_trans= all || thd->transaction.all.ha_list == 0; Ha_trx_info *ha_info= trans->ha_list; - my_xid xid= thd->transaction.xid_state.xid.get_my_xid(); + bool need_prepare_ordered, need_commit_ordered; + my_xid xid; DBUG_ENTER("ha_commit_trans"); /* @@ -1118,85 +1121,112 @@ int ha_commit_trans(THD *thd, bool all) DBUG_RETURN(2); } #ifdef USING_TRANSACTIONS - if (ha_info) + if (!ha_info) { - uint rw_ha_count; - bool rw_trans; - - DBUG_EXECUTE_IF("crash_commit_before", abort();); - - /* Close all cursors that can not survive COMMIT */ - if (is_real_trans) /* not a statement commit */ - thd->stmt_map.close_transient_cursors(); - - rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all); - /* rw_trans is TRUE when we in a transaction changing data */ - rw_trans= is_real_trans && (rw_ha_count > 0); - - if (rw_trans && - wait_if_global_read_lock(thd, 0, 0)) - { - ha_rollback_trans(thd, all); - DBUG_RETURN(1); - } - - if (rw_trans && - opt_readonly && - !(thd->security_ctx->master_access & SUPER_ACL) && - !thd->slave_thread) - { - my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only"); - ha_rollback_trans(thd, all); - error= 1; - goto end; - } - - if (!trans->no_2pc && (rw_ha_count > 1)) - { - for (; ha_info && !error; ha_info= ha_info->next()) - { - int err; - handlerton *ht= ha_info->ht(); - /* - Do not call two-phase commit if this particular - transaction is read-only. This allows for simpler - implementation in engines that are always read-only. - */ - if (! ha_info->is_trx_read_write()) - continue; - /* - Sic: we know that prepare() is not NULL since otherwise - trans->no_2pc would have been set. - */ - if ((err= ht->prepare(ht, thd, all))) - { - my_error(ER_ERROR_DURING_COMMIT, MYF(0), err); - error= 1; - } - status_var_increment(thd->status_var.ha_prepare_count); - } - DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT();); - if (error || (is_real_trans && xid && - (error= !(cookie= tc_log->log_xid(thd, xid))))) - { - ha_rollback_trans(thd, all); - error= 1; - goto end; - } - DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT();); - } - error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0; - DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT();); - if (cookie) - tc_log->unlog(cookie, xid); - DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT();); -end: - if (rw_trans) - start_waiting_global_read_lock(thd); + /* Free resources and perform other cleanup even for 'empty' transactions. */ + if (is_real_trans) + thd->transaction.cleanup(); + DBUG_RETURN(0); } - /* Free resources and perform other cleanup even for 'empty' transactions. */ - else if (is_real_trans) - thd->transaction.cleanup(); + + DBUG_EXECUTE_IF("crash_commit_before", abort();); + + /* Close all cursors that can not survive COMMIT */ + if (is_real_trans) /* not a statement commit */ + thd->stmt_map.close_transient_cursors(); + + uint rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all); + /* rw_trans is TRUE when we in a transaction changing data */ + bool rw_trans= is_real_trans && (rw_ha_count > 0); + + if (rw_trans && + wait_if_global_read_lock(thd, 0, 0)) + { + ha_rollback_trans(thd, all); + DBUG_RETURN(1); + } + + if (rw_trans && + opt_readonly && + !(thd->security_ctx->master_access & SUPER_ACL) && + !thd->slave_thread) + { + my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only"); + goto err; + } + + if (trans->no_2pc || (rw_ha_count <= 1)) + { + error= ha_commit_one_phase(thd, all); + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT();); + goto end; + } + + need_prepare_ordered= FALSE; + need_commit_ordered= FALSE; + xid= thd->transaction.xid_state.xid.get_my_xid(); + + for (Ha_trx_info *hi= ha_info; hi; hi= hi->next()) + { + int err; + handlerton *ht= hi->ht(); + /* + Do not call two-phase commit if this particular + transaction is read-only. This allows for simpler + implementation in engines that are always read-only. + */ + if (! hi->is_trx_read_write()) + continue; + /* + Sic: we know that prepare() is not NULL since otherwise + trans->no_2pc would have been set. + */ + if ((err= ht->prepare(ht, thd, all))) + my_error(ER_ERROR_DURING_COMMIT, MYF(0), err); + status_var_increment(thd->status_var.ha_prepare_count); + + if (err) + goto err; + + if (ht->prepare_ordered) + need_prepare_ordered= TRUE; + if (ht->commit_ordered) + need_commit_ordered= TRUE; + } + DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT();); + + if (!is_real_trans) + { + error= commit_one_phase_2(thd, all, trans, is_real_trans); + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT();); + goto end; + } + + cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered, + need_commit_ordered); + if (!cookie) + goto err; + + DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT();); + + error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0; + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT();); + + DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT();); + tc_log->unlog(cookie, xid); + + DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT();); + goto end; + + /* Come here if error and we need to rollback. */ +err: + if (!error) + error= 1; + ha_rollback_trans(thd, all); + +end: + if (rw_trans) + start_waiting_global_read_lock(thd); #endif /* USING_TRANSACTIONS */ DBUG_RETURN(error); } @@ -1207,7 +1237,6 @@ end: */ int ha_commit_one_phase(THD *thd, bool all) { - int error=0; THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt; /* "real" is a nick name for a transaction for which a commit will @@ -1217,8 +1246,41 @@ int ha_commit_one_phase(THD *thd, bool all) enclosing 'all' transaction is rolled back. */ bool is_real_trans=all || thd->transaction.all.ha_list == 0; - Ha_trx_info *ha_info= trans->ha_list, *ha_info_next; + Ha_trx_info *ha_info= trans->ha_list; DBUG_ENTER("ha_commit_one_phase"); +#ifdef USING_TRANSACTIONS + if (ha_info) + { + if (is_real_trans) + { + bool locked= false; + for (; ha_info; ha_info= ha_info->next()) + { + handlerton *ht= ha_info->ht(); + if (ht->commit_ordered) + { + if (ha_info->is_trx_read_write() && !locked) + { + pthread_mutex_lock(&LOCK_commit_ordered); + locked= 1; + } + ht->commit_ordered(ht, thd, all); + } + } + if (locked) + pthread_mutex_unlock(&LOCK_commit_ordered); + } + } +#endif /* USING_TRANSACTIONS */ + DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans)); +} + +static int +commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans) +{ + int error= 0; + Ha_trx_info *ha_info= trans->ha_list, *ha_info_next; + DBUG_ENTER("commit_one_phase_2"); #ifdef USING_TRANSACTIONS if (ha_info) { diff --git a/sql/handler.h b/sql/handler.h index d03264a23db..17dcc294099 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -656,9 +656,96 @@ struct handlerton NOTE 'all' is also false in auto-commit mode where 'end of statement' and 'real commit' mean the same event. */ - int (*commit)(handlerton *hton, THD *thd, bool all); + int (*commit)(handlerton *hton, THD *thd, bool all); + /* + The commit_ordered() method is called prior to the commit() method, after + the transaction manager has decided to commit (not rollback) the + transaction. Unlike commit(), commit_ordered() is called only when the + full transaction is committed, not for each commit of statement + transaction in a multi-statement transaction. + + The calls to commit_ordered() in multiple parallel transactions is + guaranteed to happen in the same order in every participating + handler. This can be used to ensure the same commit order among multiple + handlers (eg. in table handler and binlog). So if transaction T1 calls + into commit_ordered() of handler A before T2, then T1 will also call + commit_ordered() of handler B before T2. + + Engines that implement this method should during this call make the + transaction visible to other transactions, thereby making the order of + transaction commits be defined by the order of commit_ordered() calls. + + The intension is that commit_ordered() should do the minimal amount of + work that needs to happen in consistent commit order among handlers. To + preserve ordering, calls need to be serialised on a global mutex, so + doing any time-consuming or blocking operations in commit_ordered() will + limit scalability. + + Handlers can rely on commit_ordered() calls for transactions that updated + data to be serialised (no two calls can run in parallel, so no extra + locking on the handler part is required to ensure this). However, calls + for SELECT-only transactions are not serialised, so can occur in parallel + with each other and with at most one write-transaction. + + Note that commit_ordered() can be called from a different thread than the + one handling the transaction! So it can not do anything that depends on + thread local storage, in particular it can not call my_error() and + friends (instead it can store the error code and delay the call of + my_error() to the commit() method). + + Similarly, since commit_ordered() returns void, any return error code + must be saved and returned from the commit() method instead. + + The commit_ordered method is optional, and can be left unset if not + needed in a particular handler. + */ + void (*commit_ordered)(handlerton *hton, THD *thd, bool all); int (*rollback)(handlerton *hton, THD *thd, bool all); int (*prepare)(handlerton *hton, THD *thd, bool all); + /* + The prepare_ordered method is optional. If set, it will be called after + successful prepare() in all handlers participating in 2-phase + commit. Like commit_ordered(), it is called only when the full + transaction is committed, not for each commit of statement transaction. + + The calls to prepare_ordered() among multiple parallel transactions are + ordered consistently with calls to commit_ordered(). This means that + calls to prepare_ordered() effectively define the commit order, and that + each handler will see the same sequence of transactions calling into + prepare_ordered() and commit_ordered(). + + Thus, prepare_ordered() can be used to define commit order for handlers + that need to do this in the prepare step (like binlog). It can also be + used to release transaction's locks early in an order consistent with the + order transactions will be eventually committed. + + Like commit_ordered(), prepare_ordered() calls are serialised to maintain + ordering, so the intension is that they should execute fast, with only + the minimal amount of work needed to define commit order. Handlers can + rely on this serialisation, and do not need to do any extra locking to + avoid two prepare_ordered() calls running in parallel. + + Like commit_ordered(), prepare_ordered() is not guaranteed to be called + in the context of the thread handling the rest of the transaction. So it + cannot invoke code that relies on thread local storage, in particular it + cannot call my_error(). + + When prepare_ordered() is called, the transaction coordinator has already + decided to commit (not rollback) the transaction. So prepare_ordered() + cannot cause a rollback by returning an error, all possible errors must + be handled in prepare() (the prepare_ordered() method returns void). In + case of some fatal error, a record of the error must be made internally + by the engine and returned from commit() later. + + Note that for user-level XA SQL commands, no consistent ordering among + prepare_ordered() and commit_ordered() is guaranteed (as that would + require blocking all other commits for an indefinite time). + + When 2-phase commit is not used (eg. only one engine (and no binlog) in + transaction), prepare() is not called and in such cases prepare_ordered() + also is not called. + */ + void (*prepare_ordered)(handlerton *hton, THD *thd, bool all); int (*recover)(handlerton *hton, XID *xid_list, uint len); int (*commit_by_xid)(handlerton *hton, XID *xid); int (*rollback_by_xid)(handlerton *hton, XID *xid); diff --git a/sql/log.cc b/sql/log.cc index f52e68dd1b9..8440a835158 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -38,6 +38,7 @@ #endif #include +#include "debug_sync.h" /* max size of the log message */ #define MAX_LOG_BUFFER_SIZE 1024 @@ -154,9 +155,12 @@ class binlog_trx_data { public: binlog_trx_data() : at_least_one_stmt_committed(0), incident(FALSE), m_pending(0), - before_stmt_pos(MY_OFF_T_UNDEF) + before_stmt_pos(MY_OFF_T_UNDEF), using_xa(0) { trans_log.end_of_file= max_binlog_cache_size; + (void) my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW, + "LOCK_group_commit", MYF(0)); + (void) pthread_cond_init(&COND_group_commit, 0); } ~binlog_trx_data() @@ -208,11 +212,12 @@ public: completely. */ void reset() { - if (!empty()) + if (trans_log.type != WRITE_CACHE || !empty()) truncate(0); before_stmt_pos= MY_OFF_T_UNDEF; incident= FALSE; trans_log.end_of_file= max_binlog_cache_size; + using_xa= FALSE; DBUG_ASSERT(empty()); } @@ -257,6 +262,41 @@ public: Binlog position before the start of the current statement. */ my_off_t before_stmt_pos; + + /* 0 or error when writing to binlog; set during group commit. */ + int error; + /* If error != 0, value of errno (for my_error() reporting). */ + int commit_errno; + /* Link for queueing transactions up for group commit to binlog. */ + binlog_trx_data *next; + /* + Flag set true when group commit for this transaction is finished; used + with pthread_cond_wait() to wait until commit is done. + This flag is protected by LOCK_group_commit. + */ + bool done; + /* + Flag set if this transaction is the group commit leader that will handle + the actual writing to the binlog. + This flag is protected by LOCK_group_commit. + */ + bool group_commit_leader; + /* + Flag set true if this transaction is committed with log_xid() as part of + XA, false if not. + */ + bool using_xa; + /* + Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be + written during group commit. The incident_event is only valid if + has_incident() is true. + */ + Log_event *begin_event; + Log_event *end_event; + Log_event *incident_event; + /* Mutex and condition for wakeup after group commit. */ + pthread_mutex_t LOCK_group_commit; + pthread_cond_t COND_group_commit; }; handlerton *binlog_hton; @@ -1391,117 +1431,188 @@ static int binlog_close_connection(handlerton *hton, THD *thd) return 0; } +/* Helper functions for binlog_flush_trx_cache(). */ +static int +binlog_flush_trx_cache_prepare(THD *thd) +{ + if (thd->binlog_flush_pending_rows_event(TRUE)) + return 1; + return 0; +} + +static void +binlog_flush_trx_cache_finish(THD *thd, binlog_trx_data *trx_data) +{ + IO_CACHE *trans_log= &trx_data->trans_log; + + trx_data->reset(); + + statistic_increment(binlog_cache_use, &LOCK_status); + if (trans_log->disk_writes != 0) + { + statistic_increment(binlog_cache_disk_use, &LOCK_status); + trans_log->disk_writes= 0; + } +} + /* - End a transaction. + End a transaction, writing events to the binary log. SYNOPSIS - binlog_end_trans() + binlog_flush_trx_cache() thd The thread whose transaction should be ended trx_data Pointer to the transaction data to use - end_ev The end event to use, or NULL - all True if the entire transaction should be ended, false if - only the statement transaction should be ended. + end_ev The end event to use (COMMIT, ROLLBACK, or commit XID) DESCRIPTION End the currently open transaction. The transaction can be either - a real transaction (if 'all' is true) or a statement transaction - (if 'all' is false). + a real transaction or a statement transaction. - If 'end_ev' is NULL, the transaction is a rollback of only - transactional tables, so the transaction cache will be truncated - to either just before the last opened statement transaction (if - 'all' is false), or reset completely (if 'all' is true). + This can be to commit a transaction, with a COMMIT query event or an XA + commit XID event. But it can also be to rollback a transaction with a + ROLLBACK query event, used for rolling back transactions which also + contain updates to non-transactional tables. */ static int -binlog_end_trans(THD *thd, binlog_trx_data *trx_data, - Log_event *end_ev, bool all) +binlog_flush_trx_cache(THD *thd, binlog_trx_data *trx_data, + Log_event *end_ev) { - DBUG_ENTER("binlog_end_trans"); - int error=0; - IO_CACHE *trans_log= &trx_data->trans_log; - DBUG_PRINT("enter", ("transaction: %s end_ev: 0x%lx", - all ? "all" : "stmt", (long) end_ev)); + DBUG_ENTER("binlog_flush_trx_cache"); DBUG_PRINT("info", ("thd->options={ %s%s}", FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT), FLAGSTR(thd->options, OPTION_BEGIN))); + if (binlog_flush_trx_cache_prepare(thd)) + DBUG_RETURN(1); + /* - NULL denotes ROLLBACK with nothing to replicate: i.e., rollback of - only transactional tables. If the transaction contain changes to - any non-transactiona tables, we need write the transaction and log - a ROLLBACK last. - */ - if (end_ev != NULL) - { - if (thd->binlog_flush_pending_rows_event(TRUE)) - DBUG_RETURN(1); - /* - Doing a commit or a rollback including non-transactional tables, - i.e., ending a transaction where we might write the transaction - cache to the binary log. + Doing a commit or a rollback including non-transactional tables, + i.e., ending a transaction where we might write the transaction + cache to the binary log. - We can always end the statement when ending a transaction since - transactions are not allowed inside stored functions. If they - were, we would have to ensure that we're not ending a statement - inside a stored function. - */ - error= mysql_bin_log.write(thd, &trx_data->trans_log, end_ev, - trx_data->has_incident()); - trx_data->reset(); + We can always end the statement when ending a transaction since + transactions are not allowed inside stored functions. If they + were, we would have to ensure that we're not ending a statement + inside a stored function. + */ + int error= mysql_bin_log.write_transaction_to_binlog(thd, trx_data, end_ev); - /* - We need to step the table map version after writing the - transaction cache to disk. - */ - mysql_bin_log.update_table_map_version(); - statistic_increment(binlog_cache_use, &LOCK_status); - if (trans_log->disk_writes != 0) - { - statistic_increment(binlog_cache_disk_use, &LOCK_status); - trans_log->disk_writes= 0; - } - } - else - { - /* - If rolling back an entire transaction or a single statement not - inside a transaction, we reset the transaction cache. - - If rolling back a statement in a transaction, we truncate the - transaction cache to remove the statement. - */ - thd->binlog_remove_pending_rows_event(TRUE); - if (all || !(thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT))) - { - if (trx_data->has_incident()) - error= mysql_bin_log.write_incident(thd, TRUE); - trx_data->reset(); - } - else // ...statement - trx_data->truncate(trx_data->before_stmt_pos); - - /* - We need to step the table map version on a rollback to ensure - that a new table map event is generated instead of the one that - was written to the thrown-away transaction cache. - */ - mysql_bin_log.update_table_map_version(); - } + binlog_flush_trx_cache_finish(thd, trx_data); DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL); DBUG_RETURN(error); } +/* + Discard a transaction, ie. ROLLBACK with only transactional table updates. + + SYNOPSIS + binlog_truncate_trx_cache() + + thd The thread whose transaction should be ended + trx_data Pointer to the transaction data to use + all True if the entire transaction should be ended, false if + only the statement transaction should be ended. + + DESCRIPTION + + Rollback (and end) a transaction that only modifies transactional + tables. The transaction can be either a real transaction (if 'all' is + true) or a statement transaction (if 'all' is false). + + The transaction cache will be truncated to either just before the last + opened statement transaction (if 'all' is false), or reset completely (if + 'all' is true). + */ +static int +binlog_truncate_trx_cache(THD *thd, binlog_trx_data *trx_data, bool all) +{ + DBUG_ENTER("binlog_truncate_trx_cache"); + int error= 0; + DBUG_PRINT("enter", ("transaction: %s", all ? "all" : "stmt")); + DBUG_PRINT("info", ("thd->options={ %s%s}", + FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT), + FLAGSTR(thd->options, OPTION_BEGIN))); + + /* + ROLLBACK with nothing to replicate: i.e., rollback of only transactional + tables. + */ + + /* + If rolling back an entire transaction or a single statement not + inside a transaction, we reset the transaction cache. + + If rolling back a statement in a transaction, we truncate the + transaction cache to remove the statement. + */ + thd->binlog_remove_pending_rows_event(TRUE); + if (all || !(thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT))) + { + if (trx_data->has_incident()) + error= mysql_bin_log.write_incident(thd); + trx_data->reset(); + } + else // ...statement + trx_data->truncate(trx_data->before_stmt_pos); + + DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL); + DBUG_RETURN(error); +} + +static LEX_STRING const write_error_msg= + { C_STRING_WITH_LEN("error writing to the binary log") }; + static int binlog_prepare(handlerton *hton, THD *thd, bool all) { /* - do nothing. - just pretend we can do 2pc, so that MySQL won't - switch to 1pc. - real work will be done in MYSQL_BIN_LOG::log_xid() + If this prepare is for a single statement in the middle of a transactions, + not the actual transaction commit, then we do nothing. The real work is + only done later, in the prepare for making persistent changes. */ + if (!all && (thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT))) + return 0; + + binlog_trx_data *trx_data= + (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); + + trx_data->using_xa= TRUE; + + if (binlog_flush_trx_cache_prepare(thd)) + return 1; + + my_xid xid= thd->transaction.xid_state.xid.get_my_xid(); + if (!xid) + { + /* Skip logging this transaction, marked by setting end_event to NULL. */ + trx_data->end_event= NULL; + return 0; + } + + /* + Allocate the extra events that will be logged to the binlog in binlog group + commit. Use placement new to allocate them on the THD memroot, as they need + to remain live until log_xid() returns. + */ + size_t needed_size= sizeof(Query_log_event) + sizeof(Xid_log_event); + if (trx_data->has_incident()) + needed_size+= sizeof(Incident_log_event); + uchar *mem= (uchar *)thd->alloc(needed_size); + if (!mem) + return 1; + + trx_data->begin_event= new ((void *)mem) + Query_log_event(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0); + mem+= sizeof(Query_log_event); + + trx_data->end_event= new ((void *)mem) Xid_log_event(thd, xid); + + if (trx_data->has_incident()) + trx_data->incident_event= new ((void *)(mem + sizeof(Xid_log_event))) + Incident_log_event(thd, INCIDENT_LOST_EVENTS, write_error_msg); + return 0; } @@ -1525,11 +1636,11 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all) binlog_trx_data *const trx_data= (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); - if (trx_data->empty()) + if (trx_data->using_xa) { // we're here because trans_log was flushed in MYSQL_BIN_LOG::log_xid() - trx_data->reset(); - DBUG_RETURN(0); + binlog_flush_trx_cache_finish(thd, trx_data); + DBUG_RETURN(error); } /* @@ -1556,8 +1667,8 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all) !stmt_has_updated_trans_table(thd) && thd->transaction.stmt.modified_non_trans_table)) { - Query_log_event qev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0); - error= binlog_end_trans(thd, trx_data, &qev, all); + Query_log_event end_ev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0); + error= binlog_flush_trx_cache(thd, trx_data, &end_ev); } trx_data->at_least_one_stmt_committed = my_b_tell(&trx_data->trans_log) > 0; @@ -1621,7 +1732,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all) (thd->options & OPTION_KEEP_LOG)) && mysql_bin_log.check_write_error(thd)) trx_data->set_incident(); - error= binlog_end_trans(thd, trx_data, 0, all); + error= binlog_truncate_trx_cache(thd, trx_data, all); } else { @@ -1641,8 +1752,8 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all) thd->current_stmt_binlog_row_based) || ((thd->options & OPTION_KEEP_LOG))) { - Query_log_event qev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0); - error= binlog_end_trans(thd, trx_data, &qev, all); + Query_log_event end_ev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0); + error= binlog_flush_trx_cache(thd, trx_data, &end_ev); } /* Otherwise, we simply truncate the cache as there is no change on @@ -1650,7 +1761,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all) */ else if ((all && !thd->transaction.all.modified_non_trans_table) || (!all && !thd->transaction.stmt.modified_non_trans_table)) - error= binlog_end_trans(thd, trx_data, 0, all); + error= binlog_truncate_trx_cache(thd, trx_data, all); } if (!all) trx_data->before_stmt_pos = MY_OFF_T_UNDEF; // part of the stmt rollback @@ -2464,7 +2575,7 @@ const char *MYSQL_LOG::generate_name(const char *log_name, MYSQL_BIN_LOG::MYSQL_BIN_LOG() :bytes_written(0), prepared_xids(0), file_id(1), open_count(1), - need_start_event(TRUE), m_table_map_version(0), + need_start_event(TRUE), is_relay_log(0), description_event_for_exec(0), description_event_for_queue(0) { @@ -2492,6 +2603,7 @@ void MYSQL_BIN_LOG::cleanup() delete description_event_for_exec; (void) pthread_mutex_destroy(&LOCK_log); (void) pthread_mutex_destroy(&LOCK_index); + (void) pthread_mutex_destroy(&LOCK_queue); (void) pthread_cond_destroy(&update_cond); } DBUG_VOID_RETURN; @@ -2520,6 +2632,8 @@ void MYSQL_BIN_LOG::init_pthread_objects() */ (void) my_pthread_mutex_init(&LOCK_index, MY_MUTEX_INIT_SLOW, "LOCK_index", MYF_NO_DEADLOCK_DETECTION); + (void) my_pthread_mutex_init(&LOCK_queue, MY_MUTEX_INIT_FAST, "LOCK_queue", + MYF(0)); (void) pthread_cond_init(&update_cond, 0); } @@ -3943,6 +4057,10 @@ err: } +#ifndef DBUG_OFF +static ulong opt_binlog_dbug_fsync_sleep= 0; +#endif + bool MYSQL_BIN_LOG::flush_and_sync() { int err=0, fd=log_file.file; @@ -3953,6 +4071,11 @@ bool MYSQL_BIN_LOG::flush_and_sync() { sync_binlog_counter= 0; err=my_sync(fd, MYF(MY_WME)); +#ifndef DBUG_OFF + ulong usec_sleep= opt_binlog_dbug_fsync_sleep; + if (usec_sleep > 0) + my_sleep(usec_sleep); +#endif } return err; } @@ -4113,7 +4236,6 @@ int THD::binlog_write_table_map(TABLE *table, bool is_trans) DBUG_RETURN(error); binlog_table_maps++; - table->s->table_map_version= mysql_bin_log.table_map_version(); DBUG_RETURN(0); } @@ -4194,64 +4316,41 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd, if (Rows_log_event* pending= trx_data->pending()) { - IO_CACHE *file= &log_file; - /* Decide if we should write to the log file directly or to the transaction log. */ if (pending->get_cache_stmt() || my_b_tell(&trx_data->trans_log)) - file= &trx_data->trans_log; - - /* - If we are writing to the log file directly, we could avoid - locking the log. This does not work since we need to step the - m_table_map_version below, and that change has to be protected - by the LOCK_log mutex. - */ - pthread_mutex_lock(&LOCK_log); - - /* - Write pending event to log file or transaction cache - */ - if (pending->write(file)) { - pthread_mutex_unlock(&LOCK_log); - set_write_error(thd); - DBUG_RETURN(1); + /* Write to transaction log/cache. */ + if (pending->write(&trx_data->trans_log)) + { + set_write_error(thd); + DBUG_RETURN(1); + } } - - /* - We step the table map version if we are writing an event - representing the end of a statement. We do this regardless of - wheather we write to the transaction cache or to directly to the - file. - - In an ideal world, we could avoid stepping the table map version - if we were writing to a transaction cache, since we could then - reuse the table map that was written earlier in the transaction - cache. This does not work since STMT_END_F implies closing all - table mappings on the slave side. - - TODO: Find a solution so that table maps does not have to be - written several times within a transaction. - */ - if (pending->get_flags(Rows_log_event::STMT_END_F)) - ++m_table_map_version; - - delete pending; - - if (file == &log_file) + else { + /* Write directly to log file. */ + pthread_mutex_lock(&LOCK_log); + if (pending->write(&log_file)) + { + pthread_mutex_unlock(&LOCK_log); + set_write_error(thd); + DBUG_RETURN(1); + } + error= flush_and_sync(); if (!error) { signal_update(); rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); } + + pthread_mutex_unlock(&LOCK_log); } - pthread_mutex_unlock(&LOCK_log); + delete pending; } thd->binlog_set_pending_rows_event(event); @@ -4450,9 +4549,6 @@ err: set_write_error(thd); } - if (event_info->flags & LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F) - ++m_table_map_version; - pthread_mutex_unlock(&LOCK_log); DBUG_RETURN(error); } @@ -4575,18 +4671,14 @@ uint MYSQL_BIN_LOG::next_file_id() SYNOPSIS write_cache() cache Cache to write to the binary log - lock_log True if the LOCK_log mutex should be aquired, false otherwise - sync_log True if the log should be flushed and sync:ed DESCRIPTION Write the contents of the cache to the binary log. The cache will be reset as a READ_CACHE to be able to read the contents from it. */ -int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache, bool lock_log, bool sync_log) +int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache) { - Mutex_sentry sentry(lock_log ? &LOCK_log : NULL); - if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0)) return ER_ERROR_ON_WRITE; uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs; @@ -4697,6 +4789,7 @@ int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache, bool lock_log, bool sync_log) } /* Write data to the binary log file */ + DBUG_EXECUTE_IF("fail_binlog_write_1", return ER_ERROR_ON_WRITE;); if (my_b_write(&log_file, cache->read_pos, length)) return ER_ERROR_ON_WRITE; cache->read_pos=cache->read_end; // Mark buffer used up @@ -4704,9 +4797,6 @@ int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache, bool lock_log, bool sync_log) DBUG_ASSERT(carry == 0); - if (sync_log) - flush_and_sync(); - return 0; // All OK } @@ -4739,26 +4829,22 @@ int query_error_code(THD *thd, bool not_killed) return error; } -bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock) +bool MYSQL_BIN_LOG::write_incident(THD *thd) { uint error= 0; DBUG_ENTER("MYSQL_BIN_LOG::write_incident"); - LEX_STRING const write_error_msg= - { C_STRING_WITH_LEN("error writing to the binary log") }; Incident incident= INCIDENT_LOST_EVENTS; Incident_log_event ev(thd, incident, write_error_msg); - if (lock) - pthread_mutex_lock(&LOCK_log); + + pthread_mutex_lock(&LOCK_log); error= ev.write(&log_file); - if (lock) + if (!error && !(error= flush_and_sync())) { - if (!error && !(error= flush_and_sync())) - { - signal_update(); - rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); - } - pthread_mutex_unlock(&LOCK_log); + signal_update(); + rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); } + pthread_mutex_unlock(&LOCK_log); + DBUG_RETURN(error); } @@ -4786,103 +4872,366 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock) 'cache' needs to be reinitialized after this functions returns. */ -bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event, - bool incident) +bool +MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data, + Log_event *end_ev) { - DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)"); + DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog"); + + /* + Create the necessary events here, where we have the correct THD (and + thread context). + + Due to group commit the actual writing to binlog may happen in a different + thread. + */ + Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0); + trx_data->begin_event= &qinfo; + trx_data->end_event= end_ev; + if (trx_data->has_incident()) + { + Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg); + trx_data->incident_event= &inc_ev; + DBUG_RETURN(write_transaction_to_binlog_events(trx_data)); + } + else + { + trx_data->incident_event= NULL; + DBUG_RETURN(write_transaction_to_binlog_events(trx_data)); + } +} + +bool +MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data) +{ + /* + To facilitate group commit for the binlog, we first queue up ourselves in + the group commit queue. Then the first thread to enter the queue waits for + the LOCK_log mutex, and commits for everyone in the queue once it gets the + lock. Any other threads in the queue just wait for the first one to finish + the commit and wake them up. + */ + + pthread_mutex_lock(&trx_data->LOCK_group_commit); + const binlog_trx_data *orig_queue= atomic_enqueue_trx(trx_data); + + if (orig_queue != NULL) + { + trx_data->group_commit_leader= FALSE; + trx_data->done= FALSE; + trx_group_commit_participant(trx_data); + } + else + { + trx_data->group_commit_leader= TRUE; + pthread_mutex_unlock(&trx_data->LOCK_group_commit); + trx_group_commit_leader(NULL); + } + + return trx_group_commit_finish(trx_data); +} + +/* + Participate as secondary transaction in group commit. + + Another thread is already waiting to obtain the LOCK_log, and should include + this thread in the group commit once the log is obtained. So here we put + ourself in the queue and wait to be signalled that the group commit is done. + + Note that this function must be called with the trs_data->LOCK_group_commit + locked; the mutex will be released before return. +*/ +void +MYSQL_BIN_LOG::trx_group_commit_participant(binlog_trx_data *trx_data) +{ + safe_mutex_assert_owner(&trx_data->LOCK_group_commit); + + /* Wait until trx_data.done == true and woken up by the leader. */ + while (!trx_data->done) + pthread_cond_wait(&trx_data->COND_group_commit, + &trx_data->LOCK_group_commit); + pthread_mutex_unlock(&trx_data->LOCK_group_commit); +} + +bool +MYSQL_BIN_LOG::trx_group_commit_finish(binlog_trx_data *trx_data) +{ + DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_finish"); + DBUG_PRINT("info", ("trx_data->error=%d\n", trx_data->error)); + if (trx_data->error) + { + switch (trx_data->error) + { + case ER_ERROR_ON_WRITE: + my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, trx_data->commit_errno); + break; + case ER_ERROR_ON_READ: + my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH), + trx_data->trans_log.file_name, trx_data->commit_errno); + break; + default: + /* + There are not (and should not be) any errors thrown not covered above. + But just in case one is added later without updating the above switch + statement, include a catch-all. + */ + my_printf_error(trx_data->error, + "Error writing transaction to binary log: %d", + MYF(ME_NOREFRESH), trx_data->error); + } + + /* + Since we return error, this transaction XID will not be committed, so + we need to mark it as not needed for recovery (unlog() is not called + for a transaction if log_xid() fails). + */ + if (trx_data->end_event->get_type_code() == XID_EVENT) + mark_xid_done(); + + DBUG_RETURN(1); + } + + DBUG_RETURN(0); +} + +/* + Do binlog group commit as the lead thread. + + This must be called when this thread/transaction is queued at the start of + the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group + commit all the transactions in the queue (more may have entered while waiting + for LOCK_log). After commit is done, all other threads in the queue will be + signalled. + + */ +void +MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) +{ + uint xid_count= 0; + uint write_count= 0; + + /* First, put anything from group_log_xid into the queue. */ + binlog_trx_data *full_queue= NULL; + binlog_trx_data **next_ptr= &full_queue; + for (TC_group_commit_entry *entry= first; entry; entry= entry->next) + { + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton); + + /* Skip log_xid for transactions without xid, marked by NULL end_event. */ + if (!trx_data->end_event) + continue; + + trx_data->error= 0; + *next_ptr= trx_data; + next_ptr= &(trx_data->next); + } + + /* + Next, lock the LOCK_log(), and once we get it, add any additional writes + that queued up while we were waiting. + + Note that if some writer not going through log_xid() comes in and gets the + LOCK_log before us, they will not be able to include us in their group + commit (and they are not able to handle ensuring same commit order between + us and participating transactional storage engines anyway). + + On the other hand, when we get the LOCK_log, we will be able to include + any non-trasactional writes that queued up in our group commit. This + should hopefully not be too big of a problem, as group commit is most + important for the transactional case anyway when durability (fsync) is + enabled. + */ VOID(pthread_mutex_lock(&LOCK_log)); - /* NULL would represent nothing to replicate after ROLLBACK */ - DBUG_ASSERT(commit_event != NULL); + /* + As the queue is in reverse order of entering, reverse the queue as we add + it to the existing one. Note that there is no ordering defined between + transactional and non-transactional commits. + */ + binlog_trx_data *current= atomic_grab_trx_queue(); + binlog_trx_data *xtra_queue= NULL; + while (current) + { + current->error= 0; + binlog_trx_data *next= current->next; + current->next= xtra_queue; + xtra_queue= current; + current= next; + } + *next_ptr= xtra_queue; + /* + Now we have in full_queue the list of transactions to be committed in + order. + */ DBUG_ASSERT(is_open()); if (likely(is_open())) // Should always be true { /* - We only bother to write to the binary log if there is anything - to write. - */ - if (my_b_tell(cache) > 0) + Commit every transaction in the queue. + + Note that we are doing this in a different thread than the one running + the transaction! So we are limited in the operations we can do. In + particular, we cannot call my_error() on behalf of a transaction, as + that obtains the THD from thread local storage. Instead, we must set + current->error and let the thread do the error reporting itself once + we wake it up. + */ + for (current= full_queue; current != NULL; current= current->next) { - /* - Log "BEGIN" at the beginning of every transaction. Here, a - transaction is either a BEGIN..COMMIT block or a single - statement in autocommit mode. - */ - Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0); + IO_CACHE *cache= ¤t->trans_log; /* - Now this Query_log_event has artificial log_pos 0. It must be - adjusted to reflect the real position in the log. Not doing it - would confuse the slave: it would prevent this one from - knowing where he is in the master's binlog, which would result - in wrong positions being shown to the user, MASTER_POS_WAIT - undue waiting etc. + We only bother to write to the binary log if there is anything + to write. */ - if (qinfo.write(&log_file)) - goto err; - - DBUG_EXECUTE_IF("crash_before_writing_xid", - { - if ((write_error= write_cache(cache, false, true))) - DBUG_PRINT("info", ("error writing binlog cache: %d", - write_error)); - DBUG_PRINT("info", ("crashing before writing xid")); - abort(); - }); - - if ((write_error= write_cache(cache, false, false))) - goto err; - - if (commit_event && commit_event->write(&log_file)) - goto err; - - if (incident && write_incident(thd, FALSE)) - goto err; - - if (flush_and_sync()) - goto err; - DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_ABORT();); - if (cache->error) // Error on read + if (my_b_tell(cache) > 0) { - sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno); - write_error=1; // Don't give more errors - goto err; + current->error= write_transaction(current); + if (current->error) + current->commit_errno= errno; + + write_count++; + } + + if (current->end_event->get_type_code() == XID_EVENT) + xid_count++; + } + + if (write_count > 0) + { + if (flush_and_sync()) + { + for (current= full_queue; current != NULL; current= current->next) + { + if (!current->error) + { + current->error= ER_ERROR_ON_WRITE; + current->commit_errno= errno; + } + } + } + else + { + signal_update(); } - signal_update(); } /* - if commit_event is Xid_log_event, increase the number of + if any commit_events are Xid_log_event, increase the number of prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated if there're prepared xids in it - see the comment in new_file() for an explanation. - If the commit_event is not Xid_log_event (then it's a Query_log_event) - rotate binlog, if necessary. + If no Xid_log_events (then it's all Query_log_event) rotate binlog, + if necessary. */ - if (commit_event && commit_event->get_type_code() == XID_EVENT) + if (xid_count > 0) { - pthread_mutex_lock(&LOCK_prep_xids); - prepared_xids++; - pthread_mutex_unlock(&LOCK_prep_xids); + mark_xids_active(xid_count); } else rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); } + VOID(pthread_mutex_unlock(&LOCK_log)); - DBUG_RETURN(0); + /* + Signal those that are not part of group_log_xid, and are not group leaders + running the queue. -err: - if (!write_error) + Since a group leader runs the queue itself if a group_log_xid does not get + to do it forst, such leader threads do not need wait or wakeup. + */ + for (current= xtra_queue; current != NULL; current= current->next) { - write_error= 1; - sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno); + /* + Note that we need to take LOCK_group_commit even in the case of a leader! + + Otherwise there is a race between setting and testing the + group_commit_leader flag. + */ + pthread_mutex_lock(¤t->LOCK_group_commit); + if (!current->group_commit_leader) + { + current->done= true; + pthread_cond_signal(¤t->COND_group_commit); + } + pthread_mutex_unlock(¤t->LOCK_group_commit); } - VOID(pthread_mutex_unlock(&LOCK_log)); - DBUG_RETURN(1); } +int +MYSQL_BIN_LOG::write_transaction(binlog_trx_data *trx_data) +{ + IO_CACHE *cache= &trx_data->trans_log; + /* + Log "BEGIN" at the beginning of every transaction. Here, a transaction is + either a BEGIN..COMMIT block or a single statement in autocommit mode. The + event was constructed in write_transaction_to_binlog(), in the thread + running the transaction. + + Now this Query_log_event has artificial log_pos 0. It must be + adjusted to reflect the real position in the log. Not doing it + would confuse the slave: it would prevent this one from + knowing where he is in the master's binlog, which would result + in wrong positions being shown to the user, MASTER_POS_WAIT + undue waiting etc. + */ + if (trx_data->begin_event->write(&log_file)) + return ER_ERROR_ON_WRITE; + + DBUG_EXECUTE_IF("crash_before_writing_xid", + { + if ((write_cache(cache))) + DBUG_PRINT("info", ("error writing binlog cache")); + else + flush_and_sync(); + + DBUG_PRINT("info", ("crashing before writing xid")); + abort(); + }); + + if (write_cache(cache)) + return ER_ERROR_ON_WRITE; + + if (trx_data->end_event->write(&log_file)) + return ER_ERROR_ON_WRITE; + + if (trx_data->has_incident() && trx_data->incident_event->write(&log_file)) + return ER_ERROR_ON_WRITE; + + if (cache->error) // Error on read + return ER_ERROR_ON_READ; + + return 0; +} + +binlog_trx_data * +MYSQL_BIN_LOG::atomic_enqueue_trx(binlog_trx_data *trx_data) +{ + my_atomic_rwlock_wrlock(&LOCK_queue); + trx_data->next= group_commit_queue; + while (!my_atomic_casptr((void **)(&group_commit_queue), + (void **)(&trx_data->next), + trx_data)) + ; + my_atomic_rwlock_wrunlock(&LOCK_queue); + return trx_data->next; +} + +binlog_trx_data * +MYSQL_BIN_LOG::atomic_grab_trx_queue() +{ + my_atomic_rwlock_wrlock(&LOCK_queue); + binlog_trx_data *queue= group_commit_queue; + while (!my_atomic_casptr((void **)(&group_commit_queue), + (void **)(&queue), + NULL)) + ; + my_atomic_rwlock_wrunlock(&LOCK_queue); + return queue; +} /** Wait until we get a signal that the binary log has been updated. @@ -5276,6 +5625,344 @@ void sql_print_information(const char *format, ...) } +static my_bool mutexes_inited; +pthread_mutex_t LOCK_prepare_ordered; +pthread_mutex_t LOCK_commit_ordered; + +void +TC_init() +{ + my_pthread_mutex_init(&LOCK_prepare_ordered, MY_MUTEX_INIT_SLOW, + "LOCK_prepare_ordered", MYF(0)); + my_pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_SLOW, + "LOCK_commit_ordered", MYF(0)); + mutexes_inited= TRUE; +} + +void +TC_destroy() +{ + if (mutexes_inited) + { + pthread_mutex_destroy(&LOCK_prepare_ordered); + pthread_mutex_destroy(&LOCK_commit_ordered); + mutexes_inited= FALSE; + } +} + +void +TC_LOG::run_prepare_ordered(THD *thd, bool all) +{ + Ha_trx_info *ha_info= + all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list; + + for (; ha_info; ha_info= ha_info->next()) + { + handlerton *ht= ha_info->ht(); + if (!ht->prepare_ordered) + continue; + safe_mutex_assert_owner(&LOCK_prepare_ordered); + ht->prepare_ordered(ht, thd, all); + } +} + +void +TC_LOG::run_commit_ordered(THD *thd, bool all) +{ + Ha_trx_info *ha_info= + all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list; + + for (; ha_info; ha_info= ha_info->next()) + { + handlerton *ht= ha_info->ht(); + if (!ht->commit_ordered) + continue; + safe_mutex_assert_owner(&LOCK_commit_ordered); + ht->commit_ordered(ht, thd, all); + DEBUG_SYNC(thd, "commit_after_run_commit_ordered"); + } +} + +TC_LOG_queued::TC_LOG_queued() : group_commit_queue(NULL) +{ +} + +TC_LOG_queued::~TC_LOG_queued() +{ +} + +TC_LOG_queued::TC_group_commit_entry * +TC_LOG_queued::reverse_queue(TC_LOG_queued::TC_group_commit_entry *queue) +{ + TC_group_commit_entry *entry= queue; + TC_group_commit_entry *prev= NULL; + while (entry) + { + TC_group_commit_entry *next= entry->next; + entry->next= prev; + prev= entry; + entry= next; + } + + return prev; +} + +void +TC_LOG_queued::group_commit_wait_for_wakeup(TC_group_commit_entry *entry) +{ + THD *thd= entry->thd; + pthread_mutex_lock(&thd->LOCK_commit_ordered); + while (!entry->group_commit_ready) + pthread_cond_wait(&thd->COND_commit_ordered, + &thd->LOCK_commit_ordered); + pthread_mutex_unlock(&thd->LOCK_commit_ordered); +} + +void +TC_LOG_queued::group_commit_wakeup_other(TC_group_commit_entry *other) +{ + THD *thd= other->thd; + pthread_mutex_lock(&thd->LOCK_commit_ordered); + other->group_commit_ready= TRUE; + pthread_cond_signal(&thd->COND_commit_ordered); + pthread_mutex_unlock(&thd->LOCK_commit_ordered); +} + +TC_LOG_unordered::TC_LOG_unordered() : group_commit_queue_busy(0) +{ + pthread_cond_init(&COND_queue_busy, 0); +} + +TC_LOG_unordered::~TC_LOG_unordered() +{ + pthread_cond_destroy(&COND_queue_busy); +} + +int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, + bool need_commit_ordered) +{ + int cookie; + struct TC_group_commit_entry entry; + bool is_group_commit_leader; + LINT_INIT(is_group_commit_leader); + + if (need_prepare_ordered) + { + pthread_mutex_lock(&LOCK_prepare_ordered); + run_prepare_ordered(thd, all); + if (need_commit_ordered) + { + /* + Must put us in queue so we can run_commit_ordered() in same sequence + as we did run_prepare_ordered(). + */ + entry.thd= thd; + entry.group_commit_ready= false; + TC_group_commit_entry *previous_queue= group_commit_queue; + entry.next= previous_queue; + group_commit_queue= &entry; + is_group_commit_leader= (previous_queue == NULL); + } + pthread_mutex_unlock(&LOCK_prepare_ordered); + } + + if (xid) + cookie= log_xid(thd, xid); + else + cookie= 0; + + if (need_commit_ordered) + { + if (need_prepare_ordered) + { + /* + We did the run_prepare_ordered() serialised, then ran the log_xid() in + parallel. Now we have to do run_commit_ordered() serialised in the + same sequence as run_prepare_ordered(). + + We do this starting from the head of the queue, each thread doing + run_commit_ordered() and signalling the next in queue. + */ + if (is_group_commit_leader) + { + /* The first in queue starts the ball rolling. */ + pthread_mutex_lock(&LOCK_prepare_ordered); + while (group_commit_queue_busy) + pthread_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered); + TC_group_commit_entry *queue= group_commit_queue; + group_commit_queue= NULL; + /* + Mark the queue busy while we bounce it from one thread to the + next. + */ + group_commit_queue_busy= TRUE; + pthread_mutex_unlock(&LOCK_prepare_ordered); + + queue= reverse_queue(queue); + DBUG_ASSERT(queue == &entry && queue->thd == thd); + } + else + { + /* Not first in queue; just wait until previous thread wakes us up. */ + group_commit_wait_for_wakeup(&entry); + } + } + + /* Only run commit_ordered() if log_xid was successful. */ + if (cookie) + { + pthread_mutex_lock(&LOCK_commit_ordered); + run_commit_ordered(thd, all); + pthread_mutex_unlock(&LOCK_commit_ordered); + } + + if (need_prepare_ordered) + { + TC_group_commit_entry *next= entry.next; + if (next) + { + group_commit_wakeup_other(next); + } + else + { + pthread_mutex_lock(&LOCK_prepare_ordered); + group_commit_queue_busy= FALSE; + pthread_cond_signal(&COND_queue_busy); + pthread_mutex_unlock(&LOCK_prepare_ordered); + } + } + } + + return cookie; +} + + +TC_LOG_group_commit::TC_LOG_group_commit() + : num_commits(0), num_group_commits(0) +{ + my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW, + "LOCK_group_commit", MYF(0)); +} + +TC_LOG_group_commit::~TC_LOG_group_commit() +{ + pthread_mutex_destroy(&LOCK_group_commit); +} + +int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, + bool need_commit_ordered) +{ + IF_DBUG(int err;) + int cookie; + struct TC_group_commit_entry entry; + bool is_group_commit_leader; + + entry.thd= thd; + entry.all= all; + entry.group_commit_ready= false; + entry.xid_error= 0; + + pthread_mutex_lock(&LOCK_prepare_ordered); + TC_group_commit_entry *previous_queue= group_commit_queue; + entry.next= previous_queue; + group_commit_queue= &entry; + + DEBUG_SYNC(thd, "commit_before_prepare_ordered"); + run_prepare_ordered(thd, all); + DEBUG_SYNC(thd, "commit_after_prepare_ordered"); + pthread_mutex_unlock(&LOCK_prepare_ordered); + + is_group_commit_leader= (previous_queue == NULL); + + if (is_group_commit_leader) + { + TC_group_commit_entry *current; + + pthread_mutex_lock(&LOCK_group_commit); + DEBUG_SYNC(thd, "commit_after_get_LOCK_group_commit"); + + pthread_mutex_lock(&LOCK_prepare_ordered); + TC_group_commit_entry *queue= group_commit_queue; + group_commit_queue= NULL; + pthread_mutex_unlock(&LOCK_prepare_ordered); + + /* + Since we enqueue at the head, the queue is actually in reverse order. + So reverse it back into correct commit order before returning. + */ + queue= reverse_queue(queue); + + /* The first in the queue is the leader. */ + DBUG_ASSERT(queue == &entry && queue->thd == thd); + + DEBUG_SYNC(thd, "commit_before_group_log_xid"); + /* This will set individual error codes in each thd->xid_error. */ + group_log_xid(queue); + DEBUG_SYNC(thd, "commit_after_group_log_xid"); + + /* + Call commit_ordered methods for all transactions in the queue + (that did not get an error in group_log_xid()). + + We do this under an additional global LOCK_commit_ordered; this is + so that transactions that do not need 2-phase commit do not have + to wait for the potentially long duration of LOCK_group_commit. + */ + current= queue; + + DEBUG_SYNC(thd, "commit_before_get_LOCK_commit_ordered"); + pthread_mutex_lock(&LOCK_commit_ordered); + /* + We cannot unlock LOCK_group_commit until we have locked + LOCK_commit_ordered; otherwise scheduling could allow the next + group commit to run ahead of us, messing up the order of + commit_ordered() calls. But as soon as LOCK_commit_ordered is + obtained, we can let the next group commit start. + */ + pthread_mutex_unlock(&LOCK_group_commit); + DEBUG_SYNC(thd, "commit_after_release_LOCK_group_commit"); + + ++num_group_commits; + do + { + ++num_commits; + if (!current->xid_error) + run_commit_ordered(current->thd, current->all); + + /* + Careful not to access current->next_commit_ordered after waking up + the other thread! As it may change immediately after wakeup. + */ + TC_group_commit_entry *next= current->next; + if (current != &entry) // Don't wake up ourself + group_commit_wakeup_other(current); + current= next; + } while (current != NULL); + DEBUG_SYNC(thd, "commit_after_group_run_commit_ordered"); + + pthread_mutex_unlock(&LOCK_commit_ordered); + } + else + { + /* If not leader, just wait until leader wakes us up. */ + group_commit_wait_for_wakeup(&entry); + } + + /* + Now that we're back in our own thread context, do any delayed processing + and error reporting. + */ + IF_DBUG(err= entry.xid_error;) + cookie= xid_log_after(&entry); + /* The cookie must be non-zero in the non-error case. */ + DBUG_ASSERT(err || cookie); + + return cookie; +} + + /********* transaction coordinator log for 2pc - mmap() based solution *******/ /* @@ -5878,30 +6565,68 @@ void TC_LOG_BINLOG::close() pthread_cond_destroy (&COND_prep_xids); } -/** - @todo - group commit - - @retval - 0 error - @retval - 1 success +/* + Do a binlog log_xid() for a group of transactions, linked through + thd->next_commit_ordered. */ -int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid) +void +TC_LOG_BINLOG::group_log_xid(TC_group_commit_entry *first) { - DBUG_ENTER("TC_LOG_BINLOG::log"); - Xid_log_event xle(thd, xid); - binlog_trx_data *trx_data= - (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); - /* - We always commit the entire transaction when writing an XID. Also - note that the return value is inverted. - */ - DBUG_RETURN(!binlog_end_trans(thd, trx_data, &xle, TRUE)); + DBUG_ENTER("TC_LOG_BINLOG::group_log_xid"); + trx_group_commit_leader(first); + for (TC_group_commit_entry *entry= first; entry; entry= entry->next) + { + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton); + entry->xid_error= trx_data->error; + } + DBUG_VOID_RETURN; } -void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid) +int +TC_LOG_BINLOG::xid_log_after(TC_group_commit_entry *entry) { + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton); + if (trx_group_commit_finish(trx_data)) + return 0; // Returning zero cookie signals error + else + return 1; +} + +/* + After an XID is logged, we need to hold on to the current binlog file until + it is fully committed in the storage engine. The reason is that crash + recovery only looks at the latest binlog, so we must make sure there are no + outstanding prepared (but not committed) transactions before rotating the + binlog. + + To handle this, we keep a count of outstanding XIDs. This function is used + to increase this count when committing one or more transactions to the + binary log. +*/ +void +TC_LOG_BINLOG::mark_xids_active(uint xid_count) +{ + DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active"); + DBUG_PRINT("info", ("xid_count=%u", xid_count)); + pthread_mutex_lock(&LOCK_prep_xids); + prepared_xids+= xid_count; + pthread_mutex_unlock(&LOCK_prep_xids); + DBUG_VOID_RETURN; +} + +/* + Once an XID is committed, it is safe to rotate the binary log, as it can no + longer be needed during crash recovery. + + This function is called to mark an XID this way. It needs to decrease the + count of pending XIDs, and signal the log rotator thread when it reaches zero. +*/ +void +TC_LOG_BINLOG::mark_xid_done() +{ + DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done"); pthread_mutex_lock(&LOCK_prep_xids); DBUG_ASSERT(prepared_xids > 0); if (--prepared_xids == 0) { @@ -5909,7 +6634,16 @@ void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid) pthread_cond_signal(&COND_prep_xids); } pthread_mutex_unlock(&LOCK_prep_xids); - rotate_and_purge(0); // as ::write() did not rotate + DBUG_VOID_RETURN; +} + +void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid) +{ + DBUG_ENTER("TC_LOG_BINLOG::unlog"); + if (xid) + mark_xid_done(); + rotate_and_purge(0); // as ::write_transaction_to_binlog() did not rotate + DBUG_VOID_RETURN; } int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle) @@ -5981,6 +6715,72 @@ ulonglong mysql_bin_log_file_pos(void) #endif /* INNODB_COMPATIBILITY_HOOKS */ +static ulonglong binlog_status_var_num_commits; +static ulonglong binlog_status_var_num_group_commits; + +static SHOW_VAR binlog_status_vars_detail[]= +{ + {"commits", + (char *)&binlog_status_var_num_commits, SHOW_LONGLONG}, + {"group_commits", + (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} +}; + +static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff) +{ + mysql_bin_log.set_status_variables(); + var->type= SHOW_ARRAY; + var->value= (char *)&binlog_status_vars_detail; + return 0; +} + +static SHOW_VAR binlog_status_vars_top[]= { + {"binlog", (char *) &show_binlog_vars, SHOW_FUNC}, + {NullS, NullS, SHOW_LONG} +}; + +#ifndef DBUG_OFF +static MYSQL_SYSVAR_ULONG( + dbug_fsync_sleep, + opt_binlog_dbug_fsync_sleep, + PLUGIN_VAR_RQCMDARG, + "Extra sleep (in microseconds) to add to binlog fsync(), for debugging", + NULL, + NULL, + 0, + 0, + ULONG_MAX, + 0); + +static struct st_mysql_sys_var *binlog_sys_vars[]= +{ + MYSQL_SYSVAR(dbug_fsync_sleep), + NULL +}; +#endif + + +/* + Copy out current values of status variables, for SHOW STATUS or + information_schema.global_status. + + This is called only under LOCK_status, so we can fill in a static array. +*/ +void +TC_LOG_BINLOG::set_status_variables() +{ + ulonglong num_commits, num_group_commits; + + pthread_mutex_lock(&LOCK_commit_ordered); + num_commits= this->num_commits; + num_group_commits= this->num_group_commits; + pthread_mutex_unlock(&LOCK_commit_ordered); + + binlog_status_var_num_commits= num_commits; + binlog_status_var_num_group_commits= num_group_commits; +} + struct st_mysql_storage_engine binlog_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; @@ -5995,8 +6795,12 @@ mysql_declare_plugin(binlog) binlog_init, /* Plugin Init */ NULL, /* Plugin Deinit */ 0x0100 /* 1.0 */, - NULL, /* status variables */ + binlog_status_vars_top, /* status variables */ +#ifndef DBUG_OFF + binlog_sys_vars, /* system variables */ +#else NULL, /* system variables */ +#endif NULL /* config options */ } mysql_declare_plugin_end; diff --git a/sql/log.h b/sql/log.h index 8b5dfcb3935..ac0ebea6db4 100644 --- a/sql/log.h +++ b/sql/log.h @@ -33,11 +33,173 @@ class TC_LOG virtual int open(const char *opt_name)=0; virtual void close()=0; - virtual int log_xid(THD *thd, my_xid xid)=0; + virtual int log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, + bool need_commit_ordered) = 0; virtual void unlog(ulong cookie, my_xid xid)=0; + +protected: + /* + These methods are meant to be invoked from log_and_order() implementations + to run any prepare_ordered() respectively commit_ordered() methods in + participating handlers. + + They must be called using suitable thread syncronisation to ensure that + they are each called in the correct commit order among all + transactions. However, it is only necessary to call them if the + corresponding flag passed to log_and_order is set (it is safe, but not + required, to call them when the flag is false). + + The caller must be holding LOCK_prepare_ordered respectively + LOCK_commit_ordered when calling these methods. + */ + void run_prepare_ordered(THD *thd, bool all); + void run_commit_ordered(THD *thd, bool all); }; -class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging +/* + Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered() + and TC_LOG::run_commit_ordered(), or any other code that calls handler + prepare_ordered() or commit_ordered() methods. +*/ +extern pthread_mutex_t LOCK_prepare_ordered; +extern pthread_mutex_t LOCK_commit_ordered; + +extern void TC_init(); +extern void TC_destroy(); + +/* + Base class for two TC implementations TC_LOG_unordered and + TC_LOG_group_commit that both use a queue of threads waiting for group + commit. +*/ +class TC_LOG_queued: public TC_LOG +{ +protected: + TC_LOG_queued(); + ~TC_LOG_queued(); + + /* Structure used to link list of THDs waiting for group commit. */ + struct TC_group_commit_entry + { + struct TC_group_commit_entry *next; + THD *thd; + /* This is the `all' parameter for ha_commit_trans() etc. */ + bool all; + /* + Flag set true when it is time for this thread to wake up after group + commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered. + */ + bool group_commit_ready; + /* + Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and + cookie. + */ + int xid_error; + }; + + TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue); + + void group_commit_wait_for_wakeup(TC_group_commit_entry *entry); + void group_commit_wakeup_other(TC_group_commit_entry *other); + + /* + This is a queue of threads waiting for being allowed to commit. + Access to the queue must be protected by LOCK_prepare_ordered. + */ + TC_group_commit_entry *group_commit_queue; +}; + +class TC_LOG_unordered: public TC_LOG_queued +{ +public: + TC_LOG_unordered(); + ~TC_LOG_unordered(); + + int log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, bool need_commit_ordered); + +protected: + virtual int log_xid(THD *thd, my_xid xid)=0; + +private: + /* + This flag and condition is used to reserve the queue while threads in it + each run the commit_ordered() methods one after the other. Only once the + last commit_ordered() in the queue is done can we start on a new queue + run. + + Since we start this process in the first thread in the queue and finish in + the last (and possibly different) thread, we need a condition variable for + this (we cannot unlock a mutex in a different thread than the one who + locked it). + + The condition is used together with the LOCK_prepare_ordered mutex. + */ + my_bool group_commit_queue_busy; + pthread_cond_t COND_queue_busy; +}; + +class TC_LOG_group_commit: public TC_LOG_queued +{ +public: + TC_LOG_group_commit(); + ~TC_LOG_group_commit(); + + int log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, bool need_commit_ordered); + +protected: + /* Total number of committed transactions. */ + ulonglong num_commits; + /* Number of group commits done. */ + ulonglong num_group_commits; + + /* + When using this class, this method is used instead of log_xid() to do + logging of a group of transactions all at once. + + The transactions will be linked through THD::next_commit_ordered. + + Additionally, when this method is used instead of log_xid(), the order in + which handler->prepare_ordered() and handler->commit_ordered() are called + is guaranteed to be the same as the order of calls and THD list elements + for group_log_xid(). + + This can be used to efficiently implement group commit that at the same + time preserves the order of commits among handlers and TC (eg. to get same + commit order in InnoDB and binary log). + + For TCs that do not need this, it can be preferable to use plain log_xid() + with class TC_LOG_unordered instead, as it allows threads to run log_xid() + in parallel with each other. In contrast, group_log_xid() runs under a + global mutex, so it is guaranteed that only once call into it will be + active at once. + + Since this call handles multiple threads/THDs at once, my_error() (and + other code that relies on thread local storage) cannot be used in this + method. Instead, the implementation must record any error and report it as + the return value from xid_log_after(), which will be invoked individually + for each thread. + + In the success case, this method must set thd->xid_cookie for each thread + to the cookie that is normally returned from log_xid() (which must be + non-zero in the non-error case). + */ + virtual void group_log_xid(TC_group_commit_entry *first) = 0; + /* + Called for each transaction (in corrent thread context) after + group_log_xid() has finished, but with no guarantee on ordering among + threads. + Can be used to do error reporting etc. */ + virtual int xid_log_after(TC_group_commit_entry *entry) = 0; + +private: + /* Mutex used to serialise calls to group_log_xid(). */ + pthread_mutex_t LOCK_group_commit; +}; + +class TC_LOG_DUMMY: public TC_LOG_unordered // use it to disable the logging { public: TC_LOG_DUMMY() {} @@ -48,7 +210,7 @@ public: }; #ifdef HAVE_MMAP -class TC_LOG_MMAP: public TC_LOG +class TC_LOG_MMAP: public TC_LOG_unordered { public: // only to keep Sun Forte on sol9x86 happy typedef enum { @@ -227,12 +389,19 @@ private: time_t last_time; }; -class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG +class binlog_trx_data; +class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG { private: /* LOCK_log and LOCK_index are inited by init_pthread_objects() */ pthread_mutex_t LOCK_index; pthread_mutex_t LOCK_prep_xids; + /* + Mutex to protect the queue of transactions waiting to participate in group + commit. (Only used on platforms without native atomic operations). + */ + pthread_mutex_t LOCK_queue; + pthread_cond_t COND_prep_xids; pthread_cond_t update_cond; ulonglong bytes_written; @@ -271,8 +440,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG In 5.0 it's 0 for relay logs too! */ bool no_auto_events; - - ulonglong m_table_map_version; + /* Queue of transactions queued up to participate in group commit. */ + binlog_trx_data *group_commit_queue; int write_to_file(IO_CACHE *cache); /* @@ -282,6 +451,14 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG */ void new_file_without_locking(); void new_file_impl(bool need_lock); + int write_transaction(binlog_trx_data *trx_data); + bool write_transaction_to_binlog_events(binlog_trx_data *trx_data); + void trx_group_commit_participant(binlog_trx_data *trx_data); + void trx_group_commit_leader(TC_group_commit_entry *first); + binlog_trx_data *atomic_enqueue_trx(binlog_trx_data *trx_data); + binlog_trx_data *atomic_grab_trx_queue(); + void mark_xid_done(); + void mark_xids_active(uint xid_count); public: MYSQL_LOG::generate_name; @@ -310,18 +487,11 @@ public: int open(const char *opt_name); void close(); - int log_xid(THD *thd, my_xid xid); + void group_log_xid(TC_group_commit_entry *first); + int xid_log_after(TC_group_commit_entry *entry); void unlog(ulong cookie, my_xid xid); int recover(IO_CACHE *log, Format_description_log_event *fdle); #if !defined(MYSQL_CLIENT) - bool is_table_mapped(TABLE *table) const - { - return table->s->table_map_version == table_map_version(); - } - - ulonglong table_map_version() const { return m_table_map_version; } - void update_table_map_version() { ++m_table_map_version; } - int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event); int remove_pending_rows_event(THD *thd); @@ -362,10 +532,12 @@ public: void new_file(); bool write(Log_event* event_info); // binary log write - bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident); - bool write_incident(THD *thd, bool lock); + bool write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data, + Log_event *end_ev); + bool trx_group_commit_finish(binlog_trx_data *trx_data); + bool write_incident(THD *thd); - int write_cache(IO_CACHE *cache, bool lock_log, bool flush_and_sync); + int write_cache(IO_CACHE *cache); void set_write_error(THD *thd); bool check_write_error(THD *thd); @@ -420,6 +592,7 @@ public: inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);} inline IO_CACHE *get_index_file() { return &index_file;} inline uint32 get_open_count() { return open_count; } + void set_status_variables(); }; class Log_event_handler diff --git a/sql/log_event.h b/sql/log_event.h index 36715b1d151..46d02f5d2c5 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -463,10 +463,9 @@ struct sql_ex_info #define LOG_EVENT_SUPPRESS_USE_F 0x8 /* - The table map version internal to the log should be increased after - the event has been written to the binary log. + This used to be LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F, but is now unused. */ -#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x10 +#define LOG_EVENT_UNUSED1_F 0x10 /** @def LOG_EVENT_ARTIFICIAL_F diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 645b7498042..fd39b979f4c 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1333,6 +1333,7 @@ void clean_up(bool print_message) ha_end(); if (tc_log) tc_log->close(); + TC_destroy(); xid_cache_free(); wt_end(); delete_elements(&key_caches, (void (*)(const char*, uchar*)) free_key_cache); @@ -4124,6 +4125,8 @@ a file name for --log-bin-index option", opt_binlog_index_name); if (!errmesg[0][0]) unireg_abort(1); + TC_init(); + /* We have to initialize the storage engines before CSV logging */ if (ha_init()) { diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 580fe8057cd..8dbba6b2ec5 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -673,6 +673,8 @@ THD::THD() active_vio = 0; #endif pthread_mutex_init(&LOCK_thd_data, MY_MUTEX_INIT_FAST); + pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_FAST); + pthread_cond_init(&COND_commit_ordered, 0); /* Variables with default values */ proc_info="login"; @@ -999,6 +1001,8 @@ THD::~THD() free_root(&transaction.mem_root,MYF(0)); #endif mysys_var=0; // Safety (shouldn't be needed) + pthread_cond_destroy(&COND_commit_ordered); + pthread_mutex_destroy(&LOCK_commit_ordered); pthread_mutex_destroy(&LOCK_thd_data); #ifndef DBUG_OFF dbug_sentry= THD_SENTRY_GONE; @@ -3773,7 +3777,6 @@ int THD::binlog_flush_pending_rows_event(bool stmt_end) if (stmt_end) { pending->set_flags(Rows_log_event::STMT_END_F); - pending->flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; binlog_table_maps= 0; } @@ -3901,7 +3904,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg, { Query_log_event qinfo(this, query_arg, query_len, is_trans, suppress_use, errcode); - qinfo.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; /* Binlog table maps will be irrelevant after a Query_log_event (they are just removed on the slave side) so after the query diff --git a/sql/sql_class.h b/sql/sql_class.h index aa39ddb2b15..aa2933e4070 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1438,6 +1438,10 @@ public: /* container for handler's private per-connection data */ Ha_data ha_data[MAX_HA]; + /* Mutex and condition for waking up threads after group commit. */ + pthread_mutex_t LOCK_commit_ordered; + pthread_cond_t COND_commit_ordered; + #ifndef MYSQL_CLIENT int binlog_setup_trx_data(); diff --git a/sql/sql_load.cc b/sql/sql_load.cc index 82cc8f81b4a..441fe93aaef 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -516,7 +516,6 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, else { Delete_file_log_event d(thd, db, transactional_table); - d.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; (void) mysql_bin_log.write(&d); } } @@ -698,7 +697,6 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex, (duplicates == DUP_REPLACE) ? LOAD_DUP_REPLACE : (ignore ? LOAD_DUP_IGNORE : LOAD_DUP_ERROR), transactional_table, FALSE, errcode); - e.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F; return mysql_bin_log.write(&e); } diff --git a/sql/table.cc b/sql/table.cc index 733aa3e6887..2ddde40778d 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -296,13 +296,6 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key, share->version= refresh_version; - /* - This constant is used to mark that no table map version has been - assigned. No arithmetic is done on the value: it will be - overwritten with a value taken from MYSQL_BIN_LOG. - */ - share->table_map_version= ~(ulonglong)0; - /* Since alloc_table_share() can be called without any locking (for example, ha_create_table... functions), we do not assign a table @@ -367,10 +360,9 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key, share->frm_version= FRM_VER_TRUE_VARCHAR; /* - Temporary tables are not replicated, but we set up these fields + Temporary tables are not replicated, but we set up this fields anyway to be able to catch errors. */ - share->table_map_version= ~(ulonglong)0; share->cached_row_logging_check= -1; /* diff --git a/sql/table.h b/sql/table.h index a24e79e26cf..efc48090b3b 100644 --- a/sql/table.h +++ b/sql/table.h @@ -433,7 +433,6 @@ typedef struct st_table_share bool waiting_on_cond; /* Protection against free */ bool deleting; /* going to delete this table */ ulong table_map_id; /* for row-based replication */ - ulonglong table_map_version; /* Cache for row-based replication table share checks that does not diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index c28e0c57d2b..e19ca7206fd 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -138,8 +138,6 @@ bool check_global_access(THD *thd, ulong want_access); /** to protect innobase_open_files */ static pthread_mutex_t innobase_share_mutex; -/** to force correct commit order in binlog */ -static pthread_mutex_t prepare_commit_mutex; static ulong commit_threads = 0; static pthread_mutex_t commit_threads_m; static pthread_cond_t commit_cond; @@ -239,6 +237,7 @@ static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = { static INNOBASE_SHARE *get_share(const char *table_name); static void free_share(INNOBASE_SHARE *share); static int innobase_close_connection(handlerton *hton, THD* thd); +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all); static int innobase_commit(handlerton *hton, THD* thd, bool all); static int innobase_rollback(handlerton *hton, THD* thd, bool all); static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd, @@ -1356,7 +1355,6 @@ innobase_trx_init( trx_t* trx) /*!< in/out: InnoDB transaction handle */ { DBUG_ENTER("innobase_trx_init"); - DBUG_ASSERT(EQ_CURRENT_THD(thd)); DBUG_ASSERT(thd == trx->mysql_thd); trx->check_foreigns = !thd_test_options( @@ -1416,8 +1414,6 @@ check_trx_exists( { trx_t*& trx = thd_to_trx(thd); - ut_ad(EQ_CURRENT_THD(thd)); - if (trx == NULL) { trx = innobase_trx_allocate(thd); } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) { @@ -2024,6 +2020,7 @@ innobase_init( innobase_hton->savepoint_set=innobase_savepoint; innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint; innobase_hton->savepoint_release=innobase_release_savepoint; + innobase_hton->commit_ordered=innobase_commit_ordered; innobase_hton->commit=innobase_commit; innobase_hton->rollback=innobase_rollback; innobase_hton->prepare=innobase_xa_prepare; @@ -2492,7 +2489,6 @@ skip_overwrite: innobase_open_tables = hash_create(200); pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST); - pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST); pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST); pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST); pthread_mutex_init(&analyze_mutex, MY_MUTEX_INIT_FAST); @@ -2547,7 +2543,6 @@ innobase_end( my_free(internal_innobase_data_file_path, MYF(MY_ALLOW_ZERO_PTR)); pthread_mutex_destroy(&innobase_share_mutex); - pthread_mutex_destroy(&prepare_commit_mutex); pthread_mutex_destroy(&commit_threads_m); pthread_mutex_destroy(&commit_cond_m); pthread_mutex_destroy(&analyze_mutex); @@ -2680,6 +2675,101 @@ innobase_start_trx_and_assign_read_view( DBUG_RETURN(0); } +/*****************************************************************//** +Perform the first, fast part of InnoDB commit. + +Doing it in this call ensures that we get the same commit order here +as in binlog and any other participating transactional storage engines. + +Note that we want to do as little as really needed here, as we run +under a global mutex. The expensive fsync() is done later, in +innobase_commit(), without a lock so group commit can take place. + +Note also that this method can be called from a different thread than +the one handling the rest of the transaction. */ +static +void +innobase_commit_ordered( +/*============*/ + handlerton *hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /*!< in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + DBUG_ENTER("innobase_commit_ordered"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + if (trx->active_trans == 0 + && trx->conc_state != TRX_NOT_STARTED) { + /* We cannot throw error here; instead we will catch this error + again in innobase_commit() and report it from there. */ + DBUG_VOID_RETURN; + } + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + /* commit_ordered is only called when committing the whole transaction + (or an SQL statement when autocommit is on). */ + DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); + + /* We need current binlog position for ibbackup to work. + Note, the position is current because commit_ordered is guaranteed + to be called in same sequenece as writing to binlog. */ + +retry: + if (innobase_commit_concurrency > 0) { + pthread_mutex_lock(&commit_cond_m); + commit_threads++; + + if (commit_threads > innobase_commit_concurrency) { + commit_threads--; + pthread_cond_wait(&commit_cond, + &commit_cond_m); + pthread_mutex_unlock(&commit_cond_m); + goto retry; + } + else { + pthread_mutex_unlock(&commit_cond_m); + } + } + + /* The following calls to read the MySQL binary log + file name and the position return consistent results: + 1) We use commit_ordered() to get same commit order + in InnoDB as in binary log. + 2) A MySQL log file rotation cannot happen because + MySQL protects against this by having a counter of + transactions in prepared state and it only allows + a rotation when the counter drops to zero. See + LOCK_prep_xids and COND_prep_xids in log.cc. */ + trx->mysql_log_file_name = mysql_bin_log_file_name(); + trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); + + /* Don't do write + flush right now. For group commit + to work we want to do the flush in the innobase_commit() + method, which runs without holding any locks. */ + trx->flush_log_later = TRUE; + innobase_commit_low(trx); + trx->flush_log_later = FALSE; + + if (innobase_commit_concurrency > 0) { + pthread_mutex_lock(&commit_cond_m); + commit_threads--; + pthread_cond_signal(&commit_cond); + pthread_mutex_unlock(&commit_cond_m); + } + + DBUG_VOID_RETURN; +} + /*****************************************************************//** Commits a transaction in an InnoDB database or marks an SQL statement ended. @@ -2702,13 +2792,6 @@ innobase_commit( trx = check_trx_exists(thd); - /* Since we will reserve the kernel mutex, we have to release - the search system latch first to obey the latching order. */ - - if (trx->has_search_latch) { - trx_search_latch_release_if_reserved(trx); - } - /* The flag trx->active_trans is set to 1 in 1. ::external_lock(), @@ -2736,62 +2819,8 @@ innobase_commit( /* We were instructed to commit the whole transaction, or this is an SQL statement end and autocommit is on */ - /* We need current binlog position for ibbackup to work. - Note, the position is current because of - prepare_commit_mutex */ -retry: - if (innobase_commit_concurrency > 0) { - pthread_mutex_lock(&commit_cond_m); - commit_threads++; - - if (commit_threads > innobase_commit_concurrency) { - commit_threads--; - pthread_cond_wait(&commit_cond, - &commit_cond_m); - pthread_mutex_unlock(&commit_cond_m); - goto retry; - } - else { - pthread_mutex_unlock(&commit_cond_m); - } - } - - /* The following calls to read the MySQL binary log - file name and the position return consistent results: - 1) Other InnoDB transactions cannot intervene between - these calls as we are holding prepare_commit_mutex. - 2) Binary logging of other engines is not relevant - to InnoDB as all InnoDB requires is that committing - InnoDB transactions appear in the same order in the - MySQL binary log as they appear in InnoDB logs. - 3) A MySQL log file rotation cannot happen because - MySQL protects against this by having a counter of - transactions in prepared state and it only allows - a rotation when the counter drops to zero. See - LOCK_prep_xids and COND_prep_xids in log.cc. */ - trx->mysql_log_file_name = mysql_bin_log_file_name(); - trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); - - /* Don't do write + flush right now. For group commit - to work we want to do the flush after releasing the - prepare_commit_mutex. */ - trx->flush_log_later = TRUE; - innobase_commit_low(trx); - trx->flush_log_later = FALSE; - - if (innobase_commit_concurrency > 0) { - pthread_mutex_lock(&commit_cond_m); - commit_threads--; - pthread_cond_signal(&commit_cond); - pthread_mutex_unlock(&commit_cond_m); - } - - if (trx->active_trans == 2) { - - pthread_mutex_unlock(&prepare_commit_mutex); - } - - /* Now do a write + flush of logs. */ + /* We did the first part already in innobase_commit_ordered(), + Now finish by doing a write + flush of logs. */ trx_commit_complete_for_mysql(trx); trx->active_trans = 0; @@ -4621,6 +4650,7 @@ no_commit: no need to re-acquire locks on it. */ /* Altering to InnoDB format */ + innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ prebuilt->trx->active_trans = 1; @@ -4637,6 +4667,7 @@ no_commit: /* Commit the transaction. This will release the table locks, so they have to be acquired again. */ + innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ prebuilt->trx->active_trans = 1; @@ -8339,6 +8370,7 @@ ha_innobase::external_lock( if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { if (trx->active_trans != 0) { + innobase_commit_ordered(ht, thd, TRUE); innobase_commit(ht, thd, TRUE); } } else { @@ -9448,36 +9480,6 @@ innobase_xa_prepare( srv_active_wake_master_thread(); - if (thd_sql_command(thd) != SQLCOM_XA_PREPARE && - (all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) - { - if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) { - /* choose group commit rather than binlog order */ - return(error); - } - - /* For ibbackup to work the order of transactions in binlog - and InnoDB must be the same. Consider the situation - - thread1> prepare; write to binlog; ... - - thread2> prepare; write to binlog; commit - thread1> ... commit - - To ensure this will not happen we're taking the mutex on - prepare, and releasing it on commit. - - Note: only do it for normal commits, done via ha_commit_trans. - If 2pc protocol is executed by external transaction - coordinator, it will be just a regular MySQL client - executing XA PREPARE and XA COMMIT commands. - In this case we cannot know how many minutes or hours - will be between XA PREPARE and XA COMMIT, and we don't want - to block for undefined period of time. */ - pthread_mutex_lock(&prepare_commit_mutex); - trx->active_trans = 2; - } - return(error); } @@ -10669,11 +10671,6 @@ static MYSQL_SYSVAR_ENUM(adaptive_checkpoint, srv_adaptive_checkpoint, "Enable/Disable flushing along modified age. (none, reflex, [estimate])", NULL, innodb_adaptive_checkpoint_update, 2, &adaptive_checkpoint_typelib); -static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit, - PLUGIN_VAR_RQCMDARG, - "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.", - NULL, NULL, 0, 0, 1, 0); - static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import, PLUGIN_VAR_RQCMDARG, "Enable/Disable converting automatically *.ibd files when import tablespace.", @@ -10763,7 +10760,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(flush_neighbor_pages), MYSQL_SYSVAR(read_ahead), MYSQL_SYSVAR(adaptive_checkpoint), - MYSQL_SYSVAR(enable_unsafe_group_commit), MYSQL_SYSVAR(expand_import), MYSQL_SYSVAR(extra_rsegments), MYSQL_SYSVAR(dict_size_limit), From 8bc445360ee9f21560b5613e0bdc8b363f34d5b6 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 4 Oct 2010 20:40:31 +0200 Subject: [PATCH 02/13] MWL#116: Efficient group commit Tweak the commit_ordered() semantics. Now it is only called for transactions that go through 2-phase commit. This avoids forcing engines to make commits visible before they are durable. Also take LOCK_commit_ordered() around START TRANSACTION WITH CONSISTENT SNAPSHOT, to get a truly consistent snapshot. --- sql/handler.cc | 31 ++---- sql/handler.h | 16 +-- storage/xtradb/handler/ha_innodb.cc | 157 ++++++++++++++++------------ storage/xtradb/include/trx0trx.h | 11 +- 4 files changed, 117 insertions(+), 98 deletions(-) diff --git a/sql/handler.cc b/sql/handler.cc index 4975b4a1230..6503516a2f8 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1251,32 +1251,7 @@ int ha_commit_one_phase(THD *thd, bool all) enclosing 'all' transaction is rolled back. */ bool is_real_trans=all || thd->transaction.all.ha_list == 0; - Ha_trx_info *ha_info= trans->ha_list; DBUG_ENTER("ha_commit_one_phase"); -#ifdef USING_TRANSACTIONS - if (ha_info) - { - if (is_real_trans) - { - bool locked= false; - for (; ha_info; ha_info= ha_info->next()) - { - handlerton *ht= ha_info->ht(); - if (ht->commit_ordered) - { - if (ha_info->is_trx_read_write() && !locked) - { - pthread_mutex_lock(&LOCK_commit_ordered); - locked= 1; - } - ht->commit_ordered(ht, thd, all); - } - } - if (locked) - pthread_mutex_unlock(&LOCK_commit_ordered); - } - } -#endif /* USING_TRANSACTIONS */ DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans)); } @@ -1901,7 +1876,13 @@ int ha_start_consistent_snapshot(THD *thd) { bool warn= true; + /* + Holding the LOCK_commit_ordered mutex ensures that for any transaction + we either see it committed in all engines, or in none. + */ + pthread_mutex_lock(&LOCK_commit_ordered); plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn); + pthread_mutex_unlock(&LOCK_commit_ordered); /* Same idea as when one wants to CREATE TABLE in one engine which does not diff --git a/sql/handler.h b/sql/handler.h index a5c4af533be..82926dc08c0 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -667,6 +667,11 @@ struct handlerton full transaction is committed, not for each commit of statement transaction in a multi-statement transaction. + Not that like prepare(), commit_ordered() is only called when 2-phase + commit takes place. Ie. when no binary log and only a single engine + participates in a transaction, one commit() is called, no + commit_orderd(). So engines must be prepared for this. + The calls to commit_ordered() in multiple parallel transactions is guaranteed to happen in the same order in every participating handler. This can be used to ensure the same commit order among multiple @@ -684,11 +689,9 @@ struct handlerton doing any time-consuming or blocking operations in commit_ordered() will limit scalability. - Handlers can rely on commit_ordered() calls for transactions that updated - data to be serialised (no two calls can run in parallel, so no extra - locking on the handler part is required to ensure this). However, calls - for SELECT-only transactions are not serialised, so can occur in parallel - with each other and with at most one write-transaction. + Handlers can rely on commit_ordered() calls to be serialised (no two + calls can run in parallel, so no extra locking on the handler part is + required to ensure this). Note that commit_ordered() can be called from a different thread than the one handling the transaction! So it can not do anything that depends on @@ -700,7 +703,8 @@ struct handlerton must be saved and returned from the commit() method instead. The commit_ordered method is optional, and can be left unset if not - needed in a particular handler. + needed in a particular handler (then there will be no ordering guarantees + wrt. other engines and binary log). */ void (*commit_ordered)(handlerton *hton, THD *thd, bool all); int (*rollback)(handlerton *hton, THD *thd, bool all); diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 7b1f3bc01dc..d9abb5ae032 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -1700,10 +1700,10 @@ innobase_query_caching_of_table_permitted( /* The call of row_search_.. will start a new transaction if it is not yet started */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(innodb_hton_ptr, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } if (row_search_check_if_query_cache_permitted(trx, norm_name)) { @@ -1973,11 +1973,11 @@ ha_innobase::init_table_handle_for_HANDLER(void) /* Set the MySQL flag to mark that there is an active transaction */ - if (prebuilt->trx->active_trans == 0) { + if ((prebuilt->trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, user_thd); - prebuilt->trx->active_trans = 1; + prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } /* We did the necessary inits in this function, no need to repeat them @@ -2704,58 +2704,21 @@ innobase_start_trx_and_assign_read_view( /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(hton, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } DBUG_RETURN(0); } -/*****************************************************************//** -Perform the first, fast part of InnoDB commit. - -Doing it in this call ensures that we get the same commit order here -as in binlog and any other participating transactional storage engines. - -Note that we want to do as little as really needed here, as we run -under a global mutex. The expensive fsync() is done later, in -innobase_commit(), without a lock so group commit can take place. - -Note also that this method can be called from a different thread than -the one handling the rest of the transaction. */ static void -innobase_commit_ordered( +innobase_commit_ordered_2( /*============*/ - handlerton *hton, /*!< in: Innodb handlerton */ - THD* thd, /*!< in: MySQL thread handle of the user for whom - the transaction should be committed */ - bool all) /*!< in: TRUE - commit transaction - FALSE - the current SQL statement ended */ + trx_t* trx) /*!< in: Innodb transaction */ { - trx_t* trx; DBUG_ENTER("innobase_commit_ordered"); - DBUG_ASSERT(hton == innodb_hton_ptr); - - trx = check_trx_exists(thd); - - if (trx->active_trans == 0 - && trx->conc_state != TRX_NOT_STARTED) { - /* We cannot throw error here; instead we will catch this error - again in innobase_commit() and report it from there. */ - DBUG_VOID_RETURN; - } - /* Since we will reserve the kernel mutex, we have to release - the search system latch first to obey the latching order. */ - - if (trx->has_search_latch) { - trx_search_latch_release_if_reserved(trx); - } - - /* commit_ordered is only called when committing the whole transaction - (or an SQL statement when autocommit is on). */ - DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); /* We need current binlog position for ibbackup to work. Note, the position is current because commit_ordered is guaranteed @@ -2807,6 +2770,60 @@ retry: DBUG_VOID_RETURN; } +/*****************************************************************//** +Perform the first, fast part of InnoDB commit. + +Doing it in this call ensures that we get the same commit order here +as in binlog and any other participating transactional storage engines. + +Note that we want to do as little as really needed here, as we run +under a global mutex. The expensive fsync() is done later, in +innobase_commit(), without a lock so group commit can take place. + +Note also that this method can be called from a different thread than +the one handling the rest of the transaction. */ +static +void +innobase_commit_ordered( +/*============*/ + handlerton *hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /*!< in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + DBUG_ENTER("innobase_commit_ordered"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 + && trx->conc_state != TRX_NOT_STARTED) { + /* We cannot throw error here; instead we will catch this error + again in innobase_commit() and report it from there. */ + DBUG_VOID_RETURN; + } + + /* commit_ordered is only called when committing the whole transaction + (or an SQL statement when autocommit is on). */ + DBUG_ASSERT(all || + (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); + + innobase_commit_ordered_2(trx); + + trx->active_trans |= TRX_ACTIVE_COMMIT_ORDERED; + + DBUG_VOID_RETURN; +} + /*****************************************************************//** Commits a transaction in an InnoDB database or marks an SQL statement ended. @@ -2829,7 +2846,15 @@ innobase_commit( trx = check_trx_exists(thd); - /* The flag trx->active_trans is set to 1 in + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch && + (trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) { + trx_search_latch_release_if_reserved(trx); + } + + /* The flag TRX_ACTIVE_IN_MYSQL in trx->active_trans is set in 1. ::external_lock(), 2. ::start_stmt(), @@ -2839,20 +2864,26 @@ innobase_commit( 6. innobase_start_trx_and_assign_read_view(), 7. ::transactional_table_lock() - and it is only set to 0 in a commit or a rollback. If it is 0 we know + and it is only cleared in a commit or a rollback. If it is unset we know there cannot be resources to be freed and we could return immediately. For the time being, we play safe and do the cleanup though there should be nothing to clean up. */ - if (trx->active_trans == 0 + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 && trx->conc_state != TRX_NOT_STARTED) { sql_print_error("trx->active_trans == 0, but" " trx->conc_state != TRX_NOT_STARTED"); } + if (all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + /* Run the fast part of commit if we did not already. */ + if ((trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) { + innobase_commit_ordered_2(trx); + } + /* We were instructed to commit the whole transaction, or this is an SQL statement end and autocommit is on */ @@ -3076,7 +3107,7 @@ innobase_savepoint( innobase_release_stat_resources(trx); /* cannot happen outside of transaction */ - DBUG_ASSERT(trx->active_trans); + DBUG_ASSERT(trx->active_trans & TRX_ACTIVE_IN_MYSQL); /* TODO: use provided savepoint data area to store savepoint data */ char name[64]; @@ -3106,7 +3137,7 @@ innobase_close_connection( ut_a(trx); - if (trx->active_trans == 0 + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 && trx->conc_state != TRX_NOT_STARTED) { sql_print_error("trx->active_trans == 0, but" @@ -5021,10 +5052,9 @@ no_commit: no need to re-acquire locks on it. */ /* Altering to InnoDB format */ - innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ - prebuilt->trx->active_trans = 1; + prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL; /* We will need an IX lock on the destination table. */ prebuilt->sql_stat_start = TRUE; } else { @@ -5038,10 +5068,9 @@ no_commit: /* Commit the transaction. This will release the table locks, so they have to be acquired again. */ - innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ - prebuilt->trx->active_trans = 1; + prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL; /* Re-acquire the table lock on the source table. */ row_lock_table_for_mysql(prebuilt, src_table, mode); /* We will need an IX lock on the destination table. */ @@ -8929,10 +8958,10 @@ ha_innobase::start_stmt( trx->detailed_error[0] = '\0'; /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } else { innobase_register_stmt(ht, thd); } @@ -9030,10 +9059,10 @@ ha_innobase::external_lock( /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } else if (trx->n_mysql_tables_in_use == 0) { innobase_register_stmt(ht, thd); } @@ -9131,8 +9160,7 @@ ha_innobase::external_lock( prebuilt->used_in_HANDLER = FALSE; if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { - if (trx->active_trans != 0) { - innobase_commit_ordered(ht, thd, TRUE); + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) != 0) { innobase_commit(ht, thd, TRUE); } } else { @@ -9217,10 +9245,10 @@ ha_innobase::transactional_table_lock( /* MySQL is setting a new transactional table lock */ /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) { @@ -10272,7 +10300,8 @@ innobase_xa_prepare( innobase_release_stat_resources(trx); - if (trx->active_trans == 0 && trx->conc_state != TRX_NOT_STARTED) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 && + trx->conc_state != TRX_NOT_STARTED) { sql_print_error("trx->active_trans == 0, but trx->conc_state != " "TRX_NOT_STARTED"); @@ -10284,7 +10313,7 @@ innobase_xa_prepare( /* We were instructed to prepare the whole transaction, or this is an SQL statement end and autocommit is on */ - ut_ad(trx->active_trans); + ut_ad(trx->active_trans & TRX_ACTIVE_IN_MYSQL); error = (int) trx_prepare_for_mysql(trx); } else { diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h index 4c0ce392bcd..36f9c464c2b 100644 --- a/storage/xtradb/include/trx0trx.h +++ b/storage/xtradb/include/trx0trx.h @@ -511,9 +511,10 @@ struct trx_struct{ in that case we must flush the log in trx_commit_complete_for_mysql() */ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ - ulint active_trans; /*!< 1 - if a transaction in MySQL - is active. 2 - if prepare_commit_mutex - was taken */ + ulint active_trans; /*!< TRX_ACTIVE_IN_MYSQL - set if a + transaction in MySQL is active. + TRX_ACTIVE_COMMIT_ORDERED - set if + innobase_commit_ordered has run */ ulint has_search_latch; /* TRUE if this trx has latched the search system latch in S-mode */ @@ -824,6 +825,10 @@ Multiple flags can be combined with bitwise OR. */ #define TRX_SIG_OTHER_SESS 1 /* sent by another session (which must hold rights to this) */ +/* Flag bits for trx_struct.active_trans */ +#define TRX_ACTIVE_IN_MYSQL (1<<0) +#define TRX_ACTIVE_COMMIT_ORDERED (1<<1) + /** Commit node states */ enum commit_node_state { COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to From ffa7e0fabe50ceb00d2d64f4806fb87f9f607b3a Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 18 Oct 2010 11:25:06 +0200 Subject: [PATCH 03/13] MWL#116: Efficient group commit: Fix bug that binlog pos stored by XtraDB during commit was wrong when more than one commit in group. Now the actual binlog position for each commit is stored in THD, and XtraDB can fetch the correct value from within commit_ordered() or commit(). mysql-test/r/group_commit_binlog_pos.result: Test case for XtraDB binlog position. mysql-test/t/group_commit_binlog_pos-master.opt: Test case for XtraDB binlog position. mysql-test/t/group_commit_binlog_pos.test: Test case for XtraDB binlog position. sql/log.cc: Save binlog position corresponding to commit in THD, and make accessible to storage engine. sql/sql_parse.cc: Add generic crash point for use in test cases. storage/xtradb/handler/ha_innodb.cc: Update to use new method of getting current binlog position that works with group commit. storage/xtradb/handler/ha_innodb.h: Update to use new method of getting current binlog position that works with group commit. --- mysql-test/r/group_commit_binlog_pos.result | 35 ++++++++ .../t/group_commit_binlog_pos-master.opt | 1 + mysql-test/t/group_commit_binlog_pos.test | 83 +++++++++++++++++++ sql/log.cc | 41 ++++++++- sql/sql_parse.cc | 4 + storage/xtradb/handler/ha_innodb.cc | 21 ++--- storage/xtradb/handler/ha_innodb.h | 15 ++-- 7 files changed, 175 insertions(+), 25 deletions(-) create mode 100644 mysql-test/r/group_commit_binlog_pos.result create mode 100644 mysql-test/t/group_commit_binlog_pos-master.opt create mode 100644 mysql-test/t/group_commit_binlog_pos.test diff --git a/mysql-test/r/group_commit_binlog_pos.result b/mysql-test/r/group_commit_binlog_pos.result new file mode 100644 index 00000000000..0d7c23cbbbd --- /dev/null +++ b/mysql-test/r/group_commit_binlog_pos.result @@ -0,0 +1,35 @@ +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; +INSERT INTO t1 VALUES (0); +SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con1_waiting WAIT_FOR con3_queued"; +SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3"; +INSERT INTO t1 VALUES (1); +SET DEBUG_SYNC= "now WAIT_FOR con1_waiting"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con2_queued"; +INSERT INTO t1 VALUES (2); +SET DEBUG_SYNC= "now WAIT_FOR con2_queued"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con3_queued"; +INSERT INTO t1 VALUES (3); +SET DEBUG_SYNC= "now WAIT_FOR con1_loop"; +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont"; +SET DEBUG_SYNC= "now WAIT_FOR con1_loop"; +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont"; +SET DEBUG_SYNC= "now WAIT_FOR con1_loop"; +SELECT * FROM t1 ORDER BY a; +a +0 +1 +2 +SET SESSION debug="+d,crash_dispatch_command_before"; +SELECT 1; +ERROR HY000: Lost connection to MySQL server during query +ERROR HY000: Lost connection to MySQL server during query +ERROR HY000: Lost connection to MySQL server during query +SELECT * FROM t1 ORDER BY a; +a +0 +1 +2 +3 +InnoDB: Last MySQL binlog file position 0 767, file name ./master-bin.000001 +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; diff --git a/mysql-test/t/group_commit_binlog_pos-master.opt b/mysql-test/t/group_commit_binlog_pos-master.opt new file mode 100644 index 00000000000..425fda95086 --- /dev/null +++ b/mysql-test/t/group_commit_binlog_pos-master.opt @@ -0,0 +1 @@ +--skip-stack-trace --skip-core-file diff --git a/mysql-test/t/group_commit_binlog_pos.test b/mysql-test/t/group_commit_binlog_pos.test new file mode 100644 index 00000000000..ad1a0e4b508 --- /dev/null +++ b/mysql-test/t/group_commit_binlog_pos.test @@ -0,0 +1,83 @@ +--source include/have_debug_sync.inc +--source include/have_innodb.inc +--source include/have_log_bin.inc +# Need DBUG to crash the server intentionally +--source include/have_debug.inc +# Don't test this under valgrind, memory leaks will occur as we crash +--source include/not_valgrind.inc + +# XtraDB stores the binlog position corresponding to the last commit, and +# prints it during crash recovery. +# Test that we get the correct position when we group commit several +# transactions together. + +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; +INSERT INTO t1 VALUES (0); + +connect(con1,localhost,root,,); +connect(con2,localhost,root,,); +connect(con3,localhost,root,,); + +# Queue up three commits for group commit. + +connection con1; +SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con1_waiting WAIT_FOR con3_queued"; +SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3"; +send INSERT INTO t1 VALUES (1); + +connection con2; +SET DEBUG_SYNC= "now WAIT_FOR con1_waiting"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con2_queued"; +send INSERT INTO t1 VALUES (2); + +connection con3; +SET DEBUG_SYNC= "now WAIT_FOR con2_queued"; +SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con3_queued"; +send INSERT INTO t1 VALUES (3); + +connection default; +SET DEBUG_SYNC= "now WAIT_FOR con1_loop"; +# At this point, no transactions are committed. +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont"; +SET DEBUG_SYNC= "now WAIT_FOR con1_loop"; +# At this point, 1 transaction is committed. +SET DEBUG_SYNC= "now SIGNAL con1_loop_cont"; +SET DEBUG_SYNC= "now WAIT_FOR con1_loop"; + +# At this point, 2 transactions are committed. +SELECT * FROM t1 ORDER BY a; + +connection con2; +reap; + +# Now crash the server with 1+2 in-memory committed, 3 only prepared. +connection default; +system echo wait-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; +SET SESSION debug="+d,crash_dispatch_command_before"; +--error 2013 +SELECT 1; + +connection con1; +--error 2013 +reap; +connection con3; +--error 2013 +reap; + +system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; + +connection default; +--enable_reconnect +--source include/wait_until_connected_again.inc + +# Crash recovery should recover all three transactions. +SELECT * FROM t1 ORDER BY a; + +# Check that the binlog position reported by InnoDB is the correct one +# for the end of the second transaction (as can be checked with +# mysqlbinlog). +let $MYSQLD_DATADIR= `SELECT @@datadir`; +--exec grep 'InnoDB: Last MySQL binlog file position' $MYSQLD_DATADIR/../../log/mysqld.1.err | tail -1 + +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; diff --git a/sql/log.cc b/sql/log.cc index 4e489662079..d99f04d4425 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -155,7 +155,7 @@ class binlog_trx_data { public: binlog_trx_data() : at_least_one_stmt_committed(0), incident(FALSE), m_pending(0), - before_stmt_pos(MY_OFF_T_UNDEF), using_xa(0) + before_stmt_pos(MY_OFF_T_UNDEF), using_xa(0), commit_bin_log_file_pos(0) { trans_log.end_of_file= max_binlog_cache_size; (void) my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW, @@ -218,6 +218,7 @@ public: incident= FALSE; trans_log.end_of_file= max_binlog_cache_size; using_xa= FALSE; + commit_bin_log_file_pos= 0; DBUG_ASSERT(empty()); } @@ -297,6 +298,11 @@ public: /* Mutex and condition for wakeup after group commit. */ pthread_mutex_t LOCK_group_commit; pthread_cond_t COND_group_commit; + /* + Binlog position after current commit, available to storage engines during + commit() and commit_ordered(). + */ + ulonglong commit_bin_log_file_pos; }; handlerton *binlog_hton; @@ -5170,6 +5176,8 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) write_count++; } + current->commit_bin_log_file_pos= + log_file.pos_in_file + (log_file.write_pos - log_file.write_buffer); if (current->end_event->get_type_code() == XID_EVENT) xid_count++; } @@ -6005,6 +6013,7 @@ int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all, ++num_group_commits; do { + DEBUG_SYNC(thd, "commit_loop_entry_commit_ordered"); ++num_commits; if (!current->xid_error) run_commit_ordered(current->thd, current->all); @@ -6813,6 +6822,36 @@ ulonglong mysql_bin_log_file_pos(void) { return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file; } +/* + Get the current position of the MySQL binlog for transaction currently being + committed. + + This is valid to call from within storage engine commit_ordered() and + commit() methods only. + + Since it stores the position inside THD, it is safe to call without any + locking. + + Note that currently the binlog file name is not stored inside THD, but this + is still safe as it can only change when the log is rotated, and we never + rotate the binlog while commits are pending inside storage engines. +*/ +void +mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file) +{ + binlog_trx_data *const trx_data= + (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); + if (trx_data) + { + *out_pos= trx_data->commit_bin_log_file_pos; + *out_file= mysql_bin_log.get_log_fname(); + } + else + { + *out_pos= NULL; + *out_file= NULL; + } +} #endif /* INNODB_COMPATIBILITY_HOOKS */ diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index caec049c244..8dee0155f2b 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -999,6 +999,10 @@ bool dispatch_command(enum enum_server_command command, THD *thd, DBUG_ENTER("dispatch_command"); DBUG_PRINT("info", ("command: %d", command)); + DBUG_EXECUTE_IF("crash_dispatch_command_before", + { DBUG_PRINT("crash_dispatch_command_before", ("now")); + DBUG_ABORT(); }); + thd->command=command; /* Commands which always take a long time are logged into diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index d9abb5ae032..b05670e6c6e 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -2716,8 +2716,10 @@ static void innobase_commit_ordered_2( /*============*/ - trx_t* trx) /*!< in: Innodb transaction */ + trx_t* trx, /*!< in: Innodb transaction */ + THD* thd) /*!< in: MySQL thread handle */ { + ulonglong tmp_pos; DBUG_ENTER("innobase_commit_ordered"); /* We need current binlog position for ibbackup to work. @@ -2741,17 +2743,8 @@ retry: } } - /* The following calls to read the MySQL binary log - file name and the position return consistent results: - 1) We use commit_ordered() to get same commit order - in InnoDB as in binary log. - 2) A MySQL log file rotation cannot happen because - MySQL protects against this by having a counter of - transactions in prepared state and it only allows - a rotation when the counter drops to zero. See - LOCK_prep_xids and COND_prep_xids in log.cc. */ - trx->mysql_log_file_name = mysql_bin_log_file_name(); - trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); + mysql_bin_log_commit_pos(thd, &tmp_pos, &(trx->mysql_log_file_name)); + trx->mysql_log_offset = (ib_int64_t) tmp_pos; /* Don't do write + flush right now. For group commit to work we want to do the flush in the innobase_commit() @@ -2817,7 +2810,7 @@ innobase_commit_ordered( DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); - innobase_commit_ordered_2(trx); + innobase_commit_ordered_2(trx, thd); trx->active_trans |= TRX_ACTIVE_COMMIT_ORDERED; @@ -2881,7 +2874,7 @@ innobase_commit( /* Run the fast part of commit if we did not already. */ if ((trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) { - innobase_commit_ordered_2(trx); + innobase_commit_ordered_2(trx, thd); } /* We were instructed to commit the whole transaction, or diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index 04224277deb..20363b19abd 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -239,16 +239,6 @@ LEX_STRING *thd_query_string(MYSQL_THD thd); char **thd_query(MYSQL_THD thd); #endif -/** Get the file name of the MySQL binlog. - * @return the name of the binlog file - */ -const char* mysql_bin_log_file_name(void); - -/** Get the current position of the MySQL binlog. - * @return byte offset from the beginning of the binlog - */ -ulonglong mysql_bin_log_file_pos(void); - /** Check if a user thread is a replication slave thread @param thd user thread @@ -289,6 +279,11 @@ bool thd_binlog_filter_ok(const MYSQL_THD thd); #endif /* MYSQL_VERSION_ID > 50140 */ } +/** Get the file name and position of the MySQL binlog corresponding to the + * current commit. + */ +extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file); + typedef struct trx_struct trx_t; /********************************************************************//** @file handler/ha_innodb.h From dcb111ff6e8deb077530a6e0779277afc3eab7ff Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 18 Oct 2010 11:43:05 +0200 Subject: [PATCH 04/13] MWL#116: Efficient group commit: Fix memory leak. --- sql/log.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/log.cc b/sql/log.cc index d99f04d4425..6ec186b3786 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -166,6 +166,7 @@ public: ~binlog_trx_data() { DBUG_ASSERT(pending() == NULL); + (void) pthread_mutex_destroy(&LOCK_group_commit); close_cached_file(&trans_log); } From 45131a5b103d5a8330c50c8b9c0a7922fe22298e Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 22 Oct 2010 10:51:02 +0200 Subject: [PATCH 05/13] MWL#116: Efficient group commit: Fix assertion by moving init/deinit out of constructor in global static object. --- sql/log.cc | 14 +++++++++++++- sql/log.h | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sql/log.cc b/sql/log.cc index 6ec186b3786..85f47d3edf9 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -5927,12 +5927,22 @@ int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all, TC_LOG_group_commit::TC_LOG_group_commit() : num_commits(0), num_group_commits(0) +{ +} + +TC_LOG_group_commit::~TC_LOG_group_commit() +{ +} + +void +TC_LOG_group_commit::init() { my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW, "LOCK_group_commit", MYF(0)); } -TC_LOG_group_commit::~TC_LOG_group_commit() +void +TC_LOG_group_commit::deinit() { pthread_mutex_destroy(&LOCK_group_commit); } @@ -6591,6 +6601,7 @@ int TC_LOG_BINLOG::open(const char *opt_name) DBUG_ASSERT(total_ha_2pc > 1); DBUG_ASSERT(opt_name && opt_name[0]); + TC_LOG_group_commit::init(); pthread_mutex_init(&LOCK_prep_xids, MY_MUTEX_INIT_FAST); pthread_cond_init (&COND_prep_xids, 0); @@ -6674,6 +6685,7 @@ void TC_LOG_BINLOG::close() DBUG_ASSERT(prepared_xids==0); pthread_mutex_destroy(&LOCK_prep_xids); pthread_cond_destroy (&COND_prep_xids); + TC_LOG_group_commit::deinit(); } /* diff --git a/sql/log.h b/sql/log.h index 7a6b7f070b5..f43ae433c8a 100644 --- a/sql/log.h +++ b/sql/log.h @@ -150,6 +150,8 @@ class TC_LOG_group_commit: public TC_LOG_queued public: TC_LOG_group_commit(); ~TC_LOG_group_commit(); + void init(); + void deinit(); int log_and_order(THD *thd, my_xid xid, bool all, bool need_prepare_ordered, bool need_commit_ordered); From f0707b38ea64688735183f2e43927857236ca2ed Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 23 Oct 2010 11:50:02 +0200 Subject: [PATCH 06/13] MWL#116: Efficient group commit: Some fixes for test failures. client/mysqltest.cc: Apply Serg's work-around for MySQL BUG#57491. mysql-test/r/group_commit_binlog_pos.result: Accept different ways the server crash is detected. mysql-test/t/group_commit_binlog_pos.test: Accept different ways the server crash is detected. Skip in row-based binlogging, as otherwise the binlog offsets will be different. --- client/mysqltest.cc | 4 ++++ mysql-test/r/group_commit_binlog_pos.result | 6 +++--- mysql-test/t/group_commit_binlog_pos.test | 8 +++++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/client/mysqltest.cc b/client/mysqltest.cc index e4981bec997..78ae9a63f87 100644 --- a/client/mysqltest.cc +++ b/client/mysqltest.cc @@ -717,6 +717,10 @@ void handle_no_error(struct st_command*); #ifdef EMBEDDED_LIBRARY +/* workaround for MySQL BUG#57491 */ +#undef MY_WME +#define MY_WME 0 + /* attributes of the query thread */ pthread_attr_t cn_thd_attrib; diff --git a/mysql-test/r/group_commit_binlog_pos.result b/mysql-test/r/group_commit_binlog_pos.result index 0d7c23cbbbd..a0bb5ee2d8e 100644 --- a/mysql-test/r/group_commit_binlog_pos.result +++ b/mysql-test/r/group_commit_binlog_pos.result @@ -21,9 +21,9 @@ a 2 SET SESSION debug="+d,crash_dispatch_command_before"; SELECT 1; -ERROR HY000: Lost connection to MySQL server during query -ERROR HY000: Lost connection to MySQL server during query -ERROR HY000: Lost connection to MySQL server during query +Got one of the listed errors +Got one of the listed errors +Got one of the listed errors SELECT * FROM t1 ORDER BY a; a 0 diff --git a/mysql-test/t/group_commit_binlog_pos.test b/mysql-test/t/group_commit_binlog_pos.test index ad1a0e4b508..f8c5e719f11 100644 --- a/mysql-test/t/group_commit_binlog_pos.test +++ b/mysql-test/t/group_commit_binlog_pos.test @@ -1,6 +1,8 @@ --source include/have_debug_sync.inc --source include/have_innodb.inc --source include/have_log_bin.inc +--source include/have_binlog_format_mixed_or_statement.inc + # Need DBUG to crash the server intentionally --source include/have_debug.inc # Don't test this under valgrind, memory leaks will occur as we crash @@ -54,14 +56,14 @@ reap; connection default; system echo wait-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; SET SESSION debug="+d,crash_dispatch_command_before"; ---error 2013 +--error 2006,2013 SELECT 1; connection con1; ---error 2013 +--error 2006,2013 reap; connection con3; ---error 2013 +--error 2006,2013 reap; system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; From aa23fe7dc1f8c215fb7cbff7be9e26255ad0ef93 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 26 Oct 2010 12:45:34 +0200 Subject: [PATCH 07/13] MWL#116: Replace atomic queue with simple mutex locking for non-transactional binlog writes. Also add missing destroy of pthread_cond. --- sql/log.cc | 69 ++++++++++++++++++++---------------------------------- sql/log.h | 6 ++--- 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/sql/log.cc b/sql/log.cc index 85f47d3edf9..3ee848cde19 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -158,15 +158,15 @@ public: before_stmt_pos(MY_OFF_T_UNDEF), using_xa(0), commit_bin_log_file_pos(0) { trans_log.end_of_file= max_binlog_cache_size; - (void) my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW, - "LOCK_group_commit", MYF(0)); + (void) my_pthread_mutex_init(&LOCK_binlog_participant, MY_MUTEX_INIT_SLOW, + "LOCK_binlog_participant", MYF(0)); (void) pthread_cond_init(&COND_group_commit, 0); } ~binlog_trx_data() { DBUG_ASSERT(pending() == NULL); - (void) pthread_mutex_destroy(&LOCK_group_commit); + (void) pthread_mutex_destroy(&LOCK_binlog_participant); close_cached_file(&trans_log); } @@ -274,13 +274,13 @@ public: /* Flag set true when group commit for this transaction is finished; used with pthread_cond_wait() to wait until commit is done. - This flag is protected by LOCK_group_commit. + This flag is protected by LOCK_binlog_participant. */ bool done; /* Flag set if this transaction is the group commit leader that will handle the actual writing to the binlog. - This flag is protected by LOCK_group_commit. + This flag is protected by LOCK_binlog_participant. */ bool group_commit_leader; /* @@ -297,7 +297,7 @@ public: Log_event *end_event; Log_event *incident_event; /* Mutex and condition for wakeup after group commit. */ - pthread_mutex_t LOCK_group_commit; + pthread_mutex_t LOCK_binlog_participant; pthread_cond_t COND_group_commit; /* Binlog position after current commit, available to storage engines during @@ -4994,8 +4994,13 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data) the commit and wake them up. */ - pthread_mutex_lock(&trx_data->LOCK_group_commit); - const binlog_trx_data *orig_queue= atomic_enqueue_trx(trx_data); + pthread_mutex_lock(&trx_data->LOCK_binlog_participant); + + pthread_mutex_lock(&LOCK_queue); + binlog_trx_data *orig_queue= group_commit_queue; + trx_data->next= orig_queue; + group_commit_queue= trx_data; + pthread_mutex_unlock(&LOCK_queue); if (orig_queue != NULL) { @@ -5006,7 +5011,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data) else { trx_data->group_commit_leader= TRUE; - pthread_mutex_unlock(&trx_data->LOCK_group_commit); + pthread_mutex_unlock(&trx_data->LOCK_binlog_participant); trx_group_commit_leader(NULL); } @@ -5020,19 +5025,19 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data) this thread in the group commit once the log is obtained. So here we put ourself in the queue and wait to be signalled that the group commit is done. - Note that this function must be called with the trs_data->LOCK_group_commit + Note that this function must be called with trx_data->LOCK_binlog_participant locked; the mutex will be released before return. */ void MYSQL_BIN_LOG::trx_group_commit_participant(binlog_trx_data *trx_data) { - safe_mutex_assert_owner(&trx_data->LOCK_group_commit); + safe_mutex_assert_owner(&trx_data->LOCK_binlog_participant); /* Wait until trx_data.done == true and woken up by the leader. */ while (!trx_data->done) pthread_cond_wait(&trx_data->COND_group_commit, - &trx_data->LOCK_group_commit); - pthread_mutex_unlock(&trx_data->LOCK_group_commit); + &trx_data->LOCK_binlog_participant); + pthread_mutex_unlock(&trx_data->LOCK_binlog_participant); } bool @@ -5131,7 +5136,10 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) it to the existing one. Note that there is no ordering defined between transactional and non-transactional commits. */ - binlog_trx_data *current= atomic_grab_trx_queue(); + pthread_mutex_lock(&LOCK_queue); + binlog_trx_data *current= group_commit_queue; + group_commit_queue= NULL; + pthread_mutex_unlock(&LOCK_queue); binlog_trx_data *xtra_queue= NULL; while (current) { @@ -5230,18 +5238,19 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) for (current= xtra_queue; current != NULL; current= current->next) { /* - Note that we need to take LOCK_group_commit even in the case of a leader! + Note that we need to take LOCK_binlog_participant even in the case of a + leader! Otherwise there is a race between setting and testing the group_commit_leader flag. */ - pthread_mutex_lock(¤t->LOCK_group_commit); + pthread_mutex_lock(¤t->LOCK_binlog_participant); if (!current->group_commit_leader) { current->done= true; pthread_cond_signal(¤t->COND_group_commit); } - pthread_mutex_unlock(¤t->LOCK_group_commit); + pthread_mutex_unlock(¤t->LOCK_binlog_participant); } } @@ -5291,32 +5300,6 @@ MYSQL_BIN_LOG::write_transaction(binlog_trx_data *trx_data) return 0; } -binlog_trx_data * -MYSQL_BIN_LOG::atomic_enqueue_trx(binlog_trx_data *trx_data) -{ - my_atomic_rwlock_wrlock(&LOCK_queue); - trx_data->next= group_commit_queue; - while (!my_atomic_casptr((void **)(&group_commit_queue), - (void **)(&trx_data->next), - trx_data)) - ; - my_atomic_rwlock_wrunlock(&LOCK_queue); - return trx_data->next; -} - -binlog_trx_data * -MYSQL_BIN_LOG::atomic_grab_trx_queue() -{ - my_atomic_rwlock_wrlock(&LOCK_queue); - binlog_trx_data *queue= group_commit_queue; - while (!my_atomic_casptr((void **)(&group_commit_queue), - (void **)(&queue), - NULL)) - ; - my_atomic_rwlock_wrunlock(&LOCK_queue); - return queue; -} - /** Wait until we get a signal that the binary log has been updated. diff --git a/sql/log.h b/sql/log.h index f43ae433c8a..863cb188866 100644 --- a/sql/log.h +++ b/sql/log.h @@ -404,8 +404,8 @@ class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG pthread_mutex_t LOCK_index; pthread_mutex_t LOCK_prep_xids; /* - Mutex to protect the queue of transactions waiting to participate in group - commit. (Only used on platforms without native atomic operations). + Mutex to protect the queue of non-transactional binlog writes waiting to + participate in group commit. */ pthread_mutex_t LOCK_queue; @@ -462,8 +462,6 @@ class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG bool write_transaction_to_binlog_events(binlog_trx_data *trx_data); void trx_group_commit_participant(binlog_trx_data *trx_data); void trx_group_commit_leader(TC_group_commit_entry *first); - binlog_trx_data *atomic_enqueue_trx(binlog_trx_data *trx_data); - binlog_trx_data *atomic_grab_trx_queue(); void mark_xid_done(); void mark_xids_active(uint xid_count); From b357fca4de07c932a32a00eb42073fdd589250fc Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 26 Oct 2010 12:58:17 +0200 Subject: [PATCH 08/13] MWL#116: better name for pthread_cond + add missing pthread_cond_destroy(). --- sql/log.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sql/log.cc b/sql/log.cc index 3ee848cde19..49688a8b3c6 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -160,12 +160,13 @@ public: trans_log.end_of_file= max_binlog_cache_size; (void) my_pthread_mutex_init(&LOCK_binlog_participant, MY_MUTEX_INIT_SLOW, "LOCK_binlog_participant", MYF(0)); - (void) pthread_cond_init(&COND_group_commit, 0); + (void) pthread_cond_init(&COND_binlog_participant, 0); } ~binlog_trx_data() { DBUG_ASSERT(pending() == NULL); + (void) pthread_cond_destroy(&COND_binlog_participant); (void) pthread_mutex_destroy(&LOCK_binlog_participant); close_cached_file(&trans_log); } @@ -298,7 +299,7 @@ public: Log_event *incident_event; /* Mutex and condition for wakeup after group commit. */ pthread_mutex_t LOCK_binlog_participant; - pthread_cond_t COND_group_commit; + pthread_cond_t COND_binlog_participant; /* Binlog position after current commit, available to storage engines during commit() and commit_ordered(). @@ -5035,7 +5036,7 @@ MYSQL_BIN_LOG::trx_group_commit_participant(binlog_trx_data *trx_data) /* Wait until trx_data.done == true and woken up by the leader. */ while (!trx_data->done) - pthread_cond_wait(&trx_data->COND_group_commit, + pthread_cond_wait(&trx_data->COND_binlog_participant, &trx_data->LOCK_binlog_participant); pthread_mutex_unlock(&trx_data->LOCK_binlog_participant); } @@ -5248,7 +5249,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) if (!current->group_commit_leader) { current->done= true; - pthread_cond_signal(¤t->COND_group_commit); + pthread_cond_signal(¤t->COND_binlog_participant); } pthread_mutex_unlock(¤t->LOCK_binlog_participant); } From b91ad17cea8572f7cd21fb97d0b0ddce9f8b3c46 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 28 Oct 2010 12:40:42 +0200 Subject: [PATCH 09/13] MWL#116: Code simplifications for TC_LOG_MMAP. Make TC_LOG_MMAP (and TC_LOG_DUMMY) derive directly from TC_LOG, avoiding the inheritance hierarchy TC_LOG_queued->TC_LOG_unordered. Put the wakeup facility for commit_ordered() calls into the THD class. Some renaming to get better names. --- sql/log.cc | 88 +++++++++++++++++++----------------------------- sql/log.h | 85 +++++++++++++++++++++++----------------------- sql/sql_class.cc | 27 ++++++++++++--- sql/sql_class.h | 22 +++++++++--- 4 files changed, 119 insertions(+), 103 deletions(-) diff --git a/sql/log.cc b/sql/log.cc index 49688a8b3c6..e29758b7f0b 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -5778,43 +5778,12 @@ TC_LOG_queued::reverse_queue(TC_LOG_queued::TC_group_commit_entry *queue) return prev; } -void -TC_LOG_queued::group_commit_wait_for_wakeup(TC_group_commit_entry *entry) -{ - THD *thd= entry->thd; - pthread_mutex_lock(&thd->LOCK_commit_ordered); - while (!entry->group_commit_ready) - pthread_cond_wait(&thd->COND_commit_ordered, - &thd->LOCK_commit_ordered); - pthread_mutex_unlock(&thd->LOCK_commit_ordered); -} - -void -TC_LOG_queued::group_commit_wakeup_other(TC_group_commit_entry *other) -{ - THD *thd= other->thd; - pthread_mutex_lock(&thd->LOCK_commit_ordered); - other->group_commit_ready= TRUE; - pthread_cond_signal(&thd->COND_commit_ordered); - pthread_mutex_unlock(&thd->LOCK_commit_ordered); -} - -TC_LOG_unordered::TC_LOG_unordered() : group_commit_queue_busy(0) -{ - pthread_cond_init(&COND_queue_busy, 0); -} - -TC_LOG_unordered::~TC_LOG_unordered() -{ - pthread_cond_destroy(&COND_queue_busy); -} - -int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all, - bool need_prepare_ordered, - bool need_commit_ordered) +int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, + bool need_commit_ordered) { int cookie; - struct TC_group_commit_entry entry; + struct commit_entry entry; bool is_group_commit_leader; LINT_INIT(is_group_commit_leader); @@ -5828,18 +5797,18 @@ int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all, Must put us in queue so we can run_commit_ordered() in same sequence as we did run_prepare_ordered(). */ + thd->clear_wakeup_ready(); entry.thd= thd; - entry.group_commit_ready= false; - TC_group_commit_entry *previous_queue= group_commit_queue; + commit_entry *previous_queue= commit_ordered_queue; entry.next= previous_queue; - group_commit_queue= &entry; + commit_ordered_queue= &entry; is_group_commit_leader= (previous_queue == NULL); } pthread_mutex_unlock(&LOCK_prepare_ordered); } if (xid) - cookie= log_xid(thd, xid); + cookie= log_one_transaction(xid); else cookie= 0; @@ -5859,24 +5828,32 @@ int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all, { /* The first in queue starts the ball rolling. */ pthread_mutex_lock(&LOCK_prepare_ordered); - while (group_commit_queue_busy) + while (commit_ordered_queue_busy) pthread_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered); - TC_group_commit_entry *queue= group_commit_queue; - group_commit_queue= NULL; + commit_entry *queue= commit_ordered_queue; + commit_ordered_queue= NULL; /* Mark the queue busy while we bounce it from one thread to the next. */ - group_commit_queue_busy= TRUE; + commit_ordered_queue_busy= true; pthread_mutex_unlock(&LOCK_prepare_ordered); - queue= reverse_queue(queue); - DBUG_ASSERT(queue == &entry && queue->thd == thd); + /* Reverse the queue list so we get correct order. */ + commit_entry *prev= NULL; + while (queue) + { + commit_entry *next= queue->next; + queue->next= prev; + prev= queue; + queue= next; + } + DBUG_ASSERT(prev == &entry && prev->thd == thd); } else { /* Not first in queue; just wait until previous thread wakes us up. */ - group_commit_wait_for_wakeup(&entry); + thd->wait_for_wakeup_ready(); } } @@ -5890,15 +5867,15 @@ int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all, if (need_prepare_ordered) { - TC_group_commit_entry *next= entry.next; + commit_entry *next= entry.next; if (next) { - group_commit_wakeup_other(next); + next->thd->signal_wakeup_ready(); } else { pthread_mutex_lock(&LOCK_prepare_ordered); - group_commit_queue_busy= FALSE; + commit_ordered_queue_busy= false; pthread_cond_signal(&COND_queue_busy); pthread_mutex_unlock(&LOCK_prepare_ordered); } @@ -5940,9 +5917,9 @@ int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all, struct TC_group_commit_entry entry; bool is_group_commit_leader; + thd->clear_wakeup_ready(); entry.thd= thd; entry.all= all; - entry.group_commit_ready= false; entry.xid_error= 0; pthread_mutex_lock(&LOCK_prepare_ordered); @@ -6019,7 +5996,7 @@ int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all, */ TC_group_commit_entry *next= current->next; if (current != &entry) // Don't wake up ourself - group_commit_wakeup_other(current); + current->thd->signal_wakeup_ready(); current= next; } while (current != NULL); DEBUG_SYNC(thd, "commit_after_group_run_commit_ordered"); @@ -6029,7 +6006,7 @@ int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all, else { /* If not leader, just wait until leader wakes us up. */ - group_commit_wait_for_wakeup(&entry); + thd->wait_for_wakeup_ready(); } /* @@ -6181,6 +6158,7 @@ int TC_LOG_MMAP::open(const char *opt_name) pthread_mutex_init(&LOCK_pool, MY_MUTEX_INIT_FAST); pthread_cond_init(&COND_active, 0); pthread_cond_init(&COND_pool, 0); + pthread_cond_init(&COND_queue_busy, 0); inited=6; @@ -6188,6 +6166,8 @@ int TC_LOG_MMAP::open(const char *opt_name) active=pages; pool=pages+1; pool_last=pages+npages-1; + commit_ordered_queue= NULL; + commit_ordered_queue_busy= false; return 0; @@ -6293,7 +6273,7 @@ int TC_LOG_MMAP::overflow() to the position in memory where xid was logged to. */ -int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid) +int TC_LOG_MMAP::log_one_transaction(my_xid xid) { int err; PAGE *p; @@ -6462,6 +6442,8 @@ void TC_LOG_MMAP::close() pthread_mutex_destroy(&LOCK_active); pthread_mutex_destroy(&LOCK_pool); pthread_cond_destroy(&COND_pool); + pthread_cond_destroy(&COND_active); + pthread_cond_destroy(&COND_queue_busy); case 5: data[0]='A'; // garble the first (signature) byte, in case my_delete fails case 4: diff --git a/sql/log.h b/sql/log.h index 863cb188866..37f5462f198 100644 --- a/sql/log.h +++ b/sql/log.h @@ -91,11 +91,6 @@ protected: THD *thd; /* This is the `all' parameter for ha_commit_trans() etc. */ bool all; - /* - Flag set true when it is time for this thread to wake up after group - commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered. - */ - bool group_commit_ready; /* Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and cookie. @@ -105,9 +100,6 @@ protected: TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue); - void group_commit_wait_for_wakeup(TC_group_commit_entry *entry); - void group_commit_wakeup_other(TC_group_commit_entry *other); - /* This is a queue of threads waiting for being allowed to commit. Access to the queue must be protected by LOCK_prepare_ordered. @@ -115,36 +107,6 @@ protected: TC_group_commit_entry *group_commit_queue; }; -class TC_LOG_unordered: public TC_LOG_queued -{ -public: - TC_LOG_unordered(); - ~TC_LOG_unordered(); - - int log_and_order(THD *thd, my_xid xid, bool all, - bool need_prepare_ordered, bool need_commit_ordered); - -protected: - virtual int log_xid(THD *thd, my_xid xid)=0; - -private: - /* - This flag and condition is used to reserve the queue while threads in it - each run the commit_ordered() methods one after the other. Only once the - last commit_ordered() in the queue is done can we start on a new queue - run. - - Since we start this process in the first thread in the queue and finish in - the last (and possibly different) thread, we need a condition variable for - this (we cannot unlock a mutex in a different thread than the one who - locked it). - - The condition is used together with the LOCK_prepare_ordered mutex. - */ - my_bool group_commit_queue_busy; - pthread_cond_t COND_queue_busy; -}; - class TC_LOG_group_commit: public TC_LOG_queued { public: @@ -206,18 +168,28 @@ private: pthread_mutex_t LOCK_group_commit; }; -class TC_LOG_DUMMY: public TC_LOG_unordered // use it to disable the logging +class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging { public: TC_LOG_DUMMY() {} int open(const char *opt_name) { return 0; } void close() { } - int log_xid(THD *thd, my_xid xid) { return 1; } + /* + TC_LOG_DUMMY is only used when there are <= 1 XA-capable engines, and we + only use internal XA during commit when >= 2 XA-capable engines + participate. + */ + int log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, bool need_commit_ordered) + { + DBUG_ASSERT(0 /* Internal error - TC_LOG_DUMMY::log_and_order() called */); + return 1; + } void unlog(ulong cookie, my_xid xid) { } }; #ifdef HAVE_MMAP -class TC_LOG_MMAP: public TC_LOG_unordered +class TC_LOG_MMAP: public TC_LOG { public: // only to keep Sun Forte on sol9x86 happy typedef enum { @@ -238,6 +210,13 @@ class TC_LOG_MMAP: public TC_LOG_unordered pthread_cond_t cond; // to wait for a sync } PAGE; + /* List of THDs for which to invoke commit_ordered(), in order. */ + struct commit_entry + { + struct commit_entry *next; + THD *thd; + }; + char logname[FN_REFLEN]; File fd; my_off_t file_length; @@ -252,16 +231,38 @@ class TC_LOG_MMAP: public TC_LOG_unordered */ pthread_mutex_t LOCK_active, LOCK_pool, LOCK_sync; pthread_cond_t COND_pool, COND_active; + /* + Queue of threads that need to call commit_ordered(). + Access to this queue must be protected by LOCK_prepare_ordered. + */ + commit_entry *commit_ordered_queue; + /* + This flag and condition is used to reserve the queue while threads in it + each run the commit_ordered() methods one after the other. Only once the + last commit_ordered() in the queue is done can we start on a new queue + run. + + Since we start this process in the first thread in the queue and finish in + the last (and possibly different) thread, we need a condition variable for + this (we cannot unlock a mutex in a different thread than the one who + locked it). + + The condition is used together with the LOCK_prepare_ordered mutex. + */ + my_bool commit_ordered_queue_busy; + pthread_cond_t COND_queue_busy; public: TC_LOG_MMAP(): inited(0) {} int open(const char *opt_name); void close(); - int log_xid(THD *thd, my_xid xid); + int log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, bool need_commit_ordered); void unlog(ulong cookie, my_xid xid); int recover(); private: + int log_one_transaction(my_xid xid); void get_active_from_pool(); int sync(); int overflow(); diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 7c8ed46347f..563617f0a5b 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -704,8 +704,8 @@ THD::THD() active_vio = 0; #endif pthread_mutex_init(&LOCK_thd_data, MY_MUTEX_INIT_FAST); - pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_FAST); - pthread_cond_init(&COND_commit_ordered, 0); + pthread_mutex_init(&LOCK_wakeup_ready, MY_MUTEX_INIT_FAST); + pthread_cond_init(&COND_wakeup_ready, 0); /* Variables with default values */ proc_info="login"; @@ -1037,8 +1037,8 @@ THD::~THD() free_root(&transaction.mem_root,MYF(0)); #endif mysys_var=0; // Safety (shouldn't be needed) - pthread_cond_destroy(&COND_commit_ordered); - pthread_mutex_destroy(&LOCK_commit_ordered); + pthread_cond_destroy(&COND_wakeup_ready); + pthread_mutex_destroy(&LOCK_wakeup_ready); pthread_mutex_destroy(&LOCK_thd_data); #ifndef DBUG_OFF dbug_sentry= THD_SENTRY_GONE; @@ -4009,6 +4009,25 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg, DBUG_RETURN(0); } +void +THD::wait_for_wakeup_ready() +{ + pthread_mutex_lock(&LOCK_wakeup_ready); + while (!wakeup_ready) + pthread_cond_wait(&COND_wakeup_ready, &LOCK_wakeup_ready); + pthread_mutex_unlock(&LOCK_wakeup_ready); +} + +void +THD::signal_wakeup_ready() +{ + pthread_mutex_lock(&LOCK_wakeup_ready); + wakeup_ready= true; + pthread_cond_signal(&COND_wakeup_ready); + pthread_mutex_unlock(&LOCK_wakeup_ready); +} + + bool Discrete_intervals_list::append(ulonglong start, ulonglong val, ulonglong incr) { diff --git a/sql/sql_class.h b/sql/sql_class.h index ed02504e3ab..618d6a6e089 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1447,10 +1447,6 @@ public: /* container for handler's private per-connection data */ Ha_data ha_data[MAX_HA]; - /* Mutex and condition for waking up threads after group commit. */ - pthread_mutex_t LOCK_commit_ordered; - pthread_cond_t COND_commit_ordered; - #ifndef MYSQL_CLIENT int binlog_setup_trx_data(); @@ -2380,6 +2376,14 @@ public: LEX_STRING get_invoker_user() { return invoker_user; } LEX_STRING get_invoker_host() { return invoker_host; } bool has_invoker() { return invoker_user.length > 0; } + void clear_wakeup_ready() { wakeup_ready= false; } + /* + Sleep waiting for others to wake us up with signal_wakeup_ready(). + Must call clear_wakeup_ready() before waiting. + */ + void wait_for_wakeup_ready(); + /* Wake this thread up from wait_for_wakeup_ready(). */ + void signal_wakeup_ready(); private: /** The current internal error handler for this thread, or NULL. */ Internal_error_handler *m_internal_handler; @@ -2418,6 +2422,16 @@ private: */ LEX_STRING invoker_user; LEX_STRING invoker_host; + /* + Flag, mutex and condition for a thread to wait for a signal from another + thread. + + Currently used to wait for group commit to complete, can also be used for + other purposes. + */ + bool wakeup_ready; + pthread_mutex_t LOCK_wakeup_ready; + pthread_cond_t COND_wakeup_ready; }; From 5614ebe7ed8e56cbd345158395c1c1930b0752d1 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 29 Oct 2010 13:58:47 +0200 Subject: [PATCH 10/13] MWL#116: after-architecture-review code refactoring and cleanup. Remove the extra class hierarchy with classes TC_LOG_queued, TC_LOG_unordered, and TC_LOG_group_commit, folding the code into the TC_LOG_MMAP and TC_LOG_BINLOG classes. In particular TC_LOG_BINLOG is greatly simplified by this, unifying the code path for transactional and non-transactional commit. Remove unnecessary locking of LOCK_log in MYSQL_BIN_LOG::write() (backport of same fix from mysql-5.5). --- mysql-test/r/group_commit.result | 6 +- mysql-test/r/group_commit_binlog_pos.result | 2 +- mysql-test/t/group_commit.test | 6 +- mysql-test/t/group_commit_binlog_pos.test | 2 +- sql/log.cc | 664 ++++++-------------- sql/log.h | 143 +---- 6 files changed, 221 insertions(+), 602 deletions(-) diff --git a/mysql-test/r/group_commit.result b/mysql-test/r/group_commit.result index c7993227f8f..9e80dc6da6e 100644 --- a/mysql-test/r/group_commit.result +++ b/mysql-test/r/group_commit.result @@ -3,11 +3,11 @@ SELECT variable_value INTO @commits FROM information_schema.global_status WHERE variable_name = 'binlog_commits'; SELECT variable_value INTO @group_commits FROM information_schema.global_status WHERE variable_name = 'binlog_group_commits'; -SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued"; +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued"; INSERT INTO t1 VALUES ("con1"); set DEBUG_SYNC= "now WAIT_FOR group1_running"; SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2"; -SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed"; +SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed"; SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked"; INSERT INTO t1 VALUES ("con2"); SET DEBUG_SYNC= "now WAIT_FOR group2_con2"; @@ -25,7 +25,7 @@ SELECT * FROM t1 ORDER BY a; a con1 SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5"; -SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued"; +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued"; INSERT INTO t1 VALUES ("con5"); SET DEBUG_SYNC= "now WAIT_FOR con5_leader"; SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued"; diff --git a/mysql-test/r/group_commit_binlog_pos.result b/mysql-test/r/group_commit_binlog_pos.result index a0bb5ee2d8e..67ae30bbb79 100644 --- a/mysql-test/r/group_commit_binlog_pos.result +++ b/mysql-test/r/group_commit_binlog_pos.result @@ -1,6 +1,6 @@ CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; INSERT INTO t1 VALUES (0); -SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con1_waiting WAIT_FOR con3_queued"; +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued"; SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3"; INSERT INTO t1 VALUES (1); SET DEBUG_SYNC= "now WAIT_FOR con1_waiting"; diff --git a/mysql-test/t/group_commit.test b/mysql-test/t/group_commit.test index df4ea6654d4..7c87c166844 100644 --- a/mysql-test/t/group_commit.test +++ b/mysql-test/t/group_commit.test @@ -27,7 +27,7 @@ connect(con6,localhost,root,,); # group2 to queue up before finishing. connection con1; -SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued"; +SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group1_running WAIT_FOR group2_queued"; send INSERT INTO t1 VALUES ("con1"); # Make group2 (with three threads) queue up. @@ -37,7 +37,7 @@ send INSERT INTO t1 VALUES ("con1"); connection con2; set DEBUG_SYNC= "now WAIT_FOR group1_running"; SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2"; -SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed"; +SET DEBUG_SYNC= "commit_after_release_LOCK_log WAIT_FOR group3_committed"; SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked"; send INSERT INTO t1 VALUES ("con2"); connection con3; @@ -69,7 +69,7 @@ SELECT * FROM t1 ORDER BY a; connection con5; SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5"; -SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued"; +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con5_leader WAIT_FOR con6_queued"; send INSERT INTO t1 VALUES ("con5"); connection con6; diff --git a/mysql-test/t/group_commit_binlog_pos.test b/mysql-test/t/group_commit_binlog_pos.test index f8c5e719f11..00cf6ab685f 100644 --- a/mysql-test/t/group_commit_binlog_pos.test +++ b/mysql-test/t/group_commit_binlog_pos.test @@ -23,7 +23,7 @@ connect(con3,localhost,root,,); # Queue up three commits for group commit. connection con1; -SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con1_waiting WAIT_FOR con3_queued"; +SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued"; SET DEBUG_SYNC= "commit_loop_entry_commit_ordered SIGNAL con1_loop WAIT_FOR con1_loop_cont EXECUTE 3"; send INSERT INTO t1 VALUES (1); diff --git a/sql/log.cc b/sql/log.cc index e29758b7f0b..f2884c1ad38 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -155,19 +155,14 @@ class binlog_trx_data { public: binlog_trx_data() : at_least_one_stmt_committed(0), incident(FALSE), m_pending(0), - before_stmt_pos(MY_OFF_T_UNDEF), using_xa(0), commit_bin_log_file_pos(0) + before_stmt_pos(MY_OFF_T_UNDEF), commit_bin_log_file_pos(0), using_xa(0) { trans_log.end_of_file= max_binlog_cache_size; - (void) my_pthread_mutex_init(&LOCK_binlog_participant, MY_MUTEX_INIT_SLOW, - "LOCK_binlog_participant", MYF(0)); - (void) pthread_cond_init(&COND_binlog_participant, 0); } ~binlog_trx_data() { DBUG_ASSERT(pending() == NULL); - (void) pthread_cond_destroy(&COND_binlog_participant); - (void) pthread_mutex_destroy(&LOCK_binlog_participant); close_cached_file(&trans_log); } @@ -265,46 +260,17 @@ public: Binlog position before the start of the current statement. */ my_off_t before_stmt_pos; + /* + Binlog position after current commit, available to storage engines during + commit_ordered() and commit(). + */ + ulonglong commit_bin_log_file_pos; - /* 0 or error when writing to binlog; set during group commit. */ - int error; - /* If error != 0, value of errno (for my_error() reporting). */ - int commit_errno; - /* Link for queueing transactions up for group commit to binlog. */ - binlog_trx_data *next; - /* - Flag set true when group commit for this transaction is finished; used - with pthread_cond_wait() to wait until commit is done. - This flag is protected by LOCK_binlog_participant. - */ - bool done; - /* - Flag set if this transaction is the group commit leader that will handle - the actual writing to the binlog. - This flag is protected by LOCK_binlog_participant. - */ - bool group_commit_leader; /* Flag set true if this transaction is committed with log_xid() as part of XA, false if not. */ bool using_xa; - /* - Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be - written during group commit. The incident_event is only valid if - has_incident() is true. - */ - Log_event *begin_event; - Log_event *end_event; - Log_event *incident_event; - /* Mutex and condition for wakeup after group commit. */ - pthread_mutex_t LOCK_binlog_participant; - pthread_cond_t COND_binlog_participant; - /* - Binlog position after current commit, available to storage engines during - commit() and commit_ordered(). - */ - ulonglong commit_bin_log_file_pos; }; handlerton *binlog_hton; @@ -1441,30 +1407,6 @@ static int binlog_close_connection(handlerton *hton, THD *thd) return 0; } -/* Helper functions for binlog_flush_trx_cache(). */ -static int -binlog_flush_trx_cache_prepare(THD *thd) -{ - if (thd->binlog_flush_pending_rows_event(TRUE)) - return 1; - return 0; -} - -static void -binlog_flush_trx_cache_finish(THD *thd, binlog_trx_data *trx_data) -{ - IO_CACHE *trans_log= &trx_data->trans_log; - - trx_data->reset(); - - statistic_increment(binlog_cache_use, &LOCK_status); - if (trans_log->disk_writes != 0) - { - statistic_increment(binlog_cache_disk_use, &LOCK_status); - trans_log->disk_writes= 0; - } -} - /* End a transaction, writing events to the binary log. @@ -1487,14 +1429,15 @@ binlog_flush_trx_cache_finish(THD *thd, binlog_trx_data *trx_data) */ static int binlog_flush_trx_cache(THD *thd, binlog_trx_data *trx_data, - Log_event *end_ev) + Log_event *end_ev, bool all) { DBUG_ENTER("binlog_flush_trx_cache"); + IO_CACHE *trans_log= &trx_data->trans_log; DBUG_PRINT("info", ("thd->options={ %s%s}", FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT), FLAGSTR(thd->options, OPTION_BEGIN))); - if (binlog_flush_trx_cache_prepare(thd)) + if (thd->binlog_flush_pending_rows_event(TRUE)) DBUG_RETURN(1); /* @@ -1507,9 +1450,17 @@ binlog_flush_trx_cache(THD *thd, binlog_trx_data *trx_data, were, we would have to ensure that we're not ending a statement inside a stored function. */ - int error= mysql_bin_log.write_transaction_to_binlog(thd, trx_data, end_ev); + int error= mysql_bin_log.write_transaction_to_binlog(thd, trx_data, + end_ev, all); - binlog_flush_trx_cache_finish(thd, trx_data); + trx_data->reset(); + + statistic_increment(binlog_cache_use, &LOCK_status); + if (trans_log->disk_writes != 0) + { + statistic_increment(binlog_cache_disk_use, &LOCK_status); + trans_log->disk_writes= 0; + } DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL); DBUG_RETURN(error); @@ -1578,51 +1529,11 @@ static LEX_STRING const write_error_msg= static int binlog_prepare(handlerton *hton, THD *thd, bool all) { /* - If this prepare is for a single statement in the middle of a transactions, - not the actual transaction commit, then we do nothing. The real work is - only done later, in the prepare for making persistent changes. + do nothing. + just pretend we can do 2pc, so that MySQL won't + switch to 1pc. + real work will be done in MYSQL_BIN_LOG::log_and_order() */ - if (!all && (thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT))) - return 0; - - binlog_trx_data *trx_data= - (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); - - trx_data->using_xa= TRUE; - - if (binlog_flush_trx_cache_prepare(thd)) - return 1; - - my_xid xid= thd->transaction.xid_state.xid.get_my_xid(); - if (!xid) - { - /* Skip logging this transaction, marked by setting end_event to NULL. */ - trx_data->end_event= NULL; - return 0; - } - - /* - Allocate the extra events that will be logged to the binlog in binlog group - commit. Use placement new to allocate them on the THD memroot, as they need - to remain live until log_xid() returns. - */ - size_t needed_size= sizeof(Query_log_event) + sizeof(Xid_log_event); - if (trx_data->has_incident()) - needed_size+= sizeof(Incident_log_event); - uchar *mem= (uchar *)thd->alloc(needed_size); - if (!mem) - return 1; - - trx_data->begin_event= new ((void *)mem) - Query_log_event(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0); - mem+= sizeof(Query_log_event); - - trx_data->end_event= new ((void *)mem) Xid_log_event(thd, xid); - - if (trx_data->has_incident()) - trx_data->incident_event= new ((void *)(mem + sizeof(Xid_log_event))) - Incident_log_event(thd, INCIDENT_LOST_EVENTS, write_error_msg); - return 0; } @@ -1646,11 +1557,11 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all) binlog_trx_data *const trx_data= (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); - if (trx_data->using_xa) + if (trx_data->empty()) { // we're here because trans_log was flushed in MYSQL_BIN_LOG::log_xid() - binlog_flush_trx_cache_finish(thd, trx_data); - DBUG_RETURN(error); + trx_data->reset(); + DBUG_RETURN(0); } /* @@ -1673,7 +1584,7 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all) !stmt_has_updated_trans_table(thd) && stmt_has_updated_non_trans_table(thd))) { Query_log_event end_ev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0); - error= binlog_flush_trx_cache(thd, trx_data, &end_ev); + error= binlog_flush_trx_cache(thd, trx_data, &end_ev, all); } trx_data->at_least_one_stmt_committed = my_b_tell(&trx_data->trans_log) > 0; @@ -1757,7 +1668,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all) thd->current_stmt_binlog_row_based)) { Query_log_event end_ev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0); - error= binlog_flush_trx_cache(thd, trx_data, &end_ev); + error= binlog_flush_trx_cache(thd, trx_data, &end_ev, all); } /* Otherwise, we simply truncate the cache as there is no change on @@ -2599,6 +2510,7 @@ const char *MYSQL_LOG::generate_name(const char *log_name, MYSQL_BIN_LOG::MYSQL_BIN_LOG() :bytes_written(0), prepared_xids(0), file_id(1), open_count(1), need_start_event(TRUE), + group_commit_queue(0), num_commits(0), num_group_commits(0), is_relay_log(0), description_event_for_exec(0), description_event_for_queue(0) { @@ -2626,7 +2538,6 @@ void MYSQL_BIN_LOG::cleanup() delete description_event_for_exec; (void) pthread_mutex_destroy(&LOCK_log); (void) pthread_mutex_destroy(&LOCK_index); - (void) pthread_mutex_destroy(&LOCK_queue); (void) pthread_cond_destroy(&update_cond); } DBUG_VOID_RETURN; @@ -2655,8 +2566,6 @@ void MYSQL_BIN_LOG::init_pthread_objects() */ (void) my_pthread_mutex_init(&LOCK_index, MY_MUTEX_INIT_SLOW, "LOCK_index", MYF_NO_DEADLOCK_DETECTION); - (void) my_pthread_mutex_init(&LOCK_queue, MY_MUTEX_INIT_FAST, "LOCK_queue", - MYF(0)); (void) pthread_cond_init(&update_cond, 0); } @@ -4461,11 +4370,6 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) } /* - Flush the pending rows event to the transaction cache or to the - log file. Since this function potentially aquire the LOCK_log - mutex, we do this before aquiring the LOCK_log mutex in this - function. - We only end the statement if we are in a top-level statement. If we are inside a stored function, we do not end the statement since this will close all tables on the slave. @@ -4475,8 +4379,6 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) if (thd->binlog_flush_pending_rows_event(end_stmt)) DBUG_RETURN(error); - pthread_mutex_lock(&LOCK_log); - /* In most cases this is only called if 'is_open()' is true; in fact this is mostly called if is_open() *was* true a few instructions before, but it @@ -4497,7 +4399,6 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) thd->lex->sql_command != SQLCOM_SAVEPOINT && !binlog_filter->db_ok(local_db))) { - VOID(pthread_mutex_unlock(&LOCK_log)); DBUG_RETURN(0); } #endif /* HAVE_REPLICATION */ @@ -4539,15 +4440,11 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) thd->binlog_start_trans_and_stmt(); file= trans_log; } - /* - TODO as Mats suggested, for all the cases above where we write to - trans_log, it sounds unnecessary to lock LOCK_log. We should rather - test first if we want to write to trans_log, and if not, lock - LOCK_log. - */ } #endif /* USING_TRANSACTIONS */ DBUG_PRINT("info",("event type: %d",event_info->get_type_code())); + if (file == &log_file) + pthread_mutex_lock(&LOCK_log); /* No check for auto events flag here - this write method should @@ -4572,7 +4469,7 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT, thd->first_successful_insert_id_in_prev_stmt_for_binlog); if (e.write(file)) - goto err; + goto err_unlock; } if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0) { @@ -4583,13 +4480,13 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) thd->auto_inc_intervals_in_cur_stmt_for_binlog. minimum()); if (e.write(file)) - goto err; + goto err_unlock; } if (thd->rand_used) { Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2); if (e.write(file)) - goto err; + goto err_unlock; } if (thd->user_var_events.elements) { @@ -4604,7 +4501,7 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) user_var_event->type, user_var_event->charset_number); if (e.write(file)) - goto err; + goto err_unlock; } } } @@ -4616,23 +4513,26 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info) if (event_info->write(file) || DBUG_EVALUATE_IF("injecting_fault_writing", 1, 0)) - goto err; + goto err_unlock; if (file == &log_file) // we are writing to the real log (disk) { if (flush_and_sync()) - goto err; + goto err_unlock; signal_update(); rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); } error=0; +err_unlock: + if (file == &log_file) + pthread_mutex_unlock(&LOCK_log); + err: if (error) set_write_error(thd); } - pthread_mutex_unlock(&LOCK_log); DBUG_RETURN(error); } @@ -4957,10 +4857,16 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd) bool MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data, - Log_event *end_ev) + Log_event *end_ev, bool all) { + group_commit_entry entry; DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog"); + entry.thd= thd; + entry.trx_data= trx_data; + entry.error= 0; + entry.all= all; + /* Create the necessary events here, where we have the correct THD (and thread context). @@ -4969,23 +4875,23 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data, thread. */ Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0); - trx_data->begin_event= &qinfo; - trx_data->end_event= end_ev; + entry.begin_event= &qinfo; + entry.end_event= end_ev; if (trx_data->has_incident()) { Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg); - trx_data->incident_event= &inc_ev; - DBUG_RETURN(write_transaction_to_binlog_events(trx_data)); + entry.incident_event= &inc_ev; + DBUG_RETURN(write_transaction_to_binlog_events(&entry)); } else { - trx_data->incident_event= NULL; - DBUG_RETURN(write_transaction_to_binlog_events(trx_data)); + entry.incident_event= NULL; + DBUG_RETURN(write_transaction_to_binlog_events(&entry)); } } bool -MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data) +MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry) { /* To facilitate group commit for the binlog, we first queue up ourselves in @@ -4995,91 +4901,61 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data) the commit and wake them up. */ - pthread_mutex_lock(&trx_data->LOCK_binlog_participant); + entry->thd->clear_wakeup_ready(); + pthread_mutex_lock(&LOCK_prepare_ordered); + group_commit_entry *orig_queue= group_commit_queue; + entry->next= orig_queue; + group_commit_queue= entry; - pthread_mutex_lock(&LOCK_queue); - binlog_trx_data *orig_queue= group_commit_queue; - trx_data->next= orig_queue; - group_commit_queue= trx_data; - pthread_mutex_unlock(&LOCK_queue); + if (entry->trx_data->using_xa) + { + DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered"); + run_prepare_ordered(entry->thd, entry->all); + DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered"); + } + pthread_mutex_unlock(&LOCK_prepare_ordered); + /* + The first in the queue handle group commit for all; the others just wait + to be signalled when group commit is done. + */ if (orig_queue != NULL) - { - trx_data->group_commit_leader= FALSE; - trx_data->done= FALSE; - trx_group_commit_participant(trx_data); - } + entry->thd->wait_for_wakeup_ready(); else + trx_group_commit_leader(entry); + + if (!entry->error) + return 0; + + switch (entry->error) { - trx_data->group_commit_leader= TRUE; - pthread_mutex_unlock(&trx_data->LOCK_binlog_participant); - trx_group_commit_leader(NULL); - } - - return trx_group_commit_finish(trx_data); -} - -/* - Participate as secondary transaction in group commit. - - Another thread is already waiting to obtain the LOCK_log, and should include - this thread in the group commit once the log is obtained. So here we put - ourself in the queue and wait to be signalled that the group commit is done. - - Note that this function must be called with trx_data->LOCK_binlog_participant - locked; the mutex will be released before return. -*/ -void -MYSQL_BIN_LOG::trx_group_commit_participant(binlog_trx_data *trx_data) -{ - safe_mutex_assert_owner(&trx_data->LOCK_binlog_participant); - - /* Wait until trx_data.done == true and woken up by the leader. */ - while (!trx_data->done) - pthread_cond_wait(&trx_data->COND_binlog_participant, - &trx_data->LOCK_binlog_participant); - pthread_mutex_unlock(&trx_data->LOCK_binlog_participant); -} - -bool -MYSQL_BIN_LOG::trx_group_commit_finish(binlog_trx_data *trx_data) -{ - DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_finish"); - DBUG_PRINT("info", ("trx_data->error=%d\n", trx_data->error)); - if (trx_data->error) - { - switch (trx_data->error) - { - case ER_ERROR_ON_WRITE: - my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, trx_data->commit_errno); - break; - case ER_ERROR_ON_READ: - my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH), - trx_data->trans_log.file_name, trx_data->commit_errno); - break; - default: - /* - There are not (and should not be) any errors thrown not covered above. - But just in case one is added later without updating the above switch - statement, include a catch-all. - */ - my_printf_error(trx_data->error, - "Error writing transaction to binary log: %d", - MYF(ME_NOREFRESH), trx_data->error); - } - + case ER_ERROR_ON_WRITE: + my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno); + break; + case ER_ERROR_ON_READ: + my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH), + entry->trx_data->trans_log.file_name, entry->commit_errno); + break; + default: /* - Since we return error, this transaction XID will not be committed, so - we need to mark it as not needed for recovery (unlog() is not called - for a transaction if log_xid() fails). - */ - if (trx_data->end_event->get_type_code() == XID_EVENT) - mark_xid_done(); - - DBUG_RETURN(1); + There are not (and should not be) any errors thrown not covered above. + But just in case one is added later without updating the above switch + statement, include a catch-all. + */ + my_printf_error(entry->error, + "Error writing transaction to binary log: %d", + MYF(ME_NOREFRESH), entry->error); } - DBUG_RETURN(0); + /* + Since we return error, this transaction XID will not be committed, so + we need to mark it as not needed for recovery (unlog() is not called + for a transaction if log_xid() fails). + */ + if (entry->trx_data->using_xa) + mark_xid_done(); + + return 1; } /* @@ -5093,69 +4969,36 @@ MYSQL_BIN_LOG::trx_group_commit_finish(binlog_trx_data *trx_data) */ void -MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) +MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) { + DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader"); uint xid_count= 0; uint write_count= 0; - /* First, put anything from group_log_xid into the queue. */ - binlog_trx_data *full_queue= NULL; - binlog_trx_data **next_ptr= &full_queue; - for (TC_group_commit_entry *entry= first; entry; entry= entry->next) - { - binlog_trx_data *const trx_data= - (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton); - - /* Skip log_xid for transactions without xid, marked by NULL end_event. */ - if (!trx_data->end_event) - continue; - - trx_data->error= 0; - *next_ptr= trx_data; - next_ptr= &(trx_data->next); - } - /* - Next, lock the LOCK_log(), and once we get it, add any additional writes + Lock the LOCK_log(), and once we get it, collect any additional writes that queued up while we were waiting. - - Note that if some writer not going through log_xid() comes in and gets the - LOCK_log before us, they will not be able to include us in their group - commit (and they are not able to handle ensuring same commit order between - us and participating transactional storage engines anyway). - - On the other hand, when we get the LOCK_log, we will be able to include - any non-trasactional writes that queued up in our group commit. This - should hopefully not be too big of a problem, as group commit is most - important for the transactional case anyway when durability (fsync) is - enabled. */ VOID(pthread_mutex_lock(&LOCK_log)); + DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log"); - /* - As the queue is in reverse order of entering, reverse the queue as we add - it to the existing one. Note that there is no ordering defined between - transactional and non-transactional commits. - */ - pthread_mutex_lock(&LOCK_queue); - binlog_trx_data *current= group_commit_queue; + pthread_mutex_lock(&LOCK_prepare_ordered); + group_commit_entry *current= group_commit_queue; group_commit_queue= NULL; - pthread_mutex_unlock(&LOCK_queue); - binlog_trx_data *xtra_queue= NULL; + pthread_mutex_unlock(&LOCK_prepare_ordered); + + /* As the queue is in reverse order of entering, reverse it. */ + group_commit_entry *queue= NULL; while (current) { - current->error= 0; - binlog_trx_data *next= current->next; - current->next= xtra_queue; - xtra_queue= current; + group_commit_entry *next= current->next; + current->next= queue; + queue= current; current= next; } - *next_ptr= xtra_queue; + DBUG_ASSERT(leader == queue /* the leader should be first in queue */); - /* - Now we have in full_queue the list of transactions to be committed in - order. - */ + /* Now we have in queue the list of transactions to be committed in order. */ DBUG_ASSERT(is_open()); if (likely(is_open())) // Should always be true { @@ -5169,9 +5012,14 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) current->error and let the thread do the error reporting itself once we wake it up. */ - for (current= full_queue; current != NULL; current= current->next) + for (current= queue; current != NULL; current= current->next) { - IO_CACHE *cache= ¤t->trans_log; + binlog_trx_data *trx_data= current->trx_data; + IO_CACHE *cache= &trx_data->trans_log; + + /* Skip log_xid for transactions without xid, marked by NULL end_event. */ + if (!current->end_event) + continue; /* We only bother to write to the binary log if there is anything @@ -5186,9 +5034,9 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) write_count++; } - current->commit_bin_log_file_pos= + trx_data->commit_bin_log_file_pos= log_file.pos_in_file + (log_file.write_pos - log_file.write_buffer); - if (current->end_event->get_type_code() == XID_EVENT) + if (trx_data->using_xa) xid_count++; } @@ -5196,7 +5044,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) { if (flush_and_sync()) { - for (current= full_queue; current != NULL; current= current->next) + for (current= queue; current != NULL; current= current->next) { if (!current->error) { @@ -5213,7 +5061,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) /* if any commit_events are Xid_log_event, increase the number of - prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated + prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated if there're prepared xids in it - see the comment in new_file() for an explanation. If no Xid_log_events (then it's all Query_log_event) rotate binlog, @@ -5227,37 +5075,49 @@ MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first) rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED); } - VOID(pthread_mutex_unlock(&LOCK_log)); + DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered"); + pthread_mutex_lock(&LOCK_commit_ordered); + /* + We cannot unlock LOCK_log until we have locked LOCK_commit_ordered; + otherwise scheduling could allow the next group commit to run ahead of us, + messing up the order of commit_ordered() calls. But as soon as + LOCK_commit_ordered is obtained, we can let the next group commit start. + */ + pthread_mutex_unlock(&LOCK_log); + DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log"); + ++num_group_commits; /* - Signal those that are not part of group_log_xid, and are not group leaders - running the queue. - - Since a group leader runs the queue itself if a group_log_xid does not get - to do it forst, such leader threads do not need wait or wakeup. + Wakeup each participant waiting for our group commit, first calling the + commit_ordered() methods for any transactions doing 2-phase commit. */ - for (current= xtra_queue; current != NULL; current= current->next) + current= queue; + while (current != NULL) { - /* - Note that we need to take LOCK_binlog_participant even in the case of a - leader! + DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered"); + ++num_commits; + if (current->trx_data->using_xa && !current->error) + run_commit_ordered(current->thd, current->all); - Otherwise there is a race between setting and testing the - group_commit_leader flag. + /* + Careful not to access current->next after waking up the other thread! As + it may change immediately after wakeup. */ - pthread_mutex_lock(¤t->LOCK_binlog_participant); - if (!current->group_commit_leader) - { - current->done= true; - pthread_cond_signal(¤t->COND_binlog_participant); - } - pthread_mutex_unlock(¤t->LOCK_binlog_participant); + group_commit_entry *next= current->next; + if (current != leader) // Don't wake up ourself + current->thd->signal_wakeup_ready(); + current= next; } + DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered"); + pthread_mutex_unlock(&LOCK_commit_ordered); + + DBUG_VOID_RETURN; } int -MYSQL_BIN_LOG::write_transaction(binlog_trx_data *trx_data) +MYSQL_BIN_LOG::write_transaction(group_commit_entry *entry) { + binlog_trx_data *trx_data= entry->trx_data; IO_CACHE *cache= &trx_data->trans_log; /* Log "BEGIN" at the beginning of every transaction. Here, a transaction is @@ -5272,7 +5132,7 @@ MYSQL_BIN_LOG::write_transaction(binlog_trx_data *trx_data) in wrong positions being shown to the user, MASTER_POS_WAIT undue waiting etc. */ - if (trx_data->begin_event->write(&log_file)) + if (entry->begin_event->write(&log_file)) return ER_ERROR_ON_WRITE; DBUG_EXECUTE_IF("crash_before_writing_xid", @@ -5289,10 +5149,10 @@ MYSQL_BIN_LOG::write_transaction(binlog_trx_data *trx_data) if (write_cache(cache)) return ER_ERROR_ON_WRITE; - if (trx_data->end_event->write(&log_file)) + if (entry->end_event->write(&log_file)) return ER_ERROR_ON_WRITE; - if (trx_data->has_incident() && trx_data->incident_event->write(&log_file)) + if (entry->incident_event && entry->incident_event->write(&log_file)) return ER_ERROR_ON_WRITE; if (cache->error) // Error on read @@ -5754,30 +5614,6 @@ TC_LOG::run_commit_ordered(THD *thd, bool all) } } -TC_LOG_queued::TC_LOG_queued() : group_commit_queue(NULL) -{ -} - -TC_LOG_queued::~TC_LOG_queued() -{ -} - -TC_LOG_queued::TC_group_commit_entry * -TC_LOG_queued::reverse_queue(TC_LOG_queued::TC_group_commit_entry *queue) -{ - TC_group_commit_entry *entry= queue; - TC_group_commit_entry *prev= NULL; - while (entry) - { - TC_group_commit_entry *next= entry->next; - entry->next= prev; - prev= entry; - entry= next; - } - - return prev; -} - int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all, bool need_prepare_ordered, bool need_commit_ordered) @@ -5886,142 +5722,6 @@ int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all, } -TC_LOG_group_commit::TC_LOG_group_commit() - : num_commits(0), num_group_commits(0) -{ -} - -TC_LOG_group_commit::~TC_LOG_group_commit() -{ -} - -void -TC_LOG_group_commit::init() -{ - my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW, - "LOCK_group_commit", MYF(0)); -} - -void -TC_LOG_group_commit::deinit() -{ - pthread_mutex_destroy(&LOCK_group_commit); -} - -int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all, - bool need_prepare_ordered, - bool need_commit_ordered) -{ - IF_DBUG(int err;) - int cookie; - struct TC_group_commit_entry entry; - bool is_group_commit_leader; - - thd->clear_wakeup_ready(); - entry.thd= thd; - entry.all= all; - entry.xid_error= 0; - - pthread_mutex_lock(&LOCK_prepare_ordered); - TC_group_commit_entry *previous_queue= group_commit_queue; - entry.next= previous_queue; - group_commit_queue= &entry; - - DEBUG_SYNC(thd, "commit_before_prepare_ordered"); - run_prepare_ordered(thd, all); - DEBUG_SYNC(thd, "commit_after_prepare_ordered"); - pthread_mutex_unlock(&LOCK_prepare_ordered); - - is_group_commit_leader= (previous_queue == NULL); - - if (is_group_commit_leader) - { - TC_group_commit_entry *current; - - pthread_mutex_lock(&LOCK_group_commit); - DEBUG_SYNC(thd, "commit_after_get_LOCK_group_commit"); - - pthread_mutex_lock(&LOCK_prepare_ordered); - TC_group_commit_entry *queue= group_commit_queue; - group_commit_queue= NULL; - pthread_mutex_unlock(&LOCK_prepare_ordered); - - /* - Since we enqueue at the head, the queue is actually in reverse order. - So reverse it back into correct commit order before returning. - */ - queue= reverse_queue(queue); - - /* The first in the queue is the leader. */ - DBUG_ASSERT(queue == &entry && queue->thd == thd); - - DEBUG_SYNC(thd, "commit_before_group_log_xid"); - /* This will set individual error codes in each thd->xid_error. */ - group_log_xid(queue); - DEBUG_SYNC(thd, "commit_after_group_log_xid"); - - /* - Call commit_ordered methods for all transactions in the queue - (that did not get an error in group_log_xid()). - - We do this under an additional global LOCK_commit_ordered; this is - so that transactions that do not need 2-phase commit do not have - to wait for the potentially long duration of LOCK_group_commit. - */ - current= queue; - - DEBUG_SYNC(thd, "commit_before_get_LOCK_commit_ordered"); - pthread_mutex_lock(&LOCK_commit_ordered); - /* - We cannot unlock LOCK_group_commit until we have locked - LOCK_commit_ordered; otherwise scheduling could allow the next - group commit to run ahead of us, messing up the order of - commit_ordered() calls. But as soon as LOCK_commit_ordered is - obtained, we can let the next group commit start. - */ - pthread_mutex_unlock(&LOCK_group_commit); - DEBUG_SYNC(thd, "commit_after_release_LOCK_group_commit"); - - ++num_group_commits; - do - { - DEBUG_SYNC(thd, "commit_loop_entry_commit_ordered"); - ++num_commits; - if (!current->xid_error) - run_commit_ordered(current->thd, current->all); - - /* - Careful not to access current->next_commit_ordered after waking up - the other thread! As it may change immediately after wakeup. - */ - TC_group_commit_entry *next= current->next; - if (current != &entry) // Don't wake up ourself - current->thd->signal_wakeup_ready(); - current= next; - } while (current != NULL); - DEBUG_SYNC(thd, "commit_after_group_run_commit_ordered"); - - pthread_mutex_unlock(&LOCK_commit_ordered); - } - else - { - /* If not leader, just wait until leader wakes us up. */ - thd->wait_for_wakeup_ready(); - } - - /* - Now that we're back in our own thread context, do any delayed processing - and error reporting. - */ - IF_DBUG(err= entry.xid_error;) - cookie= xid_log_after(&entry); - /* The cookie must be non-zero in the non-error case. */ - DBUG_ASSERT(err || cookie); - - return cookie; -} - - /********* transaction coordinator log for 2pc - mmap() based solution *******/ /* @@ -6567,7 +6267,6 @@ int TC_LOG_BINLOG::open(const char *opt_name) DBUG_ASSERT(total_ha_2pc > 1); DBUG_ASSERT(opt_name && opt_name[0]); - TC_LOG_group_commit::init(); pthread_mutex_init(&LOCK_prep_xids, MY_MUTEX_INIT_FAST); pthread_cond_init (&COND_prep_xids, 0); @@ -6651,36 +6350,33 @@ void TC_LOG_BINLOG::close() DBUG_ASSERT(prepared_xids==0); pthread_mutex_destroy(&LOCK_prep_xids); pthread_cond_destroy (&COND_prep_xids); - TC_LOG_group_commit::deinit(); } /* Do a binlog log_xid() for a group of transactions, linked through thd->next_commit_ordered. */ -void -TC_LOG_BINLOG::group_log_xid(TC_group_commit_entry *first) -{ - DBUG_ENTER("TC_LOG_BINLOG::group_log_xid"); - trx_group_commit_leader(first); - for (TC_group_commit_entry *entry= first; entry; entry= entry->next) - { - binlog_trx_data *const trx_data= - (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton); - entry->xid_error= trx_data->error; - } - DBUG_VOID_RETURN; -} - int -TC_LOG_BINLOG::xid_log_after(TC_group_commit_entry *entry) +TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered __attribute__((unused)), + bool need_commit_ordered __attribute__((unused))) { + int err; + DBUG_ENTER("TC_LOG_BINLOG::log_and_order"); + binlog_trx_data *const trx_data= - (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton); - if (trx_group_commit_finish(trx_data)) - return 0; // Returning zero cookie signals error + (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton); + + trx_data->using_xa= TRUE; + if (xid) + { + Xid_log_event xid_event(thd, xid); + err= binlog_flush_trx_cache(thd, trx_data, &xid_event, all); + } else - return 1; + err= binlog_flush_trx_cache(thd, trx_data, NULL, all); + + DBUG_RETURN(!err); } /* diff --git a/sql/log.h b/sql/log.h index 37f5462f198..c5a2a72647b 100644 --- a/sql/log.h +++ b/sql/log.h @@ -73,101 +73,6 @@ extern pthread_mutex_t LOCK_commit_ordered; extern void TC_init(); extern void TC_destroy(); -/* - Base class for two TC implementations TC_LOG_unordered and - TC_LOG_group_commit that both use a queue of threads waiting for group - commit. -*/ -class TC_LOG_queued: public TC_LOG -{ -protected: - TC_LOG_queued(); - ~TC_LOG_queued(); - - /* Structure used to link list of THDs waiting for group commit. */ - struct TC_group_commit_entry - { - struct TC_group_commit_entry *next; - THD *thd; - /* This is the `all' parameter for ha_commit_trans() etc. */ - bool all; - /* - Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and - cookie. - */ - int xid_error; - }; - - TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue); - - /* - This is a queue of threads waiting for being allowed to commit. - Access to the queue must be protected by LOCK_prepare_ordered. - */ - TC_group_commit_entry *group_commit_queue; -}; - -class TC_LOG_group_commit: public TC_LOG_queued -{ -public: - TC_LOG_group_commit(); - ~TC_LOG_group_commit(); - void init(); - void deinit(); - - int log_and_order(THD *thd, my_xid xid, bool all, - bool need_prepare_ordered, bool need_commit_ordered); - -protected: - /* Total number of committed transactions. */ - ulonglong num_commits; - /* Number of group commits done. */ - ulonglong num_group_commits; - - /* - When using this class, this method is used instead of log_xid() to do - logging of a group of transactions all at once. - - The transactions will be linked through THD::next_commit_ordered. - - Additionally, when this method is used instead of log_xid(), the order in - which handler->prepare_ordered() and handler->commit_ordered() are called - is guaranteed to be the same as the order of calls and THD list elements - for group_log_xid(). - - This can be used to efficiently implement group commit that at the same - time preserves the order of commits among handlers and TC (eg. to get same - commit order in InnoDB and binary log). - - For TCs that do not need this, it can be preferable to use plain log_xid() - with class TC_LOG_unordered instead, as it allows threads to run log_xid() - in parallel with each other. In contrast, group_log_xid() runs under a - global mutex, so it is guaranteed that only once call into it will be - active at once. - - Since this call handles multiple threads/THDs at once, my_error() (and - other code that relies on thread local storage) cannot be used in this - method. Instead, the implementation must record any error and report it as - the return value from xid_log_after(), which will be invoked individually - for each thread. - - In the success case, this method must set thd->xid_cookie for each thread - to the cookie that is normally returned from log_xid() (which must be - non-zero in the non-error case). - */ - virtual void group_log_xid(TC_group_commit_entry *first) = 0; - /* - Called for each transaction (in corrent thread context) after - group_log_xid() has finished, but with no guarantee on ordering among - threads. - Can be used to do error reporting etc. */ - virtual int xid_log_after(TC_group_commit_entry *entry) = 0; - -private: - /* Mutex used to serialise calls to group_log_xid(). */ - pthread_mutex_t LOCK_group_commit; -}; - class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging { public: @@ -398,17 +303,33 @@ private: }; class binlog_trx_data; -class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG +class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG { private: + struct group_commit_entry + { + struct group_commit_entry *next; + THD *thd; + binlog_trx_data *trx_data; + /* + Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be + written during group commit. The incident_event is only valid if + trx_data->has_incident() is true. + */ + Log_event *begin_event; + Log_event *end_event; + Log_event *incident_event; + /* Set during group commit to record any per-thread error. */ + int error; + int commit_errno; + /* This is the `all' parameter for ha_commit_ordered(). */ + bool all; + /* True if we come in through XA log_and_order(), false otherwise. */ + }; + /* LOCK_log and LOCK_index are inited by init_pthread_objects() */ pthread_mutex_t LOCK_index; pthread_mutex_t LOCK_prep_xids; - /* - Mutex to protect the queue of non-transactional binlog writes waiting to - participate in group commit. - */ - pthread_mutex_t LOCK_queue; pthread_cond_t COND_prep_xids; pthread_cond_t update_cond; @@ -449,7 +370,11 @@ class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG */ bool no_auto_events; /* Queue of transactions queued up to participate in group commit. */ - binlog_trx_data *group_commit_queue; + group_commit_entry *group_commit_queue; + /* Total number of committed transactions. */ + ulonglong num_commits; + /* Number of group commits done. */ + ulonglong num_group_commits; int write_to_file(IO_CACHE *cache); /* @@ -459,10 +384,9 @@ class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG */ void new_file_without_locking(); void new_file_impl(bool need_lock); - int write_transaction(binlog_trx_data *trx_data); - bool write_transaction_to_binlog_events(binlog_trx_data *trx_data); - void trx_group_commit_participant(binlog_trx_data *trx_data); - void trx_group_commit_leader(TC_group_commit_entry *first); + int write_transaction(group_commit_entry *entry); + bool write_transaction_to_binlog_events(group_commit_entry *entry); + void trx_group_commit_leader(group_commit_entry *leader); void mark_xid_done(); void mark_xids_active(uint xid_count); @@ -493,8 +417,8 @@ public: int open(const char *opt_name); void close(); - void group_log_xid(TC_group_commit_entry *first); - int xid_log_after(TC_group_commit_entry *entry); + int log_and_order(THD *thd, my_xid xid, bool all, + bool need_prepare_ordered, bool need_commit_ordered); void unlog(ulong cookie, my_xid xid); int recover(IO_CACHE *log, Format_description_log_event *fdle); #if !defined(MYSQL_CLIENT) @@ -540,8 +464,7 @@ public: void reset_gathered_updates(THD *thd); bool write(Log_event* event_info); // binary log write bool write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data, - Log_event *end_ev); - bool trx_group_commit_finish(binlog_trx_data *trx_data); + Log_event *end_ev, bool all); bool write_incident(THD *thd); int write_cache(IO_CACHE *cache); From a786357be1c09c1ee64d954dda1ddf5f8b641533 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 1 Nov 2010 16:01:25 +0100 Subject: [PATCH 11/13] Minor whitespace/comment cleanup. --- sql/handler.h | 10 ++++------ sql/log.h | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sql/handler.h b/sql/handler.h index 6e7174bcc60..e0a89f76271 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -737,12 +737,10 @@ struct handlerton cannot invoke code that relies on thread local storage, in particular it cannot call my_error(). - When prepare_ordered() is called, the transaction coordinator has already - decided to commit (not rollback) the transaction. So prepare_ordered() - cannot cause a rollback by returning an error, all possible errors must - be handled in prepare() (the prepare_ordered() method returns void). In - case of some fatal error, a record of the error must be made internally - by the engine and returned from commit() later. + prepare_ordered() cannot cause a rollback by returning an error, all + possible errors must be handled in prepare() (the prepare_ordered() + method returns void). In case of some fatal error, a record of the error + must be made internally by the engine and returned from commit() later. Note that for user-level XA SQL commands, no consistent ordering among prepare_ordered() and commit_ordered() is guaranteed (as that would diff --git a/sql/log.h b/sql/log.h index c5a2a72647b..fcc9d5a711b 100644 --- a/sql/log.h +++ b/sql/log.h @@ -330,7 +330,6 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG /* LOCK_log and LOCK_index are inited by init_pthread_objects() */ pthread_mutex_t LOCK_index; pthread_mutex_t LOCK_prep_xids; - pthread_cond_t COND_prep_xids; pthread_cond_t update_cond; ulonglong bytes_written; From 805f009360288ac6a98572218f2f8cb2402fcf08 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 2 Nov 2010 08:40:27 +0100 Subject: [PATCH 12/13] Fix error insert to have deterministic errno (fixes test failure on mac). --- mysql-test/suite/binlog/r/binlog_ioerr.result | 4 ++-- sql/log.cc | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mysql-test/suite/binlog/r/binlog_ioerr.result b/mysql-test/suite/binlog/r/binlog_ioerr.result index 2300f3c5f82..04ac0340746 100644 --- a/mysql-test/suite/binlog/r/binlog_ioerr.result +++ b/mysql-test/suite/binlog/r/binlog_ioerr.result @@ -4,9 +4,9 @@ CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; INSERT INTO t1 VALUES(0); SET SESSION debug='+d,fail_binlog_write_1'; INSERT INTO t1 VALUES(1); -ERROR HY000: Error writing file 'master-bin' (errno: 22) +ERROR HY000: Error writing file 'master-bin' (errno: 28) INSERT INTO t1 VALUES(2); -ERROR HY000: Error writing file 'master-bin' (errno: 22) +ERROR HY000: Error writing file 'master-bin' (errno: 28) SET SESSION debug=''; INSERT INTO t1 VALUES(3); SELECT * FROM t1; diff --git a/sql/log.cc b/sql/log.cc index 0d1d7ae7535..d18ded1f24e 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -4772,7 +4772,8 @@ int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache) } /* Write data to the binary log file */ - DBUG_EXECUTE_IF("fail_binlog_write_1", return ER_ERROR_ON_WRITE;); + DBUG_EXECUTE_IF("fail_binlog_write_1", + errno= 28; return ER_ERROR_ON_WRITE;); if (my_b_write(&log_file, cache->read_pos, length)) return ER_ERROR_ON_WRITE; cache->read_pos=cache->read_end; // Mark buffer used up From a2d921be3634ceff4ab4c67f57b27a481d4a28df Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 3 Nov 2010 16:54:38 +0100 Subject: [PATCH 13/13] MWL#116: Add two more test cases: - Test internal 2-phase commit when binlog is enabled globally, but disabled in the session - Test crashing at various points during commit --- mysql-test/r/group_commit_crash.result | 120 ++++++++++++++++++ mysql-test/suite/pbxt/r/pbxt_xa_binlog.result | 31 +++++ mysql-test/suite/pbxt/t/pbxt_xa_binlog.test | 31 +++++ mysql-test/t/group_commit_crash-master.opt | 1 + mysql-test/t/group_commit_crash.test | 80 ++++++++++++ 5 files changed, 263 insertions(+) create mode 100644 mysql-test/r/group_commit_crash.result create mode 100644 mysql-test/suite/pbxt/r/pbxt_xa_binlog.result create mode 100644 mysql-test/suite/pbxt/t/pbxt_xa_binlog.test create mode 100644 mysql-test/t/group_commit_crash-master.opt create mode 100644 mysql-test/t/group_commit_crash.test diff --git a/mysql-test/r/group_commit_crash.result b/mysql-test/r/group_commit_crash.result new file mode 100644 index 00000000000..044161695e3 --- /dev/null +++ b/mysql-test/r/group_commit_crash.result @@ -0,0 +1,120 @@ +CREATE TABLE t1(a CHAR(255), +b CHAR(255), +c CHAR(255), +d CHAR(255), +id INT AUTO_INCREMENT, +PRIMARY KEY(id)) ENGINE=InnoDB; +create table t2 like t1; +create procedure setcrash(IN i INT) +begin +CASE i +WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare"; +WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log"; +WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog"; +WHEN 4 THEN SET SESSION debug="d,crash_commit_after"; +WHEN 5 THEN SET SESSION debug="d,crash_commit_before"; +ELSE BEGIN END; +END CASE; +end // +FLUSH TABLES; +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +RESET MASTER; +START TRANSACTION; +insert into t1 select * from t2; +call setcrash(5); +COMMIT; +Got one of the listed errors +SELECT * FROM t1 ORDER BY id; +a b c d id +SHOW BINLOG EVENTS LIMIT 2,1; +Log_name Pos Event_type Server_id End_log_pos Info +delete from t1; +RESET MASTER; +START TRANSACTION; +insert into t1 select * from t2; +call setcrash(4); +COMMIT; +Got one of the listed errors +SELECT * FROM t1 ORDER BY id; +a b c d id +a b c d 1 +a b c d 2 +a b c d 3 +a b c d 4 +a b c d 5 +a b c d 6 +a b c d 7 +a b c d 8 +a b c d 9 +a b c d 10 +SHOW BINLOG EVENTS LIMIT 2,1; +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 174 Query 1 268 use `test`; insert into t1 select * from t2 +delete from t1; +RESET MASTER; +START TRANSACTION; +insert into t1 select * from t2; +call setcrash(3); +COMMIT; +Got one of the listed errors +SELECT * FROM t1 ORDER BY id; +a b c d id +a b c d 1 +a b c d 2 +a b c d 3 +a b c d 4 +a b c d 5 +a b c d 6 +a b c d 7 +a b c d 8 +a b c d 9 +a b c d 10 +SHOW BINLOG EVENTS LIMIT 2,1; +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 174 Query 1 268 use `test`; insert into t1 select * from t2 +delete from t1; +RESET MASTER; +START TRANSACTION; +insert into t1 select * from t2; +call setcrash(2); +COMMIT; +Got one of the listed errors +SELECT * FROM t1 ORDER BY id; +a b c d id +a b c d 1 +a b c d 2 +a b c d 3 +a b c d 4 +a b c d 5 +a b c d 6 +a b c d 7 +a b c d 8 +a b c d 9 +a b c d 10 +SHOW BINLOG EVENTS LIMIT 2,1; +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 174 Query 1 268 use `test`; insert into t1 select * from t2 +delete from t1; +RESET MASTER; +START TRANSACTION; +insert into t1 select * from t2; +call setcrash(1); +COMMIT; +Got one of the listed errors +SELECT * FROM t1 ORDER BY id; +a b c d id +SHOW BINLOG EVENTS LIMIT 2,1; +Log_name Pos Event_type Server_id End_log_pos Info +delete from t1; +DROP TABLE t1; +DROP TABLE t2; +DROP PROCEDURE setcrash; diff --git a/mysql-test/suite/pbxt/r/pbxt_xa_binlog.result b/mysql-test/suite/pbxt/r/pbxt_xa_binlog.result new file mode 100644 index 00000000000..ec12a8a48ae --- /dev/null +++ b/mysql-test/suite/pbxt/r/pbxt_xa_binlog.result @@ -0,0 +1,31 @@ +drop table if exists t1, t2; +SET binlog_format = 'mixed'; +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; +CREATE TABLE t2 (b INT PRIMARY KEY) ENGINE=pbxt; +BEGIN; +SELECT @@log_bin; +@@log_bin +1 +INSERT INTO t1 VALUES (1); +INSERT INTO t2 VALUES (2); +COMMIT; +select * from t1; +a +1 +select * from t2; +b +2 +SET sql_log_bin = 0; +INSERT INTO t1 VALUES (3); +INSERT INTO t2 VALUES (4); +COMMIT; +select * from t1 order by a; +a +1 +3 +select * from t2 order by b; +b +2 +4 +drop table t1, t2; +drop database pbxt; diff --git a/mysql-test/suite/pbxt/t/pbxt_xa_binlog.test b/mysql-test/suite/pbxt/t/pbxt_xa_binlog.test new file mode 100644 index 00000000000..e327c5a7b40 --- /dev/null +++ b/mysql-test/suite/pbxt/t/pbxt_xa_binlog.test @@ -0,0 +1,31 @@ +--source include/have_innodb.inc +--source include/have_log_bin.inc + +--disable_warnings +drop table if exists t1, t2; +--enable_warnings + +SET binlog_format = 'mixed'; + +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb; +CREATE TABLE t2 (b INT PRIMARY KEY) ENGINE=pbxt; +BEGIN; +# verify that binlog is on +SELECT @@log_bin; +INSERT INTO t1 VALUES (1); +INSERT INTO t2 VALUES (2); +COMMIT; +select * from t1; +select * from t2; + +# Test 2-phase commit when we disable binlogging. +SET sql_log_bin = 0; +INSERT INTO t1 VALUES (3); +INSERT INTO t2 VALUES (4); +COMMIT; +select * from t1 order by a; +select * from t2 order by b; + +drop table t1, t2; +drop database pbxt; + diff --git a/mysql-test/t/group_commit_crash-master.opt b/mysql-test/t/group_commit_crash-master.opt new file mode 100644 index 00000000000..425fda95086 --- /dev/null +++ b/mysql-test/t/group_commit_crash-master.opt @@ -0,0 +1 @@ +--skip-stack-trace --skip-core-file diff --git a/mysql-test/t/group_commit_crash.test b/mysql-test/t/group_commit_crash.test new file mode 100644 index 00000000000..273cd6230eb --- /dev/null +++ b/mysql-test/t/group_commit_crash.test @@ -0,0 +1,80 @@ +# Testing group commit by crashing a few times. +# Test adapted from the Facebook patch: lp:mysqlatfacebook +--source include/not_embedded.inc +# Don't test this under valgrind, memory leaks will occur +--source include/not_valgrind.inc + +# Binary must be compiled with debug for crash to occur +--source include/have_debug.inc +--source include/have_innodb.inc +--source include/have_log_bin.inc + +let $file_format_check=`SELECT @@innodb_file_format_check`; +CREATE TABLE t1(a CHAR(255), + b CHAR(255), + c CHAR(255), + d CHAR(255), + id INT AUTO_INCREMENT, + PRIMARY KEY(id)) ENGINE=InnoDB; +create table t2 like t1; +delimiter //; +create procedure setcrash(IN i INT) +begin + CASE i + WHEN 1 THEN SET SESSION debug="d,crash_commit_after_prepare"; + WHEN 2 THEN SET SESSION debug="d,crash_commit_after_log"; + WHEN 3 THEN SET SESSION debug="d,crash_commit_before_unlog"; + WHEN 4 THEN SET SESSION debug="d,crash_commit_after"; + WHEN 5 THEN SET SESSION debug="d,crash_commit_before"; + ELSE BEGIN END; + END CASE; +end // +delimiter ;// +# Avoid getting a crashed mysql.proc table. +FLUSH TABLES; + +let $numtests = 5; + +let $numinserts = 10; +while ($numinserts) +{ + dec $numinserts; + INSERT INTO t2(a, b, c, d) VALUES ('a', 'b', 'c', 'd'); +} + +--enable_reconnect + +while ($numtests) +{ + RESET MASTER; + + START TRANSACTION; + insert into t1 select * from t2; + # Write file to make mysql-test-run.pl expect crash + --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect + + eval call setcrash($numtests); + + # Run the crashing query + --error 2006,2013 + COMMIT; + + # Poll the server waiting for it to be back online again. + --source include/wait_until_connected_again.inc + + # table and binlog should be in sync. + SELECT * FROM t1 ORDER BY id; + SHOW BINLOG EVENTS LIMIT 2,1; + + delete from t1; + + dec $numtests; +} + +# final cleanup +DROP TABLE t1; +DROP TABLE t2; +DROP PROCEDURE setcrash; +--disable_query_log +eval SET GLOBAL innodb_file_format_check=$file_format_check; +--enable_query_log