MWL#116: Efficient group commit for binary log

Preliminary commit for testing
2025-08-01 03:47:19 +03:00 · 2010-09-30 15:20:15 +02:00
parent e432151e9c
commit 0394cf2030
16 changed files with 1821 additions and 467 deletions
--- a/mysql-test/r/group_commit.result
+++ b/mysql-test/r/group_commit.result
@ -0,0 +1,63 @@
+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
+SELECT variable_value INTO @commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_commits';
+SELECT variable_value INTO @group_commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_group_commits';
+SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
+INSERT INTO t1 VALUES ("con1");
+set DEBUG_SYNC= "now WAIT_FOR group1_running";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
+SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
+INSERT INTO t1 VALUES ("con2");
+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
+INSERT INTO t1 VALUES ("con3");
+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
+INSERT INTO t1 VALUES ("con4");
+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SELECT * FROM t1 ORDER BY a;
+a
+SET DEBUG_SYNC= "now SIGNAL group2_queued";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
+SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
+INSERT INTO t1 VALUES ("con5");
+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
+INSERT INTO t1 VALUES ("con6");
+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+SET DEBUG_SYNC= "now SIGNAL group3_committed";
+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+con2
+con3
+con4
+SET DEBUG_SYNC= "now SIGNAL group2_checked";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+con2
+con3
+con4
+con5
+con6
+SELECT variable_value - @commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_commits';
+variable_value - @commits
+6
+SELECT variable_value - @group_commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_group_commits';
+variable_value - @group_commits
+3
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
--- a/mysql-test/suite/binlog/r/binlog_ioerr.result
+++ b/mysql-test/suite/binlog/r/binlog_ioerr.result
@ -0,0 +1,28 @@
+CALL mtr.add_suppression("Error writing file 'master-bin'");
+RESET MASTER;
+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
+INSERT INTO t1 VALUES(0);
+SET SESSION debug='+d,fail_binlog_write_1';
+INSERT INTO t1 VALUES(1);
+ERROR HY000: Error writing file 'master-bin' (errno: 22)
+INSERT INTO t1 VALUES(2);
+ERROR HY000: Error writing file 'master-bin' (errno: 22)
+SET SESSION debug='';
+INSERT INTO t1 VALUES(3);
+SELECT * FROM t1;
+a
+0
+3
+SHOW BINLOG EVENTS;
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+BINLOG	POS	Format_desc	1	ENDPOS	Server ver: #, Binlog ver: #
+BINLOG	POS	Query	1	ENDPOS	use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	use `test`; INSERT INTO t1 VALUES(0)
+BINLOG	POS	Xid	1	ENDPOS	COMMIT /* XID */
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	use `test`; INSERT INTO t1 VALUES(3)
+BINLOG	POS	Xid	1	ENDPOS	COMMIT /* XID */
+DROP TABLE t1;
--- a/mysql-test/suite/binlog/t/binlog_ioerr.test
+++ b/mysql-test/suite/binlog/t/binlog_ioerr.test
@ -0,0 +1,29 @@
+source include/have_debug.inc;
+source include/have_innodb.inc;
+source include/have_log_bin.inc;
+source include/have_binlog_format_mixed_or_statement.inc;
+
+CALL mtr.add_suppression("Error writing file 'master-bin'");
+
+RESET MASTER;
+
+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
+INSERT INTO t1 VALUES(0);
+SET SESSION debug='+d,fail_binlog_write_1';
+--error ER_ERROR_ON_WRITE
+INSERT INTO t1 VALUES(1);
+--error ER_ERROR_ON_WRITE
+INSERT INTO t1 VALUES(2);
+SET SESSION debug='';
+INSERT INTO t1 VALUES(3);
+SELECT * FROM t1;
+
+# Actually the output from this currently shows a bug.
+# The injected IO error leaves partially written transactions in the binlog in
+# the form of stray "BEGIN" events.
+# These should disappear from the output if binlog error handling is improved.
+--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
+--replace_column 1 BINLOG 2 POS 5 ENDPOS
+SHOW BINLOG EVENTS;
+
+DROP TABLE t1;
--- a/mysql-test/t/group_commit.test
+++ b/mysql-test/t/group_commit.test
@ -0,0 +1,115 @@
+--source include/have_debug_sync.inc
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
+
+# Test some group commit code paths by using debug_sync to do controlled
+# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
+# group.
+#
+# Group 3 is allowed to race as far as possible ahead before group 2 finishes
+# to check some edge case for concurrency control.
+
+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
+
+SELECT variable_value INTO @commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_commits';
+SELECT variable_value INTO @group_commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_group_commits';
+
+connect(con1,localhost,root,,);
+connect(con2,localhost,root,,);
+connect(con3,localhost,root,,);
+connect(con4,localhost,root,,);
+connect(con5,localhost,root,,);
+connect(con6,localhost,root,,);
+
+# Start group1 (with one thread) doing commit, waiting for
+# group2 to queue up before finishing.
+
+connection con1;
+SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
+send INSERT INTO t1 VALUES ("con1");
+
+# Make group2 (with three threads) queue up.
+# Make sure con2 is the group commit leader for group2.
+# Make group2 wait with running commit_ordered() until group3 has committed.
+
+connection con2;
+set DEBUG_SYNC= "now WAIT_FOR group1_running";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
+SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
+send INSERT INTO t1 VALUES ("con2");
+connection con3;
+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
+send INSERT INTO t1 VALUES ("con3");
+connection con4;
+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
+send INSERT INTO t1 VALUES ("con4");
+
+# When group2 is queued, let group1 continue and queue group3.
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
+
+# At this point, trasaction 1 is still not visible as commit_ordered() has not
+# been called yet.
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SELECT * FROM t1 ORDER BY a;
+
+SET DEBUG_SYNC= "now SIGNAL group2_queued";
+connection con1;
+reap;
+
+# Now transaction 1 is visible.
+connection default;
+SELECT * FROM t1 ORDER BY a;
+
+connection con5;
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
+SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
+send INSERT INTO t1 VALUES ("con5");
+
+connection con6;
+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
+send INSERT INTO t1 VALUES ("con6");
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
+# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
+SELECT * FROM t1 ORDER BY a;
+SET DEBUG_SYNC= "now SIGNAL group3_committed";
+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
+# Now transactions 1-4 visible.
+SELECT * FROM t1 ORDER BY a;
+SET DEBUG_SYNC= "now SIGNAL group2_checked";
+
+connection con2;
+reap;
+
+connection con3;
+reap;
+
+connection con4;
+reap;
+
+connection con5;
+reap;
+
+connection con6;
+reap;
+
+connection default;
+# Check all transactions finally visible.
+SELECT * FROM t1 ORDER BY a;
+
+SELECT variable_value - @commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_commits';
+SELECT variable_value - @group_commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_group_commits';
+
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
--- a/sql/handler.cc
+++ b/sql/handler.cc
@ -76,6 +76,8 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
 uint known_extensions_id= 0;

+static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans,
+                              bool is_real_trans);


 static plugin_ref ha_default_plugin(THD *thd)
@ -1070,7 +1072,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
 */
 int ha_commit_trans(THD *thd, bool all)
 {
-  int error= 0, cookie= 0;
+  int error= 0, cookie;
  /*
    'all' means that this is either an explicit commit issued by
    user, or an implicit commit issued by a DDL.
@ -1085,7 +1087,8 @@ int ha_commit_trans(THD *thd, bool all)
  */
  bool is_real_trans= all || thd->transaction.all.ha_list == 0;
  Ha_trx_info *ha_info= trans->ha_list;
-  my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
+  bool need_prepare_ordered, need_commit_ordered;
+  my_xid xid;
  DBUG_ENTER("ha_commit_trans");

  /*
@ -1118,85 +1121,112 @@ int ha_commit_trans(THD *thd, bool all)
    DBUG_RETURN(2);
  }
 #ifdef USING_TRANSACTIONS
-  if (ha_info)
+  if (!ha_info)
  {
-    uint rw_ha_count;
-    bool rw_trans;
-
-    DBUG_EXECUTE_IF("crash_commit_before", abort(););
-
-    /* Close all cursors that can not survive COMMIT */
-    if (is_real_trans)                          /* not a statement commit */
-      thd->stmt_map.close_transient_cursors();
-
-    rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
-    /* rw_trans is TRUE when we in a transaction changing data */
-    rw_trans= is_real_trans && (rw_ha_count > 0);
-
-    if (rw_trans &&
-        wait_if_global_read_lock(thd, 0, 0))
-    {
-      ha_rollback_trans(thd, all);
-      DBUG_RETURN(1);
-    }
-
-    if (rw_trans &&
-        opt_readonly &&
-        !(thd->security_ctx->master_access & SUPER_ACL) &&
-        !thd->slave_thread)
-    {
-      my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
-      ha_rollback_trans(thd, all);
-      error= 1;
-      goto end;
-    }
-
-    if (!trans->no_2pc && (rw_ha_count > 1))
-    {
-      for (; ha_info && !error; ha_info= ha_info->next())
-      {
-        int err;
-        handlerton *ht= ha_info->ht();
-        /*
-          Do not call two-phase commit if this particular
-          transaction is read-only. This allows for simpler
-          implementation in engines that are always read-only.
-        */
-        if (! ha_info->is_trx_read_write())
-          continue;
-        /*
-          Sic: we know that prepare() is not NULL since otherwise
-          trans->no_2pc would have been set.
-        */
-        if ((err= ht->prepare(ht, thd, all)))
-        {
-          my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
-          error= 1;
-        }
-        status_var_increment(thd->status_var.ha_prepare_count);
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
-      if (error || (is_real_trans && xid &&
-                    (error= !(cookie= tc_log->log_xid(thd, xid)))))
-      {
-        ha_rollback_trans(thd, all);
-        error= 1;
-        goto end;
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
-    }
-    error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
-    DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
-    if (cookie)
-      tc_log->unlog(cookie, xid);
-    DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
-end:
-    if (rw_trans)
-      start_waiting_global_read_lock(thd);
+    /* Free resources and perform other cleanup even for 'empty' transactions. */
+    if (is_real_trans)
+      thd->transaction.cleanup();
+    DBUG_RETURN(0);
  }
-  /* Free resources and perform other cleanup even for 'empty' transactions. */
-  else if (is_real_trans)
-    thd->transaction.cleanup();
+
+  DBUG_EXECUTE_IF("crash_commit_before", abort(););
+
+  /* Close all cursors that can not survive COMMIT */
+  if (is_real_trans)                          /* not a statement commit */
+    thd->stmt_map.close_transient_cursors();
+
+  uint rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
+  /* rw_trans is TRUE when we in a transaction changing data */
+  bool rw_trans= is_real_trans && (rw_ha_count > 0);
+
+  if (rw_trans &&
+      wait_if_global_read_lock(thd, 0, 0))
+  {
+    ha_rollback_trans(thd, all);
+    DBUG_RETURN(1);
+  }
+
+  if (rw_trans &&
+      opt_readonly &&
+      !(thd->security_ctx->master_access & SUPER_ACL) &&
+      !thd->slave_thread)
+  {
+    my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
+    goto err;
+  }
+
+  if (trans->no_2pc || (rw_ha_count <= 1))
+  {
+    error= ha_commit_one_phase(thd, all);
+    DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+    goto end;
+  }
+
+  need_prepare_ordered= FALSE;
+  need_commit_ordered= FALSE;
+  xid= thd->transaction.xid_state.xid.get_my_xid();
+
+  for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
+  {
+    int err;
+    handlerton *ht= hi->ht();
+    /*
+      Do not call two-phase commit if this particular
+      transaction is read-only. This allows for simpler
+      implementation in engines that are always read-only.
+    */
+    if (! hi->is_trx_read_write())
+      continue;
+    /*
+      Sic: we know that prepare() is not NULL since otherwise
+      trans->no_2pc would have been set.
+    */
+    if ((err= ht->prepare(ht, thd, all)))
+      my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
+    status_var_increment(thd->status_var.ha_prepare_count);
+
+    if (err)
+      goto err;
+
+    if (ht->prepare_ordered)
+      need_prepare_ordered= TRUE;
+    if (ht->commit_ordered)
+      need_commit_ordered= TRUE;
+  }
+  DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
+
+  if (!is_real_trans)
+  {
+    error= commit_one_phase_2(thd, all, trans, is_real_trans);
+    DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+    goto end;
+  }
+
+  cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered,
+                                need_commit_ordered);
+  if (!cookie)
+    goto err;
+
+  DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
+
+  error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0;
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+
+  DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
+  tc_log->unlog(cookie, xid);
+
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+  goto end;
+
+  /* Come here if error and we need to rollback. */
+err:
+  if (!error)
+    error= 1;
+  ha_rollback_trans(thd, all);
+
+end:
+  if (rw_trans)
+    start_waiting_global_read_lock(thd);
 #endif /* USING_TRANSACTIONS */
  DBUG_RETURN(error);
 }
@ -1207,7 +1237,6 @@ end:
 */
 int ha_commit_one_phase(THD *thd, bool all)
 {
-  int error=0;
  THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
  /*
    "real" is a nick name for a transaction for which a commit will
@ -1217,8 +1246,41 @@ int ha_commit_one_phase(THD *thd, bool all)
    enclosing 'all' transaction is rolled back.
  */
  bool is_real_trans=all || thd->transaction.all.ha_list == 0;
-  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  Ha_trx_info *ha_info= trans->ha_list;
  DBUG_ENTER("ha_commit_one_phase");
+#ifdef USING_TRANSACTIONS
+  if (ha_info)
+  {
+    if (is_real_trans)
+    {
+      bool locked= false;
+      for (; ha_info; ha_info= ha_info->next())
+      {
+        handlerton *ht= ha_info->ht();
+        if (ht->commit_ordered)
+        {
+          if (ha_info->is_trx_read_write() && !locked)
+          {
+            pthread_mutex_lock(&LOCK_commit_ordered);
+            locked= 1;
+          }
+          ht->commit_ordered(ht, thd, all);
+        }
+      }
+      if (locked)
+        pthread_mutex_unlock(&LOCK_commit_ordered);
+    }
+  }
+#endif /* USING_TRANSACTIONS */
+  DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans));
+}
+
+static int
+commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
+{
+  int error= 0;
+  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  DBUG_ENTER("commit_one_phase_2");
 #ifdef USING_TRANSACTIONS
  if (ha_info)
  {
--- a/sql/handler.h
+++ b/sql/handler.h
@ -656,9 +656,96 @@ struct handlerton
     NOTE 'all' is also false in auto-commit mode where 'end of statement'
     and 'real commit' mean the same event.
   */
-   int  (*commit)(handlerton *hton, THD *thd, bool all);
+   int (*commit)(handlerton *hton, THD *thd, bool all);
+   /*
+     The commit_ordered() method is called prior to the commit() method, after
+     the transaction manager has decided to commit (not rollback) the
+     transaction. Unlike commit(), commit_ordered() is called only when the
+     full transaction is committed, not for each commit of statement
+     transaction in a multi-statement transaction.
+
+     The calls to commit_ordered() in multiple parallel transactions is
+     guaranteed to happen in the same order in every participating
+     handler. This can be used to ensure the same commit order among multiple
+     handlers (eg. in table handler and binlog). So if transaction T1 calls
+     into commit_ordered() of handler A before T2, then T1 will also call
+     commit_ordered() of handler B before T2.
+
+     Engines that implement this method should during this call make the
+     transaction visible to other transactions, thereby making the order of
+     transaction commits be defined by the order of commit_ordered() calls.
+
+     The intension is that commit_ordered() should do the minimal amount of
+     work that needs to happen in consistent commit order among handlers. To
+     preserve ordering, calls need to be serialised on a global mutex, so
+     doing any time-consuming or blocking operations in commit_ordered() will
+     limit scalability.
+
+     Handlers can rely on commit_ordered() calls for transactions that updated
+     data to be serialised (no two calls can run in parallel, so no extra
+     locking on the handler part is required to ensure this). However, calls
+     for SELECT-only transactions are not serialised, so can occur in parallel
+     with each other and with at most one write-transaction.
+
+     Note that commit_ordered() can be called from a different thread than the
+     one handling the transaction! So it can not do anything that depends on
+     thread local storage, in particular it can not call my_error() and
+     friends (instead it can store the error code and delay the call of
+     my_error() to the commit() method).
+
+     Similarly, since commit_ordered() returns void, any return error code
+     must be saved and returned from the commit() method instead.
+
+     The commit_ordered method is optional, and can be left unset if not
+     needed in a particular handler.
+   */
+   void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
   int  (*rollback)(handlerton *hton, THD *thd, bool all);
   int  (*prepare)(handlerton *hton, THD *thd, bool all);
+   /*
+     The prepare_ordered method is optional. If set, it will be called after
+     successful prepare() in all handlers participating in 2-phase
+     commit. Like commit_ordered(), it is called only when the full
+     transaction is committed, not for each commit of statement transaction.
+
+     The calls to prepare_ordered() among multiple parallel transactions are
+     ordered consistently with calls to commit_ordered(). This means that
+     calls to prepare_ordered() effectively define the commit order, and that
+     each handler will see the same sequence of transactions calling into
+     prepare_ordered() and commit_ordered().
+
+     Thus, prepare_ordered() can be used to define commit order for handlers
+     that need to do this in the prepare step (like binlog). It can also be
+     used to release transaction's locks early in an order consistent with the
+     order transactions will be eventually committed.
+
+     Like commit_ordered(), prepare_ordered() calls are serialised to maintain
+     ordering, so the intension is that they should execute fast, with only
+     the minimal amount of work needed to define commit order. Handlers can
+     rely on this serialisation, and do not need to do any extra locking to
+     avoid two prepare_ordered() calls running in parallel.
+
+     Like commit_ordered(), prepare_ordered() is not guaranteed to be called
+     in the context of the thread handling the rest of the transaction. So it
+     cannot invoke code that relies on thread local storage, in particular it
+     cannot call my_error().
+
+     When prepare_ordered() is called, the transaction coordinator has already
+     decided to commit (not rollback) the transaction. So prepare_ordered()
+     cannot cause a rollback by returning an error, all possible errors must
+     be handled in prepare() (the prepare_ordered() method returns void). In
+     case of some fatal error, a record of the error must be made internally
+     by the engine and returned from commit() later.
+
+     Note that for user-level XA SQL commands, no consistent ordering among
+     prepare_ordered() and commit_ordered() is guaranteed (as that would
+     require blocking all other commits for an indefinite time).
+
+     When 2-phase commit is not used (eg. only one engine (and no binlog) in
+     transaction), prepare() is not called and in such cases prepare_ordered()
+     also is not called.
+   */
+   void (*prepare_ordered)(handlerton *hton, THD *thd, bool all);
   int  (*recover)(handlerton *hton, XID *xid_list, uint len);
   int  (*commit_by_xid)(handlerton *hton, XID *xid);
   int  (*rollback_by_xid)(handlerton *hton, XID *xid);
--- a/sql/log.cc
+++ b/sql/log.cc
--- a/sql/log.h
+++ b/sql/log.h
@ -33,11 +33,173 @@ class TC_LOG

  virtual int open(const char *opt_name)=0;
  virtual void close()=0;
-  virtual int log_xid(THD *thd, my_xid xid)=0;
+  virtual int log_and_order(THD *thd, my_xid xid, bool all,
+                            bool need_prepare_ordered,
+                            bool need_commit_ordered) = 0;
  virtual void unlog(ulong cookie, my_xid xid)=0;
+
+protected:
+  /*
+    These methods are meant to be invoked from log_and_order() implementations
+    to run any prepare_ordered() respectively commit_ordered() methods in
+    participating handlers.
+
+    They must be called using suitable thread syncronisation to ensure that
+    they are each called in the correct commit order among all
+    transactions. However, it is only necessary to call them if the
+    corresponding flag passed to log_and_order is set (it is safe, but not
+    required, to call them when the flag is false).
+
+    The caller must be holding LOCK_prepare_ordered respectively
+    LOCK_commit_ordered when calling these methods.
+  */
+  void run_prepare_ordered(THD *thd, bool all);
+  void run_commit_ordered(THD *thd, bool all);
 };

-class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
+/*
+  Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered()
+  and TC_LOG::run_commit_ordered(), or any other code that calls handler
+  prepare_ordered() or commit_ordered() methods.
+*/
+extern pthread_mutex_t LOCK_prepare_ordered;
+extern pthread_mutex_t LOCK_commit_ordered;
+
+extern void TC_init();
+extern void TC_destroy();
+
+/*
+  Base class for two TC implementations TC_LOG_unordered and
+  TC_LOG_group_commit that both use a queue of threads waiting for group
+  commit.
+*/
+class TC_LOG_queued: public TC_LOG
+{
+protected:
+  TC_LOG_queued();
+  ~TC_LOG_queued();
+
+  /* Structure used to link list of THDs waiting for group commit. */
+  struct TC_group_commit_entry
+  {
+    struct TC_group_commit_entry *next;
+    THD *thd;
+    /* This is the `all' parameter for ha_commit_trans() etc. */
+    bool all;
+    /*
+      Flag set true when it is time for this thread to wake up after group
+      commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered.
+    */
+    bool group_commit_ready;
+    /*
+      Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and
+      cookie.
+    */
+    int xid_error;
+  };
+
+  TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue);
+
+  void group_commit_wait_for_wakeup(TC_group_commit_entry *entry);
+  void group_commit_wakeup_other(TC_group_commit_entry *other);
+
+  /*
+    This is a queue of threads waiting for being allowed to commit.
+    Access to the queue must be protected by LOCK_prepare_ordered.
+  */
+  TC_group_commit_entry *group_commit_queue;
+};
+
+class TC_LOG_unordered: public TC_LOG_queued
+{
+public:
+  TC_LOG_unordered();
+  ~TC_LOG_unordered();
+
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
+
+protected:
+  virtual int log_xid(THD *thd, my_xid xid)=0;
+
+private:
+  /*
+    This flag and condition is used to reserve the queue while threads in it
+    each run the commit_ordered() methods one after the other. Only once the
+    last commit_ordered() in the queue is done can we start on a new queue
+    run.
+
+    Since we start this process in the first thread in the queue and finish in
+    the last (and possibly different) thread, we need a condition variable for
+    this (we cannot unlock a mutex in a different thread than the one who
+    locked it).
+
+    The condition is used together with the LOCK_prepare_ordered mutex.
+  */
+  my_bool group_commit_queue_busy;
+  pthread_cond_t COND_queue_busy;
+};
+
+class TC_LOG_group_commit: public TC_LOG_queued
+{
+public:
+  TC_LOG_group_commit();
+  ~TC_LOG_group_commit();
+
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
+
+protected:
+  /* Total number of committed transactions. */
+  ulonglong num_commits;
+  /* Number of group commits done. */
+  ulonglong num_group_commits;
+
+  /*
+    When using this class, this method is used instead of log_xid() to do
+    logging of a group of transactions all at once.
+
+    The transactions will be linked through THD::next_commit_ordered.
+
+    Additionally, when this method is used instead of log_xid(), the order in
+    which handler->prepare_ordered() and handler->commit_ordered() are called
+    is guaranteed to be the same as the order of calls and THD list elements
+    for group_log_xid().
+
+    This can be used to efficiently implement group commit that at the same
+    time preserves the order of commits among handlers and TC (eg. to get same
+    commit order in InnoDB and binary log).
+
+    For TCs that do not need this, it can be preferable to use plain log_xid()
+    with class TC_LOG_unordered instead, as it allows threads to run log_xid()
+    in parallel with each other. In contrast, group_log_xid() runs under a
+    global mutex, so it is guaranteed that only once call into it will be
+    active at once.
+
+    Since this call handles multiple threads/THDs at once, my_error() (and
+    other code that relies on thread local storage) cannot be used in this
+    method. Instead, the implementation must record any error and report it as
+    the return value from xid_log_after(), which will be invoked individually
+    for each thread.
+
+    In the success case, this method must set thd->xid_cookie for each thread
+    to the cookie that is normally returned from log_xid() (which must be
+    non-zero in the non-error case).
+  */
+  virtual void group_log_xid(TC_group_commit_entry *first) = 0;
+  /*
+    Called for each transaction (in corrent thread context) after
+    group_log_xid() has finished, but with no guarantee on ordering among
+    threads.
+    Can be used to do error reporting etc. */
+  virtual int xid_log_after(TC_group_commit_entry *entry) = 0;
+
+private:
+  /* Mutex used to serialise calls to group_log_xid(). */
+  pthread_mutex_t LOCK_group_commit;
+};
+
+class TC_LOG_DUMMY: public TC_LOG_unordered // use it to disable the logging
 {
 public:
  TC_LOG_DUMMY() {}
@ -48,7 +210,7 @@ public:
 };

 #ifdef HAVE_MMAP
-class TC_LOG_MMAP: public TC_LOG
+class TC_LOG_MMAP: public TC_LOG_unordered
 {
  public:                // only to keep Sun Forte on sol9x86 happy
  typedef enum {
@ -227,12 +389,19 @@ private:
  time_t last_time;
 };

-class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
+class binlog_trx_data;
+class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG
 {
 private:
  /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
  pthread_mutex_t LOCK_index;
  pthread_mutex_t LOCK_prep_xids;
+  /*
+    Mutex to protect the queue of transactions waiting to participate in group
+    commit. (Only used on platforms without native atomic operations).
+  */
+  pthread_mutex_t LOCK_queue;
+
  pthread_cond_t  COND_prep_xids;
  pthread_cond_t update_cond;
  ulonglong bytes_written;
@ -271,8 +440,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
    In 5.0 it's 0 for relay logs too!
  */
  bool no_auto_events;
-
-  ulonglong m_table_map_version;
+  /* Queue of transactions queued up to participate in group commit. */
+  binlog_trx_data *group_commit_queue;

  int write_to_file(IO_CACHE *cache);
  /*
@ -282,6 +451,14 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
  */
  void new_file_without_locking();
  void new_file_impl(bool need_lock);
+  int write_transaction(binlog_trx_data *trx_data);
+  bool write_transaction_to_binlog_events(binlog_trx_data *trx_data);
+  void trx_group_commit_participant(binlog_trx_data *trx_data);
+  void trx_group_commit_leader(TC_group_commit_entry *first);
+  binlog_trx_data *atomic_enqueue_trx(binlog_trx_data *trx_data);
+  binlog_trx_data *atomic_grab_trx_queue();
+  void mark_xid_done();
+  void mark_xids_active(uint xid_count);

 public:
  MYSQL_LOG::generate_name;
@ -310,18 +487,11 @@ public:

  int open(const char *opt_name);
  void close();
-  int log_xid(THD *thd, my_xid xid);
+  void group_log_xid(TC_group_commit_entry *first);
+  int xid_log_after(TC_group_commit_entry *entry);
  void unlog(ulong cookie, my_xid xid);
  int recover(IO_CACHE *log, Format_description_log_event *fdle);
 #if !defined(MYSQL_CLIENT)
-  bool is_table_mapped(TABLE *table) const
-  {
-    return table->s->table_map_version == table_map_version();
-  }
-
-  ulonglong table_map_version() const { return m_table_map_version; }
-  void update_table_map_version() { ++m_table_map_version; }
-
  int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event);
  int remove_pending_rows_event(THD *thd);

@ -362,10 +532,12 @@ public:
  void new_file();

  bool write(Log_event* event_info); // binary log write
-  bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
-  bool write_incident(THD *thd, bool lock);
+  bool write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data,
+                                   Log_event *end_ev);
+  bool trx_group_commit_finish(binlog_trx_data *trx_data);
+  bool write_incident(THD *thd);

-  int  write_cache(IO_CACHE *cache, bool lock_log, bool flush_and_sync);
+  int  write_cache(IO_CACHE *cache);
  void set_write_error(THD *thd);
  bool check_write_error(THD *thd);

@ -420,6 +592,7 @@ public:
  inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);}
  inline IO_CACHE *get_index_file() { return &index_file;}
  inline uint32 get_open_count() { return open_count; }
+  void set_status_variables();
 };

 class Log_event_handler
--- a/sql/log_event.h
+++ b/sql/log_event.h
@ -463,10 +463,9 @@ struct sql_ex_info
 #define LOG_EVENT_SUPPRESS_USE_F    0x8

 /*
-  The table map version internal to the log should be increased after
-  the event has been written to the binary log.
+  This used to be LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F, but is now unused.
 */
-#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x10
+#define LOG_EVENT_UNUSED1_F 0x10

 /**
   @def LOG_EVENT_ARTIFICIAL_F
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@ -1333,6 +1333,7 @@ void clean_up(bool print_message)
  ha_end();
  if (tc_log)
    tc_log->close();
+  TC_destroy();
  xid_cache_free();
  wt_end();
  delete_elements(&key_caches, (void (*)(const char*, uchar*)) free_key_cache);
@ -4124,6 +4125,8 @@ a file name for --log-bin-index option", opt_binlog_index_name);
  if (!errmesg[0][0])
    unireg_abort(1);

+  TC_init();
+
  /* We have to initialize the storage engines before CSV logging */
  if (ha_init())
  {
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@ -673,6 +673,8 @@ THD::THD()
  active_vio = 0;
 #endif
  pthread_mutex_init(&LOCK_thd_data, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&COND_commit_ordered, 0);

  /* Variables with default values */
  proc_info="login";
@ -999,6 +1001,8 @@ THD::~THD()
  free_root(&transaction.mem_root,MYF(0));
 #endif
  mysys_var=0;					// Safety (shouldn't be needed)
+  pthread_cond_destroy(&COND_commit_ordered);
+  pthread_mutex_destroy(&LOCK_commit_ordered);
  pthread_mutex_destroy(&LOCK_thd_data);
 #ifndef DBUG_OFF
  dbug_sentry= THD_SENTRY_GONE;
@ -3773,7 +3777,6 @@ int THD::binlog_flush_pending_rows_event(bool stmt_end)
    if (stmt_end)
    {
      pending->set_flags(Rows_log_event::STMT_END_F);
-      pending->flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
      binlog_table_maps= 0;
    }

@ -3901,7 +3904,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
    {
      Query_log_event qinfo(this, query_arg, query_len, is_trans, suppress_use,
                            errcode);
-      qinfo.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
      /*
        Binlog table maps will be irrelevant after a Query_log_event
        (they are just removed on the slave side) so after the query
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@ -1438,6 +1438,10 @@ public:
  /* container for handler's private per-connection data */
  Ha_data ha_data[MAX_HA];

+  /* Mutex and condition for waking up threads after group commit. */
+  pthread_mutex_t LOCK_commit_ordered;
+  pthread_cond_t COND_commit_ordered;
+
 #ifndef MYSQL_CLIENT
  int binlog_setup_trx_data();

--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@ -516,7 +516,6 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
 	  else
 	  {
 	    Delete_file_log_event d(thd, db, transactional_table);
-            d.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
 	    (void) mysql_bin_log.write(&d);
 	  }
 	}
@ -698,7 +697,6 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
      (duplicates == DUP_REPLACE) ? LOAD_DUP_REPLACE :
      (ignore ? LOAD_DUP_IGNORE : LOAD_DUP_ERROR),
      transactional_table, FALSE, errcode);
-  e.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
  return mysql_bin_log.write(&e);
 }

--- a/sql/table.cc
+++ b/sql/table.cc
@ -296,13 +296,6 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key,

    share->version=       refresh_version;

-    /*
-      This constant is used to mark that no table map version has been
-      assigned.  No arithmetic is done on the value: it will be
-      overwritten with a value taken from MYSQL_BIN_LOG.
-    */
-    share->table_map_version= ~(ulonglong)0;
-
    /*
      Since alloc_table_share() can be called without any locking (for
      example, ha_create_table... functions), we do not assign a table
@ -367,10 +360,9 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key,
  share->frm_version= 		 FRM_VER_TRUE_VARCHAR;

  /*
-    Temporary tables are not replicated, but we set up these fields
+    Temporary tables are not replicated, but we set up this fields
    anyway to be able to catch errors.
   */
-  share->table_map_version= ~(ulonglong)0;
  share->cached_row_logging_check= -1;

  /*
--- a/sql/table.h
+++ b/sql/table.h
@ -433,7 +433,6 @@ typedef struct st_table_share
  bool waiting_on_cond;                 /* Protection against free */
  bool deleting;                        /* going to delete this table */
  ulong table_map_id;                   /* for row-based replication */
-  ulonglong table_map_version;

  /*
    Cache for row-based replication table share checks that does not
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@ -138,8 +138,6 @@ bool check_global_access(THD *thd, ulong want_access);

 /** to protect innobase_open_files */
 static pthread_mutex_t innobase_share_mutex;
-/** to force correct commit order in binlog */
-static pthread_mutex_t prepare_commit_mutex;
 static ulong commit_threads = 0;
 static pthread_mutex_t commit_threads_m;
 static pthread_cond_t commit_cond;
@ -239,6 +237,7 @@ static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
 static INNOBASE_SHARE *get_share(const char *table_name);
 static void free_share(INNOBASE_SHARE *share);
 static int innobase_close_connection(handlerton *hton, THD* thd);
+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
 static int innobase_commit(handlerton *hton, THD* thd, bool all);
 static int innobase_rollback(handlerton *hton, THD* thd, bool all);
 static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
@ -1356,7 +1355,6 @@ innobase_trx_init(
 	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
 {
 	DBUG_ENTER("innobase_trx_init");
-	DBUG_ASSERT(EQ_CURRENT_THD(thd));
 	DBUG_ASSERT(thd == trx->mysql_thd);

 	trx->check_foreigns = !thd_test_options(
@ -1416,8 +1414,6 @@ check_trx_exists(
 {
 	trx_t*&	trx = thd_to_trx(thd);

-	ut_ad(EQ_CURRENT_THD(thd));
-
 	if (trx == NULL) {
 		trx = innobase_trx_allocate(thd);
 	} else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
@ -2024,6 +2020,7 @@ innobase_init(
        innobase_hton->savepoint_set=innobase_savepoint;
        innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
        innobase_hton->savepoint_release=innobase_release_savepoint;
+        innobase_hton->commit_ordered=innobase_commit_ordered;
        innobase_hton->commit=innobase_commit;
        innobase_hton->rollback=innobase_rollback;
        innobase_hton->prepare=innobase_xa_prepare;
@ -2492,7 +2489,6 @@ skip_overwrite:

 	innobase_open_tables = hash_create(200);
 	pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST);
-	pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST);
 	pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST);
 	pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST);
 	pthread_mutex_init(&analyze_mutex, MY_MUTEX_INIT_FAST);
@ -2547,7 +2543,6 @@ innobase_end(
 		my_free(internal_innobase_data_file_path,
 						MYF(MY_ALLOW_ZERO_PTR));
 		pthread_mutex_destroy(&innobase_share_mutex);
-		pthread_mutex_destroy(&prepare_commit_mutex);
 		pthread_mutex_destroy(&commit_threads_m);
 		pthread_mutex_destroy(&commit_cond_m);
 		pthread_mutex_destroy(&analyze_mutex);
@ -2680,6 +2675,101 @@ innobase_start_trx_and_assign_read_view(
 	DBUG_RETURN(0);
 }

+/*****************************************************************//**
+Perform the first, fast part of InnoDB commit.
+
+Doing it in this call ensures that we get the same commit order here
+as in binlog and any other participating transactional storage engines.
+
+Note that we want to do as little as really needed here, as we run
+under a global mutex. The expensive fsync() is done later, in
+innobase_commit(), without a lock so group commit can take place.
+
+Note also that this method can be called from a different thread than
+the one handling the rest of the transaction. */
+static
+void
+innobase_commit_ordered(
+/*============*/
+	handlerton *hton, /*!< in: Innodb handlerton */
+	THD*	thd,	/*!< in: MySQL thread handle of the user for whom
+			the transaction should be committed */
+	bool	all)	/*!< in:	TRUE - commit transaction
+				FALSE - the current SQL statement ended */
+{
+	trx_t*		trx;
+	DBUG_ENTER("innobase_commit_ordered");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	if (trx->active_trans == 0
+		&& trx->conc_state != TRX_NOT_STARTED) {
+		/* We cannot throw error here; instead we will catch this error
+		again in innobase_commit() and report it from there. */
+		DBUG_VOID_RETURN;
+	}
+	/* Since we will reserve the kernel mutex, we have to release
+	the search system latch first to obey the latching order. */
+
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* commit_ordered is only called when committing the whole transaction
+	(or an SQL statement when autocommit is on). */
+	DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
+
+	/* We need current binlog position for ibbackup to work.
+	Note, the position is current because commit_ordered is guaranteed
+	to be called in same sequenece as writing to binlog. */
+
+retry:
+	if (innobase_commit_concurrency > 0) {
+		pthread_mutex_lock(&commit_cond_m);
+		commit_threads++;
+
+		if (commit_threads > innobase_commit_concurrency) {
+			commit_threads--;
+			pthread_cond_wait(&commit_cond,
+					  &commit_cond_m);
+			pthread_mutex_unlock(&commit_cond_m);
+			goto retry;
+		}
+		else {
+			pthread_mutex_unlock(&commit_cond_m);
+		}
+	}
+
+	/* The following calls to read the MySQL binary log
+	   file name and the position return consistent results:
+	   1) We use commit_ordered() to get same commit order
+	   in InnoDB as in binary log.
+	   2) A MySQL log file rotation cannot happen because
+	   MySQL protects against this by having a counter of
+	   transactions in prepared state and it only allows
+	   a rotation when the counter drops to zero. See
+	   LOCK_prep_xids and COND_prep_xids in log.cc. */
+	trx->mysql_log_file_name = mysql_bin_log_file_name();
+	trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
+
+	/* Don't do write + flush right now. For group commit
+	   to work we want to do the flush in the innobase_commit()
+	   method, which runs without holding any locks. */
+	trx->flush_log_later = TRUE;
+	innobase_commit_low(trx);
+	trx->flush_log_later = FALSE;
+
+	if (innobase_commit_concurrency > 0) {
+		pthread_mutex_lock(&commit_cond_m);
+		commit_threads--;
+		pthread_cond_signal(&commit_cond);
+		pthread_mutex_unlock(&commit_cond_m);
+	}
+
+	DBUG_VOID_RETURN;
+}
+
 /*****************************************************************//**
 Commits a transaction in an InnoDB database or marks an SQL statement
 ended.
@ -2702,13 +2792,6 @@ innobase_commit(

 	trx = check_trx_exists(thd);

-	/* Since we will reserve the kernel mutex, we have to release
-	the search system latch first to obey the latching order. */
-
-	if (trx->has_search_latch) {
-		trx_search_latch_release_if_reserved(trx);
-	}
-
 	/* The flag trx->active_trans is set to 1 in

 	1. ::external_lock(),
@ -2736,62 +2819,8 @@ innobase_commit(
 		/* We were instructed to commit the whole transaction, or
 		this is an SQL statement end and autocommit is on */

-		/* We need current binlog position for ibbackup to work.
-		Note, the position is current because of
-		prepare_commit_mutex */
-retry:
-		if (innobase_commit_concurrency > 0) {
-			pthread_mutex_lock(&commit_cond_m);
-			commit_threads++;
-
-			if (commit_threads > innobase_commit_concurrency) {
-				commit_threads--;
-				pthread_cond_wait(&commit_cond,
-					&commit_cond_m);
-				pthread_mutex_unlock(&commit_cond_m);
-				goto retry;
-			}
-			else {
-				pthread_mutex_unlock(&commit_cond_m);
-			}
-		}
-
-		/* The following calls to read the MySQL binary log
-		file name and the position return consistent results:
-		1) Other InnoDB transactions cannot intervene between
-		these calls as we are holding prepare_commit_mutex.
-		2) Binary logging of other engines is not relevant
-		to InnoDB as all InnoDB requires is that committing
-		InnoDB transactions appear in the same order in the
-		MySQL binary log as they appear in InnoDB logs.
-		3) A MySQL log file rotation cannot happen because
-		MySQL protects against this by having a counter of
-		transactions in prepared state and it only allows
-		a rotation when the counter drops to zero. See
-		LOCK_prep_xids and COND_prep_xids in log.cc. */
-		trx->mysql_log_file_name = mysql_bin_log_file_name();
-		trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
-
-		/* Don't do write + flush right now. For group commit
-		to work we want to do the flush after releasing the
-		prepare_commit_mutex. */
-		trx->flush_log_later = TRUE;
-		innobase_commit_low(trx);
-		trx->flush_log_later = FALSE;
-
-		if (innobase_commit_concurrency > 0) {
-			pthread_mutex_lock(&commit_cond_m);
-			commit_threads--;
-			pthread_cond_signal(&commit_cond);
-			pthread_mutex_unlock(&commit_cond_m);
-		}
-
-		if (trx->active_trans == 2) {
-
-			pthread_mutex_unlock(&prepare_commit_mutex);
-		}
-
-		/* Now do a write + flush of logs. */
+		/* We did the first part already in innobase_commit_ordered(),
+		Now finish by doing a write + flush of logs. */
 		trx_commit_complete_for_mysql(trx);
 		trx->active_trans = 0;

@ -4621,6 +4650,7 @@ no_commit:
 			no need to re-acquire locks on it. */

 			/* Altering to InnoDB format */
+			innobase_commit_ordered(ht, user_thd, 1);
 			innobase_commit(ht, user_thd, 1);
 			/* Note that this transaction is still active. */
 			prebuilt->trx->active_trans = 1;
@ -4637,6 +4667,7 @@ no_commit:

 			/* Commit the transaction.  This will release the table
 			locks, so they have to be acquired again. */
+			innobase_commit_ordered(ht, user_thd, 1);
 			innobase_commit(ht, user_thd, 1);
 			/* Note that this transaction is still active. */
 			prebuilt->trx->active_trans = 1;
@ -8339,6 +8370,7 @@ ha_innobase::external_lock(

 		if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 			if (trx->active_trans != 0) {
+				innobase_commit_ordered(ht, thd, TRUE);
 				innobase_commit(ht, thd, TRUE);
 			}
 		} else {
@ -9448,36 +9480,6 @@ innobase_xa_prepare(

 	srv_active_wake_master_thread();

-	if (thd_sql_command(thd) != SQLCOM_XA_PREPARE &&
-	    (all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
-	{
-		if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) {
-			/* choose group commit rather than binlog order */
-			return(error);
-		}
-
-		/* For ibbackup to work the order of transactions in binlog
-		and InnoDB must be the same. Consider the situation
-
-		  thread1> prepare; write to binlog; ...
-			  <context switch>
-		  thread2> prepare; write to binlog; commit
-		  thread1>			     ... commit
-
-		To ensure this will not happen we're taking the mutex on
-		prepare, and releasing it on commit.
-
-		Note: only do it for normal commits, done via ha_commit_trans.
-		If 2pc protocol is executed by external transaction
-		coordinator, it will be just a regular MySQL client
-		executing XA PREPARE and XA COMMIT commands.
-		In this case we cannot know how many minutes or hours
-		will be between XA PREPARE and XA COMMIT, and we don't want
-		to block for undefined period of time. */
-		pthread_mutex_lock(&prepare_commit_mutex);
-		trx->active_trans = 2;
-	}
-
 	return(error);
 }

@ -10669,11 +10671,6 @@ static MYSQL_SYSVAR_ENUM(adaptive_checkpoint, srv_adaptive_checkpoint,
  "Enable/Disable flushing along modified age. (none, reflex, [estimate])",
  NULL, innodb_adaptive_checkpoint_update, 2, &adaptive_checkpoint_typelib);

-static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit,
-  PLUGIN_VAR_RQCMDARG,
-  "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
-  NULL, NULL, 0, 0, 1, 0);
-
 static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import,
  PLUGIN_VAR_RQCMDARG,
  "Enable/Disable converting automatically *.ibd files when import tablespace.",
@ -10763,7 +10760,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
  MYSQL_SYSVAR(flush_neighbor_pages),
  MYSQL_SYSVAR(read_ahead),
  MYSQL_SYSVAR(adaptive_checkpoint),
-  MYSQL_SYSVAR(enable_unsafe_group_commit),
  MYSQL_SYSVAR(expand_import),
  MYSQL_SYSVAR(extra_rsegments),
  MYSQL_SYSVAR(dict_size_limit),