From 8bc445360ee9f21560b5613e0bdc8b363f34d5b6 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 4 Oct 2010 20:40:31 +0200 Subject: [PATCH] MWL#116: Efficient group commit Tweak the commit_ordered() semantics. Now it is only called for transactions that go through 2-phase commit. This avoids forcing engines to make commits visible before they are durable. Also take LOCK_commit_ordered() around START TRANSACTION WITH CONSISTENT SNAPSHOT, to get a truly consistent snapshot. --- sql/handler.cc | 31 ++---- sql/handler.h | 16 +-- storage/xtradb/handler/ha_innodb.cc | 157 ++++++++++++++++------------ storage/xtradb/include/trx0trx.h | 11 +- 4 files changed, 117 insertions(+), 98 deletions(-) diff --git a/sql/handler.cc b/sql/handler.cc index 4975b4a1230..6503516a2f8 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1251,32 +1251,7 @@ int ha_commit_one_phase(THD *thd, bool all) enclosing 'all' transaction is rolled back. */ bool is_real_trans=all || thd->transaction.all.ha_list == 0; - Ha_trx_info *ha_info= trans->ha_list; DBUG_ENTER("ha_commit_one_phase"); -#ifdef USING_TRANSACTIONS - if (ha_info) - { - if (is_real_trans) - { - bool locked= false; - for (; ha_info; ha_info= ha_info->next()) - { - handlerton *ht= ha_info->ht(); - if (ht->commit_ordered) - { - if (ha_info->is_trx_read_write() && !locked) - { - pthread_mutex_lock(&LOCK_commit_ordered); - locked= 1; - } - ht->commit_ordered(ht, thd, all); - } - } - if (locked) - pthread_mutex_unlock(&LOCK_commit_ordered); - } - } -#endif /* USING_TRANSACTIONS */ DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans)); } @@ -1901,7 +1876,13 @@ int ha_start_consistent_snapshot(THD *thd) { bool warn= true; + /* + Holding the LOCK_commit_ordered mutex ensures that for any transaction + we either see it committed in all engines, or in none. + */ + pthread_mutex_lock(&LOCK_commit_ordered); plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn); + pthread_mutex_unlock(&LOCK_commit_ordered); /* Same idea as when one wants to CREATE TABLE in one engine which does not diff --git a/sql/handler.h b/sql/handler.h index a5c4af533be..82926dc08c0 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -667,6 +667,11 @@ struct handlerton full transaction is committed, not for each commit of statement transaction in a multi-statement transaction. + Not that like prepare(), commit_ordered() is only called when 2-phase + commit takes place. Ie. when no binary log and only a single engine + participates in a transaction, one commit() is called, no + commit_orderd(). So engines must be prepared for this. + The calls to commit_ordered() in multiple parallel transactions is guaranteed to happen in the same order in every participating handler. This can be used to ensure the same commit order among multiple @@ -684,11 +689,9 @@ struct handlerton doing any time-consuming or blocking operations in commit_ordered() will limit scalability. - Handlers can rely on commit_ordered() calls for transactions that updated - data to be serialised (no two calls can run in parallel, so no extra - locking on the handler part is required to ensure this). However, calls - for SELECT-only transactions are not serialised, so can occur in parallel - with each other and with at most one write-transaction. + Handlers can rely on commit_ordered() calls to be serialised (no two + calls can run in parallel, so no extra locking on the handler part is + required to ensure this). Note that commit_ordered() can be called from a different thread than the one handling the transaction! So it can not do anything that depends on @@ -700,7 +703,8 @@ struct handlerton must be saved and returned from the commit() method instead. The commit_ordered method is optional, and can be left unset if not - needed in a particular handler. + needed in a particular handler (then there will be no ordering guarantees + wrt. other engines and binary log). */ void (*commit_ordered)(handlerton *hton, THD *thd, bool all); int (*rollback)(handlerton *hton, THD *thd, bool all); diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 7b1f3bc01dc..d9abb5ae032 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -1700,10 +1700,10 @@ innobase_query_caching_of_table_permitted( /* The call of row_search_.. will start a new transaction if it is not yet started */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(innodb_hton_ptr, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } if (row_search_check_if_query_cache_permitted(trx, norm_name)) { @@ -1973,11 +1973,11 @@ ha_innobase::init_table_handle_for_HANDLER(void) /* Set the MySQL flag to mark that there is an active transaction */ - if (prebuilt->trx->active_trans == 0) { + if ((prebuilt->trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, user_thd); - prebuilt->trx->active_trans = 1; + prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } /* We did the necessary inits in this function, no need to repeat them @@ -2704,58 +2704,21 @@ innobase_start_trx_and_assign_read_view( /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(hton, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } DBUG_RETURN(0); } -/*****************************************************************//** -Perform the first, fast part of InnoDB commit. - -Doing it in this call ensures that we get the same commit order here -as in binlog and any other participating transactional storage engines. - -Note that we want to do as little as really needed here, as we run -under a global mutex. The expensive fsync() is done later, in -innobase_commit(), without a lock so group commit can take place. - -Note also that this method can be called from a different thread than -the one handling the rest of the transaction. */ static void -innobase_commit_ordered( +innobase_commit_ordered_2( /*============*/ - handlerton *hton, /*!< in: Innodb handlerton */ - THD* thd, /*!< in: MySQL thread handle of the user for whom - the transaction should be committed */ - bool all) /*!< in: TRUE - commit transaction - FALSE - the current SQL statement ended */ + trx_t* trx) /*!< in: Innodb transaction */ { - trx_t* trx; DBUG_ENTER("innobase_commit_ordered"); - DBUG_ASSERT(hton == innodb_hton_ptr); - - trx = check_trx_exists(thd); - - if (trx->active_trans == 0 - && trx->conc_state != TRX_NOT_STARTED) { - /* We cannot throw error here; instead we will catch this error - again in innobase_commit() and report it from there. */ - DBUG_VOID_RETURN; - } - /* Since we will reserve the kernel mutex, we have to release - the search system latch first to obey the latching order. */ - - if (trx->has_search_latch) { - trx_search_latch_release_if_reserved(trx); - } - - /* commit_ordered is only called when committing the whole transaction - (or an SQL statement when autocommit is on). */ - DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); /* We need current binlog position for ibbackup to work. Note, the position is current because commit_ordered is guaranteed @@ -2807,6 +2770,60 @@ retry: DBUG_VOID_RETURN; } +/*****************************************************************//** +Perform the first, fast part of InnoDB commit. + +Doing it in this call ensures that we get the same commit order here +as in binlog and any other participating transactional storage engines. + +Note that we want to do as little as really needed here, as we run +under a global mutex. The expensive fsync() is done later, in +innobase_commit(), without a lock so group commit can take place. + +Note also that this method can be called from a different thread than +the one handling the rest of the transaction. */ +static +void +innobase_commit_ordered( +/*============*/ + handlerton *hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /*!< in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + DBUG_ENTER("innobase_commit_ordered"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 + && trx->conc_state != TRX_NOT_STARTED) { + /* We cannot throw error here; instead we will catch this error + again in innobase_commit() and report it from there. */ + DBUG_VOID_RETURN; + } + + /* commit_ordered is only called when committing the whole transaction + (or an SQL statement when autocommit is on). */ + DBUG_ASSERT(all || + (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); + + innobase_commit_ordered_2(trx); + + trx->active_trans |= TRX_ACTIVE_COMMIT_ORDERED; + + DBUG_VOID_RETURN; +} + /*****************************************************************//** Commits a transaction in an InnoDB database or marks an SQL statement ended. @@ -2829,7 +2846,15 @@ innobase_commit( trx = check_trx_exists(thd); - /* The flag trx->active_trans is set to 1 in + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch && + (trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) { + trx_search_latch_release_if_reserved(trx); + } + + /* The flag TRX_ACTIVE_IN_MYSQL in trx->active_trans is set in 1. ::external_lock(), 2. ::start_stmt(), @@ -2839,20 +2864,26 @@ innobase_commit( 6. innobase_start_trx_and_assign_read_view(), 7. ::transactional_table_lock() - and it is only set to 0 in a commit or a rollback. If it is 0 we know + and it is only cleared in a commit or a rollback. If it is unset we know there cannot be resources to be freed and we could return immediately. For the time being, we play safe and do the cleanup though there should be nothing to clean up. */ - if (trx->active_trans == 0 + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 && trx->conc_state != TRX_NOT_STARTED) { sql_print_error("trx->active_trans == 0, but" " trx->conc_state != TRX_NOT_STARTED"); } + if (all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + /* Run the fast part of commit if we did not already. */ + if ((trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) { + innobase_commit_ordered_2(trx); + } + /* We were instructed to commit the whole transaction, or this is an SQL statement end and autocommit is on */ @@ -3076,7 +3107,7 @@ innobase_savepoint( innobase_release_stat_resources(trx); /* cannot happen outside of transaction */ - DBUG_ASSERT(trx->active_trans); + DBUG_ASSERT(trx->active_trans & TRX_ACTIVE_IN_MYSQL); /* TODO: use provided savepoint data area to store savepoint data */ char name[64]; @@ -3106,7 +3137,7 @@ innobase_close_connection( ut_a(trx); - if (trx->active_trans == 0 + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 && trx->conc_state != TRX_NOT_STARTED) { sql_print_error("trx->active_trans == 0, but" @@ -5021,10 +5052,9 @@ no_commit: no need to re-acquire locks on it. */ /* Altering to InnoDB format */ - innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ - prebuilt->trx->active_trans = 1; + prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL; /* We will need an IX lock on the destination table. */ prebuilt->sql_stat_start = TRUE; } else { @@ -5038,10 +5068,9 @@ no_commit: /* Commit the transaction. This will release the table locks, so they have to be acquired again. */ - innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ - prebuilt->trx->active_trans = 1; + prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL; /* Re-acquire the table lock on the source table. */ row_lock_table_for_mysql(prebuilt, src_table, mode); /* We will need an IX lock on the destination table. */ @@ -8929,10 +8958,10 @@ ha_innobase::start_stmt( trx->detailed_error[0] = '\0'; /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } else { innobase_register_stmt(ht, thd); } @@ -9030,10 +9059,10 @@ ha_innobase::external_lock( /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } else if (trx->n_mysql_tables_in_use == 0) { innobase_register_stmt(ht, thd); } @@ -9131,8 +9160,7 @@ ha_innobase::external_lock( prebuilt->used_in_HANDLER = FALSE; if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { - if (trx->active_trans != 0) { - innobase_commit_ordered(ht, thd, TRUE); + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) != 0) { innobase_commit(ht, thd, TRUE); } } else { @@ -9217,10 +9245,10 @@ ha_innobase::transactional_table_lock( /* MySQL is setting a new transactional table lock */ /* Set the MySQL flag to mark that there is an active transaction */ - if (trx->active_trans == 0) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) { innobase_register_trx_and_stmt(ht, thd); - trx->active_trans = 1; + trx->active_trans |= TRX_ACTIVE_IN_MYSQL; } if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) { @@ -10272,7 +10300,8 @@ innobase_xa_prepare( innobase_release_stat_resources(trx); - if (trx->active_trans == 0 && trx->conc_state != TRX_NOT_STARTED) { + if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 && + trx->conc_state != TRX_NOT_STARTED) { sql_print_error("trx->active_trans == 0, but trx->conc_state != " "TRX_NOT_STARTED"); @@ -10284,7 +10313,7 @@ innobase_xa_prepare( /* We were instructed to prepare the whole transaction, or this is an SQL statement end and autocommit is on */ - ut_ad(trx->active_trans); + ut_ad(trx->active_trans & TRX_ACTIVE_IN_MYSQL); error = (int) trx_prepare_for_mysql(trx); } else { diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h index 4c0ce392bcd..36f9c464c2b 100644 --- a/storage/xtradb/include/trx0trx.h +++ b/storage/xtradb/include/trx0trx.h @@ -511,9 +511,10 @@ struct trx_struct{ in that case we must flush the log in trx_commit_complete_for_mysql() */ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ - ulint active_trans; /*!< 1 - if a transaction in MySQL - is active. 2 - if prepare_commit_mutex - was taken */ + ulint active_trans; /*!< TRX_ACTIVE_IN_MYSQL - set if a + transaction in MySQL is active. + TRX_ACTIVE_COMMIT_ORDERED - set if + innobase_commit_ordered has run */ ulint has_search_latch; /* TRUE if this trx has latched the search system latch in S-mode */ @@ -824,6 +825,10 @@ Multiple flags can be combined with bitwise OR. */ #define TRX_SIG_OTHER_SESS 1 /* sent by another session (which must hold rights to this) */ +/* Flag bits for trx_struct.active_trans */ +#define TRX_ACTIVE_IN_MYSQL (1<<0) +#define TRX_ACTIVE_COMMIT_ORDERED (1<<1) + /** Commit node states */ enum commit_node_state { COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to