diff --git a/mysql-test/suite/innodb/r/purge_pessimistic.result b/mysql-test/suite/innodb/r/purge_pessimistic.result new file mode 100644 index 00000000000..a7ce4e13d60 --- /dev/null +++ b/mysql-test/suite/innodb/r/purge_pessimistic.result @@ -0,0 +1,36 @@ +# +# MDEV-35508: Race condition between purge and secondary index INSERT or UPDATE +# +SET @old_debug_dbug = @@global.debug_dbug; +CREATE TABLE t1(col1 INT PRIMARY KEY, col2 int, KEY k1(col2)) ENGINE=Innodb; +INSERT INTO t1 VALUES(1, 100); +CREATE TABLE t2(col1 INT PRIMARY KEY) Engine=Innodb; +InnoDB 0 transactions not purged +START TRANSACTION; +INSERT INTO t2 VALUES(10); +SET DEBUG_SYNC='RESET'; +SET GLOBAL debug_dbug= "+d,btr_force_pessimistic_delete"; +SET GLOBAL debug_dbug= "+d,enable_row_purge_sec_tree_sync"; +connect con1,localhost,root; +UPDATE t1 SET col2 = 200 WHERE col1 = 1; +connection default; +SET DEBUG_SYNC= 'now WAIT_FOR purge_sec_tree_begin'; +SET GLOBAL debug_dbug= "-d,enable_row_purge_sec_tree_sync"; +UPDATE t1 SET col2 = 100 WHERE col1 = 1; +SET DEBUG_SYNC= 'now SIGNAL purge_sec_tree_execute'; +COMMIT; +InnoDB 0 transactions not purged +disconnect con1; +SELECT * FROM t1; +col1 col2 +1 100 +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +DROP TABLE t1; +SELECT * FROM t2; +col1 +10 +DROP TABLE t2; +SET @@GLOBAL.debug_dbug = @old_debug_dbug; +SET DEBUG_SYNC='RESET'; diff --git a/mysql-test/suite/innodb/t/purge_pessimistic.opt b/mysql-test/suite/innodb/t/purge_pessimistic.opt new file mode 100644 index 00000000000..a39e5228c9d --- /dev/null +++ b/mysql-test/suite/innodb/t/purge_pessimistic.opt @@ -0,0 +1 @@ +--innodb_purge_threads=1 diff --git a/mysql-test/suite/innodb/t/purge_pessimistic.test b/mysql-test/suite/innodb/t/purge_pessimistic.test new file mode 100644 index 00000000000..88be65a6be5 --- /dev/null +++ b/mysql-test/suite/innodb/t/purge_pessimistic.test @@ -0,0 +1,51 @@ +--source include/have_innodb.inc +--source include/count_sessions.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/not_embedded.inc + +--echo # +--echo # MDEV-35508: Race condition between purge and secondary index INSERT or UPDATE +--echo # + +SET @old_debug_dbug = @@global.debug_dbug; + +CREATE TABLE t1(col1 INT PRIMARY KEY, col2 int, KEY k1(col2)) ENGINE=Innodb; +INSERT INTO t1 VALUES(1, 100); + +CREATE TABLE t2(col1 INT PRIMARY KEY) Engine=Innodb; +--source include/wait_all_purged.inc + +START TRANSACTION; +INSERT INTO t2 VALUES(10); + +SET DEBUG_SYNC='RESET'; + +SET GLOBAL debug_dbug= "+d,btr_force_pessimistic_delete"; +SET GLOBAL debug_dbug= "+d,enable_row_purge_sec_tree_sync"; + +--connect (con1,localhost,root) +UPDATE t1 SET col2 = 200 WHERE col1 = 1; + +--connection default +SET DEBUG_SYNC= 'now WAIT_FOR purge_sec_tree_begin'; +SET GLOBAL debug_dbug= "-d,enable_row_purge_sec_tree_sync"; + +UPDATE t1 SET col2 = 100 WHERE col1 = 1; +SET DEBUG_SYNC= 'now SIGNAL purge_sec_tree_execute'; + +COMMIT; +--source include/wait_all_purged.inc + +--disconnect con1 +--source include/wait_until_count_sessions.inc + +SELECT * FROM t1; +CHECK TABLE t1; +DROP TABLE t1; + +SELECT * FROM t2; +DROP TABLE t2; + +SET @@GLOBAL.debug_dbug = @old_debug_dbug; +SET DEBUG_SYNC='RESET'; diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 1d7c67d73cf..a5ffc825c20 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -4632,6 +4632,9 @@ btr_cur_optimistic_delete( ULINT_UNDEFINED, &heap); dberr_t err = DB_SUCCESS; + DBUG_EXECUTE_IF("btr_force_pessimistic_delete", + err = DB_FAIL; goto func_exit;); + if (rec_offs_any_extern(offsets) || !btr_cur_can_delete_without_compress(cursor, rec_offs_size(offsets), diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 72ba73007cf..f76bdea33a4 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -946,16 +946,58 @@ public: @return whether any transaction not newer than id might be active */ - bool find_same_or_older(trx_t *trx, trx_id_t id) + bool find_same_or_older_low(trx_t *trx, trx_id_t id) noexcept; + + /** + Determine if the specified transaction or any older one might be active. + + @param trx transaction whose max_inactive_id will be consulted + @param id identifier of another transaction + @return whether any transaction not newer than id might be active + */ + + bool find_same_or_older(trx_t *trx, trx_id_t id) noexcept { if (trx->max_inactive_id >= id) return false; - bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); + const bool found{find_same_or_older_low(trx, id)}; if (!found) trx->max_inactive_id= id; return found; } + /** + Determine if the specified transaction or any older one might be active. + + @param trx purge_sys.query->trx (may be used by multiple threads) + @param id transaction identifier to check + @return whether any transaction not newer than id might be active + */ + + bool find_same_or_older_in_purge(trx_t *trx, trx_id_t id) noexcept + { +#if SIZEOF_SIZE_T < 8 && !defined __i386__ + /* On systems that lack native 64-bit loads and stores, + it should be more efficient to acquire a futex-backed mutex + earlier than to invoke a loop or a complex library function. + + Our IA-32 target is not "i386" but at least "i686", that is, at least + Pentium MMX, which has a 64-bit data bus and 64-bit XMM registers. */ + trx->mutex_lock(); + trx_id_t &max_inactive_id= trx->max_inactive_id; + const bool hot{max_inactive_id < id && find_same_or_older(trx, id)}; +#else + Atomic_relaxed &max_inactive_id= trx->max_inactive_id_atomic; + if (max_inactive_id >= id) + return false; + trx->mutex_lock(); + const bool hot{find_same_or_older(trx, id)}; +#endif + if (hot) + max_inactive_id= id; + trx->mutex_unlock(); + return hot; + } /** Determines the maximum transaction id. @@ -1193,12 +1235,7 @@ public: inline void undo_truncate_start(fil_space_t &space); private: - static my_bool find_same_or_older_callback(void *el, void *i) - { - auto element= static_cast(el); - auto id= static_cast(i); - return element->id <= *id; - } + static my_bool find_same_or_older_callback(void *el, void *i) noexcept; struct snapshot_ids_arg diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index afa44d01485..79d5b7be5da 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -598,10 +598,25 @@ public: Cleared in commit_in_memory() after commit_state(), trx_sys_t::deregister_rw(), release_locks(). */ trx_id_t id; - /** The largest encountered transaction identifier for which no - transaction was observed to be active. This is a cache to speed up - trx_sys_t::find_same_or_older(). */ - trx_id_t max_inactive_id; + union + { + /** The largest encountered transaction identifier for which no + transaction was observed to be active. This is a cache to speed up + trx_sys_t::find_same_or_older(). + + This will be zero-initialized in Pool::Pool() and not initialized + when a transaction object in the pool is freed and reused. The + idea is that new transactions can reuse the result of + an expensive trx_sys_t::find_same_or_older_low() invocation that + was performed in an earlier transaction that used the same + memory area. */ + trx_id_t max_inactive_id; + /** Same as max_inactive_id, for purge_sys.query->trx which may be + accessed by multiple concurrent threads in in + trx_sys_t::find_same_or_older_in_purge(). Writes are protected by + trx_t::mutex. */ + Atomic_relaxed max_inactive_id_atomic; + }; private: /** mutex protecting state and some of lock diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h index 63628cc169f..410f7875d72 100644 --- a/storage/innobase/include/ut0pool.h +++ b/storage/innobase/include/ut0pool.h @@ -68,6 +68,15 @@ struct Pool { aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE)); memset_aligned( m_start, 0, m_size); + /* Note: The above would zero-initialize some + std::atomic data members in trx_t, such as + trx_t::lock, which will not be initialized further in + TrxFactory::init(). It may be implementation defined + whether such zero initialization works. On some + hypothetical platform (not one that seems to be + supported by a mainstream C++ compiler), std::atomic + might wrap the data member as well as a + non-zero-initialized mutex. */ m_last = m_start; diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index 0d9614222f7..44e79d17e97 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -746,6 +746,22 @@ bool row_purge_poss_sec(purge_node_t *node, dict_index_t *index, return can_delete; } +/** Report an error about not delete-marked secondary index record +that was about to be purged. +@param cur cursor on the secondary index record +@param entry search key */ +ATTRIBUTE_COLD ATTRIBUTE_NOINLINE +static void row_purge_del_mark_error(const btr_cur_t &cursor, + const dtuple_t &entry) +{ + const dict_index_t *index= cursor.index(); + ib::error() << "tried to purge non-delete-marked record in index " + << index->name << " of table " << index->table->name + << ": tuple: " << entry + << ", record: " << rec_index_print(cursor.page_cur.rec, index); + ut_ad(0); +} + __attribute__((nonnull, warn_unused_result)) /** Remove a secondary index entry if possible, by modifying the index tree. @param node purge node @@ -765,6 +781,16 @@ static bool row_purge_remove_sec_if_poss_tree(purge_node_t *node, mtr_t mtr; log_free_check(); +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("enable_row_purge_sec_tree_sync", + debug_sync_set_action(current_thd, STRING_WITH_LEN( + "now SIGNAL " + "purge_sec_tree_begin")); + debug_sync_set_action(current_thd, STRING_WITH_LEN( + "now WAIT_FOR " + "purge_sec_tree_execute")); + ); +#endif mtr.start(); index->set_modified(mtr); pcur.btr_cur.page_cur.index = index; @@ -812,26 +838,13 @@ found: /* Remove the index record, which should have been marked for deletion. */ - if (!rec_get_deleted_flag(btr_cur_get_rec( - btr_pcur_get_btr_cur(&pcur)), - dict_table_is_comp(index->table))) { - ib::error() - << "tried to purge non-delete-marked record" - " in index " << index->name - << " of table " << index->table->name - << ": tuple: " << *entry - << ", record: " << rec_index_print( - btr_cur_get_rec( - btr_pcur_get_btr_cur(&pcur)), - index); - - ut_ad(0); - + if (!rec_get_deleted_flag(btr_pcur_get_rec(&pcur), + index->table->not_redundant())) { + row_purge_del_mark_error(pcur.btr_cur, *entry); goto func_exit; } - btr_cur_pessimistic_delete(&err, FALSE, - btr_pcur_get_btr_cur(&pcur), + btr_cur_pessimistic_delete(&err, FALSE, &pcur.btr_cur, 0, false, &mtr); switch (UNIV_EXPECT(err, DB_SUCCESS)) { case DB_SUCCESS: @@ -850,12 +863,34 @@ func_exit: return success; } +/** Compute a nonzero return value of row_purge_remove_sec_if_poss_leaf(). +@param page latched secondary index page +@return PAGE_MAX_TRX_ID for row_purge_remove_sec_if_poss_tree() +@retval 1 if a further row_purge_poss_sec() check is necessary */ +ATTRIBUTE_NOINLINE ATTRIBUTE_COLD +static trx_id_t row_purge_check(const page_t *page) noexcept +{ + trx_id_t id= page_get_max_trx_id(page); + ut_ad(id); + if (trx_sys.find_same_or_older_in_purge(purge_sys.query->trx, id)) + /* Because an active transaction may modify the secondary index + but not PAGE_MAX_TRX_ID, row_purge_poss_sec() must be invoked + again after re-latching the page. Let us return a bogus ID. Yes, + an actual transaction with ID 1 would create the InnoDB dictionary + tables in dict_sys_t::create_or_check_sys_tables(), but it would + exclusively write TRX_UNDO_INSERT_REC records. Purging those + records never involves row_purge_remove_sec_if_poss_tree(). */ + id= 1; + return id; +} + __attribute__((nonnull, warn_unused_result)) /** Remove a secondary index entry if possible, without modifying the tree. @param node purge node @param index secondary index @param entry index entry @return PAGE_MAX_TRX_ID for row_purge_remove_sec_if_poss_tree() +@retval 1 if a further row_purge_poss_sec() check is necessary @retval 0 if success or if not found */ static trx_id_t row_purge_remove_sec_if_poss_leaf(purge_node_t *node, dict_index_t *index, @@ -895,39 +930,28 @@ found: /* Before attempting to purge a record, check if it is safe to do so. */ if (row_purge_poss_sec(node, index, entry, &mtr)) { - btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); - /* Only delete-marked records should be purged. */ - if (!rec_get_deleted_flag( - btr_cur_get_rec(btr_cur), - dict_table_is_comp(index->table))) { - - ib::error() - << "tried to purge non-delete-marked" - " record" " in index " << index->name - << " of table " << index->table->name - << ": tuple: " << *entry - << ", record: " - << rec_index_print( - btr_cur_get_rec(btr_cur), - index); + if (!rec_get_deleted_flag(btr_pcur_get_rec(&pcur), + index->table + ->not_redundant())) { + row_purge_del_mark_error(pcur.btr_cur, *entry); mtr.commit(); dict_set_corrupted(index, "purge"); goto cleanup; } if (index->is_spatial()) { - const buf_block_t* block = btr_cur_get_block( - btr_cur); + const buf_block_t* block = btr_pcur_get_block( + &pcur); if (block->page.id().page_no() != index->page && page_get_n_recs(block->page.frame) < 2 && !lock_test_prdt_page_lock( - btr_cur->rtr_info - && btr_cur->rtr_info->thr + pcur.btr_cur.rtr_info + && pcur.btr_cur.rtr_info->thr ? thr_get_trx( - btr_cur->rtr_info->thr) + pcur.btr_cur.rtr_info->thr) : nullptr, block->page.id())) { /* this is the last record on page, @@ -942,11 +966,11 @@ found: } } - if (btr_cur_optimistic_delete(btr_cur, 0, &mtr) - == DB_FAIL) { - page_max_trx_id = page_get_max_trx_id( - btr_cur_get_page(btr_cur)); - } + if (btr_cur_optimistic_delete(&pcur.btr_cur, 0, &mtr) + == DB_FAIL) { + page_max_trx_id = row_purge_check( + btr_pcur_get_page(&pcur)); + } } /* (The index entry is still needed, diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 76f3ba79baf..614e34e5a8a 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -212,6 +212,19 @@ TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const return size; } +my_bool trx_sys_t::find_same_or_older_callback(void *el, void *i) noexcept +{ + auto element= static_cast(el); + auto id= static_cast(i); + return element->id <= *id; +} + + +bool trx_sys_t::find_same_or_older_low(trx_t *trx, trx_id_t id) noexcept +{ + return rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); +} + /** Create a persistent rollback segment. @param space_id system or undo tablespace id @return pointer to new rollback segment