From 19acb0257eec0956e3a0dd3b11bbb701fa7ea648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Fri, 29 Nov 2024 10:44:38 +0200 Subject: [PATCH] MDEV-35508 Race condition between purge and secondary index INSERT or UPDATE row_purge_remove_sec_if_poss_leaf(): If there is an active transaction that is not newer than PAGE_MAX_TRX_ID, return the bogus value 1 so that row_purge_remove_sec_if_poss_tree() is guaranteed to recheck if the record needs to be purged. It could be the case that an active transaction would insert this record between the time this check completed and row_purge_remove_sec_if_poss_tree() acquired a latch on the secondary index leaf page again. row_purge_del_mark_error(), row_purge_check(): Some unlikely code refactored into separate non-inline functions. trx_sys_t::find_same_or_older_low(): Move the unlikely and bulky part of trx_sys_t::find_same_or_older() to a non-inline function. trx_sys_t::find_same_or_older_in_purge(): A variant of trx_sys_t::find_same_or_older() for use in the purge subsystem, with potential concurrent access of the same trx_t object from multiple threads. trx_t::max_inactive_id_atomic: An Atomic_relaxed alias of the regular data field trx_t::max_inactive_id, which we use on systems that have native 64-bit loads or stores. On any 64-bit system that seems to be supported by GCC, Clang or MSVC, relaxed atomic loads and stores use the regular load and store instructions. On -march=i686 the 64-bit atomic loads and stores would use an XMM register. This fixes a regression that had been introduced in commit b7b9f3ce825a08f16ad851c16f603365ae473195 (MDEV-34515). There would be messages [ERROR] InnoDB: tried to purge non-delete-marked record in index in the server error log, and an assertion ut_ad(0) would cause a crash of debug instrumented builds. This could also cause incorrect results for MVCC reads and corrupted secondary indexes. The debug instrumented test case was written by Debarun Banerjee. Reviewed by: Debarun Banerjee --- .../suite/innodb/r/purge_pessimistic.result | 36 ++++++ .../suite/innodb/t/purge_pessimistic.opt | 1 + .../suite/innodb/t/purge_pessimistic.test | 51 +++++++++ storage/innobase/btr/btr0cur.cc | 3 + storage/innobase/include/trx0sys.h | 53 +++++++-- storage/innobase/include/trx0trx.h | 23 +++- storage/innobase/include/ut0pool.h | 9 ++ storage/innobase/row/row0purge.cc | 108 +++++++++++------- storage/innobase/trx/trx0sys.cc | 13 +++ 9 files changed, 243 insertions(+), 54 deletions(-) create mode 100644 mysql-test/suite/innodb/r/purge_pessimistic.result create mode 100644 mysql-test/suite/innodb/t/purge_pessimistic.opt create mode 100644 mysql-test/suite/innodb/t/purge_pessimistic.test diff --git a/mysql-test/suite/innodb/r/purge_pessimistic.result b/mysql-test/suite/innodb/r/purge_pessimistic.result new file mode 100644 index 00000000000..a7ce4e13d60 --- /dev/null +++ b/mysql-test/suite/innodb/r/purge_pessimistic.result @@ -0,0 +1,36 @@ +# +# MDEV-35508: Race condition between purge and secondary index INSERT or UPDATE +# +SET @old_debug_dbug = @@global.debug_dbug; +CREATE TABLE t1(col1 INT PRIMARY KEY, col2 int, KEY k1(col2)) ENGINE=Innodb; +INSERT INTO t1 VALUES(1, 100); +CREATE TABLE t2(col1 INT PRIMARY KEY) Engine=Innodb; +InnoDB 0 transactions not purged +START TRANSACTION; +INSERT INTO t2 VALUES(10); +SET DEBUG_SYNC='RESET'; +SET GLOBAL debug_dbug= "+d,btr_force_pessimistic_delete"; +SET GLOBAL debug_dbug= "+d,enable_row_purge_sec_tree_sync"; +connect con1,localhost,root; +UPDATE t1 SET col2 = 200 WHERE col1 = 1; +connection default; +SET DEBUG_SYNC= 'now WAIT_FOR purge_sec_tree_begin'; +SET GLOBAL debug_dbug= "-d,enable_row_purge_sec_tree_sync"; +UPDATE t1 SET col2 = 100 WHERE col1 = 1; +SET DEBUG_SYNC= 'now SIGNAL purge_sec_tree_execute'; +COMMIT; +InnoDB 0 transactions not purged +disconnect con1; +SELECT * FROM t1; +col1 col2 +1 100 +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +DROP TABLE t1; +SELECT * FROM t2; +col1 +10 +DROP TABLE t2; +SET @@GLOBAL.debug_dbug = @old_debug_dbug; +SET DEBUG_SYNC='RESET'; diff --git a/mysql-test/suite/innodb/t/purge_pessimistic.opt b/mysql-test/suite/innodb/t/purge_pessimistic.opt new file mode 100644 index 00000000000..a39e5228c9d --- /dev/null +++ b/mysql-test/suite/innodb/t/purge_pessimistic.opt @@ -0,0 +1 @@ +--innodb_purge_threads=1 diff --git a/mysql-test/suite/innodb/t/purge_pessimistic.test b/mysql-test/suite/innodb/t/purge_pessimistic.test new file mode 100644 index 00000000000..88be65a6be5 --- /dev/null +++ b/mysql-test/suite/innodb/t/purge_pessimistic.test @@ -0,0 +1,51 @@ +--source include/have_innodb.inc +--source include/count_sessions.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/not_embedded.inc + +--echo # +--echo # MDEV-35508: Race condition between purge and secondary index INSERT or UPDATE +--echo # + +SET @old_debug_dbug = @@global.debug_dbug; + +CREATE TABLE t1(col1 INT PRIMARY KEY, col2 int, KEY k1(col2)) ENGINE=Innodb; +INSERT INTO t1 VALUES(1, 100); + +CREATE TABLE t2(col1 INT PRIMARY KEY) Engine=Innodb; +--source include/wait_all_purged.inc + +START TRANSACTION; +INSERT INTO t2 VALUES(10); + +SET DEBUG_SYNC='RESET'; + +SET GLOBAL debug_dbug= "+d,btr_force_pessimistic_delete"; +SET GLOBAL debug_dbug= "+d,enable_row_purge_sec_tree_sync"; + +--connect (con1,localhost,root) +UPDATE t1 SET col2 = 200 WHERE col1 = 1; + +--connection default +SET DEBUG_SYNC= 'now WAIT_FOR purge_sec_tree_begin'; +SET GLOBAL debug_dbug= "-d,enable_row_purge_sec_tree_sync"; + +UPDATE t1 SET col2 = 100 WHERE col1 = 1; +SET DEBUG_SYNC= 'now SIGNAL purge_sec_tree_execute'; + +COMMIT; +--source include/wait_all_purged.inc + +--disconnect con1 +--source include/wait_until_count_sessions.inc + +SELECT * FROM t1; +CHECK TABLE t1; +DROP TABLE t1; + +SELECT * FROM t2; +DROP TABLE t2; + +SET @@GLOBAL.debug_dbug = @old_debug_dbug; +SET DEBUG_SYNC='RESET'; diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 1d7c67d73cf..a5ffc825c20 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -4632,6 +4632,9 @@ btr_cur_optimistic_delete( ULINT_UNDEFINED, &heap); dberr_t err = DB_SUCCESS; + DBUG_EXECUTE_IF("btr_force_pessimistic_delete", + err = DB_FAIL; goto func_exit;); + if (rec_offs_any_extern(offsets) || !btr_cur_can_delete_without_compress(cursor, rec_offs_size(offsets), diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 72ba73007cf..f76bdea33a4 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -946,16 +946,58 @@ public: @return whether any transaction not newer than id might be active */ - bool find_same_or_older(trx_t *trx, trx_id_t id) + bool find_same_or_older_low(trx_t *trx, trx_id_t id) noexcept; + + /** + Determine if the specified transaction or any older one might be active. + + @param trx transaction whose max_inactive_id will be consulted + @param id identifier of another transaction + @return whether any transaction not newer than id might be active + */ + + bool find_same_or_older(trx_t *trx, trx_id_t id) noexcept { if (trx->max_inactive_id >= id) return false; - bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); + const bool found{find_same_or_older_low(trx, id)}; if (!found) trx->max_inactive_id= id; return found; } + /** + Determine if the specified transaction or any older one might be active. + + @param trx purge_sys.query->trx (may be used by multiple threads) + @param id transaction identifier to check + @return whether any transaction not newer than id might be active + */ + + bool find_same_or_older_in_purge(trx_t *trx, trx_id_t id) noexcept + { +#if SIZEOF_SIZE_T < 8 && !defined __i386__ + /* On systems that lack native 64-bit loads and stores, + it should be more efficient to acquire a futex-backed mutex + earlier than to invoke a loop or a complex library function. + + Our IA-32 target is not "i386" but at least "i686", that is, at least + Pentium MMX, which has a 64-bit data bus and 64-bit XMM registers. */ + trx->mutex_lock(); + trx_id_t &max_inactive_id= trx->max_inactive_id; + const bool hot{max_inactive_id < id && find_same_or_older(trx, id)}; +#else + Atomic_relaxed &max_inactive_id= trx->max_inactive_id_atomic; + if (max_inactive_id >= id) + return false; + trx->mutex_lock(); + const bool hot{find_same_or_older(trx, id)}; +#endif + if (hot) + max_inactive_id= id; + trx->mutex_unlock(); + return hot; + } /** Determines the maximum transaction id. @@ -1193,12 +1235,7 @@ public: inline void undo_truncate_start(fil_space_t &space); private: - static my_bool find_same_or_older_callback(void *el, void *i) - { - auto element= static_cast(el); - auto id= static_cast(i); - return element->id <= *id; - } + static my_bool find_same_or_older_callback(void *el, void *i) noexcept; struct snapshot_ids_arg diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index afa44d01485..79d5b7be5da 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -598,10 +598,25 @@ public: Cleared in commit_in_memory() after commit_state(), trx_sys_t::deregister_rw(), release_locks(). */ trx_id_t id; - /** The largest encountered transaction identifier for which no - transaction was observed to be active. This is a cache to speed up - trx_sys_t::find_same_or_older(). */ - trx_id_t max_inactive_id; + union + { + /** The largest encountered transaction identifier for which no + transaction was observed to be active. This is a cache to speed up + trx_sys_t::find_same_or_older(). + + This will be zero-initialized in Pool::Pool() and not initialized + when a transaction object in the pool is freed and reused. The + idea is that new transactions can reuse the result of + an expensive trx_sys_t::find_same_or_older_low() invocation that + was performed in an earlier transaction that used the same + memory area. */ + trx_id_t max_inactive_id; + /** Same as max_inactive_id, for purge_sys.query->trx which may be + accessed by multiple concurrent threads in in + trx_sys_t::find_same_or_older_in_purge(). Writes are protected by + trx_t::mutex. */ + Atomic_relaxed max_inactive_id_atomic; + }; private: /** mutex protecting state and some of lock diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h index 63628cc169f..410f7875d72 100644 --- a/storage/innobase/include/ut0pool.h +++ b/storage/innobase/include/ut0pool.h @@ -68,6 +68,15 @@ struct Pool { aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE)); memset_aligned( m_start, 0, m_size); + /* Note: The above would zero-initialize some + std::atomic data members in trx_t, such as + trx_t::lock, which will not be initialized further in + TrxFactory::init(). It may be implementation defined + whether such zero initialization works. On some + hypothetical platform (not one that seems to be + supported by a mainstream C++ compiler), std::atomic + might wrap the data member as well as a + non-zero-initialized mutex. */ m_last = m_start; diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index 0d9614222f7..44e79d17e97 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -746,6 +746,22 @@ bool row_purge_poss_sec(purge_node_t *node, dict_index_t *index, return can_delete; } +/** Report an error about not delete-marked secondary index record +that was about to be purged. +@param cur cursor on the secondary index record +@param entry search key */ +ATTRIBUTE_COLD ATTRIBUTE_NOINLINE +static void row_purge_del_mark_error(const btr_cur_t &cursor, + const dtuple_t &entry) +{ + const dict_index_t *index= cursor.index(); + ib::error() << "tried to purge non-delete-marked record in index " + << index->name << " of table " << index->table->name + << ": tuple: " << entry + << ", record: " << rec_index_print(cursor.page_cur.rec, index); + ut_ad(0); +} + __attribute__((nonnull, warn_unused_result)) /** Remove a secondary index entry if possible, by modifying the index tree. @param node purge node @@ -765,6 +781,16 @@ static bool row_purge_remove_sec_if_poss_tree(purge_node_t *node, mtr_t mtr; log_free_check(); +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("enable_row_purge_sec_tree_sync", + debug_sync_set_action(current_thd, STRING_WITH_LEN( + "now SIGNAL " + "purge_sec_tree_begin")); + debug_sync_set_action(current_thd, STRING_WITH_LEN( + "now WAIT_FOR " + "purge_sec_tree_execute")); + ); +#endif mtr.start(); index->set_modified(mtr); pcur.btr_cur.page_cur.index = index; @@ -812,26 +838,13 @@ found: /* Remove the index record, which should have been marked for deletion. */ - if (!rec_get_deleted_flag(btr_cur_get_rec( - btr_pcur_get_btr_cur(&pcur)), - dict_table_is_comp(index->table))) { - ib::error() - << "tried to purge non-delete-marked record" - " in index " << index->name - << " of table " << index->table->name - << ": tuple: " << *entry - << ", record: " << rec_index_print( - btr_cur_get_rec( - btr_pcur_get_btr_cur(&pcur)), - index); - - ut_ad(0); - + if (!rec_get_deleted_flag(btr_pcur_get_rec(&pcur), + index->table->not_redundant())) { + row_purge_del_mark_error(pcur.btr_cur, *entry); goto func_exit; } - btr_cur_pessimistic_delete(&err, FALSE, - btr_pcur_get_btr_cur(&pcur), + btr_cur_pessimistic_delete(&err, FALSE, &pcur.btr_cur, 0, false, &mtr); switch (UNIV_EXPECT(err, DB_SUCCESS)) { case DB_SUCCESS: @@ -850,12 +863,34 @@ func_exit: return success; } +/** Compute a nonzero return value of row_purge_remove_sec_if_poss_leaf(). +@param page latched secondary index page +@return PAGE_MAX_TRX_ID for row_purge_remove_sec_if_poss_tree() +@retval 1 if a further row_purge_poss_sec() check is necessary */ +ATTRIBUTE_NOINLINE ATTRIBUTE_COLD +static trx_id_t row_purge_check(const page_t *page) noexcept +{ + trx_id_t id= page_get_max_trx_id(page); + ut_ad(id); + if (trx_sys.find_same_or_older_in_purge(purge_sys.query->trx, id)) + /* Because an active transaction may modify the secondary index + but not PAGE_MAX_TRX_ID, row_purge_poss_sec() must be invoked + again after re-latching the page. Let us return a bogus ID. Yes, + an actual transaction with ID 1 would create the InnoDB dictionary + tables in dict_sys_t::create_or_check_sys_tables(), but it would + exclusively write TRX_UNDO_INSERT_REC records. Purging those + records never involves row_purge_remove_sec_if_poss_tree(). */ + id= 1; + return id; +} + __attribute__((nonnull, warn_unused_result)) /** Remove a secondary index entry if possible, without modifying the tree. @param node purge node @param index secondary index @param entry index entry @return PAGE_MAX_TRX_ID for row_purge_remove_sec_if_poss_tree() +@retval 1 if a further row_purge_poss_sec() check is necessary @retval 0 if success or if not found */ static trx_id_t row_purge_remove_sec_if_poss_leaf(purge_node_t *node, dict_index_t *index, @@ -895,39 +930,28 @@ found: /* Before attempting to purge a record, check if it is safe to do so. */ if (row_purge_poss_sec(node, index, entry, &mtr)) { - btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); - /* Only delete-marked records should be purged. */ - if (!rec_get_deleted_flag( - btr_cur_get_rec(btr_cur), - dict_table_is_comp(index->table))) { - - ib::error() - << "tried to purge non-delete-marked" - " record" " in index " << index->name - << " of table " << index->table->name - << ": tuple: " << *entry - << ", record: " - << rec_index_print( - btr_cur_get_rec(btr_cur), - index); + if (!rec_get_deleted_flag(btr_pcur_get_rec(&pcur), + index->table + ->not_redundant())) { + row_purge_del_mark_error(pcur.btr_cur, *entry); mtr.commit(); dict_set_corrupted(index, "purge"); goto cleanup; } if (index->is_spatial()) { - const buf_block_t* block = btr_cur_get_block( - btr_cur); + const buf_block_t* block = btr_pcur_get_block( + &pcur); if (block->page.id().page_no() != index->page && page_get_n_recs(block->page.frame) < 2 && !lock_test_prdt_page_lock( - btr_cur->rtr_info - && btr_cur->rtr_info->thr + pcur.btr_cur.rtr_info + && pcur.btr_cur.rtr_info->thr ? thr_get_trx( - btr_cur->rtr_info->thr) + pcur.btr_cur.rtr_info->thr) : nullptr, block->page.id())) { /* this is the last record on page, @@ -942,11 +966,11 @@ found: } } - if (btr_cur_optimistic_delete(btr_cur, 0, &mtr) - == DB_FAIL) { - page_max_trx_id = page_get_max_trx_id( - btr_cur_get_page(btr_cur)); - } + if (btr_cur_optimistic_delete(&pcur.btr_cur, 0, &mtr) + == DB_FAIL) { + page_max_trx_id = row_purge_check( + btr_pcur_get_page(&pcur)); + } } /* (The index entry is still needed, diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 76f3ba79baf..614e34e5a8a 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -212,6 +212,19 @@ TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const return size; } +my_bool trx_sys_t::find_same_or_older_callback(void *el, void *i) noexcept +{ + auto element= static_cast(el); + auto id= static_cast(i); + return element->id <= *id; +} + + +bool trx_sys_t::find_same_or_older_low(trx_t *trx, trx_id_t id) noexcept +{ + return rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); +} + /** Create a persistent rollback segment. @param space_id system or undo tablespace id @return pointer to new rollback segment