From b12c14e3b42c44ff3a48e82679c2fab8148b372f Mon Sep 17 00:00:00 2001 From: mariadb-DebarunBanerjee Date: Thu, 30 May 2024 17:14:01 +0530 Subject: [PATCH] MDEV-34265 Possible hang during IO burst with innodb_flush_sync enabled When checkpoint age goes beyond the sync flush threshold and buf_flush_sync_lsn is set, page cleaner enters into "furious flush" stage to aggressively flush dirty pages from flush list and pull checkpoint LSN above safe margin. In this stage, page cleaner skips doing LRU flush and eviction. In 10.6, all other threads entirely rely on page cleaner to generate free pages. If free pages get over while page cleaner is busy in "furious flush" stage, a session thread could wait for free page in the middle of a min-transaction(mtr) while holding latches on other pages. It, in turn, can prevent page cleaner to flush such pages preventing checkpoint LSN to move forward creating a deadlock situation. Even otherwise, it could create a stall and hang like situation for large BP with plenty of dirty pages to flush before the stage could finish. Fix: During furious flush, check and evict LRU pages after each flush iteration. --- .../suite/innodb/r/insert_into_empty.result | 13 ++ .../suite/innodb/t/insert_into_empty.test | 30 +++- storage/innobase/buf/buf0flu.cc | 136 +++++++++++------- storage/innobase/buf/buf0lru.cc | 15 ++ 4 files changed, 138 insertions(+), 56 deletions(-) diff --git a/mysql-test/suite/innodb/r/insert_into_empty.result b/mysql-test/suite/innodb/r/insert_into_empty.result index 2fdea1bce38..42385a833ce 100644 --- a/mysql-test/suite/innodb/r/insert_into_empty.result +++ b/mysql-test/suite/innodb/r/insert_into_empty.result @@ -268,4 +268,17 @@ PARTITION BY KEY(a) PARTITIONS 16; INSERT INTO t1 VALUES(1); UPDATE t1 SET a = 2 WHERE a = 1; DROP TABLE t1; +# +# MDEV-34265 Possible hang during IO burst with innodb_flush_sync enabled +# +CREATE TABLE t1(f1 MEDIUMTEXT)ENGINE=InnoDB; +SET @save_dbug=@@GLOBAL.debug_dbug; +SET @@GLOBAL.debug_dbug='+d,ib_page_cleaner_sleep'; +SET STATEMENT debug_dbug='+d,ib_free_page_sleep' FOR +INSERT INTO t1 VALUES(REPEAT(1, 8459264)); +SET @@GLOBAL.debug_dbug=@save_dbug; +SELECT length(f1) FROM t1; +length(f1) +8459264 +DROP TABLE t1; # End of 10.6 tests diff --git a/mysql-test/suite/innodb/t/insert_into_empty.test b/mysql-test/suite/innodb/t/insert_into_empty.test index aa9ef08b30d..d7f884ecc4d 100644 --- a/mysql-test/suite/innodb/t/insert_into_empty.test +++ b/mysql-test/suite/innodb/t/insert_into_empty.test @@ -163,9 +163,9 @@ DROP TABLE t1,t2; CREATE TABLE t1 (a INT KEY) ENGINE=InnoDB; ---error 0,1193 +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE SET @save_limit = @@GLOBAL.innodb_limit_optimistic_insert_debug; ---error 0,1193 +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE SET GLOBAL innodb_limit_optimistic_insert_debug = 2; BEGIN; @@ -289,4 +289,30 @@ CREATE TABLE t1(a INT KEY)ENGINE=InnoDB INSERT INTO t1 VALUES(1); UPDATE t1 SET a = 2 WHERE a = 1; DROP TABLE t1; + +--echo # +--echo # MDEV-34265 Possible hang during IO burst with innodb_flush_sync enabled +--echo # +CREATE TABLE t1(f1 MEDIUMTEXT)ENGINE=InnoDB; + +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @save_dbug=@@GLOBAL.debug_dbug; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @@GLOBAL.debug_dbug='+d,ib_page_cleaner_sleep'; + +if ($have_debug) { + SET STATEMENT debug_dbug='+d,ib_free_page_sleep' FOR + INSERT INTO t1 VALUES(REPEAT(1, 8459264)); +} +if (!$have_debug) { + --echo SET STATEMENT debug_dbug='+d,ib_free_page_sleep' FOR + INSERT INTO t1 VALUES(REPEAT(1, 8459264)); +} + +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @@GLOBAL.debug_dbug=@save_dbug; + +SELECT length(f1) FROM t1; +DROP TABLE t1; + --echo # End of 10.6 tests diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index fd1f2643324..46129a037b0 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -2025,70 +2025,94 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) /** Conduct checkpoint-related flushing for innodb_flush_sync=ON, and try to initiate checkpoints until the target is met. @param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ -ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) +ATTRIBUTE_COLD ATTRIBUTE_NOINLINE +static void buf_flush_sync_for_checkpoint(lsn_t lsn) { ut_ad(!srv_read_only_mode); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - for (;;) + /* During furious flush, we need to keep generating free pages. Otherwise + concurrent mtrs could be blocked holding latches for the pages to be flushed + causing deadlock in rare occasion. + + Ideally we should be acquiring buffer pool mutex for the check but it is more + expensive and we are not using the mutex while calling need_LRU_eviction() as + of today. It is a quick and dirty read of the LRU and free list length. + Atomic read of try_LRU_scan should eventually let us do the eviction. + Correcting the inaccuracy would need more consideration to avoid any possible + performance regression. */ + if (buf_pool.need_LRU_eviction()) { - if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) - { - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, - MONITOR_FLUSH_SYNC_COUNT, - MONITOR_FLUSH_SYNC_PAGES, n_flushed); - } - - switch (srv_file_flush_method) { - case SRV_NOSYNC: - case SRV_O_DIRECT_NO_FSYNC: - break; - default: - fil_flush_file_spaces(); - } - - mysql_mutex_lock(&log_sys.mutex); - const lsn_t newest_lsn= log_sys.get_lsn(); - mysql_mutex_lock(&log_sys.flush_order_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex); - lsn_t measure= buf_pool.get_oldest_modification(0); - mysql_mutex_unlock(&log_sys.flush_order_mutex); - const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; - - if (!recv_recovery_is_on() && - checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) - { - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - log_checkpoint_low(checkpoint_lsn, newest_lsn); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - measure= buf_pool.get_oldest_modification(LSN_MAX); - } - else - { - mysql_mutex_unlock(&log_sys.mutex); - if (!measure) - measure= LSN_MAX; - } - - mysql_mutex_assert_not_owner(&log_sys.mutex); - - /* After attempting log checkpoint, check if we have reached our target. */ - const lsn_t target= buf_flush_sync_lsn; - - if (measure >= target) - buf_flush_sync_lsn= 0; - else if (measure >= buf_flush_async_lsn) - buf_flush_async_lsn= 0; - - /* wake up buf_flush_wait() */ - pthread_cond_broadcast(&buf_pool.done_flush_list); + buf_pool.page_cleaner_set_idle(false); + buf_pool.n_flush_inc(); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - lsn= std::max(lsn, target); + mysql_mutex_lock(&buf_pool.mutex); + /* Confirm that eviction is needed after acquiring buffer pool mutex. */ + if (buf_pool.need_LRU_eviction()) + /* We intend to only evict pages keeping maximum flush bandwidth for + flush list pages advancing checkpoint. However, if the LRU tail is full + of dirty pages, we might need some flushing. */ + std::ignore= buf_flush_LRU(srv_io_capacity); + mysql_mutex_unlock(&buf_pool.mutex); - if (measure >= lsn) - return; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_dec(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); } + + if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_flushed); + } + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + mysql_mutex_lock(&log_sys.mutex); + const lsn_t newest_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&log_sys.flush_order_mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); + mysql_mutex_unlock(&log_sys.flush_order_mutex); + const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; + + if (!recv_recovery_is_on() && + checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_checkpoint_low(checkpoint_lsn, newest_lsn); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + measure= buf_pool.get_oldest_modification(LSN_MAX); + } + else + { + mysql_mutex_unlock(&log_sys.mutex); + if (!measure) + measure= LSN_MAX; + } + + mysql_mutex_assert_not_owner(&log_sys.mutex); + + /* After attempting log checkpoint, check if we have reached our target. */ + const lsn_t target= buf_flush_sync_lsn; + + if (measure >= target) + buf_flush_sync_lsn= 0; + else if (measure >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; + + /* wake up buf_flush_wait() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); } /** Check if the adpative flushing threshold is recommended based on @@ -2298,6 +2322,10 @@ static void buf_flush_page_cleaner() for (;;) { + DBUG_EXECUTE_IF("ib_page_cleaner_sleep", + { + std::this_thread::sleep_for(std::chrono::seconds(1)); + }); lsn_limit= buf_flush_sync_lsn; if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync)) diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 737df0a6913..87a071d1eda 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -406,17 +406,32 @@ got_block: ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth || buf_pool.need_LRU_eviction()); + ut_d(bool signalled = false); + if (UNIV_UNLIKELY(available < scan_depth) && LRU_size > BUF_LRU_MIN_LEN) { mysql_mutex_lock(&buf_pool.flush_list_mutex); if (!buf_pool.page_cleaner_active()) + { buf_pool.page_cleaner_wakeup(true); + ut_d(signalled = true); + } mysql_mutex_unlock(&buf_pool.flush_list_mutex); } if (!have_mutex) mysql_mutex_unlock(&buf_pool.mutex); + DBUG_EXECUTE_IF("ib_free_page_sleep", + { + static bool do_sleep = true; + if (do_sleep && signalled) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + do_sleep = false; + } + }); + block->page.zip.clear(); return block; }