From 77bebe9eb08035cf2ea8796c80b66cfa85fcf5d5 Mon Sep 17 00:00:00 2001 From: mariadb-DebarunBanerjee Date: Mon, 10 Mar 2025 12:39:56 +0530 Subject: [PATCH] MDEV-36226 Stall and crash when page cleaner fails to generate free pages during Async flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During regular iteration the page cleaner does flush from flush list with some flush target and then goes for generating free pages from LRU tail. When asynchronous flush is triggered i.e. when 7/8 th of the LSN margin is filled in the redo log, the flush target for flush list is set to innodb_io_capacity_max. If it could flush all, the flush bandwidth for LRU flush is currently set to zero. If the LRU tail has dirty pages, page cleaner ends up freeing no pages in one iteration. The scenario could repeat across multiple iterations till async flush target is reached. During this time the DB system is starved of free pages resulting in apparent stall and in some cases dict_sys latch fatal error. Fix: In page cleaner iteration, before LRU flush, ensure we provide enough flush limit so that freeing pages is no blocked by dirty pages in LRU tail. Log IO and flush state if double write flush wait is long. Reviewed by: Marko Mäkelä --- mysql-test/mariadb-test-run.pl | 1 + storage/innobase/buf/buf0dblwr.cc | 53 +++++++++++++++++-- storage/innobase/buf/buf0flu.cc | 78 ++++++++++++++++++++++++++-- storage/innobase/dict/dict0dict.cc | 4 ++ storage/innobase/include/buf0buf.h | 3 ++ storage/innobase/include/buf0dblwr.h | 3 ++ storage/innobase/include/os0file.h | 2 + storage/innobase/os/os0file.cc | 6 +++ storage/innobase/srv/srv0srv.cc | 1 + 9 files changed, 143 insertions(+), 8 deletions(-) diff --git a/mysql-test/mariadb-test-run.pl b/mysql-test/mariadb-test-run.pl index a47be67a050..3694ed31c01 100755 --- a/mysql-test/mariadb-test-run.pl +++ b/mysql-test/mariadb-test-run.pl @@ -4464,6 +4464,7 @@ sub extract_warning_lines ($$) { qr/InnoDB: innodb_open_files .* should not be greater than/, qr/InnoDB: Trying to delete tablespace.*but there are.*pending/, qr/InnoDB: Tablespace 1[0-9]* was not found at .*, and innodb_force_recovery was set/, + qr/InnoDB: Long wait \([0-9]+ seconds\) for double-write buffer flush/, qr/Slave: Unknown table 't1' .* 1051/, qr/Slave SQL:.*(Internal MariaDB error code: [[:digit:]]+|Query:.*)/, qr/slave SQL thread aborted/, diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index b04460a020f..8cc44eb7972 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -603,20 +603,67 @@ static void buf_dblwr_check_block(const buf_page_t *bpage) noexcept } #endif /* UNIV_DEBUG */ +ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept +{ + mysql_mutex_assert_owner(&mutex); + const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + + sql_print_information("InnoDB: Double Write State\n" + "-------------------\n" + "Batch running : %s\n" + "Active Slot - first_free: %zu reserved: %zu\n" + "Flush Slot - first_free: %zu reserved: %zu\n" + "-------------------", + (batch_running ? "true" : "false"), + active_slot->first_free, active_slot->reserved, + flush_slot->first_free, flush_slot->reserved); +} + bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept { mysql_mutex_assert_owner(&mutex); ut_ad(size == block_size()); - for (;;) + const size_t max_count= 60 * 60; + const size_t first_log_count= 30; + const size_t fatal_threshold= + static_cast(srv_fatal_semaphore_wait_threshold); + size_t log_count= first_log_count; + + for (ulong count= 0;;) { if (!active_slot->first_free) return false; if (!batch_running) break; - my_cond_wait(&cond, &mutex.m_mutex); - } + timespec abstime; + set_timespec(abstime, 1); + my_cond_timedwait(&cond, &mutex.m_mutex, &abstime); + + if (count > fatal_threshold) + { + buf_pool.print_flush_info(); + print_info(); + ib::fatal() << "InnoDB: Long wait (" << count + << " seconds) for double-write buffer flush."; + } + else if (++count < first_log_count && !(count % 5)) + { + sql_print_information("InnoDB: Long wait (%zu seconds) for double-write" + " buffer flush.", count); + buf_pool.print_flush_info(); + print_info(); + } + else if (!(count % log_count)) + { + sql_print_warning("InnoDB: Long wait (%zu seconds) for double-write" + " buffer flush.", count); + buf_pool.print_flush_info(); + print_info(); + log_count= log_count >= max_count ? max_count : log_count * 2; + } + } ut_ad(active_slot->reserved == active_slot->first_free); ut_ad(!flushing_buffered_writes); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index edeb210f6f7..71ee98b18be 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1374,7 +1374,10 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) noexcept break; } - if (neighbors && space->is_rotational()) + if (neighbors && space->is_rotational() && + /* Skip neighbourhood flush from LRU list if we haven't yet reached + half of the free page target. */ + UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit) n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, neighbors == 1, n->flushed, max); @@ -1763,8 +1766,16 @@ static ulint buf_flush_LRU(ulint max_n) noexcept buf_do_LRU_batch(max_n, &n); ulint pages= n.flushed; + ulint evicted= n.evicted; - if (n.evicted) + /* If we have exhausted flush quota, it is likely we exited before + generating enough free pages. Call once more with 0 flush to generate + free pages immediately as required. */ + if (pages >= max_n) + buf_do_LRU_batch(0, &n); + + evicted+= n.evicted; + if (evicted) { buf_pool.try_LRU_scan= true; pthread_cond_broadcast(&buf_pool.done_free); @@ -2339,6 +2350,11 @@ static void buf_flush_page_cleaner() noexcept DBUG_EXECUTE_IF("ib_page_cleaner_sleep", { std::this_thread::sleep_for(std::chrono::seconds(1)); + /* Cover the logging code in debug mode. */ + buf_pool.print_flush_info(); + buf_dblwr.lock(); + buf_dblwr.print_info(); + buf_dblwr.unlock(); }); lsn_limit= buf_flush_sync_lsn; @@ -2543,6 +2559,10 @@ static void buf_flush_page_cleaner() noexcept n= srv_max_io_capacity; n= n >= n_flushed ? n - n_flushed : 0; + /* It is critical to generate free pages to keep the system alive. Make + sure we are not hindered by dirty pages in LRU tail. */ + n= std::max(n, std::min(srv_max_io_capacity, + buf_pool.LRU_scan_depth)); goto LRU_flush; } @@ -2581,10 +2601,11 @@ ATTRIBUTE_COLD void buf_pool_t::LRU_warn() noexcept mysql_mutex_assert_owner(&mutex); try_LRU_scan= false; if (!LRU_warned.test_and_set(std::memory_order_acquire)) + { sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!" - " %zu blocks are in use and %zu free." - " Consider increasing innodb_buffer_pool_size.", - UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free)); + " Consider increasing innodb_buffer_pool_size."); + buf_pool.print_flush_info(); + } } /** Initialize page_cleaner. */ @@ -2664,6 +2685,53 @@ void buf_flush_sync() noexcept thd_wait_end(nullptr); } +ATTRIBUTE_COLD void buf_pool_t::print_flush_info() const noexcept +{ + /* We do dirty read of UT_LIST count variable. */ + size_t lru_size= UT_LIST_GET_LEN(LRU); + size_t dirty_size= UT_LIST_GET_LEN(flush_list); + size_t free_size= UT_LIST_GET_LEN(free); + size_t dirty_pct= lru_size ? dirty_size * 100 / (lru_size + free_size) : 0; + sql_print_information("InnoDB: Buffer Pool pages\n" + "-------------------\n" + "LRU Pages : %zu\n" + "Free Pages : %zu\n" + "Dirty Pages: %zu : %zu%%\n" + "-------------------", + lru_size, free_size, dirty_size, dirty_pct); + + lsn_t lsn= log_sys.get_lsn(); + lsn_t clsn= log_sys.last_checkpoint_lsn; + sql_print_information("InnoDB: LSN flush parameters\n" + "-------------------\n" + "System LSN : %" PRIu64 "\n" + "Checkpoint LSN: %" PRIu64 "\n" + "Flush ASync LSN: %" PRIu64 "\n" + "Flush Sync LSN: %" PRIu64 "\n" + "-------------------", + lsn, clsn, buf_flush_async_lsn.load(), buf_flush_sync_lsn.load()); + + lsn_t age= lsn - clsn; + lsn_t age_pct= log_sys.max_checkpoint_age + ? age * 100 / log_sys.max_checkpoint_age : 0; + sql_print_information("InnoDB: LSN age parameters\n" + "-------------------\n" + "Current Age : %" PRIu64 " : %" PRIu64 "%%\n" + "Max Age(Async): %" PRIu64 "\n" + "Max Age(Sync) : %" PRIu64 "\n" + "Capacity : %" PRIu64 "\n" + "-------------------", + age, age_pct, log_sys.max_modified_age_async, log_sys.max_checkpoint_age, + log_sys.log_capacity); + + sql_print_information("InnoDB: Pending IO count\n" + "-------------------\n" + "Pending Read : %zu\n" + "Pending Write: %zu\n" + "-------------------", + os_aio_pending_reads_approx(), os_aio_pending_writes_approx()); +} + #ifdef UNIV_DEBUG /** Functor to validate the flush list. */ struct Check { diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index a08b8812e6d..4f293c41854 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -44,6 +44,7 @@ Created 1/8/1996 Heikki Tuuri #include "btr0cur.h" #include "btr0sea.h" #include "buf0buf.h" +#include "buf0flu.h" #include "data0type.h" #include "dict0boot.h" #include "dict0load.h" @@ -1024,7 +1025,10 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)) noexc const ulong threshold= srv_fatal_semaphore_wait_threshold; if (waited >= threshold) + { + buf_pool.print_flush_info(); ib::fatal() << fatal_msg; + } if (waited > threshold / 4) ib::warn() << "A long wait (" << waited diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index eed41113659..605ce772b37 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1951,6 +1951,9 @@ public: /** Issue a warning that we could not free up buffer pool pages. */ ATTRIBUTE_COLD void LRU_warn() noexcept; + /** Print buffer pool flush state information. */ + ATTRIBUTE_COLD void print_flush_info() const noexcept; + private: /** Temporary memory for page_compressed and encrypted I/O */ struct io_buf_t diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 2737622a4d4..cef5d91214a 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -159,6 +159,9 @@ public: my_cond_wait(&cond, &mutex.m_mutex); mysql_mutex_unlock(&mutex); } + + /** Print double write state information. */ + ATTRIBUTE_COLD void print_info() const noexcept; }; /** The doublewrite buffer */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 837d22f2d3b..ab25ba28bd4 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1021,6 +1021,8 @@ size_t os_aio_pending_reads() noexcept; size_t os_aio_pending_reads_approx() noexcept; /** @return number of pending writes */ size_t os_aio_pending_writes() noexcept; +/** @return approximate number of pending writes */ +size_t os_aio_pending_writes_approx() noexcept; /** Wait until there are no pending asynchronous writes. @param declare whether the wait will be declared in tpool */ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index abfc0d939eb..416a3f8f130 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -3481,6 +3481,12 @@ size_t os_aio_pending_writes() noexcept return pending; } +/** @return approximate number of pending writes */ +size_t os_aio_pending_writes_approx() noexcept +{ + return write_slots->pending_io_count(); +} + /** Wait until all pending asynchronous reads have completed. @param declare whether the wait will be declared in tpool */ void os_aio_wait_until_no_pending_reads(bool declare) noexcept diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index cc35a659a71..a6a588ab729 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1220,6 +1220,7 @@ void srv_monitor_task(void*) now -= start; ulong waited = static_cast(now / 1000000); if (waited >= threshold) { + buf_pool.print_flush_info(); ib::fatal() << dict_sys.fatal_msg; }