1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

MDEV-36226 Stall and crash when page cleaner fails to generate free pages during Async flush

During regular iteration the page cleaner does flush from flush list
with some flush target and then goes for generating free pages from LRU
tail. When asynchronous flush is triggered i.e. when 7/8 th of the LSN
margin is filled in the redo log, the flush target for flush list is
set to innodb_io_capacity_max. If it could flush all, the flush
bandwidth for LRU flush is currently set to zero. If the LRU tail has
dirty pages, page cleaner ends up freeing no pages in one iteration.
The scenario could repeat across multiple iterations till async flush
target is reached. During this time the DB system is starved of free
pages resulting in apparent stall and in some cases dict_sys latch
fatal error.

Fix: In page cleaner iteration, before LRU flush, ensure we provide
enough flush limit so that freeing pages is no blocked by dirty pages
in LRU tail. Log IO and flush state if double write flush wait is long.

Reviewed by: Marko Mäkelä
This commit is contained in:
mariadb-DebarunBanerjee
2025-03-10 12:39:56 +05:30
parent 51cc77f212
commit 77bebe9eb0
9 changed files with 143 additions and 8 deletions

View File

@@ -4464,6 +4464,7 @@ sub extract_warning_lines ($$) {
qr/InnoDB: innodb_open_files .* should not be greater than/,
qr/InnoDB: Trying to delete tablespace.*but there are.*pending/,
qr/InnoDB: Tablespace 1[0-9]* was not found at .*, and innodb_force_recovery was set/,
qr/InnoDB: Long wait \([0-9]+ seconds\) for double-write buffer flush/,
qr/Slave: Unknown table 't1' .* 1051/,
qr/Slave SQL:.*(Internal MariaDB error code: [[:digit:]]+|Query:.*)/,
qr/slave SQL thread aborted/,

View File

@@ -603,20 +603,67 @@ static void buf_dblwr_check_block(const buf_page_t *bpage) noexcept
}
#endif /* UNIV_DEBUG */
ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept
{
mysql_mutex_assert_owner(&mutex);
const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
sql_print_information("InnoDB: Double Write State\n"
"-------------------\n"
"Batch running : %s\n"
"Active Slot - first_free: %zu reserved: %zu\n"
"Flush Slot - first_free: %zu reserved: %zu\n"
"-------------------",
(batch_running ? "true" : "false"),
active_slot->first_free, active_slot->reserved,
flush_slot->first_free, flush_slot->reserved);
}
bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept
{
mysql_mutex_assert_owner(&mutex);
ut_ad(size == block_size());
for (;;)
const size_t max_count= 60 * 60;
const size_t first_log_count= 30;
const size_t fatal_threshold=
static_cast<size_t>(srv_fatal_semaphore_wait_threshold);
size_t log_count= first_log_count;
for (ulong count= 0;;)
{
if (!active_slot->first_free)
return false;
if (!batch_running)
break;
my_cond_wait(&cond, &mutex.m_mutex);
}
timespec abstime;
set_timespec(abstime, 1);
my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
if (count > fatal_threshold)
{
buf_pool.print_flush_info();
print_info();
ib::fatal() << "InnoDB: Long wait (" << count
<< " seconds) for double-write buffer flush.";
}
else if (++count < first_log_count && !(count % 5))
{
sql_print_information("InnoDB: Long wait (%zu seconds) for double-write"
" buffer flush.", count);
buf_pool.print_flush_info();
print_info();
}
else if (!(count % log_count))
{
sql_print_warning("InnoDB: Long wait (%zu seconds) for double-write"
" buffer flush.", count);
buf_pool.print_flush_info();
print_info();
log_count= log_count >= max_count ? max_count : log_count * 2;
}
}
ut_ad(active_slot->reserved == active_slot->first_free);
ut_ad(!flushing_buffered_writes);

View File

@@ -1374,7 +1374,10 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) noexcept
break;
}
if (neighbors && space->is_rotational())
if (neighbors && space->is_rotational() &&
/* Skip neighbourhood flush from LRU list if we haven't yet reached
half of the free page target. */
UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit)
n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
neighbors == 1,
n->flushed, max);
@@ -1763,8 +1766,16 @@ static ulint buf_flush_LRU(ulint max_n) noexcept
buf_do_LRU_batch(max_n, &n);
ulint pages= n.flushed;
ulint evicted= n.evicted;
if (n.evicted)
/* If we have exhausted flush quota, it is likely we exited before
generating enough free pages. Call once more with 0 flush to generate
free pages immediately as required. */
if (pages >= max_n)
buf_do_LRU_batch(0, &n);
evicted+= n.evicted;
if (evicted)
{
buf_pool.try_LRU_scan= true;
pthread_cond_broadcast(&buf_pool.done_free);
@@ -2339,6 +2350,11 @@ static void buf_flush_page_cleaner() noexcept
DBUG_EXECUTE_IF("ib_page_cleaner_sleep",
{
std::this_thread::sleep_for(std::chrono::seconds(1));
/* Cover the logging code in debug mode. */
buf_pool.print_flush_info();
buf_dblwr.lock();
buf_dblwr.print_info();
buf_dblwr.unlock();
});
lsn_limit= buf_flush_sync_lsn;
@@ -2543,6 +2559,10 @@ static void buf_flush_page_cleaner() noexcept
n= srv_max_io_capacity;
n= n >= n_flushed ? n - n_flushed : 0;
/* It is critical to generate free pages to keep the system alive. Make
sure we are not hindered by dirty pages in LRU tail. */
n= std::max<ulint>(n, std::min<ulint>(srv_max_io_capacity,
buf_pool.LRU_scan_depth));
goto LRU_flush;
}
@@ -2581,10 +2601,11 @@ ATTRIBUTE_COLD void buf_pool_t::LRU_warn() noexcept
mysql_mutex_assert_owner(&mutex);
try_LRU_scan= false;
if (!LRU_warned.test_and_set(std::memory_order_acquire))
{
sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
" %zu blocks are in use and %zu free."
" Consider increasing innodb_buffer_pool_size.",
UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free));
" Consider increasing innodb_buffer_pool_size.");
buf_pool.print_flush_info();
}
}
/** Initialize page_cleaner. */
@@ -2664,6 +2685,53 @@ void buf_flush_sync() noexcept
thd_wait_end(nullptr);
}
ATTRIBUTE_COLD void buf_pool_t::print_flush_info() const noexcept
{
/* We do dirty read of UT_LIST count variable. */
size_t lru_size= UT_LIST_GET_LEN(LRU);
size_t dirty_size= UT_LIST_GET_LEN(flush_list);
size_t free_size= UT_LIST_GET_LEN(free);
size_t dirty_pct= lru_size ? dirty_size * 100 / (lru_size + free_size) : 0;
sql_print_information("InnoDB: Buffer Pool pages\n"
"-------------------\n"
"LRU Pages : %zu\n"
"Free Pages : %zu\n"
"Dirty Pages: %zu : %zu%%\n"
"-------------------",
lru_size, free_size, dirty_size, dirty_pct);
lsn_t lsn= log_sys.get_lsn();
lsn_t clsn= log_sys.last_checkpoint_lsn;
sql_print_information("InnoDB: LSN flush parameters\n"
"-------------------\n"
"System LSN : %" PRIu64 "\n"
"Checkpoint LSN: %" PRIu64 "\n"
"Flush ASync LSN: %" PRIu64 "\n"
"Flush Sync LSN: %" PRIu64 "\n"
"-------------------",
lsn, clsn, buf_flush_async_lsn.load(), buf_flush_sync_lsn.load());
lsn_t age= lsn - clsn;
lsn_t age_pct= log_sys.max_checkpoint_age
? age * 100 / log_sys.max_checkpoint_age : 0;
sql_print_information("InnoDB: LSN age parameters\n"
"-------------------\n"
"Current Age : %" PRIu64 " : %" PRIu64 "%%\n"
"Max Age(Async): %" PRIu64 "\n"
"Max Age(Sync) : %" PRIu64 "\n"
"Capacity : %" PRIu64 "\n"
"-------------------",
age, age_pct, log_sys.max_modified_age_async, log_sys.max_checkpoint_age,
log_sys.log_capacity);
sql_print_information("InnoDB: Pending IO count\n"
"-------------------\n"
"Pending Read : %zu\n"
"Pending Write: %zu\n"
"-------------------",
os_aio_pending_reads_approx(), os_aio_pending_writes_approx());
}
#ifdef UNIV_DEBUG
/** Functor to validate the flush list. */
struct Check {

View File

@@ -44,6 +44,7 @@ Created 1/8/1996 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "buf0buf.h"
#include "buf0flu.h"
#include "data0type.h"
#include "dict0boot.h"
#include "dict0load.h"
@@ -1024,7 +1025,10 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)) noexc
const ulong threshold= srv_fatal_semaphore_wait_threshold;
if (waited >= threshold)
{
buf_pool.print_flush_info();
ib::fatal() << fatal_msg;
}
if (waited > threshold / 4)
ib::warn() << "A long wait (" << waited

View File

@@ -1951,6 +1951,9 @@ public:
/** Issue a warning that we could not free up buffer pool pages. */
ATTRIBUTE_COLD void LRU_warn() noexcept;
/** Print buffer pool flush state information. */
ATTRIBUTE_COLD void print_flush_info() const noexcept;
private:
/** Temporary memory for page_compressed and encrypted I/O */
struct io_buf_t

View File

@@ -159,6 +159,9 @@ public:
my_cond_wait(&cond, &mutex.m_mutex);
mysql_mutex_unlock(&mutex);
}
/** Print double write state information. */
ATTRIBUTE_COLD void print_info() const noexcept;
};
/** The doublewrite buffer */

View File

@@ -1021,6 +1021,8 @@ size_t os_aio_pending_reads() noexcept;
size_t os_aio_pending_reads_approx() noexcept;
/** @return number of pending writes */
size_t os_aio_pending_writes() noexcept;
/** @return approximate number of pending writes */
size_t os_aio_pending_writes_approx() noexcept;
/** Wait until there are no pending asynchronous writes.
@param declare whether the wait will be declared in tpool */

View File

@@ -3481,6 +3481,12 @@ size_t os_aio_pending_writes() noexcept
return pending;
}
/** @return approximate number of pending writes */
size_t os_aio_pending_writes_approx() noexcept
{
return write_slots->pending_io_count();
}
/** Wait until all pending asynchronous reads have completed.
@param declare whether the wait will be declared in tpool */
void os_aio_wait_until_no_pending_reads(bool declare) noexcept

View File

@@ -1220,6 +1220,7 @@ void srv_monitor_task(void*)
now -= start;
ulong waited = static_cast<ulong>(now / 1000000);
if (waited >= threshold) {
buf_pool.print_flush_info();
ib::fatal() << dict_sys.fatal_msg;
}