From 984606d74705e1a4c67945531edeff81692f403d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 29 Aug 2024 14:53:08 +0300 Subject: [PATCH] MDEV-34750 SET GLOBAL innodb_log_file_size is not crash safe The recent commit 4ca355d863e3b6a14439eebbb2958afccb3548e3 (MDEV-33894) caused a serious regression for online InnoDB ib_logfile0 resizing, breaking crash-safety unless the memory-mapped log file interface is being used. However, the log resizing was broken also before this. To prevent such regressions in the future, we extend the test innodb.log_file_size_online with a kill and restart of the server and with some writes running concurrently with the log size change. When run enough many times, this test revealed all the bugs that are being fixed by the code changes. log_t::resize_start(): Do not allow the resized log to start before the current log sequence number. In this way, there is no need to copy anything to the first block of resize_buf. The previous logic regarding that was incorrect in two ways. First, we would have to copy from the last written buffer (buf or flush_buf). Second, we failed to ensure that the mini-transaction end marker bytes would be 1 in the buffer. If the source ib_logfile0 had wrapped around an odd number of times, the end marker would be 0. This was occasionally observed when running the test innodb.log_file_size_online. log_t::resize_write_buf(): To adjust for the resize_start() change, do not write anything that would be before the resize_lsn. Take the buffer (resize_buf or resize_flush_buf) as a parameter. Starting with commit 4ca355d863e3b6a14439eebbb2958afccb3548e3 we no longer swap buffers when rewriting the last log block. log_t::append(): Define as a static function; only some debug assertions need to refer to the log_sys object. innodb_log_file_size_update(): Wake up the buf_flush_page_cleaner() if needed, and wait for it to complete a batch while waiting for the log resizing to be completed. If the current LSN is behind the resize target LSN, we will write redundant FILE_CHECKPOINT records to ensure that the log resizing completes. If the buf_pool.flush_list is empty or the buf_flush_page_cleaner() is stuck for some reason, our wait will time out in 5 seconds, so that we can periodically check if the execution of SET GLOBAL innodb_log_file_size was aborted. Previously, we could get into a busy loop here while the buf_flush_page_cleaner() would remain idle. --- .../innodb/r/log_file_size_online.result | 11 +++- .../suite/innodb/t/log_file_size_online.test | 15 +++++- storage/innobase/handler/ha_innodb.cc | 26 ++++++++-- storage/innobase/include/log0log.h | 18 +++---- storage/innobase/log/log0log.cc | 51 ++++++++++--------- storage/innobase/mtr/mtr0mtr.cc | 9 ++++ 6 files changed, 89 insertions(+), 41 deletions(-) diff --git a/mysql-test/suite/innodb/r/log_file_size_online.result b/mysql-test/suite/innodb/r/log_file_size_online.result index 1db2fdde576..e4c4e899995 100644 --- a/mysql-test/suite/innodb/r/log_file_size_online.result +++ b/mysql-test/suite/innodb/r/log_file_size_online.result @@ -19,8 +19,10 @@ SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value innodb_log_file_size 4194304 FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err -UPDATE t SET b='' WHERE a<10; SET GLOBAL innodb_log_file_size=5242880; +connect con1,localhost,root; +UPDATE t SET b='' WHERE a<10; +connection default; SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value innodb_log_file_size 5242880 @@ -28,6 +30,9 @@ SELECT global_value FROM information_schema.system_variables WHERE variable_name = 'innodb_log_file_size'; global_value 5242880 +connection con1; +disconnect con1; +connection default; # restart SELECT * FROM t WHERE a<10; a b @@ -40,6 +45,10 @@ a b 7 8 9 +SELECT COUNT(*),LENGTH(b) FROM t GROUP BY b; +COUNT(*) LENGTH(b) +9 0 +19991 255 SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value innodb_log_file_size 5242880 diff --git a/mysql-test/suite/innodb/t/log_file_size_online.test b/mysql-test/suite/innodb/t/log_file_size_online.test index 65551f13dbc..3b56144ca43 100644 --- a/mysql-test/suite/innodb/t/log_file_size_online.test +++ b/mysql-test/suite/innodb/t/log_file_size_online.test @@ -25,17 +25,28 @@ SHOW VARIABLES LIKE 'innodb_log_file_size'; let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB; --source include/search_pattern_in_file.inc -UPDATE t SET b='' WHERE a<10; +send SET GLOBAL innodb_log_file_size=5242880; -SET GLOBAL innodb_log_file_size=5242880; +--connect con1,localhost,root +send UPDATE t SET b='' WHERE a<10; + +--connection default +reap; SHOW VARIABLES LIKE 'innodb_log_file_size'; SELECT global_value FROM information_schema.system_variables WHERE variable_name = 'innodb_log_file_size'; +--connection con1 +reap; +--disconnect con1 +--connection default + +--let $shutdown_timeout=0 --let $restart_parameters= --source include/restart_mysqld.inc SELECT * FROM t WHERE a<10; +SELECT COUNT(*),LENGTH(b) FROM t GROUP BY b; SHOW VARIABLES LIKE 'innodb_log_file_size'; let SEARCH_PATTERN = InnoDB: Resized log to 5\\.000MiB; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 798caa5db61..ce2d3958f9c 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -18551,6 +18551,7 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_CANT_CREATE_HANDLER_FILE); break; case log_t::RESIZE_STARTED: + const lsn_t start{log_sys.resize_in_progress()}; for (timespec abstime;;) { if (thd_kill_level(thd)) @@ -18561,13 +18562,30 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, set_timespec(abstime, 5); mysql_mutex_lock(&buf_pool.flush_list_mutex); - const bool in_progress(buf_pool.get_oldest_modification(LSN_MAX) < - log_sys.resize_in_progress()); - if (in_progress) + lsn_t resizing= log_sys.resize_in_progress(); + if (resizing > buf_pool.get_oldest_modification(0)) + { + buf_pool.page_cleaner_wakeup(true); my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex, &abstime); + resizing= log_sys.resize_in_progress(); + } mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (!log_sys.resize_in_progress()) + if (start > log_sys.get_lsn()) + { + ut_ad(!log_sys.is_pmem()); + /* The server is almost idle. Write dummy FILE_CHECKPOINT records + to ensure that the log resizing will complete. */ + log_sys.latch.wr_lock(SRW_LOCK_CALL); + while (start > log_sys.get_lsn()) + { + mtr_t mtr; + mtr.start(); + mtr.commit_files(log_sys.last_checkpoint_lsn); + } + log_sys.latch.wr_unlock(); + } + if (!resizing || resizing > start /* only wait for our resize */) break; } } diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index b9e2171f596..17743baceb0 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -224,7 +224,7 @@ public: /** exclusive latch for checkpoint, shared for mtr_t::commit() to buf */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch; - /** number of std::swap(buf, flush_buf) and writes from buf to log; + /** number of writes from buf or flush_buf to log; protected by latch.wr_lock() */ ulint write_to_log; @@ -232,8 +232,9 @@ public: lsn_t write_lsn; /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() - In write_buf(), buf and flush_buf are swapped */ + In write_buf(), buf and flush_buf may be swapped */ byte *flush_buf; + /** set when there may be need to initiate a log checkpoint. This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ std::atomic need_checkpoint; @@ -370,9 +371,10 @@ public: private: /** Write resize_buf to resize_log. - @param length the used length of resize_buf */ + @param b resize_buf or resize_flush_buf + @param length the used length of b */ ATTRIBUTE_COLD ATTRIBUTE_NOINLINE - void resize_write_buf(size_t length) noexcept; + void resize_write_buf(const byte *b, size_t length) noexcept; public: /** Rename a log file after resizing. @@ -502,13 +504,7 @@ public: @param d destination @param s string of bytes @param size length of str, in bytes */ - void append(byte *&d, const void *s, size_t size) noexcept - { - ut_ad(latch_have_any()); - ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size)); - memcpy(d, s, size); - d+= size; - } + static inline void append(byte *&d, const void *s, size_t size) noexcept; /** Set the log file format. */ void set_latest_format(bool encrypted) noexcept diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 93f8db6bfc0..d7aae556ce0 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -516,17 +516,14 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept resize_target= size; resize_buf= static_cast(ptr); resize_flush_buf= static_cast(ptr2); + start_lsn= get_lsn(); + if (is_pmem()) - { resize_log.close(); - start_lsn= get_lsn(); - } else - { - memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15); start_lsn= first_lsn + - (~lsn_t{write_size - 1} & (write_lsn - first_lsn)); - } + (~lsn_t{write_size - 1} & + (lsn_t{write_size - 1} + start_lsn - first_lsn)); } resize_lsn.store(start_lsn, std::memory_order_relaxed); status= success ? RESIZE_STARTED : RESIZE_FAILED; @@ -780,19 +777,26 @@ inline void log_t::persist(lsn_t lsn) noexcept #endif ATTRIBUTE_COLD ATTRIBUTE_NOINLINE -/** Write resize_buf to resize_log. -@param length the used length of resize_buf */ -void log_t::resize_write_buf(size_t length) noexcept +void log_t::resize_write_buf(const byte *b, size_t length) noexcept { const size_t block_size_1= write_size - 1; + ut_ad(b == resize_buf || b == resize_flush_buf); ut_ad(!(resize_target & block_size_1)); ut_ad(!(length & block_size_1)); ut_ad(length > block_size_1); ut_ad(length <= resize_target); - const lsn_t resizing{resize_in_progress()}; - ut_ad(resizing <= write_lsn); - lsn_t offset= START_OFFSET + - ((write_lsn - resizing) & ~lsn_t{block_size_1}) % + + int64_t d= int64_t(write_lsn - resize_in_progress()); + if (UNIV_UNLIKELY(d <= 0)) + { + d&= ~int64_t(block_size_1); + if (int64_t(d + length) <= 0) + return; + length+= d; + b-= d; + d= 0; + } + lsn_t offset= START_OFFSET + (lsn_t(d) & ~lsn_t{block_size_1}) % (resize_target - START_OFFSET); if (UNIV_UNLIKELY(offset + length > resize_target)) @@ -804,7 +808,7 @@ void log_t::resize_write_buf(size_t length) noexcept } ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file, - buf, offset, length) == DB_SUCCESS); + b, offset, length) == DB_SUCCESS); } /** Write buf to ib_logfile0. @@ -838,6 +842,7 @@ template inline lsn_t log_t::write_buf() noexcept ut_ad(write_size_1 >= 511); const byte *const write_buf{buf}; + const byte *const re_write_buf{resize_buf}; offset&= ~lsn_t{write_size_1}; if (length <= write_size_1) @@ -851,8 +856,8 @@ template inline lsn_t log_t::write_buf() noexcept #else # ifdef HAVE_valgrind MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - length); - if (UNIV_LIKELY_NULL(resize_buf)) - MEM_MAKE_DEFINED(resize_buf + length, (write_size_1 + 1) - length); + if (UNIV_LIKELY_NULL(re_write_buf)) + MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) - length); # endif buf[length]= 0; /* allow recovery to catch EOF faster */ #endif @@ -872,15 +877,15 @@ template inline lsn_t log_t::write_buf() noexcept the current LSN are generated. */ #ifdef HAVE_valgrind MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - new_buf_free); - if (UNIV_LIKELY_NULL(resize_buf)) - MEM_MAKE_DEFINED(resize_buf + length, (write_size_1 + 1) - + if (UNIV_LIKELY_NULL(re_write_buf)) + MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) - new_buf_free); #endif buf[length]= 0; /* allow recovery to catch EOF faster */ length&= ~write_size_1; memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15); - if (UNIV_LIKELY_NULL(resize_buf)) - memcpy_aligned<16>(resize_flush_buf, resize_buf + length, + if (UNIV_LIKELY_NULL(re_write_buf)) + memcpy_aligned<16>(resize_flush_buf, re_write_buf + length, (new_buf_free + 15) & ~15); length+= write_size_1 + 1; } @@ -899,8 +904,8 @@ template inline lsn_t log_t::write_buf() noexcept /* Do the write to the log file */ log_write_buf(write_buf, length, offset); - if (UNIV_LIKELY_NULL(resize_buf)) - resize_write_buf(length); + if (UNIV_LIKELY_NULL(re_write_buf)) + resize_write_buf(re_write_buf, length); write_lsn= lsn; if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 0bc531855b9..fe9a882a36c 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -1310,6 +1310,15 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, } } +inline void log_t::append(byte *&d, const void *s, size_t size) noexcept +{ + ut_ad(log_sys.latch_have_any()); + ut_ad(d + size <= log_sys.buf + + (log_sys.is_pmem() ? log_sys.file_size : log_sys.buf_size)); + memcpy(d, s, size); + d+= size; +} + template std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len)