diff --git a/extra/mariabackup/CMakeLists.txt b/extra/mariabackup/CMakeLists.txt index afb172dbbaa..66293dac31b 100644 --- a/extra/mariabackup/CMakeLists.txt +++ b/extra/mariabackup/CMakeLists.txt @@ -48,6 +48,11 @@ ADD_DEFINITIONS(-UMYSQL_SERVER) ADD_DEFINITIONS(-DPCRE_STATIC=1) ADD_DEFINITIONS(${SSL_DEFINES}) + +IF(PMEM_FOUND) + ADD_COMPILE_FLAGS(xtrabackup.cc COMPILE_FLAGS "-DHAVE_PMEM") +ENDIF() + MYSQL_ADD_EXECUTABLE(mariadb-backup xtrabackup.cc innobackupex.cc diff --git a/extra/mariabackup/backup_copy.cc b/extra/mariabackup/backup_copy.cc index 83be07fd90c..ea2a5b3532a 100644 --- a/extra/mariabackup/backup_copy.cc +++ b/extra/mariabackup/backup_copy.cc @@ -1818,26 +1818,11 @@ copy_back() dst_dir = (srv_log_group_home_dir && *srv_log_group_home_dir) ? srv_log_group_home_dir : mysql_data_home; - /* --backup generates a single LOG_FILE_NAME, which we must copy - if it exists. */ + /* --backup generates a single ib_logfile0, which we must copy. */ ds_data = ds_create(dst_dir, DS_TYPE_LOCAL); - MY_STAT stat_arg; - if (!my_stat(LOG_FILE_NAME, &stat_arg, MYF(0)) || !stat_arg.st_size) { - /* After completed --prepare, redo log files are redundant. - We must delete any redo logs at the destination, so that - the database will not jump to a different log sequence number - (LSN). */ - - char filename[FN_REFLEN]; - snprintf(filename, sizeof filename, "%s/%s0", dst_dir, - LOG_FILE_NAME_PREFIX); - unlink(filename); - snprintf(filename, sizeof filename, "%s/%s101", dst_dir, - LOG_FILE_NAME_PREFIX); - unlink(filename); - } else if (!(ret = copy_or_move_file(LOG_FILE_NAME, LOG_FILE_NAME, - dst_dir, 1))) { + if (!(ret = copy_or_move_file(LOG_FILE_NAME, LOG_FILE_NAME, + dst_dir, 1))) { goto cleanup; } ds_destroy(ds_data); diff --git a/extra/mariabackup/changed_page_bitmap.cc b/extra/mariabackup/changed_page_bitmap.cc index ac67bb9b924..993288b79e1 100644 --- a/extra/mariabackup/changed_page_bitmap.cc +++ b/extra/mariabackup/changed_page_bitmap.cc @@ -589,7 +589,7 @@ xb_find_lsn_in_bitmap_file( /****************************************************************//** Read the disk bitmap and build the changed page bitmap tree for the -LSN interval incremental_lsn to checkpoint_lsn_start. +LSN interval incremental_lsn to log_sys.next_checkpoint_lsn. @return the built bitmap tree or NULL if unable to read the full interval for any reason. */ @@ -599,7 +599,7 @@ xb_page_bitmap_init(void) { log_online_bitmap_file_t bitmap_file; lsn_t bmp_start_lsn = incremental_lsn; - lsn_t bmp_end_lsn = checkpoint_lsn_start; + const lsn_t bmp_end_lsn{log_sys.next_checkpoint_lsn}; byte page[MODIFIED_PAGE_BLOCK_SIZE]; lsn_t current_page_end_lsn; xb_page_bitmap *result; diff --git a/extra/mariabackup/changed_page_bitmap.h b/extra/mariabackup/changed_page_bitmap.h index 1a0e2ec37f0..8d5043596bf 100644 --- a/extra/mariabackup/changed_page_bitmap.h +++ b/extra/mariabackup/changed_page_bitmap.h @@ -38,7 +38,7 @@ typedef struct xb_page_bitmap_range_struct xb_page_bitmap_range; /****************************************************************//** Read the disk bitmap and build the changed page bitmap tree for the -LSN interval incremental_lsn to checkpoint_lsn_start. +LSN interval incremental_lsn to log_sys.next_checkpoint_lsn. @return the built bitmap tree */ xb_page_bitmap* diff --git a/extra/mariabackup/xb_plugin.cc b/extra/mariabackup/xb_plugin.cc index 3384e1c07e5..7470d376eaa 100644 --- a/extra/mariabackup/xb_plugin.cc +++ b/extra/mariabackup/xb_plugin.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2017, MariaDB Corporation. +/* Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,7 +23,7 @@ #include #include #include -#include +#include extern struct st_maria_plugin *mysql_optional_plugins[]; diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 32583211085..a96fdf36132 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -186,9 +186,6 @@ struct xb_filter_entry_t{ xb_filter_entry_t *name_hash; }; -lsn_t checkpoint_lsn_start; -lsn_t checkpoint_no_start; -static lsn_t log_copy_scanned_lsn; static bool log_copying_running; int xtrabackup_parallel; @@ -346,7 +343,6 @@ const char *opt_history = NULL; char mariabackup_exe[FN_REFLEN]; char orig_argv1[FN_REFLEN]; -pthread_mutex_t backup_mutex; pthread_cond_t scanned_lsn_cond; typedef std::map space_id_to_name_t; @@ -783,7 +779,7 @@ static void backup_file_op(uint32_t space_id, bool create, ut_ad(name); ut_ad(len); ut_ad(!new_name == !new_len); - pthread_mutex_lock(&backup_mutex); + mysql_mutex_assert_owner(&recv_sys.mutex); if (create) { ddl_tracker.id_to_name[space_id] = filename_to_spacename(name, len); @@ -797,7 +793,6 @@ static void backup_file_op(uint32_t space_id, bool create, ddl_tracker.drops.insert(space_id); msg("DDL tracking : delete %u \"%.*s\"", space_id, int(len), name); } - pthread_mutex_unlock(&backup_mutex); } @@ -1494,9 +1489,10 @@ struct my_option xb_server_options[] = IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), 0, 0, 0, 0, 0}, {"innodb_log_buffer_size", OPT_INNODB_LOG_BUFFER_SIZE, - "The size of the buffer which InnoDB uses to write log to the log files on disk.", - (G_PTR*) &srv_log_buffer_size, (G_PTR*) &srv_log_buffer_size, 0, - GET_ULONG, REQUIRED_ARG, 1024*1024L, 256*1024L, LONG_MAX, 0, 1024, 0}, + "Redo log buffer size in bytes.", + (G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0, + IF_WIN(GET_ULL,GET_ULONG), REQUIRED_ARG, 2U << 20, + 2U << 20, SIZE_T_MAX, 0, 4096, 0}, {"innodb_log_file_size", OPT_INNODB_LOG_FILE_SIZE, "Ignored for mysqld option compatibility", (G_PTR*) &srv_log_file_size, (G_PTR*) &srv_log_file_size, 0, @@ -2140,30 +2136,92 @@ error: return true; } +static byte log_hdr_buf[log_t::START_OFFSET + SIZE_OF_FILE_CHECKPOINT]; + +/** Initialize an InnoDB log file header in log_hdr_buf[] */ +static void log_hdr_init() +{ + memset(log_hdr_buf, 0, sizeof log_hdr_buf); + mach_write_to_4(LOG_HEADER_FORMAT + log_hdr_buf, log_t::FORMAT_10_8); + mach_write_to_8(LOG_HEADER_START_LSN + log_hdr_buf, + log_sys.next_checkpoint_lsn); + snprintf(reinterpret_cast(LOG_HEADER_CREATOR + log_hdr_buf), + 16, "Backup %u.%u.%u", + MYSQL_VERSION_ID / 10000, MYSQL_VERSION_ID / 100 % 100, + MYSQL_VERSION_ID % 100); + if (log_sys.is_encrypted()) + log_crypt_write_header(log_hdr_buf + LOG_HEADER_CREATOR_END); + mach_write_to_4(508 + log_hdr_buf, my_crc32c(0, log_hdr_buf, 508)); + mach_write_to_8(log_hdr_buf + 0x1000, log_sys.next_checkpoint_lsn); + mach_write_to_8(log_hdr_buf + 0x1008, recv_sys.lsn); + mach_write_to_4(log_hdr_buf + 0x103c, + my_crc32c(0, log_hdr_buf + 0x1000, 60)); +} + static bool innodb_init() { - bool create_new_db = false; + bool create_new_db= false; - if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { - /* Avoid overflow. */ - srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; - } else { - srv_max_io_capacity = std::max(2 * srv_io_capacity, 2000UL); - } + srv_max_io_capacity= srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2 + ? SRV_MAX_IO_CAPACITY_LIMIT : std::max(2 * srv_io_capacity, 2000UL); - /* Check if the data files exist or not. */ - dberr_t err = srv_sys_space.check_file_spec(&create_new_db, 5U << 20); + /* Check if the data files exist or not. */ + dberr_t err= srv_sys_space.check_file_spec(&create_new_db, 5U << 20); - if (err == DB_SUCCESS) { - err = srv_start(create_new_db); - } + if (err == DB_SUCCESS) + err= srv_start(create_new_db); - if (err != DB_SUCCESS) { - die("mariabackup: innodb_init() returned %d (%s).", - err, ut_strerr(err)); - } + if (err != DB_SUCCESS) + { + msg("mariadb-backup: srv_start() returned %d (%s).", err, ut_strerr(err)); + return true; + } - return(FALSE); + ut_ad(srv_force_recovery <= SRV_FORCE_IGNORE_CORRUPT); + ut_ad(recv_no_log_write); + buf_flush_sync(); + DBUG_ASSERT(!buf_pool.any_io_pending()); + log_sys.close_file(); + + if (xtrabackup_incremental) + /* Reset the ib_logfile0 in --target-dir, not --incremental-dir. */ + srv_log_group_home_dir= xtrabackup_target_dir; + + bool ret; + const std::string ib_logfile0{get_log_file_path()}; + os_file_delete_if_exists(innodb_log_file_key, ib_logfile0.c_str(), nullptr); + pfs_os_file_t file= os_file_create(innodb_log_file_key, ib_logfile0.c_str(), + OS_FILE_CREATE, OS_FILE_NORMAL, + OS_DATA_FILE_NO_O_DIRECT, false, &ret); + if (!ret) + { + invalid_log: + msg("mariadb-backup: Cannot create %s", ib_logfile0.c_str()); + return true; + } + + recv_sys.lsn= log_sys.next_checkpoint_lsn= + log_sys.get_lsn() - SIZE_OF_FILE_CHECKPOINT; + log_sys.set_latest_format(false); // not encrypted + log_hdr_init(); + byte *b= &log_hdr_buf[log_t::START_OFFSET]; + b[0]= FILE_CHECKPOINT | 10; + mach_write_to_8(b + 3, recv_sys.lsn); + b[11]= 1; + mach_write_to_4(b + 12, my_crc32c(0, b, 11)); + static_assert(12 + 4 == SIZE_OF_FILE_CHECKPOINT, "compatibility"); + +#ifdef _WIN32 + DWORD len; + ret= WriteFile(os_file_t{file}, log_hdr_buf, sizeof log_hdr_buf, + &len, nullptr) && len == sizeof log_hdr_buf; +#else + ret= sizeof log_hdr_buf == write(os_file_t{file}, log_hdr_buf, + sizeof log_hdr_buf); +#endif + if (!os_file_close(file) || !ret) + goto invalid_log; + return false; } /* ================= common ================= */ @@ -2711,9 +2769,9 @@ static my_bool xtrabackup_copy_datafile(fil_node_t *node, uint thread_n, memset(&write_filt_ctxt, 0, sizeof(xb_write_filt_ctxt_t)); bool was_dropped; - pthread_mutex_lock(&backup_mutex); + mysql_mutex_lock(&recv_sys.mutex); was_dropped = (ddl_tracker.drops.find(node->space->id) != ddl_tracker.drops.end()); - pthread_mutex_unlock(&backup_mutex); + mysql_mutex_unlock(&recv_sys.mutex); if (was_dropped) { if (node->is_open()) { mysql_mutex_lock(&fil_system.mutex); @@ -2783,11 +2841,11 @@ static my_bool xtrabackup_copy_datafile(fil_node_t *node, uint thread_n, } else { const fil_space_t::name_type name = node->space->name(); - pthread_mutex_lock(&backup_mutex); + mysql_mutex_lock(&recv_sys.mutex); ddl_tracker.tables_in_backup.emplace(node->space->id, std::string(name.data(), name.size())); - pthread_mutex_unlock(&backup_mutex); + mysql_mutex_unlock(&recv_sys.mutex); } /* close */ @@ -2824,169 +2882,224 @@ skip: return(FALSE); } -/** Copy redo log blocks to the data sink. -@param start_lsn buffer start LSN -@param end_lsn buffer end LSN -@param last whether we are copying the final part of the log -@return last scanned LSN -@retval 0 on failure */ -static lsn_t xtrabackup_copy_log(lsn_t start_lsn, lsn_t end_lsn, bool last) -{ - lsn_t scanned_lsn = start_lsn; - const byte* log_block = log_sys.buf; - bool more_data = false; - - for (ulint scanned_checkpoint = 0; - scanned_lsn < end_lsn; - log_block += OS_FILE_LOG_BLOCK_SIZE) { - ulint checkpoint = log_block_get_checkpoint_no(log_block); - - if (scanned_checkpoint > checkpoint - && scanned_checkpoint - checkpoint >= 0x80000000UL) { - /* Garbage from a log buffer flush which was made - before the most recent database recovery */ - msg(0,"checkpoint wrap: " LSN_PF ",%zx,%zx", - scanned_lsn, scanned_checkpoint, checkpoint); - break; - } - - scanned_checkpoint = checkpoint; - - ulint data_len = log_block_get_data_len(log_block); - - more_data = recv_sys_add_to_parsing_buf( - log_block, - scanned_lsn + data_len); - - recv_sys.scanned_lsn = scanned_lsn + data_len; - - if (data_len == OS_FILE_LOG_BLOCK_SIZE) { - /* We got a full log block. */ - scanned_lsn += data_len; - } else if (data_len >= log_sys.trailer_offset() - || data_len < LOG_BLOCK_HDR_SIZE) { - /* We got a garbage block (abrupt end of the log). */ - msg(0,"garbage block: " LSN_PF ",%zu",scanned_lsn, data_len); - break; - } else { - /* We got a partial block (abrupt end of the log). */ - scanned_lsn += data_len; - break; - } - } - - store_t store= STORE_NO; - if (more_data && recv_sys.parse(0, &store, false)) { - msg("Error: copying the log failed"); - return(0); - } - - recv_sys_justify_left_parsing_buf(); - - log_sys.log.scanned_lsn = scanned_lsn; - - end_lsn = last - ? ut_uint64_align_up(scanned_lsn, OS_FILE_LOG_BLOCK_SIZE) - : scanned_lsn & ~lsn_t(OS_FILE_LOG_BLOCK_SIZE - 1); - - if (ulint write_size = ulint(end_lsn - start_lsn)) { - if (srv_encrypt_log) { - log_crypt(log_sys.buf, start_lsn, write_size); - } - - if (ds_write(dst_log_file, log_sys.buf, write_size)) { - msg("Error: write to logfile failed"); - return(0); - } - } - - return(scanned_lsn); -} - /** Copy redo log until the current end of the log is reached -@param last whether we are copying the final part of the log @return whether the operation failed */ -static bool xtrabackup_copy_logfile(bool last = false) +static bool xtrabackup_copy_logfile() { - mysql_mutex_assert_owner(&log_sys.mutex); + mysql_mutex_assert_owner(&log_sys.mutex); + DBUG_EXECUTE_IF("log_checksum_mismatch", return false;); - ut_a(dst_log_file != NULL); - ut_ad(recv_sys.is_initialised()); + ut_a(dst_log_file); + ut_ad(recv_sys.is_initialised()); + const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; + const size_t block_size_1{log_sys.get_block_size() - 1}; - bool overwritten_block = false; - lsn_t start_lsn; - lsn_t end_lsn; + mysql_mutex_lock(&recv_sys.mutex); +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + { + recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); + recv_sys.len= size_t(log_sys.file_size); + } + else +#endif + { + recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & + block_size_1; + recv_sys.len= 0; + } - recv_sys.parse_start_lsn = log_copy_scanned_lsn; - recv_sys.scanned_lsn = log_copy_scanned_lsn; + for (unsigned retry_count{0};;) + { + recv_sys_t::parse_mtr_result r; + size_t start_offset{recv_sys.offset}; - start_lsn = ut_uint64_align_down(log_copy_scanned_lsn, - OS_FILE_LOG_BLOCK_SIZE); - do { - end_lsn = start_lsn + RECV_SCAN_SIZE; +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + { + if ((ut_d(r=) recv_sys.parse_pmem(STORE_IF_EXISTS)) != recv_sys_t::OK) + { + ut_ad(r == recv_sys_t::GOT_EOF); + goto retry; + } - if (xtrabackup_throttle && (io_ticket--) < 0) { - mysql_cond_wait(&wait_throttle, &log_sys.mutex); - } + retry_count= 0; - lsn_t lsn= start_lsn; - for (int retries= 0; retries < 100; retries++) { - if (log_sys.log.read_log_seg(&lsn, end_lsn) - || lsn != start_lsn) { - break; - } - msg("Retrying read of log at LSN=" LSN_PF, lsn); - my_sleep(1000); - } + do + { + const byte seq{log_sys.get_sequence_bit(recv_sys.lsn - + sequence_offset)}; + ut_ad(recv_sys.offset >= log_sys.START_OFFSET); + ut_ad(recv_sys.offset < recv_sys.len); + ut_ad(log_sys.buf[recv_sys.offset + >= log_sys.START_OFFSET + sequence_offset + ? recv_sys.offset - sequence_offset + : recv_sys.len - sequence_offset + + recv_sys.offset - log_sys.START_OFFSET] == + seq); + static const byte seq_1{1}; + if (UNIV_UNLIKELY(start_offset > recv_sys.offset)) + { + const ssize_t so(recv_sys.offset - (log_sys.START_OFFSET + + sequence_offset)); + if (so <= 0) + { + if (ds_write(dst_log_file, log_sys.buf + start_offset, + recv_sys.len - start_offset + so) || + ds_write(dst_log_file, &seq_1, 1)) + goto write_error; + if (so < -1 && + ds_write(dst_log_file, log_sys.buf + recv_sys.len - (1 - so), + 1 - so)) + goto write_error; + if (ds_write(dst_log_file, log_sys.buf + log_sys.START_OFFSET, + recv_sys.offset - log_sys.START_OFFSET)) + goto write_error; + } + else + { + if (ds_write(dst_log_file, log_sys.buf + start_offset, + recv_sys.len - start_offset)) + goto write_error; + if (ds_write(dst_log_file, log_sys.buf + log_sys.START_OFFSET, so)) + goto write_error; + if (ds_write(dst_log_file, &seq_1, 1)) + goto write_error; + if (so > 1 && + ds_write(dst_log_file, log_sys.buf + recv_sys.offset - + (so - 1), so - 1)) + goto write_error; + } + } + else if (seq == 1) + { + if (ds_write(dst_log_file, log_sys.buf + start_offset, + recv_sys.offset - start_offset)) + goto write_error; + } + else if (ds_write(dst_log_file, log_sys.buf + start_offset, + recv_sys.offset - start_offset - sequence_offset) || + ds_write(dst_log_file, &seq_1, 1) || + ds_write(dst_log_file, log_sys.buf + + recv_sys.offset - sequence_offset + 1, + sequence_offset - 1)) + goto write_error; - if (lsn == start_lsn) { - overwritten_block= !recv_sys.is_corrupt_log() - && log_block_calc_checksum_crc32(log_sys.buf) == - log_block_get_checksum(log_sys.buf) - && log_block_get_hdr_no(log_sys.buf) > - log_block_convert_lsn_to_no(start_lsn); - start_lsn = 0; - } else { - mysql_mutex_lock(&recv_sys.mutex); - start_lsn = xtrabackup_copy_log(start_lsn, lsn, last); - mysql_mutex_unlock(&recv_sys.mutex); - } + start_offset= recv_sys.offset; + } + while ((ut_d(r=)recv_sys.parse_pmem(STORE_IF_EXISTS)) == recv_sys_t::OK); - if (!start_lsn) { - const char *reason = recv_sys.is_corrupt_log() - ? "corrupt log." - : (overwritten_block - ? "redo log block is overwritten, please increase redo log size with innodb_log_file_size parameter." - : "redo log block checksum does not match."); + ut_ad(r == recv_sys_t::GOT_EOF); + pthread_cond_broadcast(&scanned_lsn_cond); + break; + } + else +#endif + { + { + auto source_offset= + log_sys.calc_lsn_offset(recv_sys.lsn + recv_sys.len - + recv_sys.offset); + source_offset&= ~block_size_1; + size_t size{log_sys.buf_size - recv_sys.len}; + if (source_offset + size > log_sys.file_size) + size= static_cast(log_sys.file_size - source_offset); + ut_ad(size <= log_sys.buf_size); + log_sys.log.read(source_offset, {log_sys.buf, size}); + recv_sys.len= size; + } - die("xtrabackup_copy_logfile() failed: %s", reason); - return true; - } - } while (start_lsn == end_lsn); + if (log_sys.buf[recv_sys.offset] <= 1) + break; - ut_ad(start_lsn == log_sys.log.scanned_lsn); + if (recv_sys.parse_mtr(STORE_IF_EXISTS) == recv_sys_t::OK) + { + do + { + /* Set the sequence bit (the backed-up log will not wrap around) */ + byte *seq= &log_sys.buf[recv_sys.offset - sequence_offset]; + ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - + sequence_offset)); + *seq= 1; + } + while ((r= recv_sys.parse_mtr(STORE_IF_EXISTS)) == recv_sys_t::OK); - msg(">> log scanned up to (" LSN_PF ")", start_lsn); + if (ds_write(dst_log_file, log_sys.buf + start_offset, + recv_sys.offset - start_offset)) + { +#ifdef HAVE_PMEM + write_error: +#endif + mysql_mutex_unlock(&recv_sys.mutex); + msg("Error: write to ib_logfile0 failed"); + return true; + } + else + { + const auto ofs= recv_sys.offset & ~block_size_1; + memmove_aligned<64>(log_sys.buf, log_sys.buf + ofs, + recv_sys.len - ofs); + recv_sys.len-= ofs; + recv_sys.offset&= block_size_1; + } - /* update global variable*/ - pthread_mutex_lock(&backup_mutex); - log_copy_scanned_lsn = start_lsn; - pthread_cond_broadcast(&scanned_lsn_cond); - pthread_mutex_unlock(&backup_mutex); - return(false); + pthread_cond_broadcast(&scanned_lsn_cond); + + if (r == recv_sys_t::GOT_EOF) + break; + + if (recv_sys.offset < log_sys.get_block_size()) + break; + + mysql_mutex_unlock(&recv_sys.mutex); + + if (xtrabackup_throttle && io_ticket-- < 0) + mysql_cond_wait(&wait_throttle, &log_sys.mutex); + + retry_count= 0; + } + else + { + recv_sys.len= recv_sys.offset & ~block_size_1; +#ifdef HAVE_PMEM + retry: +#endif + if (retry_count == 100) + break; + + mysql_mutex_unlock(&recv_sys.mutex); + if (!retry_count++) + msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn); + my_sleep(1000); + } + } + mysql_mutex_lock(&recv_sys.mutex); + } + + mysql_mutex_unlock(&recv_sys.mutex); + msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn); + return false; } /** Wait until redo log copying thread processes given lsn */ -void backup_wait_for_lsn(lsn_t lsn) { - bool completed = false; - pthread_mutex_lock(&backup_mutex); - do { - pthread_cond_wait(&scanned_lsn_cond, &backup_mutex); - completed = log_copy_scanned_lsn >= lsn; - } while (!completed); - pthread_mutex_unlock(&backup_mutex); +void backup_wait_for_lsn(lsn_t lsn) +{ + mysql_mutex_lock(&recv_sys.mutex); + for (lsn_t last_lsn{recv_sys.lsn}; last_lsn < lsn; ) + { + timespec abstime; + set_timespec(abstime, 5); + if (my_cond_timedwait(&scanned_lsn_cond, &recv_sys.mutex.m_mutex, + &abstime) && + last_lsn == recv_sys.lsn) + die("Was only able to copy log from " LSN_PF " to " LSN_PF + ", not " LSN_PF "; try increasing innodb_log_file_size", + log_sys.next_checkpoint_lsn, last_lsn, lsn); + last_lsn= recv_sys.lsn; + } + mysql_mutex_unlock(&recv_sys.mutex); } extern lsn_t server_lsn_after_lock; @@ -2996,7 +3109,7 @@ static void log_copying_thread() my_thread_init(); mysql_mutex_lock(&log_sys.mutex); while (!xtrabackup_copy_logfile() && - (!metadata_to_lsn || metadata_to_lsn > log_copy_scanned_lsn)) + (!metadata_to_lsn || metadata_to_lsn > recv_sys.lsn)) { timespec abstime; set_timespec_nsec(abstime, 1000ULL * xtrabackup_log_copy_interval); @@ -3117,7 +3230,7 @@ Initialize the appropriate datasink(s). Both local backups and streaming in the Otherwise (i.e. when streaming in the 'tar' format) we need 2 separate datasinks for the data stream (and don't allow parallel data copying) and for metainfo -files (including LOG_FILE_NAME). The second datasink writes to temporary +files (including ib_logfile0). The second datasink writes to temporary files first, and then streams them in a serialized way when closed. */ static void xtrabackup_init_datasinks(void) @@ -3251,7 +3364,6 @@ static void xb_load_single_table_tablespace(const char *dirname, size_t dirlen = dirname == NULL ? 0 : strlen(dirname); size_t namelen = strlen(filname); ulint pathlen = dirname == NULL ? namelen + 1: dirlen + namelen + 2; - lsn_t flush_lsn; dberr_t err; fil_space_t *space; bool defer = false; @@ -3286,7 +3398,7 @@ static void xb_load_single_table_tablespace(const char *dirname, for (int i = 0; i < 10; i++) { file->m_defer = false; - err = file->validate_first_page(&flush_lsn); + err = file->validate_first_page(); if (file->m_defer) { if (defer_space_id) { @@ -3805,7 +3917,6 @@ xb_load_tablespaces() bool create_new_db; dberr_t err; ulint sum_of_new_sizes; - lsn_t flush_lsn; ut_ad(srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_RESTORE_DELTA); @@ -3819,8 +3930,7 @@ xb_load_tablespaces() } for (int i= 0; i < 10; i++) { - err = srv_sys_space.open_or_create(false, false, &sum_of_new_sizes, - &flush_lsn); + err = srv_sys_space.open_or_create(false, false, &sum_of_new_sizes); if (err == DB_PAGE_CORRUPTED || err == DB_CORRUPTION) { my_sleep(1000); } @@ -4305,24 +4415,18 @@ static bool xtrabackup_backup_low() /* read the latest checkpoint lsn */ { - ulint max_cp_field; - - if (recv_find_max_checkpoint(&max_cp_field) == DB_SUCCESS - && log_sys.log.format != 0) { - if (max_cp_field == LOG_CHECKPOINT_1) { - log_sys.log.read(max_cp_field, - {log_sys.checkpoint_buf, - OS_FILE_LOG_BLOCK_SIZE}); - } - metadata_to_lsn = mach_read_from_8( - log_sys.checkpoint_buf + LOG_CHECKPOINT_LSN); + const lsn_t lsn = recv_sys.lsn; + if (recv_sys.find_checkpoint() == DB_SUCCESS + && log_sys.is_latest()) { + metadata_to_lsn = log_sys.next_checkpoint_lsn; msg("mariabackup: The latest check point" " (for incremental): '" LSN_PF "'", metadata_to_lsn); } else { - msg("Error: recv_find_max_checkpoint() failed."); + msg("Error: recv_sys.find_checkpoint() failed."); } + recv_sys.lsn = lsn; mysql_cond_broadcast(&log_copying_stop); const bool running= log_copying_running; mysql_mutex_unlock(&log_sys.mutex); @@ -4330,12 +4434,13 @@ static bool xtrabackup_backup_low() mysql_mutex_lock(&log_sys.mutex); } - if (metadata_to_lsn && xtrabackup_copy_logfile(true)) { + if (metadata_to_lsn && xtrabackup_copy_logfile()) { mysql_mutex_unlock(&log_sys.mutex); ds_close(dst_log_file); dst_log_file = NULL; return false; } + mysql_mutex_unlock(&log_sys.mutex); if (ds_close(dst_log_file) || !metadata_to_lsn) { @@ -4352,7 +4457,7 @@ static bool xtrabackup_backup_low() strcpy(metadata_type, "incremental"); metadata_from_lsn = incremental_lsn; } - metadata_last_lsn = log_copy_scanned_lsn; + metadata_last_lsn = recv_sys.lsn; if (!xtrabackup_stream_metadata(ds_meta)) { msg("Error: failed to stream metadata."); @@ -4390,7 +4495,6 @@ static bool xtrabackup_backup_func() pthread_mutex_t count_mutex; CorruptedPages corrupted_pages; data_thread_ctxt_t *data_threads; - pthread_mutex_init(&backup_mutex, NULL); pthread_cond_init(&scanned_lsn_cond, NULL); #ifdef USE_POSIX_FADVISE @@ -4474,28 +4578,6 @@ fail: goto fail; } - log_sys.create(); - log_sys.log.create(); - log_sys.log.open_file(get_log_file_path()); - - /* create extra LSN dir if it does not exist. */ - if (xtrabackup_extra_lsndir - &&!my_stat(xtrabackup_extra_lsndir,&stat_info,MYF(0)) - && (my_mkdir(xtrabackup_extra_lsndir,0777,MYF(0)) < 0)) { - msg("Error: cannot mkdir %d: %s\n", - my_errno, xtrabackup_extra_lsndir); - goto fail; - } - - /* create target dir if not exist */ - if (!xtrabackup_stream_str && !my_stat(xtrabackup_target_dir,&stat_info,MYF(0)) - && (my_mkdir(xtrabackup_target_dir,0777,MYF(0)) < 0)){ - msg("Error: cannot mkdir %d: %s\n", - my_errno, xtrabackup_target_dir); - goto fail; - } - - if (auto b = aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096)) { field_ref_zero = static_cast( memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX)); @@ -4503,19 +4585,12 @@ fail: goto fail; } - { - /* definition from recv_recovery_from_checkpoint_start() */ - ulint max_cp_field; - + log_sys.create(); /* get current checkpoint_lsn */ - /* Look for the latest checkpoint from any of the log groups */ mysql_mutex_lock(&log_sys.mutex); -reread_log_header: - dberr_t err = recv_find_max_checkpoint(&max_cp_field); - - if (err != DB_SUCCESS) { + if (recv_sys.find_checkpoint() != DB_SUCCESS) { msg("Error: cannot read redo log header"); unlock_and_fail: mysql_mutex_unlock(&log_sys.mutex); @@ -4525,26 +4600,31 @@ free_and_fail: goto fail; } - if (log_sys.log.format == 0) { - msg("Error: cannot process redo log before MariaDB 10.2.2"); + if (!log_sys.is_latest()) { + msg("Error: cannot process redo log before MariaDB 10.8"); goto unlock_and_fail; } - byte* buf = log_sys.checkpoint_buf; - checkpoint_lsn_start = log_sys.log.get_lsn(); - checkpoint_no_start = log_sys.next_checkpoint_no; - - log_sys.log.read(max_cp_field, {buf, OS_FILE_LOG_BLOCK_SIZE}); - - if (checkpoint_no_start != mach_read_from_8(buf + LOG_CHECKPOINT_NO) - || checkpoint_lsn_start - != mach_read_from_8(buf + LOG_CHECKPOINT_LSN) - || log_sys.log.get_lsn_offset() - != mach_read_from_8(buf + LOG_CHECKPOINT_OFFSET)) - goto reread_log_header; - + recv_needed_recovery = true; mysql_mutex_unlock(&log_sys.mutex); + /* create extra LSN dir if it does not exist. */ + if (xtrabackup_extra_lsndir + &&!my_stat(xtrabackup_extra_lsndir,&stat_info,MYF(0)) + && (my_mkdir(xtrabackup_extra_lsndir,0777,MYF(0)) < 0)) { + msg("Error: cannot mkdir %d: %s\n", + my_errno, xtrabackup_extra_lsndir); + goto free_and_fail; + } + + /* create target dir if not exist */ + if (!xtrabackup_stream_str && !my_stat(xtrabackup_target_dir,&stat_info,MYF(0)) + && (my_mkdir(xtrabackup_target_dir,0777,MYF(0)) < 0)){ + msg("Error: cannot mkdir %d: %s\n", + my_errno, xtrabackup_target_dir); + goto free_and_fail; + } + xtrabackup_init_datasinks(); if (!select_history()) { @@ -4561,44 +4641,13 @@ free_and_fail: } /* label it */ - byte* log_hdr_buf = static_cast( - aligned_malloc(LOG_FILE_HDR_SIZE, OS_FILE_LOG_BLOCK_SIZE)); - memset(log_hdr_buf, 0, LOG_FILE_HDR_SIZE); - - byte *log_hdr_field = log_hdr_buf; - mach_write_to_4(LOG_HEADER_FORMAT + log_hdr_field, log_sys.log.format); - mach_write_to_4(LOG_HEADER_SUBFORMAT + log_hdr_field, log_sys.log.subformat); - mach_write_to_8(LOG_HEADER_START_LSN + log_hdr_field, checkpoint_lsn_start); - strcpy(reinterpret_cast(LOG_HEADER_CREATOR + log_hdr_field), - "Backup " MYSQL_SERVER_VERSION); - log_block_set_checksum(log_hdr_field, - log_block_calc_checksum_crc32(log_hdr_field)); - - /* copied from log_group_checkpoint() */ - log_hdr_field += - (log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1; - /* The least significant bits of LOG_CHECKPOINT_OFFSET must be - stored correctly in the copy of the LOG_FILE_NAME. The most significant - bits, which identify the start offset of the log block in the file, - we did choose freely, as LOG_FILE_HDR_SIZE. */ - ut_ad(!((log_sys.log.get_lsn() ^ checkpoint_lsn_start) - & (OS_FILE_LOG_BLOCK_SIZE - 1))); - /* Adjust the checkpoint page. */ - memcpy(log_hdr_field, log_sys.checkpoint_buf, OS_FILE_LOG_BLOCK_SIZE); - mach_write_to_8(log_hdr_field + LOG_CHECKPOINT_OFFSET, - (checkpoint_lsn_start & (OS_FILE_LOG_BLOCK_SIZE - 1)) - | LOG_FILE_HDR_SIZE); - log_block_set_checksum(log_hdr_field, - log_block_calc_checksum_crc32(log_hdr_field)); - + recv_sys.file_checkpoint = log_sys.next_checkpoint_lsn; + log_hdr_init(); /* Write log header*/ - if (ds_write(dst_log_file, log_hdr_buf, LOG_FILE_HDR_SIZE)) { + if (ds_write(dst_log_file, log_hdr_buf, 12288)) { msg("error: write to logfile failed"); - aligned_free(log_hdr_buf); goto free_and_fail; } - - aligned_free(log_hdr_buf); log_copying_running = true; /* start io throttle */ if(xtrabackup_throttle) { @@ -4610,27 +4659,26 @@ free_and_fail: } /* Populate fil_system with tablespaces to copy */ - err = xb_load_tablespaces(); - if (err != DB_SUCCESS) { + if (dberr_t err = xb_load_tablespaces()) { msg("merror: xb_load_tablespaces() failed with" " error %s.", ut_strerr(err)); -fail_before_log_copying_thread_start: log_copying_running = false; goto free_and_fail; } /* copy log file by current position */ - log_copy_scanned_lsn = checkpoint_lsn_start; - recv_sys.recovered_lsn = log_copy_scanned_lsn; mysql_mutex_lock(&log_sys.mutex); + recv_sys.lsn = log_sys.next_checkpoint_lsn; const bool log_copy_failed = xtrabackup_copy_logfile(); mysql_mutex_unlock(&log_sys.mutex); - if (log_copy_failed) - goto fail_before_log_copying_thread_start; + if (log_copy_failed) { + log_copying_running = false; + goto free_and_fail; + } DBUG_MARIABACKUP_EVENT("before_innodb_log_copy_thread_started", {}); @@ -4687,7 +4735,6 @@ fail_before_log_copying_thread_start: pthread_mutex_destroy(&count_mutex); free(data_threads); - } bool ok = backup_start(corrupted_pages); @@ -4719,23 +4766,22 @@ fail_before_log_copying_thread_start: } xtrabackup_destroy_datasinks(); - msg("Redo log (from LSN " LSN_PF " to " LSN_PF - ") was copied.", checkpoint_lsn_start, log_copy_scanned_lsn); + msg("Redo log (from LSN " LSN_PF " to " LSN_PF ") was copied.", + log_sys.next_checkpoint_lsn, recv_sys.lsn); xb_filters_free(); xb_data_files_close(); /* Make sure that the latest checkpoint was included */ - if (metadata_to_lsn > log_copy_scanned_lsn) { + if (metadata_to_lsn > recv_sys.lsn) { msg("Error: failed to copy enough redo log (" "LSN=" LSN_PF "; checkpoint LSN=" LSN_PF ").", - log_copy_scanned_lsn, metadata_to_lsn); + recv_sys.lsn, metadata_to_lsn); goto fail; } innodb_shutdown(); log_file_op = NULL; - pthread_mutex_destroy(&backup_mutex); pthread_cond_destroy(&scanned_lsn_cond); if (!corrupted_pages.empty()) { ut_ad(opt_log_innodb_page_corruption); @@ -4771,9 +4817,9 @@ void backup_fix_ddl(CorruptedPages &corrupted_pages) space_id_to_name_t new_tables; /* Disable further DDL on backed up tables (only needed for --no-lock).*/ - pthread_mutex_lock(&backup_mutex); + mysql_mutex_lock(&recv_sys.mutex); log_file_op = backup_file_op_fail; - pthread_mutex_unlock(&backup_mutex); + mysql_mutex_unlock(&recv_sys.mutex); DBUG_MARIABACKUP_EVENT("backup_fix_ddl", {}); @@ -5842,7 +5888,9 @@ static bool xtrabackup_prepare_func(char** argv) srv_operation = SRV_OPERATION_RESTORE_DELTA; if (innodb_init_param()) { - goto error_cleanup; +error: + ok = false; + goto cleanup; } recv_sys.create(); @@ -5853,7 +5901,7 @@ static bool xtrabackup_prepare_func(char** argv) if (dberr_t err = xb_load_tablespaces()) { msg("mariabackup: error: xb_data_files_init() failed " "with error %s\n", ut_strerr(err)); - goto error_cleanup; + goto error; } ok = fil_system.sys_space->open(false) @@ -5873,14 +5921,14 @@ static bool xtrabackup_prepare_func(char** argv) fil_system.close(); innodb_free_param(); log_sys.close(); - if (!ok) goto error_cleanup; + if (!ok) goto cleanup; } srv_operation = xtrabackup_export ? SRV_OPERATION_RESTORE_EXPORT : SRV_OPERATION_RESTORE; if (innodb_init_param()) { - goto error_cleanup; + goto error; } fil_system.freeze_space_list = 0; @@ -5903,7 +5951,7 @@ static bool xtrabackup_prepare_func(char** argv) } if (innodb_init()) { - goto error_cleanup; + goto error; } ut_ad(!fil_system.freeze_space_list); @@ -5939,11 +5987,10 @@ static bool xtrabackup_prepare_func(char** argv) } /* Check whether the log is applied enough or not. */ - if (recv_sys.recovered_lsn && recv_sys.recovered_lsn < target_lsn) { + if (recv_sys.lsn && recv_sys.lsn < target_lsn) { msg("mariabackup: error: " "The log was only applied up to LSN " LSN_PF - ", instead of " LSN_PF, - recv_sys.recovered_lsn, target_lsn); + ", instead of " LSN_PF, recv_sys.lsn, target_lsn); ok = false; } #ifdef WITH_WSREP @@ -5988,7 +6035,7 @@ static bool xtrabackup_prepare_func(char** argv) if (ok && xtrabackup_export) ok= (prepare_export() == 0); -error_cleanup: +cleanup: xb_filters_free(); return ok && !ib::error::was_logged() && corrupted_pages.empty(); } diff --git a/extra/mariabackup/xtrabackup.h b/extra/mariabackup/xtrabackup.h index f2d76ff4103..11de9eeaf48 100644 --- a/extra/mariabackup/xtrabackup.h +++ b/extra/mariabackup/xtrabackup.h @@ -79,9 +79,6 @@ extern uint opt_protocol; extern ds_ctxt_t *ds_meta; extern ds_ctxt_t *ds_data; -/* The last checkpoint LSN at the backup startup time */ -extern lsn_t checkpoint_lsn_start; - extern xb_page_bitmap *changed_page_bitmap; extern char *xtrabackup_incremental; diff --git a/mysql-test/include/default_mysqld.cnf b/mysql-test/include/default_mysqld.cnf index 49fb03ecc57..89d0d779627 100644 --- a/mysql-test/include/default_mysqld.cnf +++ b/mysql-test/include/default_mysqld.cnf @@ -52,7 +52,7 @@ loose-innodb_buffer_pool_size= 8M loose-innodb_lru_scan_depth= 100 loose-innodb_write_io_threads= 2 loose-innodb_read_io_threads= 2 -loose-innodb_log_buffer_size= 1M +loose-innodb_log_buffer_size= 2M loose-innodb_log_file_size= 10M slave-net-timeout=120 diff --git a/mysql-test/main/drop_table_force.result b/mysql-test/main/drop_table_force.result index 404d8be8b21..1dd0f1f9dab 100644 --- a/mysql-test/main/drop_table_force.result +++ b/mysql-test/main/drop_table_force.result @@ -1,6 +1,4 @@ -CALL mtr.add_suppression("Operating system error number"); -CALL mtr.add_suppression("The error means the system cannot"); -CALL mtr.add_suppression("returned OS error 71"); +CALL mtr.add_suppression("InnoDB: File .*test/t1\\.ibd was not found"); #Test1: table with missing .ibd can be dropped directly create table t1(a int)engine=innodb; drop table t1; diff --git a/mysql-test/main/drop_table_force.test b/mysql-test/main/drop_table_force.test index f3073e3b67d..04ebb997b80 100644 --- a/mysql-test/main/drop_table_force.test +++ b/mysql-test/main/drop_table_force.test @@ -10,9 +10,7 @@ # the new one, we have left some references to the original test case # -CALL mtr.add_suppression("Operating system error number"); -CALL mtr.add_suppression("The error means the system cannot"); -CALL mtr.add_suppression("returned OS error 71"); +CALL mtr.add_suppression("InnoDB: File .*test/t1\\.ibd was not found"); let $DATADIR= `select @@datadir`; diff --git a/mysql-test/suite/binlog/t/binlog_innodb.test b/mysql-test/suite/binlog/t/binlog_innodb.test index 153dcdd155a..5f372e6d5fd 100644 --- a/mysql-test/suite/binlog/t/binlog_innodb.test +++ b/mysql-test/suite/binlog/t/binlog_innodb.test @@ -180,7 +180,7 @@ CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; SET @old_flush = @@GLOBAL.innodb_flush_log_at_trx_commit; SET GLOBAL innodb_flush_log_at_trx_commit=1; ---let $syncs1 = query_get_value(SHOW STATUS LIKE 'Innodb_os_log_fsyncs', Value, 1) +--let $syncs1 = query_get_value(SHOW STATUS LIKE 'Innodb_data_fsyncs', Value, 1) --let $ROWS = 100 --disable_query_log let $count = $ROWS; @@ -188,7 +188,7 @@ while ($count) { eval INSERT INTO t1 VALUES ($count); dec $count; } ---let $syncs2 = query_get_value(SHOW STATUS LIKE 'Innodb_os_log_fsyncs', Value, 1) +--let $syncs2 = query_get_value(SHOW STATUS LIKE 'Innodb_data_fsyncs', Value, 1) eval SET @num_sync = $syncs2 - $syncs1; --enable_query_log diff --git a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result index 8d1eb447b03..c1771fe534b 100644 --- a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result +++ b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result @@ -12,14 +12,13 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err -FOUND 2 /Plugin 'InnoDB' registration as a STORAGE ENGINE failed/ in mysqld.1.err # redo log from before MariaDB 10.2.2, with corrupted log block # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err +FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err # empty redo log from before MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES @@ -35,15 +34,15 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 0 -FOUND 2 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err -# Empty multi-file redo log from before MariaDB 10.2.2 +FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err +# Empty multi-file redo log (wrong offset) from before MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) -1 -FOUND 2 /InnoDB: Upgrading redo log:/ in mysqld.1.err +0 +FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err # Multi-file redo log with size mismatch from after MariaDB 10.2.2 # Corrupted multi-file redo log from after MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m @@ -52,30 +51,30 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 0 -FOUND 1 /InnoDB: Log file .*ib_logfile1 is of different size 1048576 bytes than other log files 2097152 bytes!/ in mysqld.1.err -FOUND 1 /InnoDB: Upgrade after a crash is not supported\. The redo log was created with BogoDB 1\.2\.3\.4, and it appears corrupted\./ in mysqld.1.err +FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err +FOUND 1 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 0 -FOUND 2 /InnoDB: Upgrade after a crash is not supported\. The redo log was created with BogoDB 1\.2\.3\.4, and it appears corrupted\./ in mysqld.1.err +FOUND 2 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err # Empty multi-file redo log from after MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) -1 -FOUND 3 /InnoDB: Upgrading redo log:/ in mysqld.1.err +0 +FOUND 3 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err # redo log from "after" MariaDB 10.2.2, but with invalid header checksum # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Invalid redo log header checksum/ in mysqld.1.err +FOUND 1 /InnoDB: Invalid log header checksum/ in mysqld.1.err # distant future redo log format, with valid header checksum # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -89,14 +88,14 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: No valid checkpoint found .corrupted redo log/ in mysqld.1.err +FOUND 1 /InnoDB: No valid checkpoint was found; the log was created with malicious intentions, or perhaps\./ in mysqld.1.err # valid header, valid checkpoint 1, all-zero (invalid) checkpoint 2, invalid block checksum # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 3362026715 found: 144444122/ in mysqld.1.err +FOUND 2 /InnoDB: Invalid log header checksum/ in mysqld.1.err FOUND 1 /InnoDB: Upgrade after a crash is not supported\. The redo log was created with malicious intentions, or perhaps, and it appears corrupted\./ in mysqld.1.err # same, but with current-version header # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 @@ -104,7 +103,7 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 2 /InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 3362026715 found: 144444122/ in mysqld.1.err +FOUND 3 /InnoDB: Invalid log header checksum/ in mysqld.1.err # --innodb-force-recovery=6 (skip the entire redo log) # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=6 SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -112,7 +111,7 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS InnoDB YES Supports transactions, row-level locking, foreign keys and encryption for tables YES YES YES -FOUND 1 /\[Note\] InnoDB: .* started; log sequence number 0/ in mysqld.1.err +FOUND 1 /\[Note\] InnoDB: log sequence number 0.*; transaction id 0/ in mysqld.1.err # valid header, valid checkpoint 1, all-zero (invalid) checkpoint 2, invalid block number # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -141,8 +140,9 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 2454333373 found: 150151/ in mysqld.1.err -FOUND 3 /\[ERROR\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1, and it appears corrupted\./ in mysqld.1.err +NOT FOUND /InnoDB: Invalid log header checksum +--source include/search_pattern_in_file.inc +let SEARCH_PATTERN=\[ERROR\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1, and it appears corrupted\./ in mysqld.1.err # valid header, invalid checkpoint 1, valid checkpoint 2, invalid log record # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -171,7 +171,7 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 1 -FOUND 1 /InnoDB: .* started; log sequence number 1213964; transaction id 0/ in mysqld.1.err +FOUND 1 /InnoDB: log sequence number 1213964\b.*; transaction id 0/ in mysqld.1.err # Empty 10.2 redo log # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES @@ -179,7 +179,15 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 1 -FOUND 5 /InnoDB: Upgrading redo log:/ in mysqld.1.err +FOUND 3 /InnoDB: Upgrading redo log:/ in mysqld.1.err +# Empty 10.5 redo log +# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m +SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES +WHERE engine = 'innodb' +AND support IN ('YES', 'DEFAULT', 'ENABLED'); +COUNT(*) +1 +FOUND 4 /InnoDB: Upgrading redo log:/ in mysqld.1.err # Minimal MariaDB 10.1.21 encrypted redo log # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 SELECT COUNT(*) `1` FROM INFORMATION_SCHEMA.ENGINES WHERE engine='innodb' diff --git a/mysql-test/suite/innodb/include/no_checkpoint_end.inc b/mysql-test/suite/innodb/include/no_checkpoint_end.inc index 4a00dadfd6e..61721650f32 100644 --- a/mysql-test/suite/innodb/include/no_checkpoint_end.inc +++ b/mysql-test/suite/innodb/include/no_checkpoint_end.inc @@ -10,12 +10,12 @@ my $cp = $ENV{CHECKPOINT_LSN}; $cp =~ s/^InnoDB\t\t//; my $log = "$ENV{MYSQLD_DATADIR}ib_logfile0"; open(LOG, "<$log") || die "Unable to open $log"; -seek(LOG, 512, 0) || die "Unable to seek $log"; -die unless read(LOG, $_, 16) == 16; -my ($no1hi,$no1lo,$cp1hi,$cp1lo) = unpack("N*", $_); -seek(LOG, 3 * 512, 0) || die "Unable to seek $log"; -die unless read(LOG, $_, 16) == 16; -my ($no2hi,$no2lo,$cp2hi,$cp2lo) = unpack("N*", $_); +seek(LOG, 4096, 0) || die "Unable to seek $log"; +die unless read(LOG, $_, 8) == 8; +my ($cp1hi,$cp1lo) = unpack("NN", $_); +seek(LOG, 8192, 0) || die "Unable to seek $log"; +die unless read(LOG, $_, 8) == 8; +my ($cp2hi,$cp2lo) = unpack("NN", $_); close(LOG); my $cp1 = $cp1hi << 32 | $cp1lo; @@ -27,8 +27,7 @@ if ($cp1 > $cp || $cp2 > $cp) { print OUT "--source include/start_mysqld.inc\n" unless $ENV{no_checkpoint_kill}; print OUT "$ENV{CLEANUP_IF_CHECKPOINT}\n"; - print OUT "--skip Extra checkpoint 1 after $cp"; - print OUT " ($no1hi:$no1lo=$cp1,$no2hi:$no2lo=$cp2)\n"; + print OUT "--skip Extra checkpoint 1 after $cp ($cp1,$cp2)\n"; } close(OUT); diff --git a/mysql-test/suite/innodb/r/innodb-wl5522-debug.result b/mysql-test/suite/innodb/r/innodb-wl5522-debug.result index 2973e5de550..4c1b35ac1e4 100644 --- a/mysql-test/suite/innodb/r/innodb-wl5522-debug.result +++ b/mysql-test/suite/innodb/r/innodb-wl5522-debug.result @@ -1,5 +1,5 @@ call mtr.add_suppression("InnoDB: Operating system error number .* in a file operation."); -call mtr.add_suppression("InnoDB: The error means the system cannot find the path specified."); +call mtr.add_suppression("InnoDB: Error number \\d+ means"); call mtr.add_suppression("InnoDB: Cannot open datafile for read-only: .*"); call mtr.add_suppression("InnoDB: Ignoring tablespace .* because it could not be opened."); call mtr.add_suppression("InnoDB: Tablespace for table .* is set as discarded."); diff --git a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result index 49fe8e629e3..2bbe990d7f1 100644 --- a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result +++ b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result @@ -156,9 +156,6 @@ os_data_fsyncs os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status os_pending_reads os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of reads pending os_pending_writes os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of writes pending os_log_bytes_written os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Bytes of log written (innodb_os_log_written) -os_log_fsyncs os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of fsync log writes (innodb_os_log_fsyncs) -os_log_pending_fsyncs os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of pending fsync write (innodb_os_log_pending_fsyncs) -os_log_pending_writes os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of pending log file writes (innodb_os_log_pending_writes) trx_rw_commits transaction 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of read-write transactions committed trx_ro_commits transaction 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of read-only transactions committed trx_nl_ro_commits transaction 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of non-locking auto-commit read-only transactions committed @@ -176,20 +173,17 @@ purge_undo_log_pages purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL purge_dml_delay_usec purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Microseconds DML to be delayed due to purge lagging purge_stop_count purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of times purge was stopped purge_resume_count purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of times purge was resumed -log_checkpoints recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of checkpoints +log_checkpoints recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of checkpoints log_lsn_last_flush recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value LSN of Last flush log_lsn_last_checkpoint recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value LSN at last checkpoint log_lsn_current recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Current LSN value log_lsn_checkpoint_age recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Current LSN value minus LSN at last checkpoint log_lsn_buf_pool_oldest recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value The oldest modified block LSN in the buffer pool log_max_modified_age_async recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Maximum LSN difference; when exceeded, start asynchronous preflush -log_pending_log_flushes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending log flushes -log_pending_checkpoint_writes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending checkpoints log_num_log_io recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of log I/Os log_waits recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of log waits due to small log buffer (innodb_log_waits) log_write_requests recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of log write requests (innodb_log_write_requests) log_writes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of log writes (innodb_log_writes) -log_padded recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Bytes of log padded for log write ahead compress_pages_compressed compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages compressed compress_pages_decompressed compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages decompressed compression_pad_increments compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of times padding is incremented to avoid compression failures diff --git a/mysql-test/suite/innodb/r/innodb_status_variables.result b/mysql-test/suite/innodb/r/innodb_status_variables.result index da5020bbe08..02f73f37d66 100644 --- a/mysql-test/suite/innodb/r/innodb_status_variables.result +++ b/mysql-test/suite/innodb/r/innodb_status_variables.result @@ -64,9 +64,6 @@ INNODB_MASTER_THREAD_ACTIVE_LOOPS INNODB_MASTER_THREAD_IDLE_LOOPS INNODB_MAX_TRX_ID INNODB_MEM_DICTIONARY -INNODB_OS_LOG_FSYNCS -INNODB_OS_LOG_PENDING_FSYNCS -INNODB_OS_LOG_PENDING_WRITES INNODB_OS_LOG_WRITTEN INNODB_PAGE_SIZE INNODB_PAGES_CREATED diff --git a/mysql-test/suite/innodb/r/instant_alter_import.result b/mysql-test/suite/innodb/r/instant_alter_import.result index c569c65d4ce..ed3c87b4e41 100644 --- a/mysql-test/suite/innodb/r/instant_alter_import.result +++ b/mysql-test/suite/innodb/r/instant_alter_import.result @@ -1,6 +1,6 @@ call mtr.add_suppression("Operating system error number .* in a file operation."); call mtr.add_suppression("The error means the system cannot find the path specified."); -call mtr.add_suppression("File ./test/t1.ibd: 'delete' returned OS error"); +call mtr.add_suppression("File ./test/t1.ibd was not found"); set default_storage_engine=innodb; # # MDEV-18295 IMPORT TABLESPACE fails with instant-altered tables diff --git a/mysql-test/suite/innodb/r/log_corruption.result b/mysql-test/suite/innodb/r/log_corruption.result index bf92f77d30c..12e9d340bbd 100644 --- a/mysql-test/suite/innodb/r/log_corruption.result +++ b/mysql-test/suite/innodb/r/log_corruption.result @@ -12,14 +12,13 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err -FOUND 2 /Plugin 'InnoDB' registration as a STORAGE ENGINE failed/ in mysqld.1.err # redo log from before MariaDB 10.2.2, with corrupted log block # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err +FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err # empty redo log from before MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES @@ -35,15 +34,15 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 0 -FOUND 2 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err -# Empty multi-file redo log from before MariaDB 10.2.2 +FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err +# Empty multi-file redo log (wrong offset) from before MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) -1 -FOUND 2 /InnoDB: Upgrading redo log:/ in mysqld.1.err +0 +FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err # Multi-file redo log with size mismatch from after MariaDB 10.2.2 # Corrupted multi-file redo log from after MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m @@ -52,30 +51,30 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 0 -FOUND 1 /InnoDB: Log file .*ib_logfile1 is of different size 1048576 bytes than other log files 2097152 bytes!/ in mysqld.1.err -FOUND 1 /InnoDB: Upgrade after a crash is not supported\. The redo log was created with BogoDB 1\.2\.3\.4, and it appears corrupted\./ in mysqld.1.err +FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err +FOUND 1 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 0 -FOUND 2 /InnoDB: Upgrade after a crash is not supported\. The redo log was created with BogoDB 1\.2\.3\.4, and it appears corrupted\./ in mysqld.1.err +FOUND 2 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err # Empty multi-file redo log from after MariaDB 10.2.2 # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) -1 -FOUND 3 /InnoDB: Upgrading redo log:/ in mysqld.1.err +0 +FOUND 3 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err # redo log from "after" MariaDB 10.2.2, but with invalid header checksum # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Invalid redo log header checksum/ in mysqld.1.err +FOUND 1 /InnoDB: Invalid log header checksum/ in mysqld.1.err # distant future redo log format, with valid header checksum # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -89,14 +88,14 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: No valid checkpoint found .corrupted redo log/ in mysqld.1.err +FOUND 1 /InnoDB: No valid checkpoint was found; the log was created with malicious intentions, or perhaps\./ in mysqld.1.err # valid header, valid checkpoint 1, all-zero (invalid) checkpoint 2, invalid block checksum # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 3362026715 found: 144444122/ in mysqld.1.err +FOUND 2 /InnoDB: Invalid log header checksum/ in mysqld.1.err FOUND 1 /InnoDB: Upgrade after a crash is not supported\. The redo log was created with malicious intentions, or perhaps, and it appears corrupted\./ in mysqld.1.err # same, but with current-version header # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 @@ -104,7 +103,7 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 2 /InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 3362026715 found: 144444122/ in mysqld.1.err +FOUND 3 /InnoDB: Invalid log header checksum/ in mysqld.1.err # --innodb-force-recovery=6 (skip the entire redo log) # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=6 SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -112,7 +111,7 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS InnoDB YES Supports transactions, row-level locking, foreign keys and encryption for tables YES YES YES -FOUND 1 /\[Note\] InnoDB: .* started; log sequence number 0/ in mysqld.1.err +FOUND 1 /\[Note\] InnoDB: log sequence number 0.*; transaction id 0/ in mysqld.1.err # valid header, valid checkpoint 1, all-zero (invalid) checkpoint 2, invalid block number # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -141,8 +140,9 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 2454333373 found: 150151/ in mysqld.1.err -FOUND 3 /\[ERROR\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1, and it appears corrupted\./ in mysqld.1.err +NOT FOUND /InnoDB: Invalid log header checksum +--source include/search_pattern_in_file.inc +let SEARCH_PATTERN=\[ERROR\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1, and it appears corrupted\./ in mysqld.1.err # valid header, invalid checkpoint 1, valid checkpoint 2, invalid log record # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption SELECT * FROM INFORMATION_SCHEMA.ENGINES @@ -171,7 +171,7 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 1 -FOUND 1 /InnoDB: .* started; log sequence number 1213964; transaction id 0/ in mysqld.1.err +FOUND 1 /InnoDB: log sequence number 1213964\b.*; transaction id 0/ in mysqld.1.err # Empty 10.2 redo log # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES @@ -179,7 +179,15 @@ WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); COUNT(*) 1 -FOUND 5 /InnoDB: Upgrading redo log:/ in mysqld.1.err +FOUND 3 /InnoDB: Upgrading redo log:/ in mysqld.1.err +# Empty 10.5 redo log +# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m +SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES +WHERE engine = 'innodb' +AND support IN ('YES', 'DEFAULT', 'ENABLED'); +COUNT(*) +1 +FOUND 4 /InnoDB: Upgrading redo log:/ in mysqld.1.err # Minimal MariaDB 10.1.21 encrypted redo log # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 SELECT * FROM INFORMATION_SCHEMA.ENGINES diff --git a/mysql-test/suite/innodb/r/log_file.result b/mysql-test/suite/innodb/r/log_file.result index d2070d23a9a..734e9b07687 100644 --- a/mysql-test/suite/innodb/r/log_file.result +++ b/mysql-test/suite/innodb/r/log_file.result @@ -15,7 +15,7 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -FOUND 1 /File .path.to.non-existent.*ib_logfile101: 'create' returned OS error \d+/ in mysqld.1.err +FOUND 1 /Cannot create /path/to/non-existent/ib_logfile101/ in mysqld.1.err # Successfully let InnoDB create tablespaces # restart: --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_file --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_file --innodb-undo-directory=MYSQLTEST_VARDIR/tmp/log_file --innodb-undo-logs=20 --innodb-undo-tablespaces=3 --innodb-data-file-path=ibdata1:16M;ibdata2:10M:autoextend SELECT COUNT(*) `1` FROM INFORMATION_SCHEMA.ENGINES @@ -255,7 +255,6 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS -InnoDB YES Supports transactions, row-level locking, foreign keys and encryption for tables YES YES YES bak_ib_logfile0 bak_ibdata1 bak_ibdata2 @@ -263,7 +262,6 @@ bak_undo001 bak_undo002 bak_undo003 ib_buffer_pool -ib_logfile0 ibdata1 ibdata2 undo001 diff --git a/mysql-test/suite/innodb/r/log_file_name_debug.result b/mysql-test/suite/innodb/r/log_file_name_debug.result index cb2ee68fc98..0058a305185 100644 --- a/mysql-test/suite/innodb/r/log_file_name_debug.result +++ b/mysql-test/suite/innodb/r/log_file_name_debug.result @@ -9,7 +9,7 @@ CREATE TABLE t1(a INT PRIMARY KEY) ENGINE=InnoDB; SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' FOUND 1 /InnoDB: Tablespace 4294967280 was not found at .*, but there were no modifications either/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_3,ib_log --innodb-log-file-size=4194304 +# restart: --debug=d,innodb_log_abort_5,ib_log --innodb-log-file-size=4194304 SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' FOUND 1 /ib_log: FILE_CHECKPOINT.* written/ in mysqld.1.err diff --git a/mysql-test/suite/innodb/r/log_file_size.result b/mysql-test/suite/innodb/r/log_file_size.result index 1c98dc4bbf2..47ca20af2f4 100644 --- a/mysql-test/suite/innodb/r/log_file_size.result +++ b/mysql-test/suite/innodb/r/log_file_size.result @@ -35,61 +35,32 @@ FOUND 1 /syntax error in innodb_log_group_home_dir/ in mysqld.1.err SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' FOUND 1 /InnoDB: Starting crash recovery from checkpoint LSN=.*/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_3 -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' # restart: --innodb-read-only SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' FOUND 1 /InnoDB: innodb_read_only prevents crash recovery/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_4 -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 5 /redo log from [1-9][0-9.]+[KMGT]iB to [1-9][0-9.]+[KMGT]iB/ in mysqld.1.err # restart: --debug=d,innodb_log_abort_5 SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 6 /redo log from [1-9][0-9.]+[KMGT]iB to [1-9][0-9.]+[KMGT]iB/ in mysqld.1.err +FOUND 1 /redo log from 5\.000MiB to [0-9.]*[KMGT]iB/ in mysqld.1.err # restart: --innodb-read-only SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' FOUND 2 /InnoDB: innodb_read_only prevents crash recovery/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_6 +# restart SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 7 /redo log from [1-9][0-9.]+[KMGT]iB to [1-9][0-9.]+[KMGT]iB/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_7 +FOUND 1 /InnoDB: File .*ib_logfile0 is too small/ in mysqld.1.err +# restart SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' -# restart: --innodb-read-only -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Cannot create log file in read-only mode/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_8 -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Setting log file .*ib_logfile[0-9]+ size to/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_9 -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Setting log file .*ib_logfile[0-9]+ size to/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_9 -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Log file .*ib_logfile0 size 7 is not a multiple of 512 bytes/ in mysqld.1.err -# restart: --debug=d,innodb_log_abort_9 -SELECT * FROM t1; -a -42 -123 -# restart: --debug=d,innodb_log_abort_10 -SELECT * FROM t1; -ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Setting log file .*ib_logfile[0-9]+ size to/ in mysqld.1.err -FOUND 1 /InnoDB: Renaming log file .*ib_logfile101 to .*ib_logfile0/ in mysqld.1.err +FOUND 1 /InnoDB: Expecting only ib_logfile0/ in mysqld.1.err +# restart +FOUND 1 /InnoDB: File .*ib_logfile0 was not found/ in mysqld.1.err # restart SELECT * FROM t1; a 42 123 DROP TABLE t1; +FOUND 2 /InnoDB: Resizing redo log from 5\.000MiB to [0-9.]*[KMGT]iB; LSN=\d+\b/ in mysqld.1.err diff --git a/mysql-test/suite/innodb/r/monitor.result b/mysql-test/suite/innodb/r/monitor.result index 2e5e5e2241a..0424ea51238 100644 --- a/mysql-test/suite/innodb/r/monitor.result +++ b/mysql-test/suite/innodb/r/monitor.result @@ -122,9 +122,6 @@ os_data_fsyncs disabled os_pending_reads disabled os_pending_writes disabled os_log_bytes_written disabled -os_log_fsyncs disabled -os_log_pending_fsyncs disabled -os_log_pending_writes disabled trx_rw_commits disabled trx_ro_commits disabled trx_nl_ro_commits disabled @@ -149,13 +146,10 @@ log_lsn_current disabled log_lsn_checkpoint_age disabled log_lsn_buf_pool_oldest disabled log_max_modified_age_async disabled -log_pending_log_flushes disabled -log_pending_checkpoint_writes disabled log_num_log_io disabled log_waits disabled log_write_requests disabled log_writes disabled -log_padded disabled compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled @@ -284,9 +278,6 @@ os_data_fsyncs enabled os_pending_reads enabled os_pending_writes enabled os_log_bytes_written disabled -os_log_fsyncs disabled -os_log_pending_fsyncs enabled -os_log_pending_writes enabled set global innodb_monitor_enable=""; ERROR 42000: Variable 'innodb_monitor_enable' can't be set to the value of '' set global innodb_monitor_enable="_"; diff --git a/mysql-test/suite/innodb/r/rename_table.result b/mysql-test/suite/innodb/r/rename_table.result index 8c3722c7940..0ed56005e21 100644 --- a/mysql-test/suite/innodb/r/rename_table.result +++ b/mysql-test/suite/innodb/r/rename_table.result @@ -21,7 +21,7 @@ path DROP DATABASE abc_def; # restart DROP DATABASE abc_def2; -call mtr.add_suppression("InnoDB: (Operating system error|The error means|Cannot rename file)"); +call mtr.add_suppression("InnoDB: (Operating system error|Error number \\d+ means|Cannot rename file)"); CREATE TABLE t1 (a INT) ENGINE=InnoDB; RENAME TABLE t1 TO non_existing_db.t1; ERROR HY000: Error on rename of './test/t1' to './non_existing_db/t1' (errno: 168 "Unknown (generic) error from engine") diff --git a/mysql-test/suite/innodb/r/truncate_missing.result b/mysql-test/suite/innodb/r/truncate_missing.result index 1cc654f0d7e..b7e514b172b 100644 --- a/mysql-test/suite/innodb/r/truncate_missing.result +++ b/mysql-test/suite/innodb/r/truncate_missing.result @@ -1,5 +1,5 @@ call mtr.add_suppression("InnoDB: Operating system error number "); -call mtr.add_suppression("InnoDB: (The error means|If you are|Cannot open datafile) "); +call mtr.add_suppression("InnoDB: (Error number \\d+ means|If you are|Cannot open datafile) "); call mtr.add_suppression("InnoDB: Ignoring tablespace for test/t "); call mtr.add_suppression("InnoDB: Table test/t .* does not exist"); CREATE TABLE t (a SERIAL) ENGINE=InnoDB; diff --git a/mysql-test/suite/innodb/t/alter_missing_tablespace.test b/mysql-test/suite/innodb/t/alter_missing_tablespace.test index bf7111509bd..5c7f63eb813 100644 --- a/mysql-test/suite/innodb/t/alter_missing_tablespace.test +++ b/mysql-test/suite/innodb/t/alter_missing_tablespace.test @@ -9,7 +9,7 @@ --disable_query_log call mtr.add_suppression("InnoDB: Cannot open datafile for read-only: "); call mtr.add_suppression("InnoDB: Operating system error number .* in a file operation"); -call mtr.add_suppression("InnoDB: The error means the system cannot find the path specified"); +call mtr.add_suppression("InnoDB: Error number \\d+ means"); call mtr.add_suppression("InnoDB: Ignoring tablespace for test/\(t\|x@002e@002ed\) because it could not be opened"); call mtr.add_suppression("InnoDB: Cannot calculate statistics for table .* because the .ibd file is missing"); call mtr.add_suppression("Could not find a valid tablespace file for"); diff --git a/mysql-test/suite/innodb/t/innodb-index-online.opt b/mysql-test/suite/innodb/t/innodb-index-online.opt index ff20edbe2f7..1837463f07a 100644 --- a/mysql-test/suite/innodb/t/innodb-index-online.opt +++ b/mysql-test/suite/innodb/t/innodb-index-online.opt @@ -1,6 +1,5 @@ --loose-innodb-sort-buffer-size=64k --loose-innodb-online-alter-log-max-size=128k --loose-innodb-buffer-pool-size=5M ---loose-innodb-log-buffer-size=256k --loose-innodb-sys-indexes --loose-innodb-sys-fields diff --git a/mysql-test/suite/innodb/t/innodb-table-online-master.opt b/mysql-test/suite/innodb/t/innodb-table-online-master.opt index 92eea2b0d2e..1eafb5ac188 100644 --- a/mysql-test/suite/innodb/t/innodb-table-online-master.opt +++ b/mysql-test/suite/innodb/t/innodb-table-online-master.opt @@ -1 +1 @@ ---innodb-sort-buffer-size=64k --innodb-online-alter-log-max-size=512k --innodb-buffer-pool-size=5M --innodb-log-buffer-size=256k +--innodb-sort-buffer-size=64k --innodb-online-alter-log-max-size=512k --innodb-buffer-pool-size=5M diff --git a/mysql-test/suite/innodb/t/innodb-wl5522-debug.test b/mysql-test/suite/innodb/t/innodb-wl5522-debug.test index b460cba9322..e7a39f23e11 100644 --- a/mysql-test/suite/innodb/t/innodb-wl5522-debug.test +++ b/mysql-test/suite/innodb/t/innodb-wl5522-debug.test @@ -19,7 +19,7 @@ let $restart_noprint=2; call mtr.add_suppression("InnoDB: Operating system error number .* in a file operation."); -call mtr.add_suppression("InnoDB: The error means the system cannot find the path specified."); +call mtr.add_suppression("InnoDB: Error number \\d+ means"); call mtr.add_suppression("InnoDB: Cannot open datafile for read-only: .*"); call mtr.add_suppression("InnoDB: Ignoring tablespace .* because it could not be opened."); call mtr.add_suppression("InnoDB: Tablespace for table .* is set as discarded."); diff --git a/mysql-test/suite/innodb/t/instant_alter_import.test b/mysql-test/suite/innodb/t/instant_alter_import.test index 99ae48ba815..854b5b3a953 100644 --- a/mysql-test/suite/innodb/t/instant_alter_import.test +++ b/mysql-test/suite/innodb/t/instant_alter_import.test @@ -4,7 +4,7 @@ call mtr.add_suppression("Operating system error number .* in a file operation."); call mtr.add_suppression("The error means the system cannot find the path specified."); -call mtr.add_suppression("File ./test/t1.ibd: 'delete' returned OS error"); +call mtr.add_suppression("File ./test/t1.ibd was not found"); set default_storage_engine=innodb; diff --git a/mysql-test/suite/innodb/t/log_corruption.test b/mysql-test/suite/innodb/t/log_corruption.test index 4e9ea9fa698..dad5f67afca 100644 --- a/mysql-test/suite/innodb/t/log_corruption.test +++ b/mysql-test/suite/innodb/t/log_corruption.test @@ -7,8 +7,8 @@ call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); call mtr.add_suppression("InnoDB: Unsupported redo log format"); -call mtr.add_suppression("InnoDB: No valid checkpoint found"); -call mtr.add_suppression("InnoDB: Invalid (log block|redo log header) checksum"); +call mtr.add_suppression("InnoDB: No valid checkpoint was found"); +call mtr.add_suppression("InnoDB: Invalid log header checksum"); call mtr.add_suppression("InnoDB: Missing MLOG_CHECKPOINT"); call mtr.add_suppression("InnoDB: MLOG_FILE_NAME incorrect"); call mtr.add_suppression("InnoDB: ############### CORRUPT LOG RECORD FOUND"); @@ -16,8 +16,7 @@ call mtr.add_suppression("InnoDB: Log scan aborted at LSN"); call mtr.add_suppression("InnoDB: Missing MLOG_FILE_NAME or MLOG_FILE_DELETE before MLOG_CHECKPOINT for tablespace 42\\r?$"); call mtr.add_suppression("InnoDB: Obtaining redo log encryption key version 1 failed"); call mtr.add_suppression("InnoDB: Decrypting checkpoint failed"); -call mtr.add_suppression("InnoDB: Are you sure you are using the right ib_logfile0 to start up the database\\? The checkpoint is 1213964,"); -call mtr.add_suppression("InnoDB: Log file .*ib_logfile1 is of different size 1048576 bytes than other log files 2097152 bytes!"); +call mtr.add_suppression("InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 1048576 bytes!"); --enable_query_log let bugdir= $MYSQLTEST_VARDIR/tmp/log_corruption; @@ -134,8 +133,6 @@ eval $check_no_innodb; let SEARCH_PATTERN=InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\\.2\\.2, and we did not find a valid checkpoint; --source include/search_pattern_in_file.inc -let SEARCH_PATTERN=Plugin 'InnoDB' registration as a STORAGE ENGINE failed; ---source include/search_pattern_in_file.inc --echo # redo log from before MariaDB 10.2.2, with corrupted log block --remove_file $bugdir/ib_logfile0 @@ -154,7 +151,6 @@ EOF --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\\.2\\.2, and it appears corrupted; --source include/search_pattern_in_file.inc --echo # empty redo log from before MariaDB 10.2.2 @@ -204,7 +200,7 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED'); let SEARCH_PATTERN=InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\\.2\\.2, and it appears corrupted; --source include/search_pattern_in_file.inc ---echo # Empty multi-file redo log from before MariaDB 10.2.2 +--echo # Empty multi-file redo log (wrong offset) from before MariaDB 10.2.2 perl; die unless open OUT, "+<", "$ENV{bugdir}/ib_logfile1"; binmode OUT; @@ -220,7 +216,7 @@ SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); --source include/shutdown_mysqld.inc ---let SEARCH_PATTERN= InnoDB: Upgrading redo log: +--let SEARCH_PATTERN= Upgrade after a crash is not supported. This redo log was created before MariaDB 10\\.2\\.2, and we did not find a valid checkpoint\\. --source include/search_pattern_in_file.inc --echo # Multi-file redo log with size mismatch from after MariaDB 10.2.2 @@ -251,9 +247,8 @@ SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); --source include/shutdown_mysqld.inc ---let SEARCH_PATTERN=InnoDB: Log file .*ib_logfile1 is of different size 1048576 bytes than other log files 2097152 bytes! --source include/search_pattern_in_file.inc ---let SEARCH_PATTERN=InnoDB: Upgrade after a crash is not supported\\. The redo log was created with BogoDB 1\\.2\\.3\\.4, and it appears corrupted\\. +--let SEARCH_PATTERN=InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\\.2\\.3\\.4\\. --source include/search_pattern_in_file.inc perl; @@ -270,7 +265,8 @@ SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); --source include/shutdown_mysqld.inc ---let SEARCH_PATTERN=InnoDB: Upgrade after a crash is not supported\\. The redo log was created with BogoDB 1\\.2\\.3\\.4, and it appears corrupted\\. +--let SEARCH_PATTERN=InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 1048576 bytes! +--let SEARCH_PATTERN=InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\\.2\\.3\\.4\\. --source include/search_pattern_in_file.inc --echo # Empty multi-file redo log from after MariaDB 10.2.2 @@ -292,7 +288,6 @@ SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); --source include/shutdown_mysqld.inc ---let SEARCH_PATTERN= InnoDB: Upgrading redo log: --source include/search_pattern_in_file.inc --let $restart_parameters= $dirs @@ -309,7 +304,7 @@ EOF --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=InnoDB: Invalid redo log header checksum; +let SEARCH_PATTERN=InnoDB: Invalid log header checksum; --source include/search_pattern_in_file.inc --echo # distant future redo log format, with valid header checksum @@ -345,7 +340,7 @@ EOF --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=InnoDB: No valid checkpoint found .corrupted redo log; +--let SEARCH_PATTERN=InnoDB: No valid checkpoint was found; the log was created with malicious intentions, or perhaps\\. --source include/search_pattern_in_file.inc --echo # valid header, valid checkpoint 1, all-zero (invalid) checkpoint 2, invalid block checksum @@ -363,7 +358,7 @@ EOF --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 3362026715 found: 144444122; +let SEARCH_PATTERN=InnoDB: Invalid log header checksum; --source include/search_pattern_in_file.inc let SEARCH_PATTERN=InnoDB: Upgrade after a crash is not supported\. The redo log was created with malicious intentions, or perhaps, and it appears corrupted\.; --source include/search_pattern_in_file.inc @@ -380,14 +375,14 @@ EOF eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 3362026715 found: 144444122; +let SEARCH_PATTERN=InnoDB: Invalid log header checksum; --source include/search_pattern_in_file.inc --echo # --innodb-force-recovery=6 (skip the entire redo log) --let $restart_parameters= $dirs --innodb-force-recovery=6 --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc ---let SEARCH_PATTERN=\\[Note\\] InnoDB: .* started; log sequence number 0 +--let SEARCH_PATTERN=\\[Note\\] InnoDB: log sequence number 0.*; transaction id 0 --source include/search_pattern_in_file.inc --echo # valid header, valid checkpoint 1, all-zero (invalid) checkpoint 2, invalid block number @@ -464,7 +459,7 @@ EOF --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=InnoDB: Invalid log block checksum. block: 2372 checkpoint no: 1 expected: 2454333373 found: 150151; +let SEARCH_PATTERN=InnoDB: Invalid log header checksum --source include/search_pattern_in_file.inc let SEARCH_PATTERN=\\[ERROR\\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1, and it appears corrupted\.; --source include/search_pattern_in_file.inc @@ -544,7 +539,7 @@ SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); --source include/shutdown_mysqld.inc ---let SEARCH_PATTERN= InnoDB: .* started; log sequence number 1213964; transaction id 0 +--let SEARCH_PATTERN= InnoDB: log sequence number 1213964\\b.*; transaction id 0 --source include/search_pattern_in_file.inc --echo # Empty 10.2 redo log @@ -577,6 +572,33 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED'); --let SEARCH_PATTERN= InnoDB: Upgrading redo log: --source include/search_pattern_in_file.inc +--echo # Empty 10.5 redo log +perl; +die unless open OUT, "+<", "$ENV{bugdir}/ib_logfile0"; +binmode OUT; +# header block +print OUT pack("Nx[5]nx[5]", 0x50485953, 0x1286); +print OUT "ibbackup was here!!!1!"; +print OUT pack("x[470]N", 0x677700cf); +# invalid (all-zero) checkpoint page 1 and an empty log page +print OUT chr(0) x 1024; +# valid checkpoint block 2 +print OUT pack("x[12]NNNx[264]", 0x12860c, 0, 0x80c); +# pointer to the FILE_CHECKPOINT record, and checkpoint page checksum +print OUT pack("H*x[204]NNN", "590DBAACFE922582", 0x128612, 0, 0x101741b); +# log page +print OUT pack("NnnNx[496]N", 0x80000944, 12, 12, 1, 0x46c8a2a2); +close OUT or die; +EOF + +--source include/start_mysqld.inc +SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES +WHERE engine = 'innodb' +AND support IN ('YES', 'DEFAULT', 'ENABLED'); +--source include/shutdown_mysqld.inc +--let SEARCH_PATTERN= InnoDB: Upgrading redo log: +--source include/search_pattern_in_file.inc + --echo # Minimal MariaDB 10.1.21 encrypted redo log perl; die unless open OUT, "+<", "$ENV{bugdir}/ib_logfile0"; diff --git a/mysql-test/suite/innodb/t/log_file.test b/mysql-test/suite/innodb/t/log_file.test index b33c680f02f..0f26622e2b6 100644 --- a/mysql-test/suite/innodb/t/log_file.test +++ b/mysql-test/suite/innodb/t/log_file.test @@ -11,7 +11,7 @@ call mtr.add_suppression("Plugin 'InnoDB' init function returned error"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); call mtr.add_suppression("InnoDB: Operating system error number \d+ in a file operation"); call mtr.add_suppression("InnoDB: The error means the system cannot find the path specified"); -call mtr.add_suppression("InnoDB: File .path.to.non-existent.ib_logfile101: 'create' returned OS error \d+"); +call mtr.add_suppression("InnoDB: File /path/to/non-existent/ib_logfile101 was not found"); call mtr.add_suppression("InnoDB: Cannot create .path.to.non-existent.ib_logfile101"); call mtr.add_suppression("InnoDB: The data file '.*ibdata1' was not found but one of the other data files '.*ibdata2' exists"); call mtr.add_suppression("InnoDB: Tablespace size stored in header is \d+ pages, but the sum of data file sizes is \d+ pages"); @@ -61,7 +61,7 @@ let SEARCH_PATTERN=\[ERROR\] InnoDB: Could not create undo tablespace '.*undo002 --source include/start_mysqld.inc eval $check_no_innodb; --source include/shutdown_mysqld.inc -let SEARCH_PATTERN=File .path.to.non-existent.*ib_logfile101: 'create' returned OS error \d+; +let SEARCH_PATTERN=Cannot create /path/to/non-existent/ib_logfile101; --source include/search_pattern_in_file.inc --list_files $bugdir diff --git a/mysql-test/suite/innodb/t/log_file_name.test b/mysql-test/suite/innodb/t/log_file_name.test index 895e945f4e3..b0935c90ca3 100644 --- a/mysql-test/suite/innodb/t/log_file_name.test +++ b/mysql-test/suite/innodb/t/log_file_name.test @@ -146,7 +146,7 @@ DROP TABLE t0; --disable_query_log # The following are for the orphan file t0.ibd or for the directory t2.ibd: call mtr.add_suppression("InnoDB: Operating system error number [0-9]* in a file operation"); -call mtr.add_suppression("InnoDB: Error number [0-9]* means '(File exists|Is a directory)'"); +call mtr.add_suppression("InnoDB: Error number \\d+ means"); call mtr.add_suppression("InnoDB: Cannot create file '.*t0.ibd'"); call mtr.add_suppression("InnoDB: The file '.*t0\.ibd' already exists"); call mtr.add_suppression("InnoDB: Cannot open datafile for read-write: '.*t2\.ibd'"); diff --git a/mysql-test/suite/innodb/t/log_file_name_debug.test b/mysql-test/suite/innodb/t/log_file_name_debug.test index d90be6d8916..9ef3c9ff4dc 100644 --- a/mysql-test/suite/innodb/t/log_file_name_debug.test +++ b/mysql-test/suite/innodb/t/log_file_name_debug.test @@ -35,7 +35,7 @@ SELECT * FROM t1; --let SEARCH_PATTERN = InnoDB: Tablespace 4294967280 was not found at .*, but there were no modifications either --source include/search_pattern_in_file.inc ---let $restart_parameters= --debug=d,innodb_log_abort_3,ib_log $resize +--let $restart_parameters= --debug=d,innodb_log_abort_5,ib_log $resize --source include/restart_mysqld.inc --error ER_UNKNOWN_STORAGE_ENGINE SELECT * FROM t1; diff --git a/mysql-test/suite/innodb/t/log_file_size.test b/mysql-test/suite/innodb/t/log_file_size.test index a8bec8a7c21..22838475c60 100644 --- a/mysql-test/suite/innodb/t/log_file_size.test +++ b/mysql-test/suite/innodb/t/log_file_size.test @@ -7,31 +7,50 @@ # This test is slow on buildbot. --source include/big_test.inc -if (`SELECT @@innodb_log_file_size = 1048576`) { - --skip Test requires innodb_log_file_size>1M. -} - --disable_query_log -call mtr.add_suppression("InnoDB: The log sequence numbers [0-9]+ and [0-9]+ in ibdata file do not match the log sequence number [0-9]+ in the ib_logfile"); call mtr.add_suppression("syntax error in innodb_log_group_home_dir"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("InnoDB: innodb_read_only prevents crash recovery"); -call mtr.add_suppression("InnoDB: Are you sure you are using the right ib_logfile"); -call mtr.add_suppression("InnoDB: Cannot (create|resize) log file in read-only mode"); -call mtr.add_suppression("InnoDB: Can't initiate database recovery, running in read-only-mode"); -call mtr.add_suppression("InnoDB: Log file .*ib_logfile0.* size"); -call mtr.add_suppression("InnoDB: Unable to open .*ib_logfile0. to check native AIO read support"); +call mtr.add_suppression("InnoDB: Log file .*ib_logfile1.* size"); +call mtr.add_suppression("InnoDB: File .*ib_logfile0 (is too small|was not found)"); +call mtr.add_suppression("InnoDB: Expecting only ib_logfile0"); FLUSH TABLES; --enable_query_log let MYSQLD_DATADIR= `select @@datadir`; CREATE TABLE t1(a INT PRIMARY KEY) ENGINE=InnoDB; --source include/shutdown_mysqld.inc ---move_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile.old -write_file $MYSQLD_DATADIR/ib_logfile0; +perl; +do "$ENV{MTR_SUITE_DIR}/include/crc32.pl"; +my $file = "$ENV{MYSQLD_DATADIR}ib_logfile0"; +open(FILE, "<$file") || die "Unable to open $file\n"; +seek(FILE, 4096, 0) || die "Unable to seek $file\n"; +die unless read(FILE, $_, 8) == 8; +my ($lsn_hi,$lsn_lo) = unpack("NN", $_); +seek(FILE, 8192, 0) || die "Unable to seek $file\n"; +die unless read(FILE, $_, 8) == 8; +my ($cp2hi,$cp2lo) = unpack("NN", $_); +if ($cp2hi < $lsn_hi) {} +elsif ($cp2hi > $lsn_hi || $cp2lo > $lsn_lo) +{ $lsn_hi=$cp2hi;$lsn_lo=$cp2lo; } +close(FILE); +open(FILE, ">", $file) or die "Unable to open $file\n"; +binmode FILE; +my $polynomial = 0x82f63b78; # CRC-32C +my ($header, $checkpoint, $log); +$header = "Phys" . pack("x[4]NN", $lsn_hi, $lsn_lo) . + "some Perl code" . pack("x[478]"); +$header .= pack("Nx[3584]", mycrc32($header, 0, $polynomial)); +$checkpoint = pack("NNNNx[44]", $lsn_hi, $lsn_lo, $lsn_hi, $lsn_lo); +$checkpoint .= pack("Nx[8128]", mycrc32($checkpoint, 0, $polynomial)); +$log = pack("CxxNN", 0xfa, $lsn_hi, $lsn_lo); +$log .= pack("CN", 1, mycrc32($log, 0, $polynomial)); +print FILE $header, $checkpoint, $log; +close(FILE) or die "Unable to close $file\n"; EOF + let $check_no_innodb=SELECT * FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); @@ -95,11 +114,6 @@ SELECT * FROM t1; let SEARCH_PATTERN= InnoDB: Starting crash recovery from checkpoint LSN=.*; --source include/search_pattern_in_file.inc ---let $restart_parameters= --debug=d,innodb_log_abort_3 ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - --let $restart_parameters= --innodb-read-only --source include/restart_mysqld.inc @@ -108,17 +122,11 @@ SELECT * FROM t1; let SEARCH_PATTERN= InnoDB: innodb_read_only prevents crash recovery; --source include/search_pattern_in_file.inc ---let $restart_parameters= --debug=d,innodb_log_abort_4 ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; -let SEARCH_PATTERN= redo log from [1-9][0-9.]+[KMGT]iB to [1-9][0-9.]+[KMGT]iB; ---source include/search_pattern_in_file.inc - --let $restart_parameters= --debug=d,innodb_log_abort_5 --source include/restart_mysqld.inc --error ER_UNKNOWN_STORAGE_ENGINE SELECT * FROM t1; +let SEARCH_PATTERN= redo log from 5\\.000MiB to [0-9.]*[KMGT]iB; --source include/search_pattern_in_file.inc --let $restart_parameters= --innodb-read-only @@ -128,87 +136,40 @@ SELECT * FROM t1; let SEARCH_PATTERN= InnoDB: innodb_read_only prevents crash recovery; --source include/search_pattern_in_file.inc ---let $restart_parameters= --debug=d,innodb_log_abort_6 ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - -let SEARCH_PATTERN= redo log from [1-9][0-9.]+[KMGT]iB to [1-9][0-9.]+[KMGT]iB; ---source include/search_pattern_in_file.inc - ---let $restart_parameters= --debug=d,innodb_log_abort_7 ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - -# this aborts right after deleting all log files - ---let $restart_parameters= --innodb-read-only ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - -let SEARCH_PATTERN= InnoDB: Cannot create log file in read-only mode; ---source include/search_pattern_in_file.inc - ---let $restart_parameters= --debug=d,innodb_log_abort_8 ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - -let SEARCH_PATTERN= InnoDB: Setting log file .*ib_logfile[0-9]+ size to; ---source include/search_pattern_in_file.inc - ---let $restart_parameters= --debug=d,innodb_log_abort_9 ---source include/restart_mysqld.inc ---error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - -let SEARCH_PATTERN= InnoDB: Setting log file .*ib_logfile[0-9]+ size to; ---source include/search_pattern_in_file.inc ---source include/shutdown_mysqld.inc - -# We should have perfectly synced files here. # Trigger an error in recovery. -perl; -die unless open(FILE, ">$ENV{MYSQLD_DATADIR}/ib_logfile0"); -print FILE "garbage"; -close(FILE); +--move_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile101 +--write_file $MYSQLD_DATADIR/ib_logfile0 +garbage EOF ---source include/start_mysqld.inc +--let $restart_parameters= +--source include/restart_mysqld.inc --error ER_UNKNOWN_STORAGE_ENGINE SELECT * FROM t1; -let SEARCH_PATTERN= InnoDB: Log file .*ib_logfile0 size 7 is not a multiple of 512 bytes; +let SEARCH_PATTERN= InnoDB: File .*ib_logfile0 is too small; --source include/search_pattern_in_file.inc ---remove_file $MYSQLD_DATADIR/ib_logfile0 +--move_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile1 --move_file $MYSQLD_DATADIR/ib_logfile101 $MYSQLD_DATADIR/ib_logfile0 -perl; -die unless open(FILE, ">$ENV{MYSQLD_DATADIR}/ib_logfile1"); -print FILE "junkfill" x 131072; -close(FILE); -EOF - --source include/restart_mysqld.inc +--error ER_UNKNOWN_STORAGE_ENGINE SELECT * FROM t1; +let SEARCH_PATTERN= InnoDB: Expecting only ib_logfile0; +--source include/search_pattern_in_file.inc --remove_file $MYSQLD_DATADIR/ib_logfile1 --move_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile101 ---let $restart_parameters= --debug=d,innodb_log_abort_10 --source include/restart_mysqld.inc --error ER_UNKNOWN_STORAGE_ENGINE -SELECT * FROM t1; - -let SEARCH_PATTERN= InnoDB: Setting log file .*ib_logfile[0-9]+ size to; ---source include/search_pattern_in_file.inc -let SEARCH_PATTERN= InnoDB: Renaming log file .*ib_logfile101 to .*ib_logfile0; +let SEARCH_PATTERN= InnoDB: File .*ib_logfile0 was not found; --source include/search_pattern_in_file.inc +--move_file $MYSQLD_DATADIR/ib_logfile101 $MYSQLD_DATADIR/ib_logfile0 ---let $restart_parameters= --source include/restart_mysqld.inc - SELECT * FROM t1; DROP TABLE t1; + +--let SEARCH_PATTERN= InnoDB: Resizing redo log from 5\\.000MiB to [0-9.]*[KMGT]iB; LSN=\\d+\\b +--source include/search_pattern_in_file.inc diff --git a/mysql-test/suite/innodb/t/missing_tablespaces.test b/mysql-test/suite/innodb/t/missing_tablespaces.test index 8dc325b3356..9f970ca2dd2 100644 --- a/mysql-test/suite/innodb/t/missing_tablespaces.test +++ b/mysql-test/suite/innodb/t/missing_tablespaces.test @@ -25,7 +25,7 @@ let $restart_noprint=2; --disable_query_log call mtr.add_suppression("\\[ERROR\\] InnoDB: Operating system error number 2 in a file operation."); -call mtr.add_suppression("\\[ERROR\\] InnoDB: The error means the system cannot find the path specified."); +call mtr.add_suppression("\\[ERROR\\] InnoDB: Error number \\d+ means"); call mtr.add_suppression("\\[ERROR\\] InnoDB: Cannot open datafile for read-only"); call mtr.add_suppression("\\[Warning\\] InnoDB: Ignoring tablespace .* because it could not be opened"); --enable_query_log diff --git a/mysql-test/suite/innodb/t/rename_table.test b/mysql-test/suite/innodb/t/rename_table.test index 35421f0ce7a..654f8809b22 100644 --- a/mysql-test/suite/innodb/t/rename_table.test +++ b/mysql-test/suite/innodb/t/rename_table.test @@ -32,7 +32,7 @@ DROP DATABASE abc_def; DROP DATABASE abc_def2; -call mtr.add_suppression("InnoDB: (Operating system error|The error means|Cannot rename file)"); +call mtr.add_suppression("InnoDB: (Operating system error|Error number \\d+ means|Cannot rename file)"); CREATE TABLE t1 (a INT) ENGINE=InnoDB; --replace_result "\\" "/" diff --git a/mysql-test/suite/innodb/t/truncate_missing.test b/mysql-test/suite/innodb/t/truncate_missing.test index fb6bd678237..d36a2de5cd9 100644 --- a/mysql-test/suite/innodb/t/truncate_missing.test +++ b/mysql-test/suite/innodb/t/truncate_missing.test @@ -2,7 +2,7 @@ --source include/not_embedded.inc call mtr.add_suppression("InnoDB: Operating system error number "); -call mtr.add_suppression("InnoDB: (The error means|If you are|Cannot open datafile) "); +call mtr.add_suppression("InnoDB: (Error number \\d+ means|If you are|Cannot open datafile) "); call mtr.add_suppression("InnoDB: Ignoring tablespace for test/t "); call mtr.add_suppression("InnoDB: Table test/t .* does not exist"); diff --git a/mysql-test/suite/innodb_zip/r/wl5522_debug_zip.result b/mysql-test/suite/innodb_zip/r/wl5522_debug_zip.result index c69c30c5b25..86e56b4904b 100644 --- a/mysql-test/suite/innodb_zip/r/wl5522_debug_zip.result +++ b/mysql-test/suite/innodb_zip/r/wl5522_debug_zip.result @@ -5,7 +5,7 @@ call mtr.add_suppression("InnoDB: Page for tablespace "); call mtr.add_suppression("InnoDB: Invalid FSP_SPACE_FLAGS=0x"); call mtr.add_suppression("InnoDB: Unknown index id .* on page"); call mtr.add_suppression("InnoDB: Operating system error number"); -call mtr.add_suppression("InnoDB: The error means"); +call mtr.add_suppression("InnoDB: Error number \\d+ means"); call mtr.add_suppression("InnoDB: Cannot open datafile .*t1\\.ibd"); call mtr.add_suppression("InnoDB: Ignoring tablespace for test/t1 "); call mtr.add_suppression("InnoDB: Cannot save statistics for table `test`\\.`t1` because the \\.ibd file is missing"); diff --git a/mysql-test/suite/innodb_zip/t/wl5522_debug_zip.test b/mysql-test/suite/innodb_zip/t/wl5522_debug_zip.test index 8d328dea576..1f436fafeb5 100644 --- a/mysql-test/suite/innodb_zip/t/wl5522_debug_zip.test +++ b/mysql-test/suite/innodb_zip/t/wl5522_debug_zip.test @@ -21,7 +21,7 @@ call mtr.add_suppression("InnoDB: Page for tablespace "); call mtr.add_suppression("InnoDB: Invalid FSP_SPACE_FLAGS=0x"); call mtr.add_suppression("InnoDB: Unknown index id .* on page"); call mtr.add_suppression("InnoDB: Operating system error number"); -call mtr.add_suppression("InnoDB: The error means"); +call mtr.add_suppression("InnoDB: Error number \\d+ means"); call mtr.add_suppression("InnoDB: Cannot open datafile .*t1\\.ibd"); call mtr.add_suppression("InnoDB: Ignoring tablespace for test/t1 "); call mtr.add_suppression("InnoDB: Cannot save statistics for table `test`\\.`t1` because the \\.ibd file is missing"); diff --git a/mysql-test/suite/mariabackup/huge_lsn,strict_crc32.rdiff b/mysql-test/suite/mariabackup/huge_lsn,strict_crc32.rdiff new file mode 100644 index 00000000000..9e516cf2ef1 --- /dev/null +++ b/mysql-test/suite/mariabackup/huge_lsn,strict_crc32.rdiff @@ -0,0 +1,9 @@ +@@ -2,7 +2,7 @@ + # MDEV-13416 mariabackup fails with EFAULT "Bad Address" + # + # restart +-FOUND 1 /redo log from 2\.012MiB to [0-9.]*[KMGT]iB; LSN=17596481011216\b/ in mysqld.1.err ++FOUND 1 /redo log: [0-9.]*[KMGT]iB; LSN=17596481010687\b/ in mysqld.1.err + CREATE TABLE t(i INT) ENGINE INNODB; + INSERT INTO t VALUES(1); + # xtrabackup backup diff --git a/mysql-test/suite/mariabackup/huge_lsn.result b/mysql-test/suite/mariabackup/huge_lsn.result index b24c1af964c..22cd346bbb5 100644 --- a/mysql-test/suite/mariabackup/huge_lsn.result +++ b/mysql-test/suite/mariabackup/huge_lsn.result @@ -2,7 +2,7 @@ # MDEV-13416 mariabackup fails with EFAULT "Bad Address" # # restart -FOUND 1 /InnoDB: New log file created, LSN=175964\d{8}/ in mysqld.1.err +FOUND 1 /redo log from 2\.012MiB to [0-9.]*[KMGT]iB; LSN=17596481011216\b/ in mysqld.1.err CREATE TABLE t(i INT) ENGINE INNODB; INSERT INTO t VALUES(1); # xtrabackup backup diff --git a/mysql-test/suite/mariabackup/huge_lsn.test b/mysql-test/suite/mariabackup/huge_lsn.test index 0af66b761ec..00c7e66516c 100644 --- a/mysql-test/suite/mariabackup/huge_lsn.test +++ b/mysql-test/suite/mariabackup/huge_lsn.test @@ -14,8 +14,8 @@ exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir= --enable_result_log --source include/shutdown_mysqld.inc +if ($MTR_COMBINATION_STRICT_CRC32) { perl; -do "$ENV{MTR_SUITE_DIR}/../innodb/include/crc32.pl"; my $file= "$ENV{MYSQLD_DATADIR}/ibdata1"; open(FILE, "+<", $file) or die "Unable to open $file\n"; binmode FILE; @@ -23,23 +23,48 @@ my $ps= $ENV{INNODB_PAGE_SIZE}; my $page; die "Unable to read $file" unless sysread(FILE, $page, $ps) == $ps; substr($page,26,8) = pack("NN", 4096, ~1024); -my $polynomial = 0x82f63b78; # CRC-32C -my $full_crc32 = unpack("N",substr($page,54,4)) & 0x10; # FIL_SPACE_FLAGS -if ($full_crc32) -{ - my $ck = mycrc32(substr($page, 0, $ps-4), 0, $polynomial); - substr($page, $ps-4, 4) = pack("N", $ck); -} sysseek(FILE, 0, 0) || die "Unable to rewind $file\n"; syswrite(FILE, $page, $ps)==$ps || die "Unable to write $file\n"; close(FILE) || die "Unable to close $file\n"; -EOF ---remove_files_wildcard $MYSQLD_DATADIR ib_logfile* +$file= "$ENV{MYSQLD_DATADIR}/ib_logfile0"; +open(FILE, ">", $file) || die "Unable to truncate $file\n"; +close(FILE) || "Unable to close $file\n"; +EOF +--let SEARCH_PATTERN= redo log: [0-9.]*[KMGT]iB; LSN=17596481010687\\b +} + +if (!$MTR_COMBINATION_STRICT_CRC32) { +perl; +do "$ENV{MTR_SUITE_DIR}/../innodb/include/crc32.pl"; +my $file= "$ENV{MYSQLD_DATADIR}/ib_logfile0"; +open(FILE, ">", $file) or die "Unable to open $file\n"; +binmode FILE; +# the desired log sequence number, plus 16 +my $extra_repeat = 139820; +my $lsn_hi=4096,$lsn_lo=0xfffffe00 - $extra_repeat * 15; +my $polynomial = 0x82f63b78; # CRC-32C +my ($header, $checkpoint, $log); +$header = "Phys" . pack("x[4]NN", $lsn_hi, $lsn_lo) . + "some Perl code" . pack("x[478]"); +$header .= pack("Nx[3584]", mycrc32($header, 0, $polynomial)); +$checkpoint = pack("NNNNx[44]", $lsn_hi, $lsn_lo, $lsn_hi, $lsn_lo); +$checkpoint .= pack("Nx[8128]", mycrc32($checkpoint, 0, $polynomial)); +$log = pack("CxxNN", 0xfa, $lsn_hi, $lsn_lo); +$log .= pack("CN", 1, mycrc32($log, 0, $polynomial)); + +# Write more than 2MiB of FILE_MODIFY mini-transactions to exercise the parser. +my $extra = pack("CCxa*", 0xb9, 127, "a/b.ibd"); +$extra .= pack("CN", 1, mycrc32($extra, 0, $polynomial)); + +print FILE $header, $checkpoint, $extra x $extra_repeat, $log; +close(FILE) or die "Unable to close $file\n"; +EOF +--let SEARCH_PATTERN= redo log from 2\\.012MiB to [0-9.]*[KMGT]iB; LSN=17596481011216\\b +} --source include/start_mysqld.inc let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; ---let SEARCH_PATTERN= InnoDB: New log file created, LSN=175964\d{8} --source include/search_pattern_in_file.inc CREATE TABLE t(i INT) ENGINE INNODB; diff --git a/mysql-test/suite/mariabackup/innodb_redo_overwrite.result b/mysql-test/suite/mariabackup/innodb_redo_overwrite.result index 9076dbaa57a..abc0c57bcce 100644 --- a/mysql-test/suite/mariabackup/innodb_redo_overwrite.result +++ b/mysql-test/suite/mariabackup/innodb_redo_overwrite.result @@ -1,27 +1,6 @@ CREATE TABLE t(i INT) ENGINE=INNODB; -INSERT INTO t VALUES -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), -(0), (1), (2), (3), (4), (5), (6), (7), (8), (9); -# Generate enough data to overwrite innodb redo log -# on the next "INSERT INTO t SELECT * FROM t" execution. -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; -INSERT INTO t SELECT * FROM t; +INSERT INTO t SELECT seq%10 FROM seq_0_to_51199; # xtrabackup backup -FOUND 1 /failed: redo log block is overwritten/ in backup.log -FOUND 1 /failed: redo log block checksum does not match/ in backup.log +FOUND 1 /Was only able to copy log from \d+ to \d+, not \d+; try increasing innodb_log_file_size\b/ in backup.log +NOT FOUND /failed: redo log block checksum does not match/ in backup.log DROP TABLE t; diff --git a/mysql-test/suite/mariabackup/innodb_redo_overwrite.test b/mysql-test/suite/mariabackup/innodb_redo_overwrite.test index e27229c5f33..7e0c1cef54f 100644 --- a/mysql-test/suite/mariabackup/innodb_redo_overwrite.test +++ b/mysql-test/suite/mariabackup/innodb_redo_overwrite.test @@ -1,26 +1,10 @@ --source include/have_innodb.inc --source include/have_debug_sync.inc +--source include/have_debug.inc +--source include/have_sequence.inc CREATE TABLE t(i INT) ENGINE=INNODB; - -INSERT INTO t VALUES - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9), - (0), (1), (2), (3), (4), (5), (6), (7), (8), (9); ---echo # Generate enough data to overwrite innodb redo log ---echo # on the next "INSERT INTO t SELECT * FROM t" execution. ---let $i = 0 -while ($i < 9) { -INSERT INTO t SELECT * FROM t; ---inc $i -} +INSERT INTO t SELECT seq%10 FROM seq_0_to_51199; --echo # xtrabackup backup --let $targetdir=$MYSQLTEST_VARDIR/tmp/backup @@ -33,13 +17,13 @@ INSERT INTO t SELECT * FROM t; --exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --dbug=+d,mariabackup_events > $backuplog --enable_result_log ---let SEARCH_PATTERN=failed: redo log block is overwritten +--let SEARCH_PATTERN=Was only able to copy log from \\d+ to \\d+, not \\d+; try increasing innodb_log_file_size\\b --let SEARCH_FILE=$backuplog --source include/search_pattern_in_file.inc --remove_file $backuplog --rmdir $targetdir ---let before_innodb_log_copy_thread_started=INSERT INTO test.t VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9) +--let before_innodb_log_copy_thread_started=INSERT INTO test.t SELECT seq FROM seq_0_to_9 --disable_result_log --error 1 diff --git a/mysql-test/suite/mariabackup/missing_ibd.test b/mysql-test/suite/mariabackup/missing_ibd.test index dc1406039e7..f406a555b4a 100644 --- a/mysql-test/suite/mariabackup/missing_ibd.test +++ b/mysql-test/suite/mariabackup/missing_ibd.test @@ -9,7 +9,7 @@ let MYSQLD_DATADIR=`select @@datadir`; --disable_query_log call mtr.add_suppression("InnoDB: Cannot open datafile for read-only: '.*test.t1\.ibd'"); -call mtr.add_suppression('InnoDB: Operating system error number'); +call mtr.add_suppression('InnoDB: (Operating system error number|Error number \\d+ means)'); call mtr.add_suppression('InnoDB: The error means the system cannot find the path specified\.'); call mtr.add_suppression('InnoDB: Table test/t1 in the InnoDB data dictionary has tablespace id .*, but tablespace with that id or name does not exist'); call mtr.add_suppression('InnoDB: Ignoring tablespace for test/t1 because it could not be opened\.'); diff --git a/mysql-test/suite/mariabackup/xb_file_key_management.result b/mysql-test/suite/mariabackup/xb_file_key_management.result index 6cedfd2213b..cf8edb310b8 100644 --- a/mysql-test/suite/mariabackup/xb_file_key_management.result +++ b/mysql-test/suite/mariabackup/xb_file_key_management.result @@ -1,5 +1,7 @@ -CREATE TABLE t(c VARCHAR(10)) ENGINE INNODB encrypted=yes; +CREATE TABLE t(c TEXT) ENGINE INNODB encrypted=yes; +INSERT INTO t VALUES(REPEAT('fubar',100)); INSERT INTO t VALUES('foobar1'); +DELETE FROM t LIMIT 1; # xtrabackup backup NOT FOUND /foobar1/ in ib_logfile0 # expect NOT FOUND diff --git a/mysql-test/suite/mariabackup/xb_file_key_management.test b/mysql-test/suite/mariabackup/xb_file_key_management.test index 2a176952053..4d27b2dfa95 100644 --- a/mysql-test/suite/mariabackup/xb_file_key_management.test +++ b/mysql-test/suite/mariabackup/xb_file_key_management.test @@ -1,8 +1,10 @@ #--source include/innodb_page_size.inc --source include/have_file_key_management.inc -CREATE TABLE t(c VARCHAR(10)) ENGINE INNODB encrypted=yes; +CREATE TABLE t(c TEXT) ENGINE INNODB encrypted=yes; +INSERT INTO t VALUES(REPEAT('fubar',100)); INSERT INTO t VALUES('foobar1'); +DELETE FROM t LIMIT 1; echo # xtrabackup backup; let $targetdir=$MYSQLTEST_VARDIR/tmp/backup; --disable_result_log @@ -24,7 +26,7 @@ exec $XTRABACKUP --prepare --target-dir=$targetdir; --enable_result_log --list_files $targetdir ib_logfile* ---cat_file $targetdir/ib_logfile0 +--remove_file $targetdir/ib_logfile0 SELECT * FROM t; DROP TABLE t; diff --git a/mysql-test/suite/perfschema/include/default_mysqld_autosize.cnf b/mysql-test/suite/perfschema/include/default_mysqld_autosize.cnf index eee52ede869..6bcf7a09401 100644 --- a/mysql-test/suite/perfschema/include/default_mysqld_autosize.cnf +++ b/mysql-test/suite/perfschema/include/default_mysqld_autosize.cnf @@ -19,9 +19,8 @@ loose-innodb_buffer_pool_size= 8M loose-innodb_lru_scan_depth= 100 loose-innodb_write_io_threads= 2 loose-innodb_read_io_threads= 2 -loose-innodb_log_buffer_size= 1M -loose-innodb_log_file_size= 5M -loose-innodb_log_files_in_group= 2 +loose-innodb_log_buffer_size= 2M +loose-innodb_log_file_size= 10M slave-net-timeout=120 diff --git a/mysql-test/suite/sys_vars/r/innodb_log_write_ahead_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_log_write_ahead_size_basic.result deleted file mode 100644 index 5c9eb69de50..00000000000 --- a/mysql-test/suite/sys_vars/r/innodb_log_write_ahead_size_basic.result +++ /dev/null @@ -1,88 +0,0 @@ -SET @start_global_value = @@global.innodb_log_write_ahead_size; -SET global innodb_log_write_ahead_size=4096; -Valid values are positive number -SELECT @@global.innodb_log_write_ahead_size >= 512; -@@global.innodb_log_write_ahead_size >= 512 -1 -SELECT @@global.innodb_log_write_ahead_size <= 16*1024; -@@global.innodb_log_write_ahead_size <= 16*1024 -1 -SELECT @@session.innodb_log_write_ahead_size; -ERROR HY000: Variable 'innodb_log_write_ahead_size' is a GLOBAL variable -SHOW global variables LIKE 'innodb_log_write_ahead_size'; -Variable_name Value -innodb_log_write_ahead_size 4096 -SHOW session variables LIKE 'innodb_log_write_ahead_size'; -Variable_name Value -innodb_log_write_ahead_size 4096 -SELECT * FROM information_schema.global_variables -WHERE variable_name='innodb_log_write_ahead_size'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_LOG_WRITE_AHEAD_SIZE 4096 -SELECT * FROM information_schema.session_variables -WHERE variable_name='innodb_log_write_ahead_size'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_LOG_WRITE_AHEAD_SIZE 4096 -SET global innodb_log_write_ahead_size=1024; -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -1024 -SELECT * FROM information_schema.global_variables -WHERE variable_name='innodb_log_write_ahead_size'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_LOG_WRITE_AHEAD_SIZE 1024 -SELECT * FROM information_schema.session_variables -WHERE variable_name='innodb_log_write_ahead_size'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_LOG_WRITE_AHEAD_SIZE 1024 -SET session innodb_log_write_ahead_size=2048; -ERROR HY000: Variable 'innodb_log_write_ahead_size' is a GLOBAL variable and should be set with SET GLOBAL -SET global innodb_log_write_ahead_size=512; -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -512 -SET global innodb_log_write_ahead_size=2048; -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -2048 -SET global innodb_log_write_ahead_size=4096; -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -4096 -SET global innodb_log_write_ahead_size=0; -Warnings: -Warning 1292 Truncated incorrect innodb_log_write_ahead_size value: '0' -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -512 -SET global innodb_log_write_ahead_size=-1024; -Warnings: -Warning 1292 Truncated incorrect innodb_log_write_ahead_size value: '-1024' -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -512 -SET global innodb_log_write_ahead_size=3000; -Warnings: -Warning 1292 Truncated incorrect innodb_log_write_ahead_size value: '3000' -Warning 1210 innodb_log_write_ahead_size should be set 2^n value and larger than 512. -Warning 1210 Setting innodb_log_write_ahead_size to 4096 -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -4096 -SET global innodb_log_write_ahead_size=1.1; -ERROR 42000: Incorrect argument type to variable 'innodb_log_write_ahead_size' -SET global innodb_log_write_ahead_size=1e1; -ERROR 42000: Incorrect argument type to variable 'innodb_log_write_ahead_size' -SET global innodb_log_write_ahead_size="foo"; -ERROR 42000: Incorrect argument type to variable 'innodb_log_write_ahead_size' -SET global innodb_log_write_ahead_size=-7; -Warnings: -Warning 1292 Truncated incorrect innodb_log_write_ahead_size value: '-7' -SELECT @@global.innodb_log_write_ahead_size; -@@global.innodb_log_write_ahead_size -512 -SELECT * FROM information_schema.global_variables -WHERE variable_name='innodb_log_write_ahead_size'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_LOG_WRITE_AHEAD_SIZE 512 -SET @@global.innodb_log_write_ahead_size = @start_global_value; diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff index 803fbba6200..67d3897375b 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff @@ -213,23 +213,14 @@ VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED - VARIABLE_COMMENT The size of the buffer which InnoDB uses to write log to the log files on disk. - NUMERIC_MIN_VALUE 262144 --NUMERIC_MAX_VALUE 9223372036854775807 -+NUMERIC_MAX_VALUE 2147483647 - NUMERIC_BLOCK_SIZE 1024 + VARIABLE_COMMENT Redo log buffer size in bytes. + NUMERIC_MIN_VALUE 2097152 +-NUMERIC_MAX_VALUE 18446744073709551615 ++NUMERIC_MAX_VALUE 4294967295 + NUMERIC_BLOCK_SIZE 4096 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -1033,7 +1033,7 @@ - SESSION_VALUE NULL - DEFAULT_VALUE 8192 - VARIABLE_SCOPE GLOBAL --VARIABLE_TYPE BIGINT UNSIGNED -+VARIABLE_TYPE INT UNSIGNED - VARIABLE_COMMENT Redo log write ahead unit size to avoid read-on-write, it should match the OS cache block IO size - NUMERIC_MIN_VALUE 512 - NUMERIC_MAX_VALUE 16384 -@@ -1045,10 +1045,10 @@ +@@ -1033,10 +1033,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 32 VARIABLE_SCOPE GLOBAL @@ -242,7 +233,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1057,10 +1057,10 @@ +@@ -1045,10 +1045,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 1536 VARIABLE_SCOPE GLOBAL @@ -255,7 +246,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1093,10 +1093,10 @@ +@@ -1081,10 +1081,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -268,7 +259,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1105,7 +1105,7 @@ +@@ -1093,7 +1093,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -277,7 +268,7 @@ VARIABLE_COMMENT Maximum delay of user threads in micro-seconds NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 10000000 -@@ -1237,10 +1237,10 @@ +@@ -1225,10 +1225,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -290,7 +281,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -1261,7 +1261,7 @@ +@@ -1249,7 +1249,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 16384 VARIABLE_SCOPE GLOBAL @@ -299,7 +290,7 @@ VARIABLE_COMMENT Page size to use for all InnoDB tablespaces. NUMERIC_MIN_VALUE 4096 NUMERIC_MAX_VALUE 65536 -@@ -1297,7 +1297,7 @@ +@@ -1285,7 +1285,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 300 VARIABLE_SCOPE GLOBAL @@ -308,7 +299,7 @@ VARIABLE_COMMENT Number of UNDO log pages to purge in one batch from the history list. NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 5000 -@@ -1309,7 +1309,7 @@ +@@ -1297,7 +1297,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 128 VARIABLE_SCOPE GLOBAL @@ -317,7 +308,7 @@ VARIABLE_COMMENT Dictates rate at which UNDO records are purged. Value N means purge rollback segment(s) on every Nth iteration of purge invocation NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 128 -@@ -1345,7 +1345,7 @@ +@@ -1333,7 +1333,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 56 VARIABLE_SCOPE GLOBAL @@ -326,7 +317,7 @@ VARIABLE_COMMENT Number of pages that must be accessed sequentially for InnoDB to trigger a readahead. NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 64 -@@ -1417,7 +1417,7 @@ +@@ -1405,7 +1405,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1048576 VARIABLE_SCOPE GLOBAL @@ -335,7 +326,7 @@ VARIABLE_COMMENT Memory buffer size for index creation NUMERIC_MIN_VALUE 65536 NUMERIC_MAX_VALUE 67108864 -@@ -1585,10 +1585,10 @@ +@@ -1573,10 +1573,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 324433acbe3..40e5596e5e2 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -986,10 +986,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 16777216 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT The size of the buffer which InnoDB uses to write log to the log files on disk. -NUMERIC_MIN_VALUE 262144 -NUMERIC_MAX_VALUE 9223372036854775807 -NUMERIC_BLOCK_SIZE 1024 +VARIABLE_COMMENT Redo log buffer size in bytes. +NUMERIC_MIN_VALUE 2097152 +NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_BLOCK_SIZE 4096 ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED @@ -1010,10 +1010,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 100663296 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT Size of each log file in a log group. +VARIABLE_COMMENT Redo log size in bytes. NUMERIC_MIN_VALUE 1048576 NUMERIC_MAX_VALUE 18446744073709551615 -NUMERIC_BLOCK_SIZE 65536 +NUMERIC_BLOCK_SIZE 4096 ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED @@ -1029,18 +1029,6 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED -VARIABLE_NAME INNODB_LOG_WRITE_AHEAD_SIZE -SESSION_VALUE NULL -DEFAULT_VALUE 8192 -VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT Redo log write ahead unit size to avoid read-on-write, it should match the OS cache block IO size -NUMERIC_MIN_VALUE 512 -NUMERIC_MAX_VALUE 16384 -NUMERIC_BLOCK_SIZE 512 -ENUM_VALUE_LIST NULL -READ_ONLY NO -COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_LRU_FLUSH_SIZE SESSION_VALUE NULL DEFAULT_VALUE 32 diff --git a/mysql-test/suite/sys_vars/t/innodb_log_write_ahead_size_basic.test b/mysql-test/suite/sys_vars/t/innodb_log_write_ahead_size_basic.test deleted file mode 100644 index 8693c6a7b1b..00000000000 --- a/mysql-test/suite/sys_vars/t/innodb_log_write_ahead_size_basic.test +++ /dev/null @@ -1,93 +0,0 @@ ---source include/have_innodb.inc - -SET @start_global_value = @@global.innodb_log_write_ahead_size; - -# default value is limited by innodb_page_size and varying along with the page size. -#SELECT @start_global_value; - -#set common valid value -SET global innodb_log_write_ahead_size=4096; - -# -# exists as global only -# ---echo Valid values are positive number -SELECT @@global.innodb_log_write_ahead_size >= 512; -SELECT @@global.innodb_log_write_ahead_size <= 16*1024; - ---error ER_INCORRECT_GLOBAL_LOCAL_VAR -SELECT @@session.innodb_log_write_ahead_size; -SHOW global variables LIKE 'innodb_log_write_ahead_size'; -SHOW session variables LIKE 'innodb_log_write_ahead_size'; ---disable_warnings -SELECT * FROM information_schema.global_variables -WHERE variable_name='innodb_log_write_ahead_size'; -SELECT * FROM information_schema.session_variables -WHERE variable_name='innodb_log_write_ahead_size'; ---enable_warnings - -# -# show that it's writable -# -SET global innodb_log_write_ahead_size=1024; -SELECT @@global.innodb_log_write_ahead_size; ---disable_warnings -SELECT * FROM information_schema.global_variables -WHERE variable_name='innodb_log_write_ahead_size'; -SELECT * FROM information_schema.session_variables -WHERE variable_name='innodb_log_write_ahead_size'; ---enable_warnings ---error ER_GLOBAL_VARIABLE -SET session innodb_log_write_ahead_size=2048; - -# -# Valid values -# -SET global innodb_log_write_ahead_size=512; -SELECT @@global.innodb_log_write_ahead_size; -SET global innodb_log_write_ahead_size=2048; -SELECT @@global.innodb_log_write_ahead_size; -SET global innodb_log_write_ahead_size=4096; -SELECT @@global.innodb_log_write_ahead_size; - -# limited by innodb_page_size, and the followings are occationally invalid -#SET global innodb_log_write_ahead_size=8192; -#SELECT @@global.innodb_log_write_ahead_size; -#SET global innodb_log_write_ahead_size=16384; -#SELECT @@global.innodb_log_write_ahead_size; - -# -# Invalid values -# -SET global innodb_log_write_ahead_size=0; -SELECT @@global.innodb_log_write_ahead_size; -SET global innodb_log_write_ahead_size=-1024; -SELECT @@global.innodb_log_write_ahead_size; -SET global innodb_log_write_ahead_size=3000; -SELECT @@global.innodb_log_write_ahead_size; - -# limited by innodb_page_size, and the followings result occationally different -#SET global innodb_log_write_ahead_size=32768; -#SELECT @@global.innodb_log_write_ahead_size; - -# -# incorrect types -# ---error ER_WRONG_TYPE_FOR_VAR -SET global innodb_log_write_ahead_size=1.1; ---error ER_WRONG_TYPE_FOR_VAR -SET global innodb_log_write_ahead_size=1e1; ---error ER_WRONG_TYPE_FOR_VAR -SET global innodb_log_write_ahead_size="foo"; -SET global innodb_log_write_ahead_size=-7; -SELECT @@global.innodb_log_write_ahead_size; ---disable_warnings -SELECT * FROM information_schema.global_variables -WHERE variable_name='innodb_log_write_ahead_size'; ---enable_warnings - -# -# cleanup -# - -SET @@global.innodb_log_write_ahead_size = @start_global_value; diff --git a/scripts/wsrep_sst_mariabackup.sh b/scripts/wsrep_sst_mariabackup.sh index aa9442b0601..dc9adb4ff22 100644 --- a/scripts/wsrep_sst_mariabackup.sh +++ b/scripts/wsrep_sst_mariabackup.sh @@ -1,5 +1,5 @@ #!/bin/bash -ue -# Copyright (C) 2017-2021 MariaDB +# Copyright (C) 2017-2022 MariaDB # Copyright (C) 2013 Percona Inc # # This program is free software; you can redistribute it and/or modify @@ -1146,8 +1146,6 @@ then # May need xtrabackup_checkpoints later on [ -f "$DATA/xtrabackup_binary" ] && rm -f "$DATA/xtrabackup_binary" [ -f "$DATA/xtrabackup_galera_info" ] && rm -f "$DATA/xtrabackup_galera_info" - [ -f "$DATA/ib_logfile0" ] && rm -f "$DATA/ib_logfile0" - ADDR="$WSREP_SST_OPT_ADDR" if [ "${tmode#VERIFY}" != "$tmode" ]; then diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index 28dfed18218..0d08968709f 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -1,6 +1,6 @@ #!/bin/bash -ue -# Copyright (C) 2017-2021 MariaDB +# Copyright (C) 2017-2022 MariaDB # Copyright (C) 2010-2014 Codership Oy # # This program is free software; you can redistribute it and/or modify @@ -445,7 +445,7 @@ FILTER="-f '- /lost+found' rsync ${STUNNEL:+--rsh="$STUNNEL"} \ --owner --group --perms --links --specials \ --ignore-times --inplace --dirs --delete --quiet \ - $WHOLE_FILE_OPT -f '+ /ib_logfile[0-9]*' -f '+ /aria_log.*' \ + $WHOLE_FILE_OPT -f '+ /ib_logfile0' -f '+ /aria_log.*' \ -f '+ /aria_log_control' -f '- **' "$WSREP_LOG_DIR/" \ "rsync://$WSREP_SST_OPT_ADDR-log_dir" >&2 || RC=$? @@ -603,8 +603,6 @@ $SILENT path = $INNODB_DATA_HOME_DIR EOF -# rm -rf "$DATA/ib_logfile"* # we don't want old logs around - # If the IP is local, listen only on it: if is_local_ip "$RSYNC_ADDR_UNESCAPED" then diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 74c32082629..f23f485b9a6 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -5182,6 +5182,7 @@ static int init_server_components() MARIADB_REMOVED_OPTION("innodb-log-compressed-pages"), MARIADB_REMOVED_OPTION("innodb-log-files-in-group"), MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"), + MARIADB_REMOVED_OPTION("innodb-log-write-ahead-size"), MARIADB_REMOVED_OPTION("innodb-page-cleaners"), MARIADB_REMOVED_OPTION("innodb-replication-delay"), MARIADB_REMOVED_OPTION("innodb-scrub-log"), diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 5c138d41f58..a2ea7c24541 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2014, 2021, MariaDB Corporation. +# Copyright (c) 2014, 2022, MariaDB Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -173,7 +173,6 @@ SET(INNOBASE_SOURCES include/lock0types.h include/log0crypt.h include/log0log.h - include/log0log.ic include/log0recv.h include/log0types.h include/mach0data.h @@ -338,7 +337,9 @@ OPTION(WITH_PMEM "Support redo log in persistent memory" OFF) FIND_PACKAGE(PMEM) IF(PMEM_FOUND) INCLUDE_DIRECTORIES(${PMEM_INCLUDES}) - ADD_COMPILE_FLAGS(log/log0log.cc COMPILE_FLAGS "-DHAVE_PMEM") + ADD_COMPILE_FLAGS(log/log0log.cc log/log0recv.cc + buf/buf0flu.cc mtr/mtr0mtr.cc trx/trx0trx.cc srv/srv0start.cc + COMPILE_FLAGS "-DHAVE_PMEM") SET(PMEM_LIBRARY ${PMEM_LIBRARIES}) ELSE() IF(WITH_PMEM) diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index cb58ead5830..1fc903eb616 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -373,8 +373,8 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, FIL_PAGE_FCRC32_CHECKSUM); - memcpy_aligned(src_frame, tmp_frame, - srv_page_size); + memcpy_aligned(src_frame, tmp_frame, + srv_page_size); srv_stats.pages_decrypted.inc(); srv_stats.n_temp_blocks_decrypted.inc(); @@ -799,17 +799,9 @@ buf_madvise_do_dump() /* mirrors allocation in log_t::create() */ if (log_sys.buf) { - ret += madvise(log_sys.buf, - srv_log_buffer_size, + ret += madvise(log_sys.buf, log_sys.buf_size, MADV_DODUMP); + ret += madvise(log_sys.flush_buf, log_sys.buf_size, MADV_DODUMP); - ret += madvise(log_sys.flush_buf, - srv_log_buffer_size, - MADV_DODUMP); - } - /* mirrors recv_sys_t::create() */ - if (recv_sys.buf) - { - ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP); } mysql_mutex_lock(&buf_pool.mutex); @@ -1090,7 +1082,7 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const { /* The page cleaner is disabled in read-only mode. No pages can be dirtied, so all of them must be clean. */ - ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn || + ut_ad(lsn == 0 || lsn == recv_sys.lsn || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); break; } @@ -1326,7 +1318,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block) hash_lock.lock(); if (block->page.can_relocate()) { - memcpy_aligned( + memcpy_aligned( new_block->page.frame, block->page.frame, srv_page_size); mysql_mutex_lock(&buf_pool.flush_list_mutex); diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 9c78c5c563e..533e69a05d0 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -218,8 +218,6 @@ too_small: /* Remove doublewrite pages from LRU */ buf_pool_invalidate(); - - ib::info() << "Doublewrite buffer created"; goto start_again; } @@ -331,7 +329,7 @@ func_exit: /** Process and remove the double write buffer pages for all tablespaces. */ void buf_dblwr_t::recover() { - ut_ad(recv_sys.parse_start_lsn); + ut_ad(log_sys.last_checkpoint_lsn); if (!is_initialised()) return; @@ -349,13 +347,13 @@ void buf_dblwr_t::recover() continue; const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN); - if (recv_sys.parse_start_lsn > lsn) + if (log_sys.last_checkpoint_lsn > lsn) /* Pages written before the checkpoint are not useful for recovery. */ continue; const uint32_t space_id= page_get_space_id(page); const page_id_t page_id(space_id, page_no); - if (recv_sys.scanned_lsn < lsn) + if (recv_sys.lsn < lsn) { ib::info() << "Ignoring a doublewrite copy of page " << page_id << " with future log sequence number " << lsn; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 28f03eedac9..f2c86e2cc24 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -175,8 +175,12 @@ inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage) void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn) { mysql_mutex_assert_not_owner(&mutex); - mysql_mutex_assert_owner(&log_sys.flush_order_mutex); +#ifdef SAFE_MUTEX + if (!recv_recovery_is_on()) + mysql_mutex_assert_owner(&log_sys.flush_order_mutex); +#endif /* SAFE_MUTEX */ ut_ad(lsn > 2); + static_assert(log_t::FIRST_LSN >= 2, "compatibility"); ut_ad(!fsp_is_system_temporary(block->page.id().space())); mysql_mutex_lock(&flush_list_mutex); @@ -424,7 +428,7 @@ void buf_flush_assign_full_crc32_checksum(byte* page) ut_ad(!corrupted); ut_ad(size == uint(srv_page_size)); const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; - mach_write_to_4(page + payload, ut_crc32(page, payload)); + mach_write_to_4(page + payload, my_crc32c(0, page, payload)); } /** Initialize a page for writing to the tablespace. @@ -605,7 +609,7 @@ static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d) return NULL; const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; - mach_write_to_4(d + payload, ut_crc32(d, payload)); + mach_write_to_4(d + payload, my_crc32c(0, d, payload)); srv_stats.pages_encrypted.inc(); srv_stats.n_temp_blocks_encrypted.inc(); @@ -732,7 +736,7 @@ not_compressed: if (full_crc32) { static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); - mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4)); + mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4)); ut_ad(!buf_page_is_corrupted(true, tmp, space->flags)); } @@ -899,8 +903,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) (write_frame ? write_frame : frame))); ut_ad(lsn >= oldest_modification()); - if (lsn > log_sys.get_flushed_lsn()) - log_write_up_to(lsn, true); + log_write_up_to(lsn, true); } space->io(IORequest{type, this, slot}, physical_offset(), size, write_frame, this); @@ -1663,6 +1666,59 @@ ulint buf_flush_LRU(ulint max_n) return n_flushed; } +#ifdef HAVE_PMEM +# include +#endif + +/** Write checkpoint information to the log header and release mutex. +@param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ +inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept +{ + ut_ad(!srv_read_only_mode); + ut_ad(end_lsn >= next_checkpoint_lsn); + ut_ad(end_lsn <= get_lsn()); + ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() || + srv_shutdown_state > SRV_SHUTDOWN_INITIATED); + + DBUG_PRINT("ib_log", + ("checkpoint at " LSN_PF " written", next_checkpoint_lsn)); + + auto n= next_checkpoint_no; + const size_t offset{(n & 1) ? CHECKPOINT_2 : CHECKPOINT_1}; + static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); + static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); + byte* c= my_assume_aligned + (is_pmem() ? buf + offset : checkpoint_buf); + memset_aligned(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); + mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); + mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); + mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); +#ifdef HAVE_PMEM + if (is_pmem()) + pmem_persist(c, 64); + else +#endif + { + n_pending_checkpoint_writes++; + mysql_mutex_unlock(&mutex); + /* FIXME: issue an asynchronous write */ + log.write(offset, {c, get_block_size()}); + if (srv_file_flush_method != SRV_O_DSYNC) + ut_a(log.flush()); + mysql_mutex_lock(&mutex); + n_pending_checkpoint_writes--; + } + + ut_ad(!n_pending_checkpoint_writes); + next_checkpoint_no++; + last_checkpoint_lsn= next_checkpoint_lsn; + + DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF, + next_checkpoint_lsn, get_flushed_lsn())); + + mysql_mutex_unlock(&mutex); +} + /** Initiate a log checkpoint, discarding the start of the log. @param oldest_lsn the checkpoint LSN @param end_lsn log_sys.get_lsn() @@ -1675,22 +1731,19 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) ut_ad(end_lsn == log_sys.get_lsn()); ut_ad(!recv_no_log_write); - ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); - - if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) - /* Some log has been written since the previous checkpoint. */; - else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) - /* MariaDB startup expects the redo log file to be logically empty - (not even containing a FILE_CHECKPOINT record) after a clean shutdown. - Perform an extra checkpoint at shutdown. */; - else + if (oldest_lsn == log_sys.last_checkpoint_lsn || + (oldest_lsn == end_lsn && oldest_lsn == log_sys.last_checkpoint_lsn + + (log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT))) { /* Do nothing, because nothing was logged (other than a FILE_CHECKPOINT record) since the previous checkpoint. */ + do_nothing: mysql_mutex_unlock(&log_sys.mutex); return true; } + ut_ad(oldest_lsn > log_sys.last_checkpoint_lsn); /* Repeat the FILE_MODIFY records after the checkpoint, in case some log records between the checkpoint and log_sys.lsn need them. Finally, write a FILE_CHECKPOINT record. Redo log apply expects to @@ -1701,24 +1754,13 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) dirty pages are flushed to the tablespace files. At this point, because we hold log_sys.mutex, mtr_t::commit() in other threads will be blocked, and no pages can be added to the flush lists. */ - lsn_t flush_lsn= oldest_lsn; - - if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn || - srv_shutdown_state <= SRV_SHUTDOWN_INITIATED)) - { - flush_lsn= log_sys.get_lsn(); - ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); - mysql_mutex_unlock(&log_sys.mutex); - log_write_up_to(flush_lsn, true, true); - mysql_mutex_lock(&log_sys.mutex); - if (log_sys.last_checkpoint_lsn >= oldest_lsn) - { - mysql_mutex_unlock(&log_sys.mutex); - return true; - } - } - else - ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + const lsn_t flush_lsn{fil_names_clear(oldest_lsn)}; + ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); + mysql_mutex_unlock(&log_sys.mutex); + log_write_up_to(flush_lsn, true); + mysql_mutex_lock(&log_sys.mutex); + if (log_sys.last_checkpoint_lsn >= oldest_lsn) + goto do_nothing; ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); @@ -1730,7 +1772,7 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) } log_sys.next_checkpoint_lsn= oldest_lsn; - log_write_checkpoint_info(end_lsn); + log_sys.write_checkpoint(end_lsn); mysql_mutex_assert_not_owner(&log_sys.mutex); return true; @@ -1760,8 +1802,8 @@ static bool log_checkpoint() mysql_mutex_lock(&log_sys.flush_order_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex); const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&log_sys.flush_order_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); return log_checkpoint_low(oldest_lsn, end_lsn); } @@ -1845,8 +1887,7 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) to happen until now. There could be an outstanding FILE_CHECKPOINT record from a previous fil_names_clear() call, which we must write out before we can advance the checkpoint. */ - if (sync_lsn > log_sys.get_flushed_lsn()) - log_write_up_to(sync_lsn, true); + log_write_up_to(sync_lsn, true); log_checkpoint(); } } @@ -1919,8 +1960,8 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) const lsn_t newest_lsn= log_sys.get_lsn(); mysql_mutex_lock(&log_sys.flush_order_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex); - lsn_t measure= buf_pool.get_oldest_modification(0); mysql_mutex_unlock(&log_sys.flush_order_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; if (!recv_recovery_is_on() && @@ -2439,7 +2480,11 @@ NOTE: The calling thread is not allowed to hold any buffer page latches! */ void buf_flush_sync() { if (recv_recovery_is_on()) + { + mysql_mutex_lock(&recv_sys.mutex); recv_sys.apply(true); + mysql_mutex_unlock(&recv_sys.mutex); + } thd_wait_begin(nullptr, THD_WAIT_DISKIO); tpool::tpool_wait_begin(); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 9846b4a6af5..5469ac49b3a 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -460,9 +460,7 @@ not_found: << flush_failures << " failed attempts to" " flush a page!" " Consider increasing innodb_buffer_pool_size." - " Pending flushes (fsync) log: " - << log_sys.get_pending_flushes() - << "; buffer pool: " + " Pending flushes (fsync): " << fil_n_pending_tablespace_flushes << ". " << os_n_file_reads << " OS file reads, " << os_n_file_writes << " OS file writes, " diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 5a792a59664..85490372180 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -1,6 +1,6 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -533,7 +533,7 @@ static byte* fil_encrypt_buf_for_full_crc32( ut_a(dstlen == srclen); const ulint payload = size - FIL_PAGE_FCRC32_CHECKSUM; - mach_write_to_4(dst_frame + payload, ut_crc32(dst_frame, payload)); + mach_write_to_4(dst_frame + payload, my_crc32c(0, dst_frame, payload)); /* Clean the rest of the buffer. FIXME: Punch holes when writing! */ memset(dst_frame + (payload + 4), 0, srv_page_size - (payload + 4)); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 0007fde5f43..5a592c3a768 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1496,7 +1496,7 @@ inline void mtr_t::log_file_op(mfile_type_t type, uint32_t space_id, { ut_ad(strchr(new_path, '/')); m_log.push(reinterpret_cast(path), uint32_t(len + 1)); - m_log.push(reinterpret_cast(new_path), uint32_t(new_len)); + m_log.push(reinterpret_cast(new_path), uint32_t(new_len - 1)); } else m_log.push(reinterpret_cast(path), uint32_t(len)); @@ -1514,6 +1514,14 @@ static void fil_name_write_rename_low(uint32_t space_id, const char *old_name, mtr->log_file_op(FILE_RENAME, space_id, old_name, new_name); } +static void fil_name_commit_durable(mtr_t *mtr) +{ + mysql_mutex_lock(&log_sys.mutex); + auto lsn= mtr->commit_files(); + mysql_mutex_unlock(&log_sys.mutex); + log_write_up_to(lsn, true); +} + /** Write redo log for renaming a file. @param[in] space_id tablespace id @param[in] old_name tablespace file name @@ -1524,8 +1532,7 @@ static void fil_name_write_rename(uint32_t space_id, mtr_t mtr; mtr.start(); fil_name_write_rename_low(space_id, old_name, new_name, &mtr); - mtr.commit(); - log_write_up_to(mtr.commit_lsn(), true); + fil_name_commit_durable(&mtr); } /** Write FILE_MODIFY for a file. @@ -1655,8 +1662,7 @@ pfs_os_file_t fil_delete_tablespace(uint32_t id) mtr_t mtr; mtr.start(); mtr.log_file_op(FILE_DELETE, id, space->chain.start->name); - mtr.commit(); - log_write_up_to(mtr.commit_lsn(), true); + fil_name_commit_durable(&mtr); /* Remove any additional files. */ if (char *cfg_name= fil_make_filepath(space->chain.start->name, @@ -1969,8 +1975,7 @@ fil_ibd_create( mtr.start(); mtr.log_file_op(FILE_CREATE, space_id, path); - mtr.commit(); - log_write_up_to(mtr.commit_lsn(), true); + fil_name_commit_durable(&mtr); ulint type; static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, @@ -2760,8 +2765,8 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, void *buf, buf_page_t *bpage) { ut_ad(referenced()); - ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_ad((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + ut_ad(offset % UNIV_ZIP_SIZE_MIN == 0); + ut_ad(len % 512 == 0); /* page_compressed */ ut_ad(fil_validate_skip()); ut_ad(type.is_read() || type.is_write()); ut_ad(type.type != IORequest::DBLWR_BATCH); @@ -3057,19 +3062,6 @@ fil_space_validate_for_mtr_commit( } #endif /* UNIV_DEBUG */ -/** Write a FILE_MODIFY record for a persistent tablespace. -@param[in] space tablespace -@param[in,out] mtr mini-transaction */ -static -void -fil_names_write( - const fil_space_t* space, - mtr_t* mtr) -{ - ut_ad(UT_LIST_GET_LEN(space->chain) == 1); - fil_name_write(space->id, UT_LIST_GET_FIRST(space->chain)->name, mtr); -} - /** Note that a non-predefined persistent tablespace has been modified by redo log. @param[in,out] space tablespace */ @@ -3087,55 +3079,48 @@ fil_names_dirty( space->max_lsn = log_sys.get_lsn(); } -/** Write FILE_MODIFY records when a non-predefined persistent -tablespace was modified for the first time since the latest -fil_names_clear(). -@param[in,out] space tablespace */ -void fil_names_dirty_and_write(fil_space_t* space) +/** Write a FILE_MODIFY record when a non-predefined persistent +tablespace was modified for the first time since fil_names_clear(). */ +ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write() { - mysql_mutex_assert_owner(&log_sys.mutex); - ut_d(fil_space_validate_for_mtr_commit(space)); - ut_ad(space->max_lsn == log_sys.get_lsn()); + mysql_mutex_assert_owner(&log_sys.mutex); + ut_d(fil_space_validate_for_mtr_commit(m_user_space)); + ut_ad(!m_user_space->max_lsn); + m_user_space->max_lsn= log_sys.get_lsn(); - fil_system.named_spaces.push_back(*space); - mtr_t mtr; - mtr.start(); - fil_names_write(space, &mtr); + fil_system.named_spaces.push_back(*m_user_space); + ut_ad(UT_LIST_GET_LEN(m_user_space->chain) == 1); - DBUG_EXECUTE_IF("fil_names_write_bogus", - { - char bogus_name[] = "./test/bogus file.ibd"; - fil_name_write( - SRV_SPACE_ID_UPPER_BOUND, - bogus_name, &mtr); - }); + mtr_t mtr; + mtr.start(); + fil_name_write(m_user_space->id, + UT_LIST_GET_FIRST(m_user_space->chain)->name, + &mtr); - mtr.commit_files(); + DBUG_EXECUTE_IF("fil_names_write_bogus", + {fil_name_write(SRV_SPACE_ID_UPPER_BOUND, + "./test/bogus file.ibd", &mtr);}); + mtr.commit_files(); } /** On a log checkpoint, reset fil_names_dirty_and_write() flags -and write out FILE_MODIFY and FILE_CHECKPOINT if needed. -@param[in] lsn checkpoint LSN -@param[in] do_write whether to always write FILE_CHECKPOINT -@return whether anything was written to the redo log -@retval false if no flags were set and nothing written -@retval true if anything was written to the redo log */ -bool -fil_names_clear( - lsn_t lsn, - bool do_write) +and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. +@param lsn checkpoint LSN +@return current LSN */ +lsn_t fil_names_clear(lsn_t lsn) { mtr_t mtr; mysql_mutex_assert_owner(&log_sys.mutex); ut_ad(lsn); + ut_ad(log_sys.is_latest()); mtr.start(); for (auto it = fil_system.named_spaces.begin(); it != fil_system.named_spaces.end(); ) { if (mtr.get_log()->size() + strlen(it->chain.start->name) - >= RECV_SCAN_SIZE - (3 + 5)) { + >= recv_sys.MTR_SIZE_MAX - (3 + 5)) { /* Prevent log parse buffer overflow */ mtr.commit_files(); mtr.start(); @@ -3158,20 +3143,13 @@ fil_names_clear( was called. If we kept track of "min_lsn" (the first LSN where max_lsn turned nonzero), we could avoid the fil_names_write() call if min_lsn > lsn. */ - - fil_names_write(&*it, &mtr); - do_write = true; - + ut_ad(UT_LIST_GET_LEN((*it).chain) == 1); + fil_name_write((*it).id, UT_LIST_GET_FIRST((*it).chain)->name, + &mtr); it = next; } - if (do_write) { - mtr.commit_files(lsn); - } else { - ut_ad(!mtr.has_modifications()); - } - - return(do_write); + return mtr.commit_files(lsn); } /* Unit Tests */ diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc index 1767e0dc8e3..c32f9d296c9 100644 --- a/storage/innobase/fsp/fsp0file.cc +++ b/storage/innobase/fsp/fsp0file.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -353,7 +353,7 @@ dberr_t Datafile::validate_to_dd(uint32_t space_id, uint32_t flags) /* Validate this single-table-tablespace with the data dictionary, but do not compare the DATA_DIR flag, in case the tablespace was remotely located. */ - err = validate_first_page(0); + err = validate_first_page(); if (err != DB_SUCCESS) { return(err); } @@ -396,7 +396,7 @@ Datafile::validate_for_recovery() ut_ad(is_open()); ut_ad(!srv_read_only_mode); - err = validate_first_page(0); + err = validate_first_page(); switch (err) { case DB_TABLESPACE_EXISTS: @@ -443,7 +443,7 @@ Datafile::validate_for_recovery() /* Free the previously read first page and then re-validate. */ free_first_page(); m_defer = false; - err = validate_first_page(0); + err = validate_first_page(); } return(err); @@ -453,11 +453,10 @@ Datafile::validate_for_recovery() tablespace is opened. This occurs before the fil_space_t is created so the Space ID found here must not already be open. m_is_valid is set true on success, else false. -@param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN @retval DB_SUCCESS on if the datafile is valid @retval DB_CORRUPTION if the datafile is not readable @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ -dberr_t Datafile::validate_first_page(lsn_t *flush_lsn) +dberr_t Datafile::validate_first_page() { const char* error_txt = NULL; @@ -467,14 +466,6 @@ dberr_t Datafile::validate_first_page(lsn_t *flush_lsn) && read_first_page(srv_read_only_mode) != DB_SUCCESS) { error_txt = "Cannot read first page"; - } else { - ut_ad(m_first_page); - - if (flush_lsn != NULL) { - - *flush_lsn = mach_read_from_8( - m_first_page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); - } } if (error_txt != NULL) { diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 60e62a4f922..77b14edaf55 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -477,8 +477,7 @@ void fsp_apply_init_file_page(buf_block_t *block) const page_id_t id(block->page.id()); mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no()); - if (log_sys.is_physical()) - memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8); + memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8); mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id.space()); if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) @@ -488,8 +487,7 @@ void fsp_apply_init_file_page(buf_block_t *block) static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET, block->page.frame + FIL_PAGE_OFFSET, 4); - if (log_sys.is_physical()) - memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8); + memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8); static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, "not perfect alignment"); memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc index 4c3507538c7..9b91b94cd4f 100644 --- a/storage/innobase/fsp/fsp0sysspace.cc +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2021, MariaDB Corporation. +Copyright (c) 2016, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -525,16 +525,11 @@ SysTablespace::open_file( } /** Check the tablespace header for this tablespace. -@param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN @return DB_SUCCESS or error code */ -dberr_t -SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn) +inline dberr_t SysTablespace::read_lsn_and_check_flags() { dberr_t err; - /* Only relevant for the system tablespace. */ - ut_ad(space_id() == TRX_SYS_SPACE); - files_t::iterator it = m_files.begin(); ut_a(it->m_exists); @@ -566,7 +561,7 @@ SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn) first datafile. */ for (int retry = 0; retry < 2; ++retry) { - err = it->validate_first_page(flushed_lsn); + err = it->validate_first_page(); if (err != DB_SUCCESS && (retry == 1 @@ -593,6 +588,13 @@ SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn) return(err); } + if (srv_operation == SRV_OPERATION_NORMAL) { + /* Prepare for possible upgrade from 0-sized ib_logfile0. */ + ut_ad(!log_sys.next_checkpoint_lsn); + log_sys.next_checkpoint_lsn = mach_read_from_8( + it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/); + } + it->close(); return(DB_SUCCESS); @@ -830,14 +832,12 @@ SysTablespace::check_file_spec( @param[in] is_temp whether this is a temporary tablespace @param[in] create_new_db whether we are creating a new database @param[out] sum_new_sizes sum of sizes of the new files added -@param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file @return DB_SUCCESS or error code */ dberr_t SysTablespace::open_or_create( bool is_temp, bool create_new_db, - ulint* sum_new_sizes, - lsn_t* flush_lsn) + ulint* sum_new_sizes) { dberr_t err = DB_SUCCESS; fil_space_t* space = NULL; @@ -886,10 +886,9 @@ SysTablespace::open_or_create( } - if (!create_new_db && flush_lsn) { - /* Validate the header page in the first datafile - and read LSNs fom the others. */ - err = read_lsn_and_check_flags(flush_lsn); + if (!create_new_db && space_id() == TRX_SYS_SPACE) { + /* Validate the header page in the first datafile. */ + err = read_lsn_and_check_flags(); if (err != DB_SUCCESS) { return(err); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 46b716dd31a..ac8f8e37901 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -953,8 +953,9 @@ static SHOW_VAR innodb_status_variables[]= { &export_vars.innodb_buffer_pool_write_requests, SHOW_SIZE_T}, {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T}, {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T}, - {"data_fsyncs", &export_vars.innodb_data_fsyncs, SHOW_SIZE_T}, - {"data_pending_fsyncs", &export_vars.innodb_data_pending_fsyncs,SHOW_SIZE_T}, + {"data_fsyncs", (size_t*) &os_n_fsyncs, SHOW_SIZE_T}, + {"data_pending_fsyncs", + (size_t*) &fil_n_pending_tablespace_flushes, SHOW_SIZE_T}, {"data_pending_reads", &export_vars.innodb_data_pending_reads, SHOW_SIZE_T}, {"data_pending_writes", &export_vars.innodb_data_pending_writes,SHOW_SIZE_T}, {"data_read", &export_vars.innodb_data_read, SHOW_SIZE_T}, @@ -979,9 +980,9 @@ static SHOW_VAR innodb_status_variables[]= { {"ibuf_merges", &ibuf.n_merges, SHOW_SIZE_T}, {"ibuf_segment_size", &ibuf.seg_size, SHOW_SIZE_T}, {"ibuf_size", &ibuf.size, SHOW_SIZE_T}, - {"log_waits", &export_vars.innodb_log_waits, SHOW_SIZE_T}, - {"log_write_requests", &export_vars.innodb_log_write_requests, SHOW_SIZE_T}, - {"log_writes", &export_vars.innodb_log_writes, SHOW_SIZE_T}, + {"log_waits", &log_sys.waits, SHOW_SIZE_T}, + {"log_write_requests", &log_sys.write_to_buf, SHOW_SIZE_T}, + {"log_writes", &log_sys.write_to_log, SHOW_SIZE_T}, {"lsn_current", &export_vars.innodb_lsn_current, SHOW_ULONGLONG}, {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG}, {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint, @@ -993,11 +994,6 @@ static SHOW_VAR innodb_status_variables[]= { {"mem_adaptive_hash", &export_vars.innodb_mem_adaptive_hash, SHOW_SIZE_T}, #endif {"mem_dictionary", &export_vars.innodb_mem_dictionary, SHOW_SIZE_T}, - {"os_log_fsyncs", &export_vars.innodb_os_log_fsyncs, SHOW_SIZE_T}, - {"os_log_pending_fsyncs", &export_vars.innodb_os_log_pending_fsyncs, - SHOW_SIZE_T}, - {"os_log_pending_writes", &export_vars.innodb_os_log_pending_writes, - SHOW_SIZE_T}, {"os_log_written", &export_vars.innodb_os_log_written, SHOW_SIZE_T}, {"page_size", &srv_page_size, SHOW_ULONG}, {"pages_created", &buf_pool.stat.n_pages_created, SHOW_SIZE_T}, @@ -1621,54 +1617,14 @@ innobase_start_trx_and_assign_read_view( be committed */ /** Flush InnoDB redo logs to the file system. -@param[in] hton InnoDB handlerton -@param[in] binlog_group_flush true if we got invoked by binlog -group commit during flush stage, false in other cases. @return false */ -static -bool -innobase_flush_logs( - handlerton* hton, - bool binlog_group_flush) +static bool innobase_flush_logs(handlerton*) { - DBUG_ENTER("innobase_flush_logs"); - DBUG_ASSERT(hton == innodb_hton_ptr); - - if (srv_read_only_mode) { - DBUG_RETURN(false); - } - - /* If !binlog_group_flush, we got invoked by FLUSH LOGS or similar. - Else, we got invoked by binlog group commit during flush stage. */ - - if (binlog_group_flush && srv_flush_log_at_trx_commit == 0) { - /* innodb_flush_log_at_trx_commit=0 - (write and sync once per second). - Do not flush the redo log during binlog group commit. */ - DBUG_RETURN(false); - } - - /* Flush the redo log buffer to the redo log file. - Sync it to disc if we are in FLUSH LOGS, or if - innodb_flush_log_at_trx_commit=1 - (write and sync at each commit). */ - log_buffer_flush_to_disk(!binlog_group_flush - || srv_flush_log_at_trx_commit == 1); - - DBUG_RETURN(false); -} - -/** Flush InnoDB redo logs to the file system. -@param[in] hton InnoDB handlerton -@param[in] binlog_group_flush true if we got invoked by binlog -group commit during flush stage, false in other cases. -@return false */ -static -bool -innobase_flush_logs( - handlerton* hton) -{ - return innobase_flush_logs(hton, true); + if (!srv_read_only_mode && srv_flush_log_at_trx_commit) + /* Write any outstanding redo log. Durably if + innodb_flush_log_at_trx_commit=1. */ + log_buffer_flush_to_disk(srv_flush_log_at_trx_commit == 1); + return false; } /************************************************************************//** @@ -3964,23 +3920,6 @@ static int innodb_init_params() << srv_page_size_shift); } - if (srv_log_write_ahead_size > srv_page_size) { - srv_log_write_ahead_size = srv_page_size; - } else { - ulong srv_log_write_ahead_size_tmp = OS_FILE_LOG_BLOCK_SIZE; - - while (srv_log_write_ahead_size_tmp - < srv_log_write_ahead_size) { - srv_log_write_ahead_size_tmp - = srv_log_write_ahead_size_tmp * 2; - } - if (srv_log_write_ahead_size_tmp - != srv_log_write_ahead_size) { - srv_log_write_ahead_size - = srv_log_write_ahead_size_tmp / 2; - } - } - srv_buf_pool_size = ulint(innobase_buffer_pool_size); if (innobase_open_files < 10) { @@ -18275,31 +18214,25 @@ static my_bool innodb_buf_flush_list_now = TRUE; static uint innodb_merge_threshold_set_all_debug = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; -/****************************************************************//** -Force innodb to checkpoint. */ +/** Force an InnoDB log checkpoint. */ static void -checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save) +checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void *save) { - if (*(my_bool*) save) { - mysql_mutex_unlock(&LOCK_global_system_variables); + if (!*static_cast(save)) + return; + const auto size= log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT; + mysql_mutex_unlock(&LOCK_global_system_variables); + lsn_t lsn; + while (log_sys.last_checkpoint_lsn.load(std::memory_order_acquire) + size < + (lsn= log_sys.get_lsn(std::memory_order_acquire))) + log_make_checkpoint(); - lsn_t lsn; + if (dberr_t err= fil_write_flushed_lsn(lsn)) + sql_print_warning("innodb_checkpoint_now_set failed: %d", err); - while (log_sys.last_checkpoint_lsn.load( - std::memory_order_acquire) - + SIZE_OF_FILE_CHECKPOINT - < (lsn= log_sys.get_lsn(std::memory_order_acquire))) { - log_make_checkpoint(); - log_sys.log.flush(); - } - - if (dberr_t err = fil_write_flushed_lsn(lsn)) { - ib::warn() << "Checkpoint set failed " << err; - } - - mysql_mutex_lock(&LOCK_global_system_variables); - } + mysql_mutex_lock(&LOCK_global_system_variables); } /****************************************************************//** @@ -18446,51 +18379,6 @@ buffer_pool_load_abort( } } -/****************************************************************//** -Update the system variable innodb_log_write_ahead_size using the "saved" -value. This function is registered as a callback with MySQL. */ -static -void -innodb_log_write_ahead_size_update( -/*===============================*/ - THD* thd, /*!< in: thread handle */ - st_mysql_sys_var*, void*, - const void* save) /*!< in: immediate result - from check function */ -{ - ulong val = OS_FILE_LOG_BLOCK_SIZE; - ulong in_val = *static_cast(save); - - while (val < in_val) { - val = val * 2; - } - - if (val > srv_page_size) { - val = srv_page_size; - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "innodb_log_write_ahead_size cannot" - " be set higher than innodb_page_size."); - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "Setting innodb_log_write_ahead_size" - " to %lu", - srv_page_size); - } else if (val != in_val) { - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "innodb_log_write_ahead_size should be" - " set 2^n value and larger than 512."); - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "Setting innodb_log_write_ahead_size" - " to %lu", - val); - } - - srv_log_write_ahead_size = val; -} - /** Update innodb_status_output or innodb_status_output_locks, which control InnoDB "status monitor" output to the error log. @param[out] var current value @@ -18724,7 +18612,7 @@ innobase_wsrep_set_checkpoint( if (wsrep_is_wsrep_xid(xid)) { trx_rseg_update_wsrep_checkpoint(xid); - innobase_flush_logs(hton, false); + log_buffer_flush_to_disk(srv_flush_log_at_trx_commit == 1); return 0; } else { return 1; @@ -19299,23 +19187,15 @@ static MYSQL_SYSVAR_ULONG(page_size, srv_page_size, NULL, NULL, UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0); -static MYSQL_SYSVAR_ULONG(log_buffer_size, srv_log_buffer_size, +static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "The size of the buffer which InnoDB uses to write log to the log files on disk.", - NULL, NULL, 16L << 20, 256L << 10, LONG_MAX, 1024); + "Redo log buffer size in bytes.", + NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096); static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Size of each log file in a log group.", - NULL, NULL, 96 << 20, 1 << 20, std::numeric_limits::max(), - UNIV_PAGE_SIZE_MAX); - -static MYSQL_SYSVAR_ULONG(log_write_ahead_size, srv_log_write_ahead_size, - PLUGIN_VAR_RQCMDARG, - "Redo log write ahead unit size to avoid read-on-write," - " it should match the OS cache block IO size", - NULL, innodb_log_write_ahead_size_update, - 8*1024L, OS_FILE_LOG_BLOCK_SIZE, UNIV_PAGE_SIZE_DEF, OS_FILE_LOG_BLOCK_SIZE); + "Redo log size in bytes.", + NULL, NULL, 96 << 20, 1U << 20, std::numeric_limits::max(), 4096); static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct, PLUGIN_VAR_RQCMDARG, @@ -19791,7 +19671,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(page_size), MYSQL_SYSVAR(log_buffer_size), MYSQL_SYSVAR(log_file_size), - MYSQL_SYSVAR(log_write_ahead_size), MYSQL_SYSVAR(log_group_home_dir), MYSQL_SYSVAR(max_dirty_pages_pct), MYSQL_SYSVAR(max_dirty_pages_pct_lwm), diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index e666c285bf0..f41e3f659fc 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2021, MariaDB Corporation. +Copyright (c) 2016, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2597,7 +2597,7 @@ static bool ibuf_get_volume_buffered_hash(const rec_t *rec, ulint *hash, ut_ad(rec_get_n_fields_old(rec) > IBUF_REC_FIELD_USER); const ulint start= rec_get_field_start_offs(rec, IBUF_REC_FIELD_USER); const ulint len= rec_get_data_size_old(rec) - start; - const uint32_t fold= ut_crc32(rec + start, len); + const uint32_t fold= my_crc32c(0, rec + start, len); hash+= (fold / (CHAR_BIT * sizeof *hash)) % size; ulint bitmask= static_cast(1) << (fold % (CHAR_BIT * sizeof(*hash))); diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 6cdb954ec29..cf6eed7f103 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1773,50 +1773,14 @@ void fil_names_dirty( fil_space_t* space); -/** Write FILE_MODIFY records when a non-predefined persistent -tablespace was modified for the first time since the latest -fil_names_clear(). -@param[in,out] space tablespace */ -void fil_names_dirty_and_write(fil_space_t* space); - -/** Write FILE_MODIFY records if a persistent tablespace was modified -for the first time since the latest fil_names_clear(). -@param[in,out] space tablespace -@param[in,out] mtr mini-transaction -@return whether any FILE_MODIFY record was written */ -inline bool fil_names_write_if_was_clean(fil_space_t* space) -{ - mysql_mutex_assert_owner(&log_sys.mutex); - - if (space == NULL) { - return(false); - } - - const bool was_clean = space->max_lsn == 0; - ut_ad(space->max_lsn <= log_sys.get_lsn()); - space->max_lsn = log_sys.get_lsn(); - - if (was_clean) { - fil_names_dirty_and_write(space); - } - - return(was_clean); -} - bool fil_comp_algo_loaded(ulint comp_algo); /** On a log checkpoint, reset fil_names_dirty_and_write() flags -and write out FILE_MODIFY and FILE_CHECKPOINT if needed. -@param[in] lsn checkpoint LSN -@param[in] do_write whether to always write FILE_CHECKPOINT -@return whether anything was written to the redo log -@retval false if no flags were set and nothing written -@retval true if anything was written to the redo log */ -bool -fil_names_clear( - lsn_t lsn, - bool do_write); +and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. +@param lsn checkpoint LSN +@return current LSN */ +lsn_t fil_names_clear(lsn_t lsn); #ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH void test_make_filepath(); diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 9dfb3cc7561..ce11b868bd1 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2021, MariaDB Corporation. +Copyright (c) 2018, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -216,11 +216,10 @@ public: tablespace is opened. This occurs before the fil_space_t is created so the Space ID found here must not already be open. m_is_valid is set true on success, else false. - @param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN @retval DB_SUCCESS on if the datafile is valid @retval DB_CORRUPTION if the datafile is not readable @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ - dberr_t validate_first_page(lsn_t* flush_lsn) + dberr_t validate_first_page() MY_ATTRIBUTE((warn_unused_result)); /** Get Datafile::m_filepath. diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h index b6bdadd3501..514f3fdbf25 100644 --- a/storage/innobase/include/fsp0sysspace.h +++ b/storage/innobase/include/fsp0sysspace.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2021, MariaDB Corporation. +Copyright (c) 2016, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -151,20 +151,17 @@ public: @param[in] is_temp whether this is a temporary tablespace @param[in] create_new_db whether we are creating a new database @param[out] sum_new_sizes sum of sizes of the new files added - @param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file @return DB_SUCCESS or error code */ dberr_t open_or_create( bool is_temp, bool create_new_db, - ulint* sum_new_sizes, - lsn_t* flush_lsn) + ulint* sum_new_sizes) MY_ATTRIBUTE((warn_unused_result)); private: /** Check the tablespace header for this tablespace. - @param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN @return DB_SUCCESS or error code */ - dberr_t read_lsn_and_check_flags(lsn_t* flushed_lsn); + inline dberr_t read_lsn_and_check_flags(); /** @return true if the last file size is valid. */ diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h index b9390927ece..22c0c9636bf 100644 --- a/storage/innobase/include/log0crypt.h +++ b/storage/innobase/include/log0crypt.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (C) 2014, 2021, MariaDB Corporation. +Copyright (C) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,27 +24,26 @@ Created 11/25/2013 Minli Zhu Modified Jan Lindström jan.lindstrom@mariadb.com MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. *******************************************************/ -#ifndef log0crypt_h -#define log0crypt_h +#pragma once #include "log0log.h" -/** innodb_encrypt_log: whether to encrypt the redo log */ -extern my_bool srv_encrypt_log; - /** Initialize the redo log encryption key and random parameters when creating a new redo log. -The random parameters will be persisted in the log checkpoint pages. -@see log_crypt_write_checkpoint_buf() -@see log_crypt_read_checkpoint_buf() +The random parameters will be persisted in the log header. +@see log_crypt_write_header() +@see log_crypt_read_header() @return whether the operation succeeded */ bool log_crypt_init(); -/*********************************************************************//** -Writes the crypto (version, msg and iv) info, which has been used for -log blocks with lsn <= this checkpoint's lsn, to a log header's -checkpoint buf. */ -void log_crypt_write_checkpoint_buf(byte *buf); +/** Add the encryption information to the log header buffer. +@param buf part of log header buffer */ +void log_crypt_write_header(byte *buf); + +/** Read the encryption information from a redo log checkpoint buffer. +@param buf part of checkpoint buffer +@return whether the operation was successful */ +bool log_crypt_read_header(const byte *buf); /** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info. @param[in] buf checkpoint buffer @@ -60,25 +59,28 @@ ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn); /** Read the checkpoint crypto (version, msg and iv) info. @param[in] buf checkpoint buffer @return whether the operation was successful */ -bool log_crypt_read_checkpoint_buf(const byte* buf); +ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf); -/** log_crypt() operation code */ -enum log_crypt_t { - /** encrypt a log block without rotating key */ - LOG_ENCRYPT, - /** decrypt a log block */ - LOG_DECRYPT, - /** attempt to rotate the key, and encrypt a log block */ - LOG_ENCRYPT_ROTATE_KEY -}; - -/** Encrypt or decrypt log blocks. -@param[in,out] buf log blocks to encrypt or decrypt +/** Decrypt log blocks. +@param[in,out] buf log blocks to decrypt @param[in] lsn log sequence number of the start of the buffer @param[in] size size of the buffer, in bytes -@param[in] op whether to decrypt, encrypt, or rotate key and encrypt -@return whether the operation succeeded (encrypt always does) */ -bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT); +@return whether the operation succeeded */ +ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size); + +/** Decrypt part of a log record. +@param iv initialization vector +@param buf buffer for the decrypted data +@param data the encrypted data +@param len length of the data, in bytes +@return buf */ +byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len); + +/** Decrypt a log snippet. +@param iv initialization vector +@param buf buffer to be replaced with encrypted contents +@param end pointer past the end of buf */ +void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end); /** Encrypt or decrypt a temporary file block. @param[in] src block to encrypt or decrypt @@ -111,7 +113,3 @@ log_tmp_block_decrypt( { return(log_tmp_block_encrypt(src, size, dst, offs, false)); } - -/** @return whether temporary files are encrypted */ -inline bool log_tmp_is_encrypted() { return srv_encrypt_log; } -#endif // log0crypt.h diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 76deace8fde..618bda3b87a 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -31,14 +31,12 @@ Database log Created 12/9/1995 Heikki Tuuri *******************************************************/ -#ifndef log0log_h -#define log0log_h +#pragma once #include "log0types.h" #include "os0file.h" #include "span.h" #include "my_atomic_wrapper.h" -#include #include using st_::span; @@ -51,9 +49,6 @@ static const char LOG_FILE_NAME[] = "ib_logfile0"; @return path with log file name*/ std::string get_log_file_path(const char *filename= LOG_FILE_NAME); -/** Returns paths for all existing log files */ -std::vector get_existing_log_files_paths(); - /** Delete log file. @param[in] suffix suffix of the file name */ static inline void delete_log_file(const char* suffix) @@ -62,31 +57,6 @@ static inline void delete_log_file(const char* suffix) os_file_delete_if_exists(innodb_log_file_key, path.c_str(), nullptr); } -/** Append a string to the log. -@param[in] str string -@param[in] len string length -@param[out] start_lsn start LSN of the log record -@return end lsn of the log record, zero if did not succeed */ -UNIV_INLINE -lsn_t -log_reserve_and_write_fast( - const void* str, - ulint len, - lsn_t* start_lsn); -/***********************************************************************//** -Checks if there is need for a log buffer flush or a new checkpoint, and does -this if yes. Any database operation should call this when it has modified -more than about 4 pages. NOTE that this function may only be called when the -OS thread owns no synchronization objects except dict_sys.latch. */ -UNIV_INLINE -void -log_free_check(void); -/*================*/ - -/** Extends the log buffer. -@param[in] len requested minimum size in bytes */ -void log_buffer_extend(ulong len); - /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_pool.get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -97,25 +67,20 @@ bool log_set_capacity(ulonglong file_size) MY_ATTRIBUTE((warn_unused_result)); -/** -Ensure that the log has been written to the log file up to a given +struct completion_callback; + +/** Ensure that the log has been written to the log file up to a given log entry (such as that of a transaction commit). Start a new write, or wait and check if an already running write is covering the request. -@param[in] lsn log sequence number that should be -included in the redo log file write -@param[in] flush_to_disk whether the written log should also -be flushed to the file system -@param[in] rotate_key whether to rotate the encryption key -@param[in] cb completion callback. If not NULL, the callback will be called - whenever lsn is written or flushed. -*/ -struct completion_callback; -void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false, - const completion_callback* cb=nullptr); +@param lsn log sequence number that should be included in the file write +@param durable whether the write needs to be durable +@param callback log write completion callback */ +void log_write_up_to(lsn_t lsn, bool durable, + const completion_callback *callback= nullptr); /** Write to the log file up to the last log entry. -@param sync whether to wait for a durable write to complete */ -void log_buffer_flush_to_disk(bool sync= true); +@param durable whether to wait for a durable write to complete */ +void log_buffer_flush_to_disk(bool durable= true); /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */ @@ -130,10 +95,6 @@ ATTRIBUTE_COLD void log_make_checkpoint(); /** Make a checkpoint at the latest lsn on shutdown. */ ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); -/** Write checkpoint info to the log header and release log_sys.mutex. -@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ -ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn); - /** Checks that there is enough free space in the log to start a new query step. Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this @@ -141,100 +102,6 @@ function may only be called if the calling thread owns no synchronization objects! */ ATTRIBUTE_COLD void log_check_margins(); -/************************************************************//** -Gets a log block flush bit. -@return TRUE if this block was the first to be written in a log flush */ -UNIV_INLINE -ibool -log_block_get_flush_bit( -/*====================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Gets a log block number stored in the header. -@return log block number stored in the block header */ -UNIV_INLINE -ulint -log_block_get_hdr_no( -/*=================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Gets a log block data length. -@return log block data length measured as a byte offset from the block start */ -UNIV_INLINE -ulint -log_block_get_data_len( -/*===================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Sets the log block data length. */ -UNIV_INLINE -void -log_block_set_data_len( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint len); /*!< in: data length */ -/** Calculate the CRC-32C checksum of a log block. -@param[in] block log block -@return checksum */ -inline ulint log_block_calc_checksum_crc32(const byte* block); - -/************************************************************//** -Gets a log block checksum field value. -@return checksum */ -UNIV_INLINE -ulint -log_block_get_checksum( -/*===================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Sets a log block checksum field value. */ -UNIV_INLINE -void -log_block_set_checksum( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint checksum); /*!< in: checksum */ -/************************************************************//** -Gets a log block first mtr log record group offset. -@return first mtr log record group byte offset from the block start, 0 -if none */ -UNIV_INLINE -ulint -log_block_get_first_rec_group( -/*==========================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Sets the log block first mtr log record group offset. */ -UNIV_INLINE -void -log_block_set_first_rec_group( -/*==========================*/ - byte* log_block, /*!< in/out: log block */ - ulint offset); /*!< in: offset, 0 if none */ -/************************************************************//** -Gets a log block checkpoint number field (4 lowest bytes). -@return checkpoint no (4 lowest bytes) */ -UNIV_INLINE -ulint -log_block_get_checkpoint_no( -/*========================*/ - const byte* log_block); /*!< in: log block */ -/************************************************************//** -Initializes a log block in the log buffer. */ -UNIV_INLINE -void -log_block_init( -/*===========*/ - byte* log_block, /*!< in: pointer to the log buffer */ - lsn_t lsn); /*!< in: lsn within the log block */ -/************************************************************//** -Converts a lsn to a log block number. -@return log block number, it is > 0 and <= 1G */ -UNIV_INLINE -ulint -log_block_convert_lsn_to_no( -/*========================*/ - lsn_t lsn); /*!< in: lsn of a byte within the block */ /******************************************************//** Prints info of the log. */ void @@ -247,82 +114,12 @@ void log_refresh_stats(void); /*===================*/ -/* The counting of lsn's starts from this value: this must be non-zero */ -#define LOG_START_LSN ((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE)) - -/* Offsets of a log block header */ -#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and - is allowed to wrap around at 2G; the - highest bit is set to 1 if this is the - first log block in a log flush write - segment */ -#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL - /* mask used to get the highest bit in - the preceding field */ -#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to - this block */ -#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an - mtr log record group in this log block, - 0 if none; if the value is the same - as LOG_BLOCK_HDR_DATA_LEN, it means - that the first rec group has not yet - been catenated to this log block, but - if it will, it will start at this - offset; an archive recovery can - start parsing the log records starting - from this offset in this log block, - if value not 0 */ -#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of - log_sys.next_checkpoint_no when the - log block was last written to: if the - block has not yet been written full, - this value is only updated before a - log buffer flush */ -#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in - bytes */ - -#define LOG_BLOCK_KEY 4 /* encryption key version - before LOG_BLOCK_CHECKSUM; - after log_t::FORMAT_ENC_10_4 only */ -#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block - contents; in InnoDB versions - < 3.23.52 this did not contain the - checksum but the same value as - LOG_BLOCK_HDR_NO */ - -/** Offsets inside the checkpoint pages (redo log format version 1) @{ */ -/** Checkpoint number */ -#define LOG_CHECKPOINT_NO 0 -/** Log sequence number up to which all changes have been flushed */ -#define LOG_CHECKPOINT_LSN 8 -/** Byte offset of the log record corresponding to LOG_CHECKPOINT_LSN */ -#define LOG_CHECKPOINT_OFFSET 16 -/** srv_log_buffer_size at the time of the checkpoint (not used) */ -#define LOG_CHECKPOINT_LOG_BUF_SIZE 24 -/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/ -#define LOG_CHECKPOINT_CRYPT_KEY 32 -/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */ -#define LOG_CHECKPOINT_CRYPT_NONCE 36 -/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */ -#define LOG_CHECKPOINT_CRYPT_MESSAGE 40 -/** start LSN of the MLOG_CHECKPOINT mini-transaction corresponding -to this checkpoint, or 0 if the information has not been written */ -#define LOG_CHECKPOINT_END_LSN OS_FILE_LOG_BLOCK_SIZE - 16 - -/* @} */ - /** Offsets of a log file header */ /* @{ */ /** Log file header format identifier (32-bit unsigned big-endian integer). This used to be called LOG_GROUP_ID and always written as 0, because InnoDB never supported more than one copy of the redo log. */ #define LOG_HEADER_FORMAT 0 -/** Redo log subformat (originally 0). In format version 0, the -LOG_FILE_START_LSN started here, 4 bytes earlier than LOG_HEADER_START_LSN, -which the LOG_FILE_START_LSN was renamed to. -Subformat 1 is for the fully redo-logged TRUNCATE -(no MLOG_TRUNCATE records or extra log checkpoints or log file) */ -#define LOG_HEADER_SUBFORMAT 4 /** LSN of the start of data in this log file (with format version 1; in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */ #define LOG_HEADER_START_LSN 8 @@ -331,123 +128,69 @@ and the creation time if the log file was created by mysqlbackup --restore, or the MySQL version that created the redo log file. */ #define LOG_HEADER_CREATOR 16 /** End of the log file creator field. */ -#define LOG_HEADER_CREATOR_END (LOG_HEADER_CREATOR + 32) -/** Contents of the LOG_HEADER_CREATOR field */ -#define LOG_HEADER_CREATOR_CURRENT \ - "MariaDB " \ - IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ - IB_TO_STR(MYSQL_VERSION_MINOR) "." \ - IB_TO_STR(MYSQL_VERSION_PATCH) - +#define LOG_HEADER_CREATOR_END 48 /* @} */ -#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE - /* first checkpoint field in the log - header; we write alternately to the - checkpoint fields when we make new - checkpoints; this field is only defined - in the first log file of a log */ -#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE) - /* second checkpoint field in the log - header */ -#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE) +struct log_t; -/** Abstraction for reading, writing and flushing file cache to disk */ -class file_io -{ -public: - file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {} - virtual ~file_io() noexcept {}; - virtual dberr_t open(const char *path, bool read_only) noexcept= 0; - virtual dberr_t rename(const char *old_path, - const char *new_path) noexcept= 0; - virtual dberr_t close() noexcept= 0; - virtual dberr_t read(os_offset_t offset, span buf) noexcept= 0; - virtual dberr_t write(const char *path, os_offset_t offset, - span buf) noexcept= 0; - virtual dberr_t flush() noexcept= 0; - - /** Durable writes doesn't require calling flush() */ - bool writes_are_durable() const noexcept { return m_durable_writes; } - -protected: - bool m_durable_writes; -}; - -class file_os_io final: public file_io -{ -public: - file_os_io()= default; - file_os_io(const file_os_io &)= delete; - file_os_io &operator=(const file_os_io &)= delete; - file_os_io(file_os_io &&rhs); - file_os_io &operator=(file_os_io &&rhs); - ~file_os_io() noexcept; - - dberr_t open(const char *path, bool read_only) noexcept final; - bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; } - dberr_t rename(const char *old_path, const char *new_path) noexcept final; - dberr_t close() noexcept final; - dberr_t read(os_offset_t offset, span buf) noexcept final; - dberr_t write(const char *path, os_offset_t offset, - span buf) noexcept final; - dberr_t flush() noexcept final; - -private: - pfs_os_file_t m_fd{OS_FILE_CLOSED}; -}; - -/** File abstraction + path */ +/** File abstraction */ class log_file_t { + friend log_t; + pfs_os_file_t m_file; public: - log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {} + log_file_t()= default; + log_file_t(pfs_os_file_t file) noexcept : m_file(file) {} - dberr_t open(bool read_only) noexcept; - bool is_opened() const noexcept; + /** Open a file + @return file size in bytes + @retval 0 if not readable */ + os_offset_t open(bool read_only) noexcept; + bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; } - const std::string &get_path() const noexcept { return m_path; } - - dberr_t rename(std::string new_path) noexcept; dberr_t close() noexcept; dberr_t read(os_offset_t offset, span buf) noexcept; - bool writes_are_durable() const noexcept; dberr_t write(os_offset_t offset, span buf) noexcept; - dberr_t flush() noexcept; - void free() - { - m_path.clear(); - m_path.shrink_to_fit(); - } - -private: - std::unique_ptr m_file; - std::string m_path; + bool flush() const noexcept { return os_file_flush(m_file); } +#ifdef HAVE_PMEM + byte *mmap(bool read_only, const struct stat &st) noexcept; +#endif }; /** Redo log buffer */ -struct log_t{ +struct log_t +{ /** The original (not version-tagged) InnoDB redo log format */ - static constexpr uint32_t FORMAT_3_23 = 0; + static constexpr uint32_t FORMAT_3_23= 0; /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ - static constexpr uint32_t FORMAT_10_2 = 1; - /** The MariaDB 10.3.2 log format. - To prevent crash-downgrade to earlier 10.2 due to the inability to - roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record, - MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT - 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2 - (MDEV-13564 backup-friendly TRUNCATE). */ - static constexpr uint32_t FORMAT_10_3 = 103; + static constexpr uint32_t FORMAT_10_2= 1; + /** The MariaDB 10.3.2 log format. */ + static constexpr uint32_t FORMAT_10_3= 103; /** The MariaDB 10.4.0 log format. */ - static constexpr uint32_t FORMAT_10_4 = 104; + static constexpr uint32_t FORMAT_10_4= 104; /** Encrypted MariaDB redo log */ - static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31; + static constexpr uint32_t FORMAT_ENCRYPTED= 1U << 31; /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ - static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED; - /** The MariaDB 10.5 physical redo log format */ - static constexpr uint32_t FORMAT_10_5 = 0x50485953; - /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */ - static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED; + static constexpr uint32_t FORMAT_ENC_10_4= FORMAT_10_4 | FORMAT_ENCRYPTED; + /** The MariaDB 10.5.1 physical redo log format */ + static constexpr uint32_t FORMAT_10_5= 0x50485953; + /** The MariaDB 10.5.1 physical format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_5= FORMAT_10_5 | FORMAT_ENCRYPTED; + /** The MariaDB 10.8.0 variable-block-size redo log format */ + static constexpr uint32_t FORMAT_10_8= 0x50687973; + /** The MariaDB 10.8.0 format with innodb_encrypt_log=ON */ + static constexpr uint32_t FORMAT_ENC_10_8= FORMAT_10_8 | FORMAT_ENCRYPTED; + + /** Location of the first checkpoint block */ + static constexpr size_t CHECKPOINT_1= 4096; + /** Location of the second checkpoint block */ + static constexpr size_t CHECKPOINT_2= 8192; + /** Start of record payload */ + static constexpr lsn_t START_OFFSET= 12288; + + /** smallest possible log sequence number in the current format + (used to be 2048 before FORMAT_10_8). */ + static constexpr lsn_t FIRST_LSN= START_OFFSET; private: /** The log sequence number of the last change of durable InnoDB files */ @@ -462,117 +205,49 @@ private: public: /** mutex protecting the log */ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; +private: + /** Last written LSN */ + lsn_t write_lsn; +public: /** first free offset within the log buffer in use */ size_t buf_free; /** recommended maximum size of buf, after which the buffer is flushed */ size_t max_buf_free; - /** mutex to serialize access to the flush list when we are putting - dirty blocks in the list. The idea behind this mutex is to be able - to release log_sys.mutex during mtr_commit and still ensure that - insertions in the flush_list happen in the LSN order. */ + /** mutex that ensures that inserts into buf_pool.flush_list are in + LSN order; allows mtr_t::commit() to release log_sys.mutex earlier */ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex; - /** log_buffer, append data here */ + /** log record buffer, written to by mtr_t::commit() */ byte *buf; - /** log_buffer, writing data to file from this buffer. - Before flushing write_buf is swapped with flush_buf */ + /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() + In write_buf(), buf and flush_buf are swapped */ byte *flush_buf; - /** Log file stuff. Protected by mutex. */ - struct file { - /** format of the redo log: e.g., FORMAT_10_5 */ - uint32_t format; - /** redo log subformat: 0 with separately logged TRUNCATE, - 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */ - uint32_t subformat; - /** individual log file size in bytes, including the header */ - lsn_t file_size; - private: - /** lsn used to fix coordinates within the log group */ - lsn_t lsn; - /** the byte offset of the above lsn */ - lsn_t lsn_offset; - /** log file */ - log_file_t fd; + /** number of write requests (to buf); protected by mutex */ + ulint write_to_buf; + /** number of std::swap(buf, flush_buf) and writes from buf to log; + protected by mutex */ + ulint write_to_log; + /** number of waits in append_prepare() */ + ulint waits; + /** innodb_log_buffer_size (size of buf and flush_buf, in bytes) */ + size_t buf_size; - public: - /** used only in recovery: recovery scan succeeded up to this - lsn in this log group */ - lsn_t scanned_lsn; - - /** opens log file which must be closed prior this call */ - void open_file(std::string path); - /** writes header */ - void write_header_durable(lsn_t lsn); - /** opens log file which must be closed prior this call */ - dberr_t rename(std::string path) { return fd.rename(path); } - /** reads buffer from log file - @param[in] offset offset in log file - @param[in] buf buffer where to read */ - void read(os_offset_t offset, span buf); - /** Tells whether writes require calling flush() */ - bool writes_are_durable() const noexcept; - /** writes buffer to log file - @param[in] offset offset in log file - @param[in] buf buffer from which to write */ - void write(os_offset_t offset, span buf); - /** flushes OS page cache (excluding metadata!) for log file */ - void flush(); - /** closes log file */ - void close_file(); - - /** @return whether the redo log is encrypted */ - bool is_encrypted() const { return format & FORMAT_ENCRYPTED; } - /** @return whether the redo log is in the physical format */ - bool is_physical() const - { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; } - /** @return capacity in bytes */ - lsn_t capacity() const{ return file_size - LOG_FILE_HDR_SIZE; } - /** Calculate the offset of a log sequence number. - @param[in] lsn log sequence number - @return offset within the log */ - inline lsn_t calc_lsn_offset(lsn_t lsn) const; - inline lsn_t calc_lsn_offset_old(lsn_t lsn) const; - - /** Set the field values to correspond to a given lsn. */ - void set_fields(lsn_t lsn) - { - lsn_t c_lsn_offset = calc_lsn_offset(lsn); - set_lsn(lsn); - set_lsn_offset(c_lsn_offset); - } - - /** Read a log segment to log_sys.buf. - @param[in,out] start_lsn in: read area start, - out: the last read valid lsn - @param[in] end_lsn read area end - @return whether no invalid blocks (e.g checksum mismatch) were found */ - bool read_log_seg(lsn_t* start_lsn, lsn_t end_lsn); - - /** Initialize the redo log buffer. */ - void create(); - - /** Close the redo log buffer. */ - void close() { close_file(); } - void set_lsn(lsn_t a_lsn); - lsn_t get_lsn() const { return lsn; } - void set_lsn_offset(lsn_t a_lsn); - lsn_t get_lsn_offset() const { return lsn_offset; } - } log; + /** log file size in bytes, including the header */ + lsn_t file_size; +private: + /** the log sequence number at the start of the log file */ + lsn_t first_lsn; +#if defined __linux__ || defined _WIN32 + /** The physical block size of the storage */ + uint32_t block_size; +#endif +public: + /** format of the redo log: e.g., FORMAT_10_8 */ + uint32_t format; + /** Log file */ + log_file_t log; /** The fields involved in the log buffer flush @{ */ - size_t buf_next_to_write;/*!< first offset in the log buffer - where the byte content may not exist - written to file, e.g., the start - offset of a log record catenated - later; this is advanced when a flush - operation is completed to all the log - groups */ - lsn_t write_lsn; /*!< last written lsn */ - lsn_t current_flush_lsn;/*!< end lsn for the current running - write + flush operation */ - std::atomic pending_flushes; /*!< system calls in progress */ - std::atomic flushes; /*!< system calls counter */ - ulint n_log_ios; /*!< number of log i/os initiated thus far */ ulint n_log_ios_old; /*!< number of log i/o's at the @@ -597,46 +272,65 @@ public: /*!< this is the maximum allowed value for lsn - last_checkpoint_lsn when a new query step is started */ - ib_uint64_t next_checkpoint_no; - /*!< next checkpoint number */ /** latest completed checkpoint (protected by log_sys.mutex) */ Atomic_relaxed last_checkpoint_lsn; lsn_t next_checkpoint_lsn; /*!< next checkpoint lsn */ - ulint n_pending_checkpoint_writes; - /*!< number of currently pending - checkpoint writes */ + /** next checkpoint number (protected by mutex) */ + ulint next_checkpoint_no; + /** number of pending checkpoint writes */ + ulint n_pending_checkpoint_writes; /** buffer for checkpoint header */ byte *checkpoint_buf; /* @} */ -private: - bool m_initialised; -public: - /** - Constructor. + bool is_initialised() const noexcept { return max_buf_free != 0; } - Some members may require late initialisation, thus we just mark object as - uninitialised. Real initialisation happens in create(). - */ - log_t(): m_initialised(false) {} +#ifdef HAVE_PMEM + bool is_pmem() const noexcept { return !flush_buf; } +#else + static constexpr bool is_pmem() { return false; } +#endif - /** @return whether the redo log is encrypted */ - bool is_encrypted() const { return(log.is_encrypted()); } - /** @return whether the redo log is in the physical format */ - bool is_physical() const { return log.is_physical(); } + bool is_opened() const noexcept { return log.is_opened(); } - bool is_initialised() const { return m_initialised; } + void attach(log_file_t file, os_offset_t size); + + void close_file(); lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const { return lsn.load(order); } void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); } - lsn_t get_flushed_lsn() const - { return flushed_to_disk_lsn.load(std::memory_order_acquire); } - void set_flushed_lsn(lsn_t lsn) - { flushed_to_disk_lsn.store(lsn, std::memory_order_release); } + lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire) + const noexcept + { return flushed_to_disk_lsn.load(order); } + + /** Initialize the LSN on initial log file creation. */ + lsn_t init_lsn() noexcept + { + mysql_mutex_lock(&mutex); + const lsn_t lsn{get_lsn()}; + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + write_lsn= lsn; + mysql_mutex_unlock(&mutex); + return lsn; + } + + void set_recovered_lsn(lsn_t lsn) noexcept + { + mysql_mutex_assert_owner(&mutex); + write_lsn= lsn; + this->lsn.store(lsn, std::memory_order_relaxed); + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + } + +#ifdef HAVE_PMEM + /** Persist the log. + @param lsn desired new value of flushed_to_disk_lsn */ + inline void persist(lsn_t lsn) noexcept; +#endif bool check_flush_or_checkpoint() const { @@ -646,96 +340,93 @@ public: void set_check_flush_or_checkpoint(bool flag= true) { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); } - bool has_encryption_key_rotation() const { - return log.format == FORMAT_ENC_10_4 || log.format == FORMAT_ENC_10_5; - } - - /** @return the log block header + trailer size */ - unsigned framing_size() const - { - return has_encryption_key_rotation() - ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM - : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM; - } - /** @return the log block payload size */ - unsigned payload_size() const - { - return has_encryption_key_rotation() - ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM - - LOG_BLOCK_KEY - : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM; - } - /** @return the log block trailer offset */ - unsigned trailer_offset() const - { - return has_encryption_key_rotation() - ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY - : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM; - } - - size_t get_pending_flushes() const - { - return pending_flushes.load(std::memory_order_relaxed); - } - - size_t get_flushes() const - { - return flushes.load(std::memory_order_relaxed); - } + /** Make previous write_buf() durable and update flushed_to_disk_lsn. */ + inline bool flush(lsn_t lsn) noexcept; /** Initialise the redo log subsystem. */ void create(); /** Shut down the redo log subsystem. */ void close(); + +#if defined __linux__ || defined _WIN32 + /** @return the physical block size of the storage */ + size_t get_block_size() const noexcept + { ut_ad(block_size); return block_size; } + /** Set the log block size for file I/O. */ + void set_block_size(uint32_t size) noexcept { block_size= size; } +#else + /** @return the physical block size of the storage */ + static size_t get_block_size() { return 512; } +#endif + + /** Reserve space in the log buffer for appending data. + @param size upper limit of the length of the data to append(), in bytes + @return the current LSN */ + inline lsn_t append_prepare(size_t size) noexcept; + + /** Append a string of bytes to the redo log. + @param s string of bytes + @param size length of str, in bytes */ + void append(const void *s, size_t size) noexcept + { + mysql_mutex_assert_owner(&mutex); + ut_ad(buf_free + size <= (is_pmem() ? file_size : buf_size)); + memcpy(buf + buf_free, s, size); + buf_free+= size; + } + + /** Set the log file format. */ + void set_latest_format(bool encrypted) noexcept + { format= encrypted ? FORMAT_ENC_10_8 : FORMAT_10_8; } + /** @return whether the redo log is encrypted */ + bool is_encrypted() const noexcept { return format & FORMAT_ENCRYPTED; } + /** @return whether the redo log is in the latest format */ + bool is_latest() const noexcept + { return (~FORMAT_ENCRYPTED & format) == FORMAT_10_8; } + + /** @return capacity in bytes */ + lsn_t capacity() const noexcept { return file_size - START_OFFSET; } + + /** Set the LSN of the log file at file creation. */ + void set_first_lsn(lsn_t lsn) noexcept { write_lsn= first_lsn= lsn; } + /** @return the first LSN of the log file */ + lsn_t get_first_lsn() const noexcept { return first_lsn; } + + /** Determine the sequence bit at a log sequence number */ + byte get_sequence_bit(lsn_t lsn) const noexcept + { + ut_ad(lsn >= first_lsn); + return !(((lsn - first_lsn) / capacity()) & 1); + } + + /** Calculate the offset of a log sequence number. + @param lsn log sequence number + @return byte offset within ib_logfile0 */ + lsn_t calc_lsn_offset(lsn_t lsn) const noexcept + { + ut_ad(lsn >= first_lsn); + return START_OFFSET + (lsn - first_lsn) % capacity(); + } + + /** Write checkpoint information to the log header and release mutex. + @param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ + inline void write_checkpoint(lsn_t end_lsn) noexcept; + + /** Write buf to ib_logfile0 and release mutex. + @return new write target + @retval 0 if everything was written */ + inline lsn_t write_buf() noexcept; + + /** Create the log. */ + void create(lsn_t lsn) noexcept; }; /** Redo log system */ extern log_t log_sys; -#ifdef UNIV_DEBUG -extern bool log_write_lock_own(); -#endif -/** Calculate the offset of a log sequence number. -@param[in] lsn log sequence number -@return offset within the log */ -inline lsn_t log_t::file::calc_lsn_offset(lsn_t lsn) const +inline void log_free_check() { - ut_ad(this == &log_sys.log); - /* The lsn parameters are updated while holding both the mutexes - and it is ok to have either of them while reading */ -#ifdef SAFE_MUTEX - ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); -#endif /* SAFE_MUTEX */ - const lsn_t size = capacity(); - lsn_t l= lsn - this->lsn; - if (longlong(l) < 0) { - l = lsn_t(-longlong(l)) % size; - l = size - l; - } - - l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size); - l %= size; - return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE)); + if (log_sys.check_flush_or_checkpoint()) + log_check_margins(); } - -inline void log_t::file::set_lsn(lsn_t a_lsn) -{ -#ifdef SAFE_MUTEX - ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); -#endif /* SAFE_MUTEX */ - lsn= a_lsn; -} - -inline void log_t::file::set_lsn_offset(lsn_t a_lsn) -{ -#ifdef SAFE_MUTEX - ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own()); -#endif /* SAFE_MUTEX */ - ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE)); - lsn_offset= a_lsn; -} - -#include "log0log.ic" - -#endif diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic deleted file mode 100644 index 73434737925..00000000000 --- a/storage/innobase/include/log0log.ic +++ /dev/null @@ -1,311 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/log0log.ic -Database log - -Created 12/9/1995 Heikki Tuuri -*******************************************************/ - -#include "mach0data.h" -#include "assume_aligned.h" -#include "ut0crc32.h" - -extern ulong srv_log_buffer_size; - -/************************************************************//** -Gets a log block flush bit. -@return TRUE if this block was the first to be written in a log flush */ -UNIV_INLINE -ibool -log_block_get_flush_bit( -/*====================*/ - const byte* log_block) /*!< in: log block */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility"); - - return *log_block & 0x80; -} - -/************************************************************//** -Sets the log block flush bit. */ -UNIV_INLINE -void -log_block_set_flush_bit( -/*====================*/ - byte* log_block, /*!< in/out: log block */ - ibool val) /*!< in: value to set */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility"); - - if (val) - *log_block|= 0x80; - else - *log_block&= 0x7f; -} - -/************************************************************//** -Gets a log block number stored in the header. -@return log block number stored in the block header */ -UNIV_INLINE -ulint -log_block_get_hdr_no( -/*=================*/ - const byte* log_block) /*!< in: log block */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - return mach_read_from_4(my_assume_aligned<4>(log_block)) & - ~LOG_BLOCK_FLUSH_BIT_MASK; -} - -/************************************************************//** -Sets the log block number stored in the header; NOTE that this must be set -before the flush bit! */ -UNIV_INLINE -void -log_block_set_hdr_no( -/*=================*/ - byte* log_block, /*!< in/out: log block */ - ulint n) /*!< in: log block number: must be > 0 and - < LOG_BLOCK_FLUSH_BIT_MASK */ -{ - static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); - ut_ad(n > 0); - ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK); - - mach_write_to_4(my_assume_aligned<4>(log_block), n); -} - -/************************************************************//** -Gets a log block data length. -@return log block data length measured as a byte offset from the block start */ -UNIV_INLINE -ulint -log_block_get_data_len( -/*===================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_2(my_assume_aligned<2> - (log_block + LOG_BLOCK_HDR_DATA_LEN)); -} - -/************************************************************//** -Sets the log block data length. */ -UNIV_INLINE -void -log_block_set_data_len( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint len) /*!< in: data length */ -{ - mach_write_to_2(my_assume_aligned<2>(log_block + LOG_BLOCK_HDR_DATA_LEN), - len); -} - -/************************************************************//** -Gets a log block first mtr log record group offset. -@return first mtr log record group byte offset from the block start, 0 -if none */ -UNIV_INLINE -ulint -log_block_get_first_rec_group( -/*==========================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_2(my_assume_aligned<2> - (log_block + LOG_BLOCK_FIRST_REC_GROUP)); -} - -/************************************************************//** -Sets the log block first mtr log record group offset. */ -UNIV_INLINE -void -log_block_set_first_rec_group( -/*==========================*/ - byte* log_block, /*!< in/out: log block */ - ulint offset) /*!< in: offset, 0 if none */ -{ - mach_write_to_2(my_assume_aligned<2> - (log_block + LOG_BLOCK_FIRST_REC_GROUP), offset); -} - -/************************************************************//** -Gets a log block checkpoint number field (4 lowest bytes). -@return checkpoint no (4 lowest bytes) */ -UNIV_INLINE -ulint -log_block_get_checkpoint_no( -/*========================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_4(my_assume_aligned<4> - (log_block + LOG_BLOCK_CHECKPOINT_NO)); -} - -/************************************************************//** -Sets a log block checkpoint number field (4 lowest bytes). */ -UNIV_INLINE -void -log_block_set_checkpoint_no( -/*========================*/ - byte* log_block, /*!< in/out: log block */ - ib_uint64_t no) /*!< in: checkpoint no */ -{ - mach_write_to_4(my_assume_aligned<4>(log_block + LOG_BLOCK_CHECKPOINT_NO), - static_cast(no)); -} - -/************************************************************//** -Converts a lsn to a log block number. -@return log block number, it is > 0 and <= 1G */ -UNIV_INLINE -ulint -log_block_convert_lsn_to_no( -/*========================*/ - lsn_t lsn) /*!< in: lsn of a byte within the block */ -{ - return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & - (DBUG_IF("innodb_small_log_block_no_limit") - ? 0xFUL : 0x3FFFFFFFUL)) + 1); -} - -/** Calculate the CRC-32C checksum of a log block. -@param[in] block log block -@return checksum */ -inline ulint log_block_calc_checksum_crc32(const byte* block) -{ - return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM); -} - -/************************************************************//** -Gets a log block checksum field value. -@return checksum */ -UNIV_INLINE -ulint -log_block_get_checksum( -/*===================*/ - const byte* log_block) /*!< in: log block */ -{ - return mach_read_from_4(my_assume_aligned<4> - (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM + - log_block)); -} - -/************************************************************//** -Sets a log block checksum field value. */ -UNIV_INLINE -void -log_block_set_checksum( -/*===================*/ - byte* log_block, /*!< in/out: log block */ - ulint checksum) /*!< in: checksum */ -{ - mach_write_to_4(my_assume_aligned<4> - (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM + - log_block), checksum); -} - -/************************************************************//** -Initializes a log block in the log buffer. */ -UNIV_INLINE -void -log_block_init( -/*===========*/ - byte* log_block, /*!< in: pointer to the log buffer */ - lsn_t lsn) /*!< in: lsn within the log block */ -{ - ulint no; - - no = log_block_convert_lsn_to_no(lsn); - - log_block_set_hdr_no(log_block, no); - - log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); - log_block_set_first_rec_group(log_block, 0); -} - -/** Append a string to the log. -@param[in] str string -@param[in] len string length -@param[out] start_lsn start LSN of the log record -@return end lsn of the log record, zero if did not succeed */ -UNIV_INLINE -lsn_t -log_reserve_and_write_fast( - const void* str, - ulint len, - lsn_t* start_lsn) -{ - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(len > 0); - - const ulint data_len = len - + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; - - if (data_len >= log_sys.trailer_offset()) { - - /* The string does not fit within the current log block - or the log block would become full */ - - return(0); - } - - lsn_t lsn = log_sys.get_lsn(); - *start_lsn = lsn; - - memcpy(log_sys.buf + log_sys.buf_free, str, len); - - log_block_set_data_len( - reinterpret_cast(ut_align_down( - log_sys.buf + log_sys.buf_free, - OS_FILE_LOG_BLOCK_SIZE)), - data_len); - - log_sys.buf_free += len; - - ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size}); - - lsn += len; - log_sys.set_lsn(lsn); - - return lsn; -} - -/***********************************************************************//** -Checks if there is need for a log buffer flush or a new checkpoint, and does -this if yes. Any database operation should call this when it has modified -more than about 4 pages. NOTE that this function may only be called when the -OS thread owns no synchronization objects except dict_sys.latch. */ -UNIV_INLINE -void -log_free_check(void) -/*================*/ -{ - /* During row_log_table_apply(), this function will be called while we - are holding some latches. This is OK, as long as we are not holding - any latches on buffer blocks. */ - - if (log_sys.check_flush_or_checkpoint()) { - - log_check_margins(); - } -} diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 04b59f33f4e..1c56a192595 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,13 +37,6 @@ Created 9/20/1997 Heikki Tuuri /** @return whether recovery is currently running. */ #define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) -/** Find the latest checkpoint in the log header. -@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 -@return error code or DB_SUCCESS */ -dberr_t -recv_find_max_checkpoint(ulint* max_field) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - /** Apply any buffered redo log to a page that was just read from a data file. @param[in,out] space tablespace @param[in,out] bpage buffer pool page */ @@ -51,12 +44,9 @@ ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage) MY_ATTRIBUTE((nonnull)); /** Start recovering from a redo log checkpoint. -@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first system tablespace page @return error code or DB_SUCCESS */ -dberr_t -recv_recovery_from_checkpoint_start( - lsn_t flush_lsn); +dberr_t recv_recovery_from_checkpoint_start(); /** Whether to store redo log records in recv_sys.pages */ enum store_t { @@ -69,17 +59,6 @@ enum store_t { }; -/** Adds data from a new log block to the parsing buffer of recv_sys if -recv_sys.parse_start_lsn is non-zero. -@param[in] log_block log block to add -@param[in] scanned_lsn lsn of how far we were able to find - data in this log block -@return true if more data added */ -bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn); - -/** Moves the parsing buffer data left to the buffer start */ -void recv_sys_justify_left_parsing_buf(); - /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] create whether the file is being created @@ -223,35 +202,24 @@ private: during log scan or apply */ bool found_corrupt_fs; public: + /** @return maximum guaranteed size of a mini-transaction on recovery */ + static constexpr size_t MTR_SIZE_MAX{1U << 20}; + /** whether we are applying redo log records during crash recovery */ bool recovery_on; /** whether recv_recover_page(), invoked from buf_page_t::read_complete(), should apply log records*/ bool apply_log_recs; - byte* buf; /*!< buffer for parsing log records */ - ulint len; /*!< amount of data in buf */ - lsn_t parse_start_lsn; - /*!< this is the lsn from which we were able to - start parsing log records and adding them to - pages; zero if a suitable - start point not found yet */ - lsn_t scanned_lsn; - /*!< the log data has been scanned up to this - lsn */ - ulint scanned_checkpoint_no; - /*!< the log data has been scanned up to this - checkpoint number (lowest 4 bytes) */ - ulint recovered_offset; - /*!< start offset of non-parsed log records in - buf */ - lsn_t recovered_lsn; - /*!< the log records have been parsed up to - this lsn */ - lsn_t mlog_checkpoint_lsn; - /*!< the LSN of a FILE_CHECKPOINT - record, or 0 if none was parsed */ - /** the time when progress was last reported */ - time_t progress_time; + /** number of bytes in log_sys.buf */ + size_t len; + /** start offset of non-parsed log records in log_sys.buf */ + size_t offset; + /** log sequence number of the first non-parsed record */ + lsn_t lsn; + /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ + lsn_t file_checkpoint; + /** the time when progress was last reported */ + time_t progress_time; using map = std::map, @@ -279,10 +247,10 @@ public: /** The contents of the doublewrite buffer */ recv_dblwr_t dblwr; - /** Last added LSN to pages. */ + /** Last added LSN to pages, before switching to STORE_NO */ lsn_t last_stored_lsn= 0; - void read(os_offset_t offset, span buf); + inline void read(os_offset_t offset, span buf); inline size_t files_size(); void close_files() { files.clear(); files.shrink_to_fit(); } @@ -305,17 +273,13 @@ private: from before MariaDB Server 10.5.1) */ std::vector files; - void open_log_files_if_needed(); - /** Base node of the redo block list. List elements are linked via buf_block_t::unzip_LRU. */ UT_LIST_BASE_NODE_T(buf_block_t) blocks; public: /** Check whether the number of read redo log blocks exceeds the maximum. - Store last_stored_lsn if the recovery is not in the last phase. - @param[in,out] store whether to store page operations @return whether the memory is exhausted */ - inline bool is_memory_exhausted(store_t *store); + inline bool is_memory_exhausted(); /** Apply buffered log to persistent data pages. @param last_batch whether it is possible to write more redo log */ void apply(bool last_batch); @@ -335,22 +299,42 @@ public: bool is_initialised() const { return last_stored_lsn != 0; } + /** Find the latest checkpoint. + @return error code or DB_SUCCESS */ + dberr_t find_checkpoint(); + /** Register a redo log snippet for a page. @param it page iterator @param start_lsn start LSN of the mini-transaction @param lsn @see mtr_t::commit_lsn() - @param l redo log snippet @see log_t::FORMAT_10_5 + @param l redo log snippet @param len length of l, in bytes */ inline void add(map::iterator it, lsn_t start_lsn, lsn_t lsn, const byte *l, size_t len); - /** Parse and register one mini-transaction in log_t::FORMAT_10_5. - @param checkpoint_lsn the log sequence number of the latest checkpoint - @param store whether to store the records - @param apply whether to apply file-level log records - @return whether FILE_CHECKPOINT record was seen the first time, - or corruption was noticed */ - bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply); + enum parse_mtr_result { OK, PREMATURE_EOF, GOT_EOF }; + +private: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction. + @param store whether to store the records + @param l log data source */ + template + inline parse_mtr_result parse(store_t store, source& l) noexcept; +public: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @param store whether to store the records */ + static parse_mtr_result parse_mtr(store_t store) noexcept; + + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @param store whether to store the records */ + static parse_mtr_result parse_pmem(store_t store) noexcept +#ifdef HAVE_PMEM + ; +#else + { return parse_mtr(store); } +#endif /** Clear a fully processed set of stored redo log records. */ inline void clear(); @@ -441,11 +425,3 @@ extern bool recv_no_log_write; number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by recv_recovery_from_checkpoint_start(). */ extern bool recv_lsn_checks_on; - -/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many -times! */ -#define RECV_PARSING_BUF_SIZE (2U << 20) - -/** Size of block reads when the log groups are scanned forward to do a -roll-forward */ -#define RECV_SCAN_SIZE (4U << srv_page_size_shift) diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index 82fbca10721..c955500ff7a 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2019, 2021, MariaDB Corporation. +Copyright (c) 2019, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -50,7 +50,8 @@ inline uint8_t mlog_decode_varint_length(byte first) @param log redo log record buffer @return the decoded integer @retval MLOG_DECODE_ERROR on error */ -inline uint32_t mlog_decode_varint(const byte* log) +template +inline uint32_t mlog_decode_varint(const byte_pointer log) { uint32_t i= *log; if (i < MIN_2BYTE) diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 0b04c0729eb..b79cf7e91c7 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -109,8 +109,9 @@ struct mtr_t { FILE_MODIFY records and an optional FILE_CHECKPOINT marker. The caller must hold log_sys.mutex. This is to be used at log_checkpoint(). - @param checkpoint_lsn the log sequence number of a checkpoint, or 0 */ - void commit_files(lsn_t checkpoint_lsn= 0); + @param checkpoint_lsn the log sequence number of a checkpoint, or 0 + @return current LSN */ + lsn_t commit_files(lsn_t checkpoint_lsn= 0); /** @return mini-transaction savepoint (current size of m_memo) */ ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); } @@ -625,12 +626,19 @@ private: /** Prepare to write the mini-transaction log to the redo log buffer. @return number of bytes to write in finish_write() */ - inline ulint prepare_write(); + inline size_t prepare_write(); + + /** Write a FILE_MODIFY record when a non-predefined persistent + tablespace was modified for the first time since fil_names_clear(). */ + ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void name_write(); + + /** Encrypt the log */ + ATTRIBUTE_NOINLINE void encrypt(); /** Append the redo log records to the redo log buffer. @param len number of bytes to write @return {start_lsn,flush_ahead} */ - inline std::pair finish_write(ulint len); + std::pair finish_write(size_t len); /** Release the resources */ inline void release_resources(); @@ -676,6 +684,9 @@ private: /** whether the pages has been trimmed */ uint16_t m_trim_pages:1; + /** CRC-32C of m_log */ + uint32_t m_crc; + #ifdef UNIV_DEBUG /** Persistent user tablespace associated with the mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */ diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h index 9ee7810fa7b..3db88988924 100644 --- a/storage/innobase/include/mtr0types.h +++ b/storage/innobase/include/mtr0types.h @@ -47,17 +47,17 @@ enum mtr_log_t { /* A mini-transaction is a stream of records that is always terminated by -a NUL byte. The first byte of a mini-transaction record is never NUL, -but NUL bytes can occur within mini-transaction records. The first -bytes of each record will explicitly encode the length of the record. -NUL bytes also acts as padding in log blocks, that is, there can be -multiple sucessive NUL bytes between mini-transactions in a redo log -block. +a byte 0x00 or 0x01. The first byte of a mini-transaction record is +never one of these bytes, but these bytes can occur within mini-transaction +records. The first byte of the record would contain a record type, flags, and a part of length. The optional second byte of the record will contain more length. (Not needed for short records.) +For example, because the length of an INIT_PAGE record is 3 to 11 bytes, +the first byte will be 0x02 to 0x0a, indicating the number of subsequent bytes. + Bit 7 of the first byte of a redo log record is the same_page flag. If same_page=1, the record is referring to the same page as the previous record. Records that do not refer to data pages but to file @@ -186,8 +186,11 @@ A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23 0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78. -The end of the mini-transaction would be indicated by a NUL byte. -*/ +The end of the mini-transaction would be indicated by the end byte +0x00 or 0x01; @see log_sys.get_sequence_bit(). +If log_sys.is_encrypted(), that is followed by 8 bytes of nonce +(part of initialization vector). That will be followed by 4 bytes +of CRC-32C of the entire mini-tranasction, excluding the end byte. */ /** Redo log record types. These bit patterns (3 bits) will be written to the redo log file, so the existing codes or their interpretation on @@ -297,14 +300,16 @@ enum mfile_type_t FILE_RENAME = 0xa0, /** Modify a file. Followed by tablespace ID and the file name. */ FILE_MODIFY = 0xb0, - /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier, - 8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */ + /** End-of-checkpoint marker, at the end of a mini-transaction. + Followed by 2 NUL bytes of page identifier and 8 bytes of LSN; + @see SIZE_OF_FILE_CHECKPOINT. + When all bytes are NUL, this is a dummy padding record. */ FILE_CHECKPOINT = 0xf0 }; /** Size of a FILE_CHECKPOINT record, including the trailing byte to -terminate the mini-transaction. */ -constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1; +terminate the mini-transaction and the CRC-32C. */ +constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1 + 4; #ifndef UNIV_INNOCHECKSUM /** Types for the mlock objects to store in the mtr_t::m_memo */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index ce26a0187a9..1407edbb806 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -107,15 +107,6 @@ struct pfs_os_file_t #endif }; -/** The next value should be smaller or equal to the smallest sector size used -on any disk. A log block is required to be a portion of disk which is written -so that if the start and the end of a block get written to disk, then the -whole block gets written. This should be true even in most cases of a crash: -if this fails for a log block, then it is equivalent to a media failure in the -log. */ - -#define OS_FILE_LOG_BLOCK_SIZE 512U - /** Options for os_file_create_func @{ */ enum os_file_create_t { OS_FILE_OPEN = 51, /*!< to open an existing file (if diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 32da0e4e2b4..a355c65fe6b 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -2,7 +2,7 @@ Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -271,9 +271,6 @@ enum monitor_id_t { MONITOR_OS_PENDING_READS, MONITOR_OS_PENDING_WRITES, MONITOR_OVLD_OS_LOG_WRITTEN, - MONITOR_OVLD_OS_LOG_FSYNC, - MONITOR_OVLD_OS_LOG_PENDING_FSYNC, - MONITOR_OVLD_OS_LOG_PENDING_WRITES, /* Transaction related counters */ MONITOR_MODULE_TRX, @@ -300,20 +297,17 @@ enum monitor_id_t { /* Recovery related counters */ MONITOR_MODULE_RECOVERY, - MONITOR_NUM_CHECKPOINT, + MONITOR_OVLD_CHECKPOINTS, MONITOR_OVLD_LSN_FLUSHDISK, MONITOR_OVLD_LSN_CHECKPOINT, MONITOR_OVLD_LSN_CURRENT, MONITOR_LSN_CHECKPOINT_AGE, MONITOR_OVLD_BUF_OLDEST_LSN, MONITOR_OVLD_MAX_AGE_ASYNC, - MONITOR_PENDING_LOG_FLUSH, - MONITOR_PENDING_CHECKPOINT_WRITE, MONITOR_LOG_IO, MONITOR_OVLD_LOG_WAITS, MONITOR_OVLD_LOG_WRITE_REQUEST, MONITOR_OVLD_LOG_WRITES, - MONITOR_OVLD_LOG_PADDED, /* Page Manager related counters */ MONITOR_MODULE_PAGE, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 8cb834e1cc6..fde39d998b6 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -3,7 +3,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -86,25 +86,6 @@ struct srv_stats_t /** Count the amount of data written in total (in bytes) */ ulint_ctr_1_t data_written; - /** Number of the log write requests done */ - ulint_ctr_1_t log_write_requests; - - /** Number of physical writes to the log performed */ - ulint_ctr_1_t log_writes; - - /** Amount of data padded for log write ahead */ - ulint_ctr_1_t log_padded; - - /** Amount of data written to the log files in bytes */ - lsn_ctr_1_t os_log_written; - - /** Number of writes being done to the log files */ - ulint_ctr_1_t os_log_pending_writes; - - /** We increase this counter, when we don't have enough - space in the log buffer and have to flush it */ - ulint_ctr_1_t log_waits; - /** Store the number of write requests issued */ ulint_ctr_1_t buf_pool_write_requests; @@ -293,10 +274,8 @@ extern char* srv_log_group_home_dir; /** The InnoDB redo log file size, or 0 when changing the redo log format at startup (while disallowing writes to the redo log). */ extern ulonglong srv_log_file_size; -extern ulong srv_log_buffer_size; extern ulong srv_flush_log_at_trx_commit; extern uint srv_flush_log_at_timeout; -extern ulong srv_log_write_ahead_size; extern my_bool srv_adaptive_flushing; extern my_bool srv_flush_sync; @@ -473,9 +452,13 @@ extern my_bool srv_print_all_deadlocks; extern my_bool srv_cmp_per_index_enabled; +/** innodb_encrypt_log */ +extern my_bool srv_encrypt_log; + /* is encryption enabled */ extern ulong srv_encrypt_tables; + /** Status variables to be passed to MySQL */ extern struct export_var_t export_vars; @@ -710,8 +693,6 @@ struct export_var_t{ ulint innodb_checkpoint_max_age; ulint innodb_data_pending_reads; /*!< Pending reads */ ulint innodb_data_pending_writes; /*!< Pending writes */ - ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */ - ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */ ulint innodb_data_read; /*!< Data bytes read */ ulint innodb_data_writes; /*!< I/O write requests */ ulint innodb_data_written; /*!< Data bytes written */ @@ -720,9 +701,6 @@ struct export_var_t{ ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */ ulint innodb_deadlocks; ulint innodb_history_list_length; - ulint innodb_log_waits; /*!< srv_log_waits */ - ulint innodb_log_write_requests; /*!< srv_log_write_requests */ - ulint innodb_log_writes; /*!< srv_log_writes */ lsn_t innodb_lsn_current; lsn_t innodb_lsn_flushed; lsn_t innodb_lsn_last_checkpoint; @@ -731,10 +709,8 @@ struct export_var_t{ ulint innodb_mem_adaptive_hash; #endif ulint innodb_mem_dictionary; - lsn_t innodb_os_log_written; /*!< srv_os_log_written */ - ulint innodb_os_log_fsyncs; /*!< n_log_flushes */ - ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ - ulint innodb_os_log_pending_fsyncs; /*!< n_pending_log_flushes */ + /** log_sys.get_lsn() - recv_sys.lsn */ + lsn_t innodb_os_log_written; ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc index d035808c6b9..8a7714101ba 100644 --- a/storage/innobase/log/log0crypt.cc +++ b/storage/innobase/log/log0crypt.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. -Copyright (C) 2014, 2021, MariaDB Corporation. +Copyright (C) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -31,16 +31,14 @@ MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. #include "log0crypt.h" #include "log0recv.h" // for recv_sys - -/** innodb_encrypt_log: whether to encrypt the redo log */ -my_bool srv_encrypt_log; +#include "mach0data.h" /** Redo log encryption key ID */ #define LOG_DEFAULT_ENCRYPTION_KEY 1 struct crypt_info_t { - ulint checkpoint_no; /*!< checkpoint no; 32 bits */ - uint key_version; /*!< mysqld key version */ + uint32_t checkpoint_no; /*!< checkpoint no; 32 bits */ + uint32_t key_version; /*!< key version */ /** random string for encrypting the key */ alignas(8) byte crypt_msg[MY_AES_BLOCK_SIZE]; /** the secret key */ @@ -60,6 +58,40 @@ static crypt_info_t infos[5 * 2]; /** First unused slot in infos[] */ static size_t infos_used; +/* Offsets of a log block header */ +#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and + is allowed to wrap around at 2G; the + highest bit is set to 1 if this is the + first log block in a log flush write + segment */ +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL + /* mask used to get the highest bit in + the preceding field */ +#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to + this block */ +#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an + mtr log record group in this log block, + 0 if none; if the value is the same + as LOG_BLOCK_HDR_DATA_LEN, it means + that the first rec group has not yet + been catenated to this log block, but + if it will, it will start at this + offset; an archive recovery can + start parsing the log records starting + from this offset in this log block, + if value not 0 */ +#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in + bytes */ + +#define LOG_BLOCK_KEY 4 /* encryption key version + before LOG_BLOCK_CHECKSUM; + after log_t::FORMAT_ENC_10_4 only */ +#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block + contents; in InnoDB versions + < 3.23.52 this did not contain the + checksum but the same value as + LOG_BLOCK_HDR_NO */ + /*********************************************************************//** Get a log block's start lsn. @return a log block's start lsn */ @@ -123,26 +155,36 @@ static bool init_crypt_key(crypt_info_t* info, bool upgrade = false) return true; } -/** Encrypt or decrypt log blocks. -@param[in,out] buf log blocks to encrypt or decrypt +static ulint log_block_get_hdr_no(const byte *log_block) +{ + static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); + return mach_read_from_4(my_assume_aligned<4>(log_block)) & + ~LOG_BLOCK_FLUSH_BIT_MASK; +} + +/** Decrypt log blocks. +@param[in,out] buf log blocks to decrypt @param[in] lsn log sequence number of the start of the buffer @param[in] size size of the buffer, in bytes -@param[in] op whether to decrypt, encrypt, or rotate key and encrypt -@return whether the operation succeeded (encrypt always does) */ -bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op) +@return whether the operation succeeded */ +ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size) { - ut_ad(size % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_ad(ulint(buf) % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(!(size & 511)); + ut_ad(!(ulint(buf) & 511)); ut_a(info.key_version); alignas(8) byte aes_ctr_iv[MY_AES_BLOCK_SIZE]; #define LOG_CRYPT_HDR_SIZE 4 - lsn &= ~lsn_t(OS_FILE_LOG_BLOCK_SIZE - 1); + lsn &= ~lsn_t{511}; + + const bool has_encryption_key_rotation + = log_sys.format == log_t::FORMAT_ENC_10_4 + || log_sys.format == log_t::FORMAT_ENC_10_5; for (const byte* const end = buf + size; buf != end; - buf += OS_FILE_LOG_BLOCK_SIZE, lsn += OS_FILE_LOG_BLOCK_SIZE) { - alignas(4) byte dst[OS_FILE_LOG_BLOCK_SIZE - LOG_CRYPT_HDR_SIZE + buf += 512, lsn += 512) { + alignas(4) byte dst[512 - LOG_CRYPT_HDR_SIZE - LOG_BLOCK_CHECKSUM]; /* The log block number is not encrypted. */ @@ -156,45 +198,28 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op) ut_ad(log_block_get_start_lsn(lsn, log_block_get_hdr_no(buf)) == lsn); - byte* key_ver = &buf[OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_KEY - - LOG_BLOCK_CHECKSUM]; - const size_t dst_size - = log_sys.has_encryption_key_rotation() + byte* key_ver = &buf[512 - LOG_BLOCK_KEY - LOG_BLOCK_CHECKSUM]; + + const size_t dst_size = has_encryption_key_rotation ? sizeof dst - LOG_BLOCK_KEY : sizeof dst; - if (log_sys.has_encryption_key_rotation()) { - const uint key_version = info.key_version; - switch (op) { - case LOG_ENCRYPT_ROTATE_KEY: - info.key_version - = encryption_key_get_latest_version( - LOG_DEFAULT_ENCRYPTION_KEY); - if (key_version != info.key_version - && !init_crypt_key(&info)) { - info.key_version = key_version; - } - /* fall through */ - case LOG_ENCRYPT: - mach_write_to_4(key_ver, info.key_version); - break; - case LOG_DECRYPT: - info.key_version = mach_read_from_4(key_ver); - if (key_version != info.key_version - && !init_crypt_key(&info)) { - return false; - } - } + if (has_encryption_key_rotation) { + const auto key_version = info.key_version; + info.key_version = mach_read_from_4(key_ver); + if (key_version == info.key_version) { + } else if (!init_crypt_key(&info)) { + return false; #ifndef DBUG_OFF - if (key_version != info.key_version) { + } else { DBUG_PRINT("ib_log", ("key_version: %x -> %x", key_version, info.key_version)); - } #endif /* !DBUG_OFF */ + } } ut_ad(LOG_CRYPT_HDR_SIZE + dst_size - == log_sys.trailer_offset()); + == 512 - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY); uint dst_len; int rc = encryption_crypt( @@ -203,9 +228,7 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op) const_cast(info.crypt_key), MY_AES_BLOCK_SIZE, aes_ctr_iv, sizeof aes_ctr_iv, - op == LOG_DECRYPT - ? ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD - : ENCRYPTION_FLAG_ENCRYPT | ENCRYPTION_FLAG_NOPAD, + ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD, LOG_DEFAULT_ENCRYPTION_KEY, info.key_version); ut_a(rc == MY_AES_OK); @@ -219,8 +242,8 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op) /** Initialize the redo log encryption key and random parameters when creating a new redo log. The random parameters will be persisted in the log checkpoint pages. -@see log_crypt_write_checkpoint_buf() -@see log_crypt_read_checkpoint_buf() +@see log_crypt_write_header() +@see log_crypt_read_header() @return whether the operation succeeded */ bool log_crypt_init() { @@ -287,8 +310,7 @@ next_slot: @return whether the decryption was successful */ ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn) { - const uint32_t checkpoint_no - = uint32_t(log_block_get_checkpoint_no(buf)); + const uint32_t checkpoint_no = mach_read_from_4(buf + 8); const crypt_info_t* info = infos; for (const crypt_info_t* const end = info + infos_used; info < end; info++) { @@ -309,16 +331,16 @@ ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn) return false; } found: - byte dst[OS_FILE_LOG_BLOCK_SIZE]; + byte dst[512]; uint dst_len; byte aes_ctr_iv[MY_AES_BLOCK_SIZE]; - const uint src_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE; + const uint src_len = 512 - LOG_BLOCK_HDR_SIZE; ulint log_block_no = log_block_get_hdr_no(buf); /* The log block header is not encrypted. */ - memcpy(dst, buf, LOG_BLOCK_HDR_SIZE); + memcpy(dst, buf, 512); memcpy(aes_ctr_iv, info->crypt_nonce, 3); mach_write_to_8(aes_ctr_iv + 3, @@ -345,30 +367,47 @@ found: return true; } -/** Add the encryption information to a redo log checkpoint buffer. -@param[in,out] buf checkpoint buffer */ -void log_crypt_write_checkpoint_buf(byte *buf) -{ - ut_ad(info.key_version); - compile_time_assert(16 == sizeof info.crypt_msg); - compile_time_assert(16 == MY_AES_BLOCK_SIZE); - compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE - - LOG_CHECKPOINT_CRYPT_NONCE - == sizeof info.crypt_nonce); +/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/ +constexpr size_t LOG_CHECKPOINT_CRYPT_KEY= 32; +/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */ +constexpr size_t LOG_CHECKPOINT_CRYPT_NONCE= 36; +/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */ +constexpr size_t LOG_CHECKPOINT_CRYPT_MESSAGE= 40; - memcpy(buf + LOG_CHECKPOINT_CRYPT_MESSAGE, info.crypt_msg, - MY_AES_BLOCK_SIZE); - memcpy(buf + LOG_CHECKPOINT_CRYPT_NONCE, info.crypt_nonce, - sizeof info.crypt_nonce); - mach_write_to_4(buf + LOG_CHECKPOINT_CRYPT_KEY, info.key_version); +/** Add the encryption information to the log header buffer. +@param buf part of log header buffer */ +void log_crypt_write_header(byte *buf) +{ + ut_ad(info.key_version); + mach_write_to_4(my_assume_aligned<4>(buf), LOG_DEFAULT_ENCRYPTION_KEY); + mach_write_to_4(my_assume_aligned<4>(buf + 4), info.key_version); + memcpy_aligned<8>(buf + 8, info.crypt_msg, MY_AES_BLOCK_SIZE); + static_assert(MY_AES_BLOCK_SIZE == 16, "compatibility"); + memcpy_aligned<4>(buf + 24, info.crypt_nonce, sizeof info.crypt_nonce); +} + +/** Read the encryption information from a log header buffer. +@param buf part of log header buffer +@return whether the operation was successful */ +bool log_crypt_read_header(const byte *buf) +{ + MEM_UNDEFINED(&info.checkpoint_no, sizeof info.checkpoint_no); + MEM_NOACCESS(&info.checkpoint_no, sizeof info.checkpoint_no); + if (mach_read_from_4(my_assume_aligned<4>(buf)) != + LOG_DEFAULT_ENCRYPTION_KEY) + return false; + info.key_version= mach_read_from_4(my_assume_aligned<4>(buf + 4)); + memcpy_aligned<8>(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE); + memcpy_aligned<4>(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce); + return init_crypt_key(&info); } /** Read the checkpoint crypto (version, msg and iv) info. @param[in] buf checkpoint buffer @return whether the operation was successful */ -bool log_crypt_read_checkpoint_buf(const byte* buf) +ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf) { - info.checkpoint_no = mach_read_from_4(buf + (LOG_CHECKPOINT_NO + 4)); + info.checkpoint_no = mach_read_from_4(buf + 4); info.key_version = mach_read_from_4(buf + LOG_CHECKPOINT_CRYPT_KEY); #if MY_AES_BLOCK_SIZE != 16 @@ -423,3 +462,180 @@ bool log_tmp_block_encrypt( return rc == MY_AES_OK; } + +/** Decrypt part of a log record. +@param iv initialization vector +@param buf buffer for the decrypted data +@param data the encrypted data +@param len length of the data, in bytes +@return buf */ +byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len) +{ + ut_a(MY_AES_OK == encryption_crypt(data, len, buf, &len, + info.crypt_key, MY_AES_BLOCK_SIZE, + iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_DECRYPT | + ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version)); + return buf; +} + +#include "mtr0log.h" + +/** Encrypt a log snippet +@param iv initialization vector +@param tmp temporary buffer +@param buf buffer to be replaced with encrypted contents +@param end pointer past the end of buf +@return encrypted data bytes that follow */ +static size_t log_encrypt_buf(byte iv[MY_AES_BLOCK_SIZE], + byte *&tmp, byte *buf, const byte *const end) +{ + for (byte *l= buf; l != end; ) + { + const byte b= *l++; + size_t rlen= b & 0xf; + if (!rlen) + { + const size_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + + if (b < 0x80) + { + /* Add the page identifier to the initialization vector. */ + size_t idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen < rlen); + mach_write_to_4(my_assume_aligned<4>(iv + 8), mlog_decode_varint(l)); + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen <= rlen); + mach_write_to_4(my_assume_aligned<4>(iv + 12), mlog_decode_varint(l)); + l+= idlen; + rlen-= idlen; + } + + uint len; + + if (l + rlen > end) + { + if (size_t len= end - l) + { + /* Only WRITE or EXTENDED records may comprise multiple segments. */ + static_assert((EXTENDED | 0x10) == WRITE, "compatibility"); + ut_ad((b & 0x60) == EXTENDED); + ut_ad(l < end); + memcpy(tmp, l, len); + tmp+= len; + rlen-= len; + } + return rlen; + } + + if (!rlen) + continue; /* FREE_PAGE and INIT_PAGE have no payload. */ + + len= static_cast(rlen); + ut_a(MY_AES_OK == encryption_crypt(l, len, tmp, &len, + info.crypt_key, MY_AES_BLOCK_SIZE, + iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_ENCRYPT | + ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version)); + ut_ad(len == rlen); + memcpy(l, tmp, rlen); + l+= rlen; + } + + return 0; +} + +/** Encrypt the log */ +ATTRIBUTE_NOINLINE void mtr_t::encrypt() +{ + ut_ad(log_sys.format == log_t::FORMAT_ENC_10_8); + ut_ad(m_log.size()); + + alignas(8) byte iv[MY_AES_BLOCK_SIZE]; + + m_commit_lsn= log_sys.get_lsn(); + ut_ad(m_commit_lsn); + byte *tmp= static_cast(alloca(srv_page_size)), *t= tmp; + byte *dst= static_cast(alloca(srv_page_size)); + mach_write_to_8(iv, m_commit_lsn); + mtr_buf_t::block_t *start= nullptr; + size_t size= 0, start_size= 0; + m_crc= 0; + + m_log.for_each_block([&](mtr_buf_t::block_t *b) + { + ut_ad(t - tmp + size <= srv_page_size); + byte *buf= b->begin(); + if (!start) + { + parse: + ut_ad(t == tmp); + size= log_encrypt_buf(iv, t, buf, b->end()); + if (!size) + { + ut_ad(t == tmp); + start_size= 0; + } + else + { + start= b; + start_size= t - tmp; + } + m_crc= my_crc32c(m_crc, buf, b->end() - buf - start_size); + } + else if (size > b->used()) + { + ::memcpy(t, buf, b->used()); + t+= b->used(); + size-= b->used(); + } + else + { + ::memcpy(t, buf, size); + t+= size; + buf+= size; + uint len= static_cast(t - tmp); + ut_a(MY_AES_OK == encryption_crypt(tmp, len, dst, &len, + info.crypt_key, MY_AES_BLOCK_SIZE, + iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_ENCRYPT | + ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version)); + ut_ad(tmp + len == t); + m_crc= my_crc32c(m_crc, dst, len); + /* Copy the encrypted data back to the log snippets. */ + ::memcpy(start->end() - start_size, dst, start_size); + t= dst + start_size; + for (ilist::iterator i(start); &*++i != b;) + { + const size_t l{i->used()}; + ::memcpy(i->begin(), t, l); + t+= l; + } + ::memcpy(b->begin(), t, size); + ut_ad(t + size == dst + len); + t= tmp; + start= nullptr; + goto parse; + } + return true; + }); + + ut_ad(t == tmp); + ut_ad(!start); + ut_ad(!size); +} diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 349ce4cd7d0..3b4af4f99b6 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Google Inc. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -52,6 +52,7 @@ Created 12/9/1995 Heikki Tuuri #include "srv0mon.h" #include "buf0dump.h" #include "log0sync.h" +#include "log.h" /* General philosophy of InnoDB redo-logs: @@ -60,62 +61,19 @@ Every change to a contents of a data page must be done through mtr_t, and mtr_t::commit() will write log records to the InnoDB redo log. */ +MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) +static group_commit_lock flush_lock; +MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) +static group_commit_lock write_lock; + /** Redo log system */ log_t log_sys; -/* A margin for free space in the log buffer before a log entry is catenated */ -#define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE) - /* Margins for free space in the log buffer after a log entry is catenated */ #define LOG_BUF_FLUSH_RATIO 2 -#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \ +#define LOG_BUF_FLUSH_MARGIN ((4 * 4096) /* cf. log_t::append_prepare() */ \ + (4U << srv_page_size_shift)) -/** Extends the log buffer. -@param[in] len requested minimum size in bytes */ -void log_buffer_extend(ulong len) -{ - const size_t new_buf_size = ut_calc_align(len, srv_page_size); - byte* new_buf = static_cast - (ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME)); - byte* new_flush_buf = static_cast - (ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME)); - - mysql_mutex_lock(&log_sys.mutex); - - if (len <= srv_log_buffer_size) { - /* Already extended enough by the others */ - mysql_mutex_unlock(&log_sys.mutex); - ut_free_dodump(new_buf, new_buf_size); - ut_free_dodump(new_flush_buf, new_buf_size); - return; - } - - ib::warn() << "The redo log transaction size " << len << - " exceeds innodb_log_buffer_size=" - << srv_log_buffer_size << " / 2). Trying to extend it."; - - byte* old_buf = log_sys.buf; - byte* old_flush_buf = log_sys.flush_buf; - const ulong old_buf_size = srv_log_buffer_size; - srv_log_buffer_size = static_cast(new_buf_size); - log_sys.buf = new_buf; - log_sys.flush_buf = new_flush_buf; - memcpy_aligned(new_buf, old_buf, - log_sys.buf_free); - - log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO - - LOG_BUF_FLUSH_MARGIN; - - mysql_mutex_unlock(&log_sys.mutex); - - ut_free_dodump(old_buf, old_buf_size); - ut_free_dodump(old_flush_buf, old_buf_size); - - ib::info() << "innodb_log_buffer_size was extended to " - << new_buf_size << "."; -} - /** Calculate the recommended highest values for lsn - last_checkpoint_lsn and lsn - buf_pool.get_oldest_modification(). @param[in] file_size requested innodb_log_file_size @@ -125,6 +83,8 @@ accommodate the number of OS threads in the database server */ bool log_set_capacity(ulonglong file_size) { + mysql_mutex_assert_owner(&log_sys.mutex); + /* Margin for the free space in the smallest log, before a new query step which modifies the database, is started */ const size_t LOG_CHECKPOINT_FREE_PER_THREAD = 4U @@ -134,7 +94,7 @@ log_set_capacity(ulonglong file_size) lsn_t margin; ulint free; - lsn_t smallest_capacity = file_size - LOG_FILE_HDR_SIZE; + lsn_t smallest_capacity = file_size - log_t::START_OFFSET; /* Add extra safety */ smallest_capacity -= smallest_capacity / 10; @@ -146,23 +106,19 @@ log_set_capacity(ulonglong file_size) free = LOG_CHECKPOINT_FREE_PER_THREAD * 10 + LOG_CHECKPOINT_EXTRA_FREE; if (free >= smallest_capacity / 2) { - ib::error() << "innodb_log_file_size is too small. " - << INNODB_PARAMETERS_MSG; + sql_print_error("InnoDB: innodb_log_file_size is too small." + " %s", INNODB_PARAMETERS_MSG); return false; } margin = smallest_capacity - free; margin = margin - margin / 10; /* Add still some extra safety */ - mysql_mutex_lock(&log_sys.mutex); - log_sys.log_capacity = smallest_capacity; log_sys.max_modified_age_async = margin - margin / 8; log_sys.max_checkpoint_age = margin; - mysql_mutex_unlock(&log_sys.mutex); - return(true); } @@ -171,7 +127,6 @@ void log_t::create() { ut_ad(this == &log_sys); ut_ad(!is_initialised()); - m_initialised= true; #if defined(__aarch64__) mysql_mutex_init(log_sys_mutex_key, &mutex, MY_MUTEX_INIT_FAST); @@ -182,699 +137,593 @@ void log_t::create() mysql_mutex_init(log_flush_order_mutex_key, &flush_order_mutex, nullptr); #endif - /* Start the lsn from one log block from zero: this way every - log record has a non-zero start lsn, a fact which we will use */ + /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */ + lsn.store(FIRST_LSN, std::memory_order_relaxed); + flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed); + write_lsn= FIRST_LSN; - set_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE); - set_flushed_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE); - - ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE); - ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift); - - buf= static_cast(ut_malloc_dontdump(srv_log_buffer_size, - PSI_INSTRUMENT_ME)); - TRASH_ALLOC(buf, srv_log_buffer_size); - flush_buf= static_cast(ut_malloc_dontdump(srv_log_buffer_size, +#ifndef HAVE_PMEM + buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); + TRASH_ALLOC(buf, buf_size); + flush_buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); - TRASH_ALLOC(flush_buf, srv_log_buffer_size); + TRASH_ALLOC(flush_buf, buf_size); + checkpoint_buf= static_cast(aligned_malloc(4096, 4096)); + memset_aligned<4096>(checkpoint_buf, 0, 4096); +#else + ut_ad(!checkpoint_buf); + ut_ad(!buf); + ut_ad(!flush_buf); +#endif - max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO - - LOG_BUF_FLUSH_MARGIN; + max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; set_check_flush_or_checkpoint(); n_log_ios_old= n_log_ios; last_printout_time= time(NULL); - buf_next_to_write= 0; - last_checkpoint_lsn= write_lsn= LOG_START_LSN; + last_checkpoint_lsn= FIRST_LSN; n_log_ios= 0; n_log_ios_old= 0; log_capacity= 0; max_modified_age_async= 0; max_checkpoint_age= 0; - next_checkpoint_no= 0; next_checkpoint_lsn= 0; n_pending_checkpoint_writes= 0; - log_block_init(buf, LOG_START_LSN); - log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE); + buf_free= 0; - buf_free= LOG_BLOCK_HDR_SIZE; - checkpoint_buf= static_cast - (aligned_malloc(OS_FILE_LOG_BLOCK_SIZE, OS_FILE_LOG_BLOCK_SIZE)); -} - -file_os_io::file_os_io(file_os_io &&rhs) : m_fd(rhs.m_fd) -{ - rhs.m_fd= OS_FILE_CLOSED; -} - -file_os_io &file_os_io::operator=(file_os_io &&rhs) -{ - std::swap(m_fd, rhs.m_fd); - return *this; -} - -file_os_io::~file_os_io() noexcept -{ - if (is_opened()) - close(); -} - -dberr_t file_os_io::open(const char *path, bool read_only) noexcept -{ - ut_ad(!is_opened()); - - bool success; - auto tmp_fd= os_file_create( - innodb_log_file_key, path, OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, - OS_FILE_NORMAL, OS_LOG_FILE, read_only, &success); - if (!success) - return DB_ERROR; - - m_durable_writes= srv_file_flush_method == SRV_O_DSYNC; - m_fd= tmp_fd; - return success ? DB_SUCCESS : DB_ERROR; -} - -dberr_t file_os_io::rename(const char *old_path, const char *new_path) noexcept -{ - return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS - : DB_ERROR; -} - -dberr_t file_os_io::close() noexcept -{ - if (!os_file_close(m_fd)) - return DB_ERROR; - - m_fd= OS_FILE_CLOSED; - return DB_SUCCESS; -} - -dberr_t file_os_io::read(os_offset_t offset, span buf) noexcept -{ - return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size()); -} - -dberr_t file_os_io::write(const char *path, os_offset_t offset, - span buf) noexcept -{ - return os_file_write(IORequestWrite, path, m_fd, buf.data(), offset, - buf.size()); -} - -dberr_t file_os_io::flush() noexcept -{ - return os_file_flush(m_fd) ? DB_SUCCESS : DB_ERROR; -} - -#ifdef HAVE_PMEM - -#include - -/** Memory mapped file */ -class mapped_file_t -{ -public: - mapped_file_t()= default; - mapped_file_t(const mapped_file_t &)= delete; - mapped_file_t &operator=(const mapped_file_t &)= delete; - mapped_file_t(mapped_file_t &&)= delete; - mapped_file_t &operator=(mapped_file_t &&)= delete; - ~mapped_file_t() noexcept; - - dberr_t map(const char *path, bool read_only= false, - bool nvme= false) noexcept; - dberr_t unmap() noexcept; - byte *data() noexcept { return m_area.data(); } - -private: - span m_area; -}; - -mapped_file_t::~mapped_file_t() noexcept -{ - if (!m_area.empty()) - unmap(); -} - -dberr_t mapped_file_t::map(const char *path, bool read_only, - bool nvme) noexcept -{ - auto fd= mysql_file_open(innodb_log_file_key, path, - read_only ? O_RDONLY : O_RDWR, MYF(MY_WME)); - if (fd == -1) - return DB_ERROR; - - const auto file_size= size_t{os_file_get_size(path).m_total_size}; - - const int nvme_flag= nvme ? MAP_SYNC : 0; - void *ptr= - my_mmap(0, file_size, read_only ? PROT_READ : PROT_READ | PROT_WRITE, - MAP_SHARED_VALIDATE | nvme_flag, fd, 0); - mysql_file_close(fd, MYF(MY_WME)); - - if (ptr == MAP_FAILED) - return DB_ERROR; - - m_area= {static_cast(ptr), file_size}; - return DB_SUCCESS; -} - -dberr_t mapped_file_t::unmap() noexcept -{ - ut_ad(!m_area.empty()); - - if (my_munmap(m_area.data(), m_area.size())) - return DB_ERROR; - - m_area= {}; - return DB_SUCCESS; -} - -static bool is_pmem(const char *path) noexcept -{ - mapped_file_t mf; - return mf.map(path, true, true) == DB_SUCCESS ? true : false; -} - -class file_pmem_io final : public file_io -{ -public: - file_pmem_io() noexcept : file_io(true) {} - - dberr_t open(const char *path, bool read_only) noexcept final - { - return m_file.map(path, read_only, true); - } - dberr_t rename(const char *old_path, const char *new_path) noexcept final - { - return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS - : DB_ERROR; - } - dberr_t close() noexcept final { return m_file.unmap(); } - dberr_t read(os_offset_t offset, span buf) noexcept final - { - memcpy(buf.data(), m_file.data() + offset, buf.size()); - return DB_SUCCESS; - } - dberr_t write(const char *, os_offset_t offset, - span buf) noexcept final - { - pmem_memcpy_persist(m_file.data() + offset, buf.data(), buf.size()); - return DB_SUCCESS; - } - dberr_t flush() noexcept final - { - ut_ad(0); - return DB_SUCCESS; - } - -private: - mapped_file_t m_file; -}; -#endif - -dberr_t log_file_t::open(bool read_only) noexcept -{ - ut_a(!is_opened()); - -#ifdef HAVE_PMEM - auto ptr= is_pmem(m_path.c_str()) - ? std::unique_ptr(new file_pmem_io) - : std::unique_ptr(new file_os_io); -#else - auto ptr= std::unique_ptr(new file_os_io); -#endif - - if (dberr_t err= ptr->open(m_path.c_str(), read_only)) - return err; - - m_file= std::move(ptr); - return DB_SUCCESS; -} - -bool log_file_t::is_opened() const noexcept -{ - return static_cast(m_file); -} - -dberr_t log_file_t::rename(std::string new_path) noexcept -{ - if (dberr_t err= m_file->rename(m_path.c_str(), new_path.c_str())) - return err; - - m_path = std::move(new_path); - return DB_SUCCESS; + ut_ad(is_initialised()); } dberr_t log_file_t::close() noexcept { ut_a(is_opened()); - if (dberr_t err= m_file->close()) - return err; + if (!os_file_close(m_file)) + return DB_ERROR; - m_file.reset(); + m_file= OS_FILE_CLOSED; return DB_SUCCESS; } dberr_t log_file_t::read(os_offset_t offset, span buf) noexcept { ut_ad(is_opened()); - return m_file->read(offset, buf); -} - -bool log_file_t::writes_are_durable() const noexcept -{ - return m_file->writes_are_durable(); + return os_file_read(IORequestRead, m_file, buf.data(), offset, buf.size()); } dberr_t log_file_t::write(os_offset_t offset, span buf) noexcept { ut_ad(is_opened()); - return m_file->write(m_path.c_str(), offset, buf); + return os_file_write(IORequestWrite, "ib_logfile0", m_file, + buf.data(), offset, buf.size()); } -dberr_t log_file_t::flush() noexcept +#ifdef HAVE_PMEM +# include +#endif + +void log_t::attach(log_file_t file, os_offset_t size) { - ut_ad(is_opened()); - return m_file->flush(); + log= file; + ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); + file_size= size; + +#ifdef HAVE_PMEM + ut_ad(!buf); + ut_ad(!flush_buf); + if (size && !(size_t(size) & 4095)) + { + void *ptr= + my_mmap(0, size_t(size), + srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED_VALIDATE | MAP_SYNC, log.m_file, 0); +#ifdef __linux__ + if (ptr == MAP_FAILED) + { + struct stat st; + if (!fstat(log.m_file, &st)) + { + const auto st_dev= st.st_dev; + if (!stat("/dev/shm", &st) && st.st_dev == st_dev) + ptr= my_mmap(0, size_t(size), + srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED, log.m_file, 0); + } + } +#endif /* __linux__ */ + if (ptr != MAP_FAILED) + { + log.close(); + mprotect(ptr, size_t(size), PROT_READ); + buf= static_cast(ptr); +#if defined __linux__ || defined _WIN32 + set_block_size(CPU_LEVEL1_DCACHE_LINESIZE); +#endif + return; + } + } + buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); + TRASH_ALLOC(buf, buf_size); + flush_buf= static_cast(ut_malloc_dontdump(buf_size, + PSI_INSTRUMENT_ME)); + TRASH_ALLOC(flush_buf, buf_size); +#endif + +#if defined __linux__ || defined _WIN32 + if (!block_size) + set_block_size(512); +# ifdef __linux__ + else if (srv_file_flush_method != SRV_O_DSYNC) + sql_print_information("InnoDB: Buffered log writes (block size=%u bytes)", + block_size); +#endif + else + sql_print_information("InnoDB: File system buffers for log" + " disabled (block size=%u bytes)", block_size); +#endif + +#ifdef HAVE_PMEM + checkpoint_buf= static_cast(aligned_malloc(block_size, block_size)); + memset_aligned<64>(checkpoint_buf, 0, block_size); +#endif } -void log_t::file::open_file(std::string path) +void log_t::create(lsn_t lsn) noexcept { - fd= log_file_t(std::move(path)); - if (const dberr_t err= fd.open(srv_read_only_mode)) - ib::fatal() << "open(" << fd.get_path() << ") returned " << err; -} - -/** Update the log block checksum. */ -static void log_block_store_checksum(byte* block) -{ - log_block_set_checksum(block, log_block_calc_checksum_crc32(block)); -} - -void log_t::file::write_header_durable(lsn_t lsn) -{ - ut_ad(lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + mysql_mutex_assert_owner(&mutex); ut_ad(!recv_no_log_write); - ut_ad(log_sys.log.format == log_t::FORMAT_10_5 || - log_sys.log.format == log_t::FORMAT_ENC_10_5); + ut_ad(is_latest()); + ut_ad(this == &log_sys); - byte *buf= log_sys.checkpoint_buf; - memset_aligned(buf, 0, OS_FILE_LOG_BLOCK_SIZE); + this->lsn.store(lsn, std::memory_order_relaxed); + this->flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + first_lsn= lsn; + write_lsn= lsn; - mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format); - mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat); + last_checkpoint_lsn= 0; + +#ifdef HAVE_PMEM + if (is_pmem()) + { + mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); + memset_aligned<4096>(buf, 0, 4096); + buf_free= START_OFFSET; + } + else +#endif + { + buf_free= 0; + memset_aligned<4096>(flush_buf, 0, buf_size); + memset_aligned<4096>(buf, 0, buf_size); + } + + mach_write_to_4(buf + LOG_HEADER_FORMAT, FORMAT_10_8); mach_write_to_8(buf + LOG_HEADER_START_LSN, lsn); + static constexpr const char LOG_HEADER_CREATOR_CURRENT[]= + "MariaDB " + IB_TO_STR(MYSQL_VERSION_MAJOR) "." + IB_TO_STR(MYSQL_VERSION_MINOR) "." + IB_TO_STR(MYSQL_VERSION_PATCH); + strcpy(reinterpret_cast(buf) + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT); - ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR >= - sizeof LOG_HEADER_CREATOR_CURRENT); - log_block_store_checksum(buf); + static_assert(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR >= + sizeof LOG_HEADER_CREATOR_CURRENT, "compatibility"); + if (is_encrypted()) + log_crypt_write_header(buf + LOG_HEADER_CREATOR_END); + mach_write_to_4(my_assume_aligned<4>(508 + buf), my_crc32c(0, buf, 508)); - DBUG_PRINT("ib_log", ("write " LSN_PF, lsn)); + DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); - log_sys.log.write(0, {buf, OS_FILE_LOG_BLOCK_SIZE}); - if (!log_sys.log.writes_are_durable()) - log_sys.log.flush(); -} - -void log_t::file::read(os_offset_t offset, span buf) -{ - if (const dberr_t err= fd.read(offset, buf)) - ib::fatal() << "read(" << fd.get_path() << ") returned "<< err; -} - -bool log_t::file::writes_are_durable() const noexcept -{ - return fd.writes_are_durable(); -} - -void log_t::file::write(os_offset_t offset, span buf) -{ - srv_stats.os_log_pending_writes.inc(); - if (const dberr_t err= fd.write(offset, buf)) - ib::fatal() << "write(" << fd.get_path() << ") returned " << err; - srv_stats.os_log_pending_writes.dec(); - srv_stats.os_log_written.add(buf.size()); - srv_stats.log_writes.inc(); - log_sys.n_log_ios++; -} - -void log_t::file::flush() -{ - log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire); - if (const dberr_t err= fd.flush()) - ib::fatal() << "flush(" << fd.get_path() << ") returned " << err; - log_sys.pending_flushes.fetch_sub(1, std::memory_order_release); - log_sys.flushes.fetch_add(1, std::memory_order_release); -} - -void log_t::file::close_file() -{ - if (fd.is_opened()) - { - if (const dberr_t err= fd.close()) - ib::fatal() << "close(" << fd.get_path() << ") returned " << err; - } - fd.free(); // Free path -} - -/** Initialize the redo log. */ -void log_t::file::create() -{ - ut_ad(this == &log_sys.log); - ut_ad(log_sys.is_initialised()); - - format= srv_encrypt_log ? log_t::FORMAT_ENC_10_5 : log_t::FORMAT_10_5; - subformat= 2; - file_size= srv_log_file_size; - lsn= LOG_START_LSN; - lsn_offset= LOG_FILE_HDR_SIZE; -} - -/******************************************************//** -Writes a buffer to a log file. */ -static -void -log_write_buf( - byte* buf, /*!< in: buffer */ - ulint len, /*!< in: buffer len; must be divisible - by OS_FILE_LOG_BLOCK_SIZE */ -#ifdef UNIV_DEBUG - ulint pad_len, /*!< in: pad len in the buffer len */ -#endif /* UNIV_DEBUG */ - lsn_t start_lsn, /*!< in: start lsn of the buffer; must - be divisible by - OS_FILE_LOG_BLOCK_SIZE */ - ulint new_data_offset)/*!< in: start offset of new data in - buf: this parameter is used to decide - if we have to write a new log file - header */ -{ - ulint write_len; - lsn_t next_offset; - ulint i; - - ut_ad(log_write_lock_own()); - ut_ad(!recv_no_log_write); - ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); - -loop: - if (len == 0) { - - return; - } - - next_offset = log_sys.log.calc_lsn_offset(start_lsn); - - if ((next_offset % log_sys.log.file_size) + len - > log_sys.log.file_size) { - /* if the above condition holds, then the below expression - is < len which is ulint, so the typecast is ok */ - write_len = ulint(log_sys.log.file_size - - (next_offset % log_sys.log.file_size)); - } else { - write_len = len; - } - - DBUG_PRINT("ib_log", - ("write " LSN_PF " to " LSN_PF - ": len " ULINTPF - " blocks " ULINTPF ".." ULINTPF, - start_lsn, next_offset, - write_len, - log_block_get_hdr_no(buf), - log_block_get_hdr_no( - buf + write_len - - OS_FILE_LOG_BLOCK_SIZE))); - - ut_ad(pad_len >= len - || log_block_get_hdr_no(buf) - == log_block_convert_lsn_to_no(start_lsn)); - - /* Calculate the checksums for each log block and write them to - the trailer fields of the log blocks */ - - for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { -#ifdef UNIV_DEBUG - ulint hdr_no_2 = log_block_get_hdr_no(buf) + i; - DBUG_EXECUTE_IF("innodb_small_log_block_no_limit", - hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;); +#ifdef HAVE_PMEM + if (is_pmem()) + pmem_persist(buf, 512); + else #endif - ut_ad(pad_len >= len - || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len - || log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2); - log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE); - } - - log_sys.log.write(next_offset, {buf, write_len}); - - if (write_len < len) { - start_lsn += write_len; - len -= write_len; - buf += write_len; - goto loop; - } + { + log.write(0, {buf, 4096}); + memset_aligned<512>(buf, 0, 512); + } } -/** Flush the recently written changes to the log file. -and invoke mysql_mutex_lock(&log_sys.mutex). */ -static void log_write_flush_to_disk_low(lsn_t lsn) +void log_t::close_file() { - if (!log_sys.log.writes_are_durable()) - log_sys.log.flush(); - ut_a(lsn >= log_sys.get_flushed_lsn()); - log_sys.set_flushed_lsn(lsn); +#ifdef HAVE_PMEM + if (is_pmem()) + { + ut_ad(!is_opened()); + ut_ad(!checkpoint_buf); + if (buf) + { + my_munmap(buf, file_size); + buf= nullptr; + } + return; + } + + ut_free_dodump(buf, buf_size); + buf= nullptr; + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + aligned_free(checkpoint_buf); + checkpoint_buf= nullptr; +#endif + if (is_opened()) + if (const dberr_t err= log.close()) + ib::fatal() << "closing ib_logfile0 failed: " << err; } -/** Swap log buffers, and copy the content of last block -from old buf to the head of the new buf. Thus, buf_free and -buf_next_to_write would be changed accordingly */ -static inline -void -log_buffer_switch() +/** Write an aligned buffer to ib_logfile0. +@param buf buffer to be written +@param len length of data to be written +@param offset log file offset */ +static void log_write_buf(const byte *buf, size_t len, lsn_t offset) { - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(log_write_lock_own()); + ut_ad(write_lock.is_owner()); + ut_ad(!recv_no_log_write); + ut_d(const size_t block_size_1= log_sys.get_block_size() - 1); + ut_ad(!(offset & block_size_1)); + ut_ad(!(len & block_size_1)); + ut_ad(!(size_t(buf) & block_size_1)); + ut_ad(len); - size_t area_end = ut_calc_align( - log_sys.buf_free, OS_FILE_LOG_BLOCK_SIZE); + if (UNIV_LIKELY(offset + len <= log_sys.file_size)) + { +write: + log_sys.log.write(offset, {buf, len}); + return; + } - /* Copy the last block to new buf */ - memcpy_aligned( - log_sys.flush_buf, - log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE, - OS_FILE_LOG_BLOCK_SIZE); - - std::swap(log_sys.buf, log_sys.flush_buf); - - log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE; - log_sys.buf_next_to_write = log_sys.buf_free; + const size_t write_len= size_t(log_sys.file_size - offset); + log_sys.log.write(offset, {buf, write_len}); + len-= write_len; + buf+= write_len; + ut_ad(log_sys.START_OFFSET + len < offset); + offset= log_sys.START_OFFSET; + goto write; } /** Invoke commit_checkpoint_notify_ha() to notify that outstanding log writes have been completed. */ void log_flush_notify(lsn_t flush_lsn); -/** -Writes log buffer to disk -which is the "write" part of log_write_up_to(). +#if 0 // Currently we overwrite the last log block until it is complete. +/** CRC-32C of pad messages using between 1 and 15 bytes of NUL bytes +in the payload */ +static const unsigned char pad_crc[15][4]= { + {0xA6,0x59,0xC1,0xDB}, {0xF2,0xAF,0x80,0x73}, {0xED,0x02,0xF1,0x90}, + {0x68,0x4E,0xA3,0xF3}, {0x5D,0x1B,0xEA,0x6A}, {0xE0,0x01,0x86,0xB9}, + {0xD1,0x06,0x86,0xF5}, {0xEB,0x20,0x12,0x33}, {0xBA,0x73,0xB2,0xA3}, + {0x5F,0xA2,0x08,0x03}, {0x70,0x03,0xD6,0x9D}, {0xED,0xB3,0x49,0x78}, + {0xFD,0xD6,0xB9,0x9C}, {0x25,0xF8,0xB1,0x2C}, {0xCD,0xAA,0xE7,0x10} +}; -This function does not flush anything. - -Note : the caller must have log_sys.mutex locked, and this -mutex is released in the function. - -*/ -static void log_write(bool rotate_key) +/** Pad the log with some dummy bytes +@param lsn desired log sequence number +@param pad number of bytes to append to the log +@param begin buffer to write 'pad' bytes to +@param extra buffer for additional pad bytes (up to 15 bytes) +@return additional bytes used in extra[] */ +ATTRIBUTE_NOINLINE +static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) { - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(!recv_no_log_write); - lsn_t write_lsn; - if (log_sys.buf_free == log_sys.buf_next_to_write) { - /* Nothing to write */ - mysql_mutex_unlock(&log_sys.mutex); - return; - } + ut_ad(!(size_t(begin + pad) & (log_sys.get_block_size() - 1))); + byte *b= begin; + const byte seq{log_sys.get_sequence_bit(lsn)}; + /* The caller should never request padding such that the + file would wrap around to the beginning. That is, the sequence + bit must be the same for all records. */ + ut_ad(seq == log_sys.get_sequence_bit(lsn + pad)); - ulint start_offset; - ulint end_offset; - ulint area_start; - ulint area_end; - ulong write_ahead_size = srv_log_write_ahead_size; - ulint pad_size; + if (log_sys.is_encrypted()) + { + /* The lengths of our pad messages vary between 15 and 29 bytes + (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte, + 4 bytes checksum, 8 NUL bytes nonce). */ + if (pad < 15) + { + extra[0]= FILE_CHECKPOINT | 1; + extra[1]= 0; + extra[2]= seq; + memcpy(extra + 3, pad_crc[0], 4); + memset(extra + 7, 0, 8); + memcpy(b, extra, pad); + memmove(extra, extra + pad, 15 - pad); + return 15 - pad; + } - DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF, - log_sys.write_lsn, - log_sys.get_lsn())); + /* Pad first with 29-byte messages until the remaining size is + less than 29+15 bytes, and then write 1 or 2 shorter messages. */ + const byte *const end= begin + pad; + for (; b + (29 + 15) < end; b+= 29) + { + b[0]= FILE_CHECKPOINT | 15; + memset(b + 1, 0, 15); + b[16]= seq; + memcpy(b + 17, pad_crc[14], 4); + memset(b + 21, 0, 8); + } + if (b + 29 < end) + { + b[0]= FILE_CHECKPOINT | 1; + b[1]= 0; + b[2]= seq; + memcpy(b + 3, pad_crc[0], 4); + memset(b + 7, 0, 8); + b+= 15; + } + const size_t last_pad(end - b); + ut_ad(last_pad >= 15); + ut_ad(last_pad <= 29); + b[0]= FILE_CHECKPOINT | byte(last_pad - 14); + memset(b + 1, 0, last_pad - 14); + b[last_pad - 13]= seq; + memcpy(b + last_pad - 12, pad_crc[last_pad - 15], 4); + memset(b + last_pad - 8, 0, 8); + } + else + { + /* The lengths of our pad messages vary between 7 and 21 bytes + (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte, + 4 bytes checksum). */ + if (pad < 7) + { + extra[0]= FILE_CHECKPOINT | 1; + extra[1]= 0; + extra[2]= seq; + memcpy(extra + 3, pad_crc[0], 4); + memcpy(b, extra, pad); + memmove(extra, extra + pad, 7 - pad); + return 7 - pad; + } + /* Pad first with 21-byte messages until the remaining size is + less than 21+7 bytes, and then write 1 or 2 shorter messages. */ + const byte *const end= begin + pad; + for (; b + (21 + 7) < end; b+= 21) + { + b[0]= FILE_CHECKPOINT | 15; + memset(b + 1, 0, 15); + b[16]= seq; + memcpy(b + 17, pad_crc[14], 4); + } + if (b + 21 < end) + { + b[0]= FILE_CHECKPOINT | 1; + b[1]= 0; + b[2]= seq; + memcpy(b + 3, pad_crc[0], 4); + b+= 7; + } + const size_t last_pad(end - b); + ut_ad(last_pad >= 7); + ut_ad(last_pad <= 21); + b[0]= FILE_CHECKPOINT | byte(last_pad - 6); + memset(b + 1, 0, last_pad - 6); + b[last_pad - 5]= seq; + memcpy(b + last_pad - 4, pad_crc[last_pad - 7], 4); + } - start_offset = log_sys.buf_next_to_write; - end_offset = log_sys.buf_free; - - area_start = ut_2pow_round(start_offset, - ulint(OS_FILE_LOG_BLOCK_SIZE)); - area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE)); - - ut_ad(area_end - area_start > 0); - - log_block_set_flush_bit(log_sys.buf + area_start, TRUE); - log_block_set_checkpoint_no( - log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE, - log_sys.next_checkpoint_no); - - write_lsn = log_sys.get_lsn(); - byte *write_buf = log_sys.buf; - - log_buffer_switch(); - - log_sys.log.set_fields(log_sys.write_lsn); - - mysql_mutex_unlock(&log_sys.mutex); - /* Erase the end of the last log block. */ - memset(write_buf + end_offset, 0, - ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1)); - - /* Calculate pad_size if needed. */ - pad_size = 0; - if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) { - ulint end_offset_in_unit; - lsn_t end_offset = log_sys.log.calc_lsn_offset( - ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE)); - end_offset_in_unit = (ulint) (end_offset % write_ahead_size); - - if (end_offset_in_unit > 0 - && (area_end - area_start) > end_offset_in_unit) { - /* The first block in the unit was initialized - after the last writing. - Needs to be written padded data once. */ - pad_size = std::min( - ulint(write_ahead_size) - end_offset_in_unit, - srv_log_buffer_size - area_end); - ::memset(write_buf + area_end, 0, pad_size); - } - } - - if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) { - service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "InnoDB log write: " - LSN_PF, log_sys.write_lsn); - } - - if (log_sys.is_encrypted()) { - log_crypt(write_buf + area_start, log_sys.write_lsn, - area_end - area_start, - rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT); - } - - /* Do the write to the log file */ - log_write_buf( - write_buf + area_start, area_end - area_start + pad_size, -#ifdef UNIV_DEBUG - pad_size, -#endif /* UNIV_DEBUG */ - ut_uint64_align_down(log_sys.write_lsn, - OS_FILE_LOG_BLOCK_SIZE), - start_offset - area_start); - srv_stats.log_padded.add(pad_size); - log_sys.write_lsn = write_lsn; - if (log_sys.log.writes_are_durable()) { - log_sys.set_flushed_lsn(write_lsn); - log_flush_notify(write_lsn); - } - return; -} - -static group_commit_lock write_lock; -static group_commit_lock flush_lock; - -#ifdef UNIV_DEBUG -bool log_write_lock_own() -{ - return write_lock.is_owner(); + return 0; } #endif +#ifdef HAVE_PMEM +/** Persist the log. +@param lsn desired new value of flushed_to_disk_lsn */ +inline void log_t::persist(lsn_t lsn) noexcept +{ + ut_ad(is_pmem()); + mysql_mutex_assert_not_owner(&mutex); + ut_ad(!write_lock.is_owner()); + ut_ad(!flush_lock.is_owner()); + + lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed); + + if (old >= lsn) + return; + + const size_t start(calc_lsn_offset(old)); + const size_t end(calc_lsn_offset(lsn)); + if (UNIV_UNLIKELY(end < start)) + { + pmem_persist(log_sys.buf + start, log_sys.file_size - start); + pmem_persist(log_sys.buf + log_sys.START_OFFSET, + end - log_sys.START_OFFSET); + } + else + pmem_persist(log_sys.buf + start, end - start); + + old= flushed_to_disk_lsn.load(std::memory_order_relaxed); + + if (old >= lsn) + return; + + while (!flushed_to_disk_lsn.compare_exchange_weak + (old, lsn, std::memory_order_release, std::memory_order_relaxed)) + if (old >= lsn) + break; + + log_flush_notify(lsn); + DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); +} +#endif + +/** Write buf to ib_logfile0 and release mutex. +@return new write target +@retval 0 if everything was written */ +inline lsn_t log_t::write_buf() noexcept +{ + mysql_mutex_assert_owner(&mutex); + + ut_ad(!srv_read_only_mode); + ut_ad(!is_pmem()); + + const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; + + if (write_lsn >= lsn) + { + mysql_mutex_unlock(&mutex); + ut_ad(write_lsn == lsn); + } + else + { + ut_ad(!recv_no_log_write); + write_lock.set_pending(lsn); + ut_ad(write_lsn >= get_flushed_lsn()); + const size_t block_size_1{get_block_size() - 1}; + const lsn_t offset{calc_lsn_offset(write_lsn) & ~block_size_1}; + + DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, + write_lsn, lsn, offset)); + const byte *write_buf{buf}; + size_t length{buf_free}; + ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1)); + buf_free&= block_size_1; + ut_ad(buf_free == ((lsn - first_lsn) & block_size_1)); + + if (buf_free) + { +#if 0 /* TODO: Pad the last log block with dummy records. */ + buf_free= log_pad(lsn, get_block_size() - buf_free, + buf + buf_free, flush_buf); + ... /* TODO: Update the LSN and adjust other code. */ +#else + /* The rest of the block will be written as garbage. + (We want to avoid memset() while holding mutex.) + This block will be overwritten later, once records beyond + the current LSN are generated. */ + MEM_MAKE_DEFINED(buf + length, get_block_size() - buf_free); + buf[length]= 0; /* allow recovery to catch EOF faster */ + length&= ~block_size_1; + memcpy_aligned<16>(flush_buf, buf + length, (buf_free + 15) & ~15); + length+= get_block_size(); +#endif + } + + std::swap(buf, flush_buf); + write_to_log++; + mysql_mutex_unlock(&mutex); + + if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + { + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "InnoDB log write: " LSN_PF, write_lsn); + } + + /* Do the write to the log file */ + log_write_buf(write_buf, length, offset); + write_lsn= lsn; + if (srv_file_flush_method == SRV_O_DSYNC) + { + flushed_to_disk_lsn.store(lsn, std::memory_order_release); + log_flush_notify(lsn); + } + } + + return write_lock.release(lsn); +} + +inline bool log_t::flush(lsn_t lsn) noexcept +{ + ut_ad(lsn >= get_flushed_lsn()); + flush_lock.set_pending(lsn); + const bool success{log.flush()}; + if (UNIV_LIKELY(success)) + { + flushed_to_disk_lsn.store(lsn, std::memory_order_release); + log_flush_notify(lsn); + } + return success; +} + +/** Ensure that previous log writes are durable. +@param lsn previously written LSN +@return new durable lsn target +@retval 0 if everything was adequately written */ +static lsn_t log_flush(lsn_t lsn) +{ + ut_ad(!log_sys.is_pmem()); + + if (srv_file_flush_method != SRV_O_DSYNC) + ut_a(log_sys.flush(lsn)); + + DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); + return flush_lock.release(lsn); +} + +static const completion_callback dummy_callback{[](void *) {},nullptr}; /** Ensure that the log has been written to the log file up to a given log entry (such as that of a transaction commit). Start a new write, or wait and check if an already running write is covering the request. -@param[in] lsn log sequence number that should be -included in the redo log file write -@param[in] flush_to_disk whether the written log should also -be flushed to the file system -@param[in] rotate_key whether to rotate the encryption key */ -void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key, +@param lsn log sequence number that should be included in the file write +@param durable whether the write needs to be durable +@param callback log write completion callback */ +void log_write_up_to(lsn_t lsn, bool durable, const completion_callback *callback) { ut_ad(!srv_read_only_mode); - ut_ad(!rotate_key || flush_to_disk); ut_ad(lsn != LSN_MAX); - if (recv_no_ibuf_operations) + if (UNIV_UNLIKELY(recv_no_ibuf_operations)) { - /* Recovery is running and no operations on the log files are - allowed yet (the variable name .._no_ibuf_.. is misleading) */ + /* A non-final batch of recovery is active no writes to the log + are allowed yet. */ ut_a(!callback); return; } -repeat: - lsn_t ret_lsn1= 0, ret_lsn2= 0; + ut_ad(lsn <= log_sys.get_lsn()); - if (flush_to_disk && +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + { + ut_ad(!callback); + if (durable) + log_sys.persist(lsn); + return; + } +#endif + +repeat: + if (durable && flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) return; - if (write_lock.acquire(lsn, flush_to_disk ? nullptr : callback) == + lsn_t write_lsn; + + if (write_lock.acquire(lsn, durable ? nullptr : callback) == group_commit_lock::ACQUIRED) { mysql_mutex_lock(&log_sys.mutex); - lsn_t write_lsn= log_sys.get_lsn(); - write_lock.set_pending(write_lsn); - - log_write(rotate_key); - - ut_a(log_sys.write_lsn == write_lsn); - ret_lsn1= write_lock.release(write_lsn); + write_lsn= log_sys.write_buf(); } + else + write_lsn= 0; - if (flush_to_disk) + if (durable) { - /* Flush the highest written lsn.*/ - auto flush_lsn = write_lock.value(); - flush_lock.set_pending(flush_lsn); - log_write_flush_to_disk_low(flush_lsn); - ret_lsn2= flush_lock.release(flush_lsn); - - log_flush_notify(flush_lsn); - DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); - } - - if (ret_lsn1 || ret_lsn2) - { - /* - There is no new group commit lead, some async waiters could stall. - Rerun log_write_up_to(), to prevent that. - */ - lsn= std::max(ret_lsn1, ret_lsn2); - static const completion_callback dummy{[](void *) {},nullptr}; - callback= &dummy; - goto repeat; + lsn= log_flush(write_lock.value()); + if (lsn || write_lsn) + { + /* There is no new group commit lead; some async waiters could stall. */ + callback= &dummy_callback; + if (write_lsn > lsn) + lsn= write_lsn; + goto repeat; + } } } /** Write to the log file up to the last log entry. -@param sync whether to wait for a durable write to complete */ -void log_buffer_flush_to_disk(bool sync) +@param durable whether to wait for a durable write to complete */ +void log_buffer_flush_to_disk(bool durable) { ut_ad(!srv_read_only_mode); - log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync); + log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable); } /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */ @@ -882,6 +731,9 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare() { mysql_mutex_assert_not_owner(&log_sys.mutex); + if (log_sys.is_pmem()) + return; + while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) != @@ -892,16 +744,20 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare() ATTRIBUTE_COLD void log_write_and_flush() { ut_ad(!srv_read_only_mode); - auto lsn= log_sys.get_lsn(); - write_lock.set_pending(lsn); - log_write(false); - ut_a(log_sys.write_lsn == lsn); - write_lock.release(lsn); - - lsn= write_lock.value(); - flush_lock.set_pending(lsn); - log_write_flush_to_disk_low(lsn); - flush_lock.release(lsn); + if (!log_sys.is_pmem()) + { + const lsn_t write_lsn{log_sys.write_buf()}; + const lsn_t flush_lsn{log_flush(write_lock.value())}; + if (write_lsn || flush_lsn) + log_write_up_to(std::max(write_lsn, flush_lsn), true, &dummy_callback); + } +#ifdef HAVE_PMEM + else + { + mysql_mutex_unlock(&log_sys.mutex); + log_sys.persist(log_sys.get_lsn()); + } +#endif } /******************************************************************** @@ -910,89 +766,12 @@ Tries to establish a big enough margin of free space in the log buffer, such that a new log entry can be catenated without an immediate need for a flush. */ ATTRIBUTE_COLD static void log_flush_margin() { - lsn_t lsn = 0; + mysql_mutex_lock(&log_sys.mutex); + const bool flush{log_sys.buf_free > log_sys.max_buf_free}; + mysql_mutex_unlock(&log_sys.mutex); - mysql_mutex_lock(&log_sys.mutex); - - if (log_sys.buf_free > log_sys.max_buf_free) { - /* We can write during flush */ - lsn = log_sys.get_lsn(); - } - - mysql_mutex_unlock(&log_sys.mutex); - - if (lsn) { - log_write_up_to(lsn, false); - } -} - -/** Write checkpoint info to the log header and release log_sys.mutex. -@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ -ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn) -{ - ut_ad(!srv_read_only_mode); - ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn); - ut_ad(end_lsn <= log_sys.get_lsn()); - ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= log_sys.get_lsn() - || srv_shutdown_state > SRV_SHUTDOWN_INITIATED); - - DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF - " written", - log_sys.next_checkpoint_no, - log_sys.next_checkpoint_lsn)); - - byte* buf = log_sys.checkpoint_buf; - memset_aligned(buf, 0, OS_FILE_LOG_BLOCK_SIZE); - - mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no); - mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn); - - if (log_sys.is_encrypted()) { - log_crypt_write_checkpoint_buf(buf); - } - - lsn_t lsn_offset - = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn); - mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset); - mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, - srv_log_buffer_size); - mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn); - - log_block_store_checksum(buf); - - ut_ad(LOG_CHECKPOINT_1 < srv_page_size); - ut_ad(LOG_CHECKPOINT_2 < srv_page_size); - - ++log_sys.n_pending_checkpoint_writes; - - mysql_mutex_unlock(&log_sys.mutex); - - /* Note: We alternate the physical place of the checkpoint info. - See the (next_checkpoint_no & 1) below. */ - - log_sys.log.write((log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2 - : LOG_CHECKPOINT_1, - {buf, OS_FILE_LOG_BLOCK_SIZE}); - - log_sys.log.flush(); - - mysql_mutex_lock(&log_sys.mutex); - - --log_sys.n_pending_checkpoint_writes; - ut_ad(log_sys.n_pending_checkpoint_writes == 0); - - log_sys.next_checkpoint_no++; - - log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; - - DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF - ", flushed to " LSN_PF, - lsn_t{log_sys.last_checkpoint_lsn}, - log_sys.get_flushed_lsn())); - - MONITOR_INC(MONITOR_NUM_CHECKPOINT); - - mysql_mutex_unlock(&log_sys.mutex); + if (flush) + log_buffer_flush_to_disk(false); } /****************************************************************//** @@ -1175,15 +954,13 @@ wait_suspend_loop: if (log_sys.is_initialised()) { mysql_mutex_lock(&log_sys.mutex); const ulint n_write = log_sys.n_pending_checkpoint_writes; - const ulint n_flush = log_sys.pending_flushes; mysql_mutex_unlock(&log_sys.mutex); - if (n_write || n_flush) { + if (n_write) { if (srv_print_verbose_log && count > 600) { - ib::info() << "Pending checkpoint_writes: " - << n_write - << ". Pending log flush writes: " - << n_flush; + sql_print_information( + "InnoDB: Pending checkpoint writes: " + ULINTPF, n_write); count = 0; } goto loop; @@ -1214,13 +991,16 @@ wait_suspend_loop: "ensuring dirty buffer pool are written to log"); log_make_checkpoint(); + const auto sizeof_cp = log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 + : SIZE_OF_FILE_CHECKPOINT; + mysql_mutex_lock(&log_sys.mutex); lsn = log_sys.get_lsn(); const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn - && lsn != log_sys.last_checkpoint_lsn - + SIZE_OF_FILE_CHECKPOINT; + && lsn != log_sys.last_checkpoint_lsn + sizeof_cp; ut_ad(lsn >= log_sys.last_checkpoint_lsn); mysql_mutex_unlock(&log_sys.mutex); @@ -1228,10 +1008,8 @@ wait_suspend_loop: if (lsn_changed) { goto loop; } - - log_sys.log.flush(); } else { - lsn = recv_sys.recovered_lsn; + lsn = recv_sys.lsn; } srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; @@ -1246,10 +1024,10 @@ wait_suspend_loop: ut_a(lsn == log_sys.get_lsn() || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); - if (UNIV_UNLIKELY(lsn < recv_sys.recovered_lsn)) { - ib::error() << "Shutdown LSN=" << lsn - << " is less than start LSN=" - << recv_sys.recovered_lsn; + if (UNIV_UNLIKELY(lsn < recv_sys.lsn)) { + sql_print_error("InnoDB: Shutdown LSN=" LSN_PF + " is less than start LSN=" LSN_PF, + lsn, recv_sys.lsn); } srv_shutdown_lsn = lsn; @@ -1307,10 +1085,8 @@ log_print( } fprintf(file, - ULINTPF " pending log flushes, " ULINTPF " pending chkp writes\n" ULINTPF " log i/o's done, %.2f log i/o's/second\n", - log_sys.pending_flushes.load(), log_sys.n_pending_checkpoint_writes, log_sys.n_log_ios, static_cast( @@ -1338,21 +1114,27 @@ void log_t::close() { ut_ad(this == &log_sys); if (!is_initialised()) return; - m_initialised= false; - log.close(); + close_file(); - ut_free_dodump(buf, srv_log_buffer_size); +#ifndef HAVE_PMEM + ut_free_dodump(buf, buf_size); buf= nullptr; - ut_free_dodump(flush_buf, srv_log_buffer_size); + ut_free_dodump(flush_buf, buf_size); flush_buf= nullptr; + aligned_free(checkpoint_buf); + checkpoint_buf= nullptr; +#else + ut_ad(!checkpoint_buf); + ut_ad(!buf); + ut_ad(!flush_buf); +#endif mysql_mutex_destroy(&mutex); mysql_mutex_destroy(&flush_order_mutex); recv_sys.close(); - aligned_free(checkpoint_buf); - checkpoint_buf= nullptr; + max_buf_free= 0; } std::string get_log_file_path(const char *filename) @@ -1376,23 +1158,3 @@ std::string get_log_file_path(const char *filename) return path; } - -std::vector get_existing_log_files_paths() { - std::vector result; - - for (int i= 0; i < 101; i++) { - auto path= get_log_file_path(LOG_FILE_NAME_PREFIX) - .append(std::to_string(i)); - os_file_stat_t stat; - dberr_t err= os_file_get_status(path.c_str(), &stat, false, true); - if (err) - break; - - if (stat.type != OS_FILE_TYPE_FILE) - break; - - result.push_back(std::move(path)); - } - - return result; -} diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 5a7a6076322..530779a7e07 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -87,7 +87,7 @@ is bigger than the lsn we are able to scan up to, that is an indication that the recovery failed and the database may be corrupt. */ static lsn_t recv_max_page_lsn; -/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */ +/** Stored physical log record with logical LSN */ struct log_phys_t : public log_rec_t { /** start LSN of the mini-transaction (not necessarily of this record) */ @@ -248,19 +248,19 @@ public: memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8); mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space()); last_offset= FIL_PAGE_TYPE; - next_after_applying: + next_after_applying: if (applied == APPLIED_NO) applied= APPLIED_YES; } else { - record_corrupted: + record_corrupted: if (!srv_force_recovery) { recv_sys.set_corrupt_log(); return applied; } - next_not_same_page: + next_not_same_page: last_offset= 1; /* the next record must not be same_page */ } next: @@ -307,7 +307,7 @@ public: goto record_corrupted; if (undo_append(block, ++l, --rlen) && !srv_force_recovery) { -page_corrupted: + page_corrupted: sql_print_error("InnoDB: Set innodb_force_recovery=1" " to ignore corruption."); recv_sys.set_corrupt_log(); @@ -1112,36 +1112,22 @@ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn) DBUG_VOID_RETURN; } -void recv_sys_t::open_log_files_if_needed() +inline void recv_sys_t::read(os_offset_t total_offset, span buf) { - if (!recv_sys.files.empty()) - return; - - for (auto &&path : get_existing_log_files_paths()) - { - recv_sys.files.emplace_back(std::move(path)); - ut_a(recv_sys.files.back().open(true) == DB_SUCCESS); - } -} - -void recv_sys_t::read(os_offset_t total_offset, span buf) -{ - open_log_files_if_needed(); - - size_t file_idx= static_cast(total_offset / log_sys.log.file_size); - os_offset_t offset= total_offset % log_sys.log.file_size; + size_t file_idx= static_cast(total_offset / log_sys.file_size); + os_offset_t offset= total_offset % log_sys.file_size; dberr_t err= recv_sys.files[file_idx].read(offset, buf); ut_a(err == DB_SUCCESS); } inline size_t recv_sys_t::files_size() { - open_log_files_if_needed(); + ut_ad(!files.empty()); return files.size(); } /** Process a file name from a FILE_* record. -@param[in,out] name file name +@param[in] name file name @param[in] len length of the file name @param[in] space_id the tablespace ID @param[in] deleted whether this is a FILE_DELETE record @@ -1150,8 +1136,8 @@ inline size_t recv_sys_t::files_size() stored */ static void -fil_name_process(char* name, ulint len, uint32_t space_id, - bool deleted, lsn_t lsn, store_t *store) +fil_name_process(const char* name, ulint len, uint32_t space_id, + bool deleted, lsn_t lsn, store_t store) { if (srv_operation == SRV_OPERATION_BACKUP) { return; @@ -1196,7 +1182,7 @@ fil_name_process(char* name, ulint len, uint32_t space_id, the space_id. If not, ignore the file after displaying a note. Abort if there are multiple files with the same space_id. */ - switch (fil_ibd_load(space_id, name, space)) { + switch (fil_ibd_load(space_id, fname.name.c_str(), space)) { case FIL_LOAD_OK: ut_ad(space != NULL); @@ -1219,12 +1205,14 @@ same_space: sql_print_error("InnoDB: Tablespace " UINT32PF " has been found" " in two places:" - " '%.*s' and '%s'." + " '%.*s' and '%.*s'." " You must delete" " one of them.", space_id, int(f.name.size()), - f.name.data(), name); + f.name.data(), + int(fname.name.size()), + fname.name.data()); recv_sys.set_corrupt_fs(); } break; @@ -1249,20 +1237,21 @@ same_space: sql_print_information( "InnoDB: At LSN: " LSN_PF - ": unable to open file %s" + ": unable to open file %.*s" " for tablespace " UINT32PF, - recv_sys.recovered_lsn, - name, space_id); + recv_sys.lsn, + int(fname.name.size()), + fname.name.data(), space_id); } break; case FIL_LOAD_DEFER: /** Skip the deferred spaces when lsn is already processed */ - if (*store != store_t::STORE_IF_EXISTS) { + if (store != store_t::STORE_IF_EXISTS) { deferred_spaces.add( static_cast(space_id), - name, lsn); + fname.name.c_str(), lsn); } break; case FIL_LOAD_INVALID: @@ -1319,12 +1308,6 @@ void recv_sys_t::close() deferred_spaces.clear(); ut_d(mysql_mutex_unlock(&mutex)); - if (buf) - { - ut_free_dodump(buf, RECV_PARSING_BUF_SIZE); - buf= nullptr; - } - last_stored_lsn= 0; mysql_mutex_destroy(&mutex); pthread_cond_destroy(&cond); @@ -1347,17 +1330,12 @@ void recv_sys_t::create() apply_log_recs = false; apply_batch_on = false; - buf = static_cast(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE, - PSI_INSTRUMENT_ME)); len = 0; - parse_start_lsn = 0; - scanned_lsn = 0; - scanned_checkpoint_no = 0; - recovered_offset = 0; - recovered_lsn = 0; + offset = 0; + lsn = 0; found_corrupt_log = false; found_corrupt_fs = false; - mlog_checkpoint_lsn = 0; + file_checkpoint = 0; progress_time = time(NULL); recv_max_page_lsn = 0; @@ -1398,9 +1376,6 @@ void recv_sys_t::debug_free() recovery_on= false; pages.clear(); - ut_free_dodump(buf, RECV_PARSING_BUF_SIZE); - - buf= nullptr; mysql_mutex_unlock(&mutex); } @@ -1484,176 +1459,11 @@ inline void recv_sys_t::free(const void *data) } -/** Read a log segment to log_sys.buf. -@param[in,out] start_lsn in: read area start, -out: the last read valid lsn -@param[in] end_lsn read area end -@return whether no invalid blocks (e.g checksum mismatch) were found */ -bool log_t::file::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn) +/** @return whether a log_t::FORMAT_10_5 log block checksum matches */ +static bool recv_check_log_block(const byte *buf) { - ulint len; - bool success = true; - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(!(*start_lsn % OS_FILE_LOG_BLOCK_SIZE)); - ut_ad(!(end_lsn % OS_FILE_LOG_BLOCK_SIZE)); - byte* buf = log_sys.buf; -loop: - lsn_t source_offset = calc_lsn_offset_old(*start_lsn); - - ut_a(end_lsn - *start_lsn <= ULINT_MAX); - len = (ulint) (end_lsn - *start_lsn); - - ut_ad(len != 0); - - const bool at_eof = (source_offset % file_size) + len > file_size; - if (at_eof) { - /* If the above condition is true then len (which is ulint) - is > the expression below, so the typecast is ok */ - len = ulint(file_size - (source_offset % file_size)); - } - - log_sys.n_log_ios++; - - ut_a((source_offset >> srv_page_size_shift) <= ULINT_MAX); - - recv_sys.read(source_offset, {buf, len}); - - for (ulint l = 0; l < len; l += OS_FILE_LOG_BLOCK_SIZE, - buf += OS_FILE_LOG_BLOCK_SIZE, - (*start_lsn) += OS_FILE_LOG_BLOCK_SIZE) { - const ulint block_number = log_block_get_hdr_no(buf); - - if (block_number != log_block_convert_lsn_to_no(*start_lsn)) { - /* Garbage or an incompletely written log block. - We will not report any error, because this can - happen when InnoDB was killed while it was - writing redo log. We simply treat this as an - abrupt end of the redo log. */ -fail: - end_lsn = *start_lsn; - success = false; - break; - } - - ulint crc = log_block_calc_checksum_crc32(buf); - ulint cksum = log_block_get_checksum(buf); - - DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", { - static int block_counter; - if (block_counter++ == 0) { - cksum = crc + 1; - } - }); - - DBUG_EXECUTE_IF("log_checksum_mismatch", { cksum = crc + 1; }); - - if (UNIV_UNLIKELY(crc != cksum)) { - ib::error_or_warn(srv_operation!=SRV_OPERATION_BACKUP) - << "Invalid log block checksum. block: " - << block_number - << " checkpoint no: " - << log_block_get_checkpoint_no(buf) - << " expected: " << crc - << " found: " << cksum; - goto fail; - } - - if (is_encrypted() - && !log_crypt(buf, *start_lsn, - OS_FILE_LOG_BLOCK_SIZE, - LOG_DECRYPT)) { - goto fail; - } - - ulint dl = log_block_get_data_len(buf); - if (dl < LOG_BLOCK_HDR_SIZE - || (dl != OS_FILE_LOG_BLOCK_SIZE - && dl > log_sys.trailer_offset())) { - recv_sys.set_corrupt_log(); - goto fail; - } - } - - if (recv_sys.report(time(NULL))) { - sql_print_information("InnoDB: Read redo log up to LSN=" - LSN_PF, *start_lsn); - service_manager_extend_timeout( - INNODB_EXTEND_TIMEOUT_INTERVAL, - "Read redo log up to LSN=" LSN_PF, *start_lsn); - } - - if (*start_lsn != end_lsn) { - goto loop; - } - - return(success); -} - - - -/********************************************************//** -Copies a log segment from the most up-to-date log group to the other log -groups, so that they all contain the latest log data. Also writes the info -about the latest checkpoint to the groups, and inits the fields in the group -memory structs to up-to-date values. */ -static -void -recv_synchronize_groups() -{ - const lsn_t recovered_lsn = recv_sys.recovered_lsn; - - /* Read the last recovered log block to the recovery system buffer: - the block is always incomplete */ - - lsn_t start_lsn = ut_uint64_align_down(recovered_lsn, - OS_FILE_LOG_BLOCK_SIZE); - log_sys.log.read_log_seg(&start_lsn, - start_lsn + OS_FILE_LOG_BLOCK_SIZE); - log_sys.log.set_fields(recovered_lsn); - - /* Copy the checkpoint info to the log; remember that we have - incremented checkpoint_no by one, and the info will not be written - over the max checkpoint info, thus making the preservation of max - checkpoint info on disk certain */ - - if (!srv_read_only_mode) { - log_write_checkpoint_info(0); - mysql_mutex_lock(&log_sys.mutex); - } -} - -/** Check the consistency of a log header block. -@param[in] log header block -@return true if ok */ -static -bool -recv_check_log_header_checksum( - const byte* buf) -{ - return(log_block_get_checksum(buf) - == log_block_calc_checksum_crc32(buf)); -} - -static bool redo_file_sizes_are_correct() -{ - auto paths= get_existing_log_files_paths(); - auto get_size= [](const std::string &path) { - return os_file_get_size(path.c_str()).m_total_size; - }; - os_offset_t size= get_size(paths[0]); - - auto it= - std::find_if(paths.begin(), paths.end(), [&](const std::string &path) { - return get_size(path) != size; - }); - - if (it == paths.end()) - return true; - - sql_print_error("InnoDB: Log file %.*s is of different size " UINT64PF - " bytes than other log files " UINT64PF " bytes!", - int(it->size()), it->data(), get_size(*it), size); - return false; + return mach_read_from_4(my_assume_aligned<4>(508 + buf)) == + my_crc32c(0, buf, 508); } /** Calculate the checksum for a log block using the pre-10.2.2 algorithm. */ @@ -1682,12 +1492,8 @@ inline uint32_t log_block_calc_checksum_format_0(const byte *b) ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() { uint64_t max_no= 0; - byte *buf= log_sys.buf; - ut_ad(log_sys.log.format == 0); - - if (!redo_file_sizes_are_correct()) - return DB_CORRUPTION; + ut_ad(log_sys.format == 0); /** Offset of the first checkpoint checksum */ constexpr uint CHECKSUM_1= 288; @@ -1700,22 +1506,21 @@ ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() /** Least significant bits of the checkpoint offset */ constexpr uint OFFS_LO= 16; - lsn_t lsn= 0; - - for (ulint field= LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; - field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) + lsn_t source_offset= 0; + const lsn_t log_size{(log_sys.file_size - 2048) * recv_sys.files_size()}; + for (size_t field= 512; field < 2048; field+= 1024) { - log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE}); + const byte *buf= log_sys.buf + field; if (static_cast(ut_fold_binary(buf, CHECKSUM_1)) != mach_read_from_4(buf + CHECKSUM_1) || static_cast(ut_fold_binary(buf + CHECKPOINT_LSN, CHECKSUM_2 - CHECKPOINT_LSN)) != mach_read_from_4(buf + CHECKSUM_2)) - { - DBUG_LOG("ib_log", "invalid pre-10.2.2 checkpoint " << field); - continue; - } + { + DBUG_PRINT("ib_log", ("invalid pre-10.2.2 checkpoint %zu", field)); + continue; + } if (!log_crypt_101_read_checkpoint(buf)) { @@ -1729,287 +1534,342 @@ ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() checkpoint_no, mach_read_from_8(buf + CHECKPOINT_LSN))); - if (checkpoint_no >= max_no) + if (checkpoint_no < max_no) + continue; + + const lsn_t o= lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 | + mach_read_from_4(buf + OFFS_LO); + if (o >= 0x80c && (o & ~511) + 512 < log_size) { max_no= checkpoint_no; - lsn= mach_read_from_8(buf + CHECKPOINT_LSN); - log_sys.log.set_lsn(lsn); - log_sys.log.set_lsn_offset(lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 | - mach_read_from_4(buf + OFFS_LO)); + log_sys.next_checkpoint_lsn= mach_read_from_8(buf + CHECKPOINT_LSN); + source_offset= o; } } - if (!lsn) + const char *uag= srv_operation == SRV_OPERATION_NORMAL + ? "InnoDB: Upgrade after a crash is not supported." + : "mariadb-backup --prepare is not possible."; + + if (!log_sys.next_checkpoint_lsn) { - sql_print_error("InnoDB: Upgrade after a crash is not supported." + sql_print_error("%s" " This redo log was created before MariaDB 10.2.2," " and we did not find a valid checkpoint." " Please follow the instructions at" - " https://mariadb.com/kb/en/library/upgrading/"); + " https://mariadb.com/kb/en/library/upgrading/", uag); return DB_ERROR; } - log_sys.set_lsn(lsn); - log_sys.set_flushed_lsn(lsn); - const lsn_t source_offset= log_sys.log.calc_lsn_offset_old(lsn); - - static const char NO_UPGRADE_RECOVERY_MSG[]= - "Upgrade after a crash is not supported." + static const char pre_10_2[]= " This redo log was created before MariaDB 10.2.2"; - recv_sys.read(source_offset & ~511, {buf, 512}); + byte *buf= const_cast(field_ref_zero); - if (log_block_calc_checksum_format_0(buf) != log_block_get_checksum(buf) && - !log_crypt_101_read_block(buf, lsn)) + if (source_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512); + else + recv_sys.read(source_offset & ~511, {buf, 512}); + + if (log_block_calc_checksum_format_0(buf) != + mach_read_from_4(my_assume_aligned<4>(buf + 508)) && + !log_crypt_101_read_block(buf, log_sys.next_checkpoint_lsn)) { - sql_print_error("InnoDB: %s, and it appears corrupted.", - NO_UPGRADE_RECOVERY_MSG); + sql_print_error("%s%s, and it appears corrupted.", uag, pre_10_2); return DB_CORRUPTION; } if (mach_read_from_2(buf + 4) == (source_offset & 511)) - { - /* Mark the redo log for upgrading. */ - srv_log_file_size= 0; - recv_sys.parse_start_lsn= recv_sys.recovered_lsn= recv_sys.scanned_lsn= - recv_sys.mlog_checkpoint_lsn = lsn; - log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn= - log_sys.write_lsn= log_sys.current_flush_lsn= lsn; - log_sys.next_checkpoint_no= 0; return DB_SUCCESS; - } if (buf[20 + 32 * 9] == 2) sql_print_error("InnoDB: Cannot decrypt log for upgrading." " The encrypted log was created before MariaDB 10.2.2."); else - sql_print_error("InnoDB: %s.", NO_UPGRADE_RECOVERY_MSG); + sql_print_error("%s%s.", uag, pre_10_2); return DB_ERROR; } -/** Calculate the offset of a log sequence number -in an old redo log file (during upgrade check). -@param[in] lsn log sequence number -@return byte offset within the log */ -inline lsn_t log_t::file::calc_lsn_offset_old(lsn_t lsn) const -{ - const lsn_t size= capacity() * recv_sys.files_size(); - lsn_t l= lsn - this->lsn; - if (longlong(l) < 0) - { - l= lsn_t(-longlong(l)) % size; - l= size - l; - } - - l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size); - l%= size; - return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE)); -} - -/** Determine if a redo log from MariaDB 10.2.2+, 10.3, or 10.4 is clean. +/** Determine if a redo log from MariaDB 10.2.2, 10.3, 10.4, or 10.5 is clean. +@param lsn_offset checkpoint LSN offset @return error code @retval DB_SUCCESS if the redo log is clean @retval DB_CORRUPTION if the redo log is corrupted @retval DB_ERROR if the redo log is not empty */ -static dberr_t recv_log_recover_10_4() +static dberr_t recv_log_recover_10_5(lsn_t lsn_offset) { - const lsn_t lsn = log_sys.log.get_lsn(); - const lsn_t source_offset = log_sys.log.calc_lsn_offset_old(lsn); - byte* buf = log_sys.buf; + byte *buf= const_cast(field_ref_zero); - if (!redo_file_sizes_are_correct()) { - return DB_CORRUPTION; - } + if (lsn_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512); + else + recv_sys.read(lsn_offset & ~511, {buf, 512}); - recv_sys.read(source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1), - {buf, OS_FILE_LOG_BLOCK_SIZE}); + if (!recv_check_log_block(buf)) + { + sql_print_error("InnoDB: Invalid log header checksum"); + return DB_CORRUPTION; + } - ulint crc = log_block_calc_checksum_crc32(buf); - ulint cksum = log_block_get_checksum(buf); + if (log_sys.is_encrypted() && + !log_decrypt(buf, log_sys.next_checkpoint_lsn & ~511, 512)) + return DB_ERROR; - if (UNIV_UNLIKELY(crc != cksum)) { - sql_print_error("InnoDB: Invalid log block checksum." - " block: " ULINTPF " checkpoint no: " ULINTPF - " expected: " ULINTPF " found: " ULINTPF, - log_block_get_hdr_no(buf), - log_block_get_checkpoint_no(buf), crc, cksum); - return DB_CORRUPTION; - } + /* On a clean shutdown, the redo log will be logically empty + after the checkpoint lsn. */ - if (log_sys.log.is_encrypted() - && !log_crypt(buf, lsn & ~511, 512, LOG_DECRYPT)) { - return DB_ERROR; - } + if (mach_read_from_2(my_assume_aligned<2>(buf + 4)) != (lsn_offset & 511)) + return DB_ERROR; - /* On a clean shutdown, the redo log will be logically empty - after the checkpoint lsn. */ - - if (log_block_get_data_len(buf) - != (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) { - return DB_ERROR; - } - - /* Mark the redo log for upgrading. */ - srv_log_file_size = 0; - recv_sys.parse_start_lsn = recv_sys.recovered_lsn - = recv_sys.scanned_lsn - = recv_sys.mlog_checkpoint_lsn = lsn; - log_sys.set_lsn(lsn); - log_sys.set_flushed_lsn(lsn); - log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn - = log_sys.write_lsn = log_sys.current_flush_lsn = lsn; - log_sys.next_checkpoint_no = 0; - return DB_SUCCESS; + return DB_SUCCESS; } -/** Find the latest checkpoint in the log header. -@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 -@return error code or DB_SUCCESS */ -dberr_t -recv_find_max_checkpoint(ulint* max_field) +dberr_t recv_sys_t::find_checkpoint() { - ib_uint64_t max_no; - ib_uint64_t checkpoint_no; - ulint field; - byte* buf; + bool wrong_size= false; + byte *buf; - max_no = 0; - *max_field = 0; + if (files.empty()) + { + file_checkpoint= 0; + std::string path{get_log_file_path()}; + bool success; + pfs_os_file_t file= os_file_create(innodb_log_file_key, path.c_str(), + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, + srv_read_only_mode, &success); + if (file == OS_FILE_CLOSED) + return DB_ERROR; + const os_offset_t size{os_file_get_size(file)}; + if (!size) + { + if (srv_operation != SRV_OPERATION_NORMAL) + goto too_small; + } + else if (size < log_t::START_OFFSET + SIZE_OF_FILE_CHECKPOINT) + { + too_small: + os_file_close(file); + sql_print_error("InnoDB: File %.*s is too small", + int(path.size()), path.data()); + return DB_ERROR; + } - buf = log_sys.checkpoint_buf; + log_sys.attach(file, size); + recv_sys.files.emplace_back(file); + for (int i= 1; i < 101; i++) + { + path= get_log_file_path(LOG_FILE_NAME_PREFIX).append(std::to_string(i)); + file= os_file_create(innodb_log_file_key, path.c_str(), + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT | + OS_FILE_ON_ERROR_SILENT, + OS_FILE_NORMAL, OS_LOG_FILE, true, &success); + if (file == OS_FILE_CLOSED) + break; + const os_offset_t sz{os_file_get_size(file)}; + if (size != sz) + { + sql_print_error("InnoDB: Log file %.*s is of different size " UINT64PF + " bytes than other log files " UINT64PF " bytes!", + int(path.size()), path.data(), sz, size); + wrong_size= true; + } + recv_sys.files.emplace_back(file); + } - log_sys.log.read(0, {buf, OS_FILE_LOG_BLOCK_SIZE}); - /* Check the header page checksum. There was no - checksum in the first redo log format (version 0). */ - log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT); - log_sys.log.subformat = log_sys.log.format != log_t::FORMAT_3_23 - ? mach_read_from_4(buf + LOG_HEADER_SUBFORMAT) - : 0; - if (log_sys.log.format != log_t::FORMAT_3_23 - && !recv_check_log_header_checksum(buf)) { - sql_print_error("InnoDB: Invalid redo log header checksum."); - return(DB_CORRUPTION); - } + if (!size) + { + if (wrong_size) + return DB_CORRUPTION; + if (log_sys.next_checkpoint_lsn < 8204) + { + /* Before MDEV-14425, InnoDB had a minimum LSN of 8192+12=8204. + Likewise, mariadb-backup --prepare would create an empty + ib_logfile0 after applying the log. We will allow an upgrade + from such an empty log. - char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1]; + If a user replaces the redo log with an empty file and the + FIL_PAGE_FILE_FLUSH_LSN field was zero in the system + tablespace (see SysTablespace::read_lsn_and_check_flags()) we + must refuse to start up. */ + sql_print_error("InnoDB: ib_logfile0 is empty, and LSN is unknown."); + return DB_CORRUPTION; + } + lsn= log_sys.next_checkpoint_lsn; + log_sys.format= log_t::FORMAT_3_23; + goto upgrade; + } + } + else + ut_ad(srv_operation == SRV_OPERATION_BACKUP); + log_sys.next_checkpoint_lsn= 0; + lsn= 0; + buf= my_assume_aligned<4096>(log_sys.buf); + if (!log_sys.is_pmem()) + log_sys.log.read(0, {buf, 4096}); + /* Check the header page checksum. There was no + checksum in the first redo log format (version 0). */ + log_sys.format= mach_read_from_4(buf + LOG_HEADER_FORMAT); + if (log_sys.format == log_t::FORMAT_3_23) + { + if (wrong_size) + return DB_CORRUPTION; + if (dberr_t err= recv_log_recover_pre_10_2()) + return err; + upgrade: + memset_aligned<512>(const_cast(field_ref_zero), 0, 512); + /* Mark the redo log for upgrading. */ + log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn; + log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn); + lsn= file_checkpoint= log_sys.next_checkpoint_lsn; + log_sys.next_checkpoint_no= 0; + return DB_SUCCESS; + } - memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator); - /* Ensure that the string is NUL-terminated. */ - creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR] = 0; + if (!recv_check_log_block(buf)) + { + sql_print_error("InnoDB: Invalid log header checksum"); + return DB_CORRUPTION; + } - switch (log_sys.log.format) { - case log_t::FORMAT_3_23: - return recv_log_recover_pre_10_2(); - case log_t::FORMAT_10_2: - case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED: - case log_t::FORMAT_10_3: - case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED: - case log_t::FORMAT_10_4: - case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED: - case log_t::FORMAT_10_5: - case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: - break; - default: - sql_print_error("InnoDB: Unsupported redo log format." - " The redo log was created with %s.", creator); - return(DB_ERROR); - } + const lsn_t first_lsn{mach_read_from_8(buf + LOG_HEADER_START_LSN)}; + log_sys.set_first_lsn(first_lsn); + char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1]; + memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator); + /* Ensure that the string is NUL-terminated. */ + creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR]= 0; - for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; - field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) { - log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE}); + lsn_t lsn_offset= 0; - const ulint crc32 = log_block_calc_checksum_crc32(buf); - const ulint cksum = log_block_get_checksum(buf); + switch (log_sys.format) { + default: + sql_print_error("InnoDB: Unsupported redo log format." + " The redo log was created with %s.", creator); + return DB_ERROR; + case log_t::FORMAT_10_8: + if (files.size() != 1) + { + sql_print_error("InnoDB: Expecting only ib_logfile0"); + return DB_CORRUPTION; + } - if (crc32 != cksum) { - DBUG_PRINT("ib_log", - ("invalid checkpoint," - " at " ULINTPF - ", checksum " ULINTPFx - " expected " ULINTPFx, - field, cksum, crc32)); - continue; - } + if (*reinterpret_cast(buf + LOG_HEADER_FORMAT + 4) || + first_lsn < log_t::FIRST_LSN) + { + sql_print_error("InnoDB: Invalid ib_logfile0 header block;" + " the log was created with %s.", creator); + return DB_CORRUPTION; + } - if (log_sys.is_encrypted() - && !log_crypt_read_checkpoint_buf(buf)) { - sql_print_error("InnoDB: Reading checkpoint" - " encryption info failed."); - continue; - } + if (!mach_read_from_4(buf + LOG_HEADER_CREATOR_END)); + else if (!log_crypt_read_header(buf + LOG_HEADER_CREATOR_END)) + { + sql_print_error("InnoDB: Reading log encryption info failed;" + " the log was created with %s.", creator); + return DB_ERROR; + } + else + log_sys.format= log_t::FORMAT_ENC_10_8; - checkpoint_no = mach_read_from_8( - buf + LOG_CHECKPOINT_NO); + for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2; + field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1) + { + if (log_sys.is_pmem()) + buf= log_sys.buf + field; + else + log_sys.log.read(field, {buf, log_sys.get_block_size()}); + const lsn_t checkpoint_lsn{mach_read_from_8(buf)}; + const lsn_t end_lsn{mach_read_from_8(buf + 8)}; + if (checkpoint_lsn < first_lsn || end_lsn < checkpoint_lsn || + memcmp(buf + 16, field_ref_zero, 60 - 16) || + my_crc32c(0, buf, 60) != mach_read_from_4(buf + 60)) + { + DBUG_PRINT("ib_log", ("invalid checkpoint at %zu", field)); + continue; + } - DBUG_PRINT("ib_log", - ("checkpoint " UINT64PF " at " LSN_PF " found", - checkpoint_no, mach_read_from_8( - buf + LOG_CHECKPOINT_LSN))); + if (checkpoint_lsn >= log_sys.next_checkpoint_lsn) + { + log_sys.next_checkpoint_lsn= checkpoint_lsn; + log_sys.next_checkpoint_no= field == log_t::CHECKPOINT_1; + lsn= end_lsn; + } + } + if (!log_sys.next_checkpoint_lsn) + goto got_no_checkpoint; + if (!memcmp(creator, "Backup ", 7)) + srv_start_after_restore= true; + return DB_SUCCESS; + case log_t::FORMAT_10_5: + case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: + if (files.size() != 1) + { + sql_print_error("InnoDB: Expecting only ib_logfile0"); + return DB_CORRUPTION; + } + /* fall through */ + case log_t::FORMAT_10_2: + case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_3: + case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_4: + case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED: + uint64_t max_no= 0; + const lsn_t log_size{(log_sys.file_size - 2048) * files.size()}; + for (size_t field= 512; field < 2048; field += 1024) + { + const byte *b = buf + field; - if (checkpoint_no >= max_no) { - *max_field = field; - max_no = checkpoint_no; - log_sys.log.set_lsn(mach_read_from_8( - buf + LOG_CHECKPOINT_LSN)); - log_sys.log.set_lsn_offset(mach_read_from_8( - buf + LOG_CHECKPOINT_OFFSET)); - log_sys.next_checkpoint_no = checkpoint_no; - } - } + if (!recv_check_log_block(b)) + { + DBUG_PRINT("ib_log", ("invalid checkpoint checksum at %zu", field)); + continue; + } - if (*max_field == 0) { - /* Before 10.2.2, we could get here during database - initialization if we created an ib_logfile0 file that - was filled with zeroes, and were killed. After - 10.2.2, we would reject such a file already earlier, - when checking the file header. */ - sql_print_error("InnoDB: No valid checkpoint found" - " (corrupted redo log)." - " You can try --innodb-force-recovery=6" - " as a last resort."); - return(DB_ERROR); - } + if (log_sys.is_encrypted() && !log_crypt_read_checkpoint_buf(b)) + { + sql_print_error("InnoDB: Reading checkpoint encryption info failed."); + continue; + } - switch (log_sys.log.format) { - case log_t::FORMAT_10_5: - case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: - break; - default: - if (dberr_t err = recv_log_recover_10_4()) { - sql_print_error("InnoDB: Upgrade after a crash" - " is not supported." - " The redo log was created with %s%s.", - creator, - (err == DB_ERROR - ? "" : ", and it appears corrupted")); - return err; - } - } + const uint64_t checkpoint_no= mach_read_from_8(b); + const lsn_t checkpoint_lsn= mach_read_from_8(b + 8); + DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found", + checkpoint_no, checkpoint_lsn)); + const lsn_t o{mach_read_from_8(b + 16)}; + if (checkpoint_no >= max_no && o >= 0x80c && (o & ~511) + 512 < log_size) + { + max_no= checkpoint_no; + log_sys.next_checkpoint_lsn= checkpoint_lsn; + log_sys.next_checkpoint_no= field == 512; + lsn_offset= mach_read_from_8(b + 16); + } + } + } - return(DB_SUCCESS); -} + if (!log_sys.next_checkpoint_lsn) + { + got_no_checkpoint: + sql_print_error("InnoDB: No valid checkpoint was found;" + " the log was created with %s.", creator); + return DB_ERROR; + } -/*******************************************************//** -Calculates the new value for lsn when more data is added to the log. */ -static -lsn_t -recv_calc_lsn_on_data_add( -/*======================*/ - lsn_t lsn, /*!< in: old lsn */ - ib_uint64_t len) /*!< in: this many bytes of data is - added, log block headers not included */ -{ - unsigned frag_len = static_cast(lsn % OS_FILE_LOG_BLOCK_SIZE) - - LOG_BLOCK_HDR_SIZE; - unsigned payload_size = log_sys.payload_size(); - ut_ad(frag_len < payload_size); - lsn_t lsn_len = len; - lsn_len += (lsn_len + frag_len) / payload_size - * (OS_FILE_LOG_BLOCK_SIZE - payload_size); + if (wrong_size) + return DB_CORRUPTION; - return(lsn + lsn_len); + if (dberr_t err= recv_log_recover_10_5(lsn_offset)) + { + sql_print_error("%s The redo log was created with %s%s", + srv_operation == SRV_OPERATION_NORMAL + ? "InnoDB: Upgrade after a crash is not supported." + : "mariadb-backup --prepare is not possible", creator, + (err == DB_ERROR ? "." : ", and it appears corrupted.")); + return err; + } + + goto upgrade; } /** Trim old log records for a page. @@ -2056,7 +1916,7 @@ inline void page_recv_t::will_not_read() @param it page iterator @param start_lsn start LSN of the mini-transaction @param lsn @see mtr_t::commit_lsn() -@param recs redo log snippet @see log_t::FORMAT_10_5 +@param l redo log snippet @param len length of l, in bytes */ inline void recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn, const byte *l, size_t len) @@ -2105,7 +1965,7 @@ append: goto append; } recs.log.append(new (alloc(log_phys_t::alloc_size(len))) - log_phys_t(start_lsn, lsn, l, len)); + log_phys_t{start_lsn, lsn, l, len}); } /** Store/remove the freed pages in fil_name_t of recv_spaces. @@ -2139,83 +1999,319 @@ static void store_freed_or_init_rec(page_id_t page_id, bool freed) } } -/** Parse and register one mini-transaction in log_t::FORMAT_10_5. -@param checkpoint_lsn the log sequence number of the latest checkpoint -@param store whether to store the records -@param apply whether to apply file-level log records -@return whether FILE_CHECKPOINT record was seen the first time, -or corruption was noticed */ -bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply) +/** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */ +struct recv_buf +{ + const byte *ptr; + + constexpr recv_buf(const byte *ptr) : ptr(ptr) {} + constexpr bool operator==(const recv_buf other) const + { return ptr == other.ptr; } + + static const byte *end() { return &log_sys.buf[recv_sys.len]; } + + const char *get_filename(byte*, size_t) const noexcept + { return reinterpret_cast(ptr); } + + bool is_eof(size_t len= 0) const noexcept { return ptr + len >= end(); } + + byte operator*() const noexcept + { + ut_ad(ptr >= log_sys.buf); + ut_ad(ptr < end()); + return *ptr; + } + byte operator[](size_t size) const noexcept { return *(*this + size); } + recv_buf operator+(size_t len) const noexcept + { recv_buf r{*this}; return r+= len; } + recv_buf &operator++() noexcept { return *this+= 1; } + recv_buf &operator+=(size_t len) noexcept { ptr+= len; return *this; } + + size_t operator-(const recv_buf start) const noexcept + { + ut_ad(ptr >= start.ptr); + return size_t(ptr - start.ptr); + } + + uint32_t crc32c(const recv_buf start) const noexcept + { + return my_crc32c(0, start.ptr, ptr - start.ptr); + } + + void *memcpy(void *buf, size_t size) const noexcept + { + ut_ad(size); + ut_ad(!is_eof(size - 1)); + return ::memcpy(buf, ptr, size); + } + + bool is_zero(size_t size) const noexcept + { + ut_ad(!is_eof(size)); + return !memcmp(ptr, field_ref_zero, size); + } + + uint64_t read8() const noexcept + { ut_ad(!is_eof(7)); return mach_read_from_8(ptr); } + uint32_t read4() const noexcept + { ut_ad(!is_eof(3)); return mach_read_from_4(ptr); } + + /** Update the pointer if the new pointer is within the buffer. */ + bool set_if_contains(const byte *pos) noexcept + { + if (pos > end() || pos < ptr) + return false; + ptr= pos; + return true; + } + + /** Get the contiguous, unencrypted buffer. + @param buf return value of copy_if_needed() + @param start start of the mini-transaction + @param decrypt_buf possibly, a copy of the mini-transaction + @return contiguous, non-encrypted buffer */ + const byte *get_buf(const byte *buf, const recv_buf start, + const byte *decrypt_buf) const noexcept + { return ptr == buf ? start.ptr : decrypt_buf; } + + /** Copy and decrypt a log record if needed. + @param iv initialization vector + @param tmp buffer for the decrypted log record + @param start un-encrypted start of the log record + @param len length of the possibly encrypted part, in bytes */ + const byte *copy_if_needed(const byte *iv, byte *tmp, recv_buf start, + size_t len) + { + ut_ad(*this - start + len <= srv_page_size); + if (!len || !log_sys.is_encrypted()) + return ptr; + const size_t s(*this - start); + start.memcpy(tmp, s); + return log_decrypt_buf(iv, tmp + s, ptr, static_cast(len)); + } +}; + +#ifdef HAVE_PMEM +/** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */ +struct recv_ring : public recv_buf +{ + constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {} + + constexpr static bool is_eof() { return false; } + constexpr static bool is_eof(size_t) { return false; } + + byte operator*() const noexcept + { + ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(ptr < end()); + return *ptr; + } + byte operator[](size_t size) const noexcept { return *(*this + size); } + recv_ring operator+(size_t len) const noexcept + { recv_ring r{*this}; return r+= len; } + recv_ring &operator++() noexcept { return *this+= 1; } + recv_ring &operator+=(size_t len) noexcept + { + ut_ad(ptr < end()); + ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(len < recv_sys.MTR_SIZE_MAX * 2); + ptr+= len; + if (ptr >= end()) + { + ptr-= recv_sys.len - log_sys.START_OFFSET; + ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(ptr < end()); + } + return *this; + } + size_t operator-(const recv_ring start) const noexcept + { + auto s= ptr - start.ptr; + return s >= 0 + ? size_t(s) + : size_t(s + recv_sys.len - log_sys.START_OFFSET); + } + + uint32_t crc32c(const recv_ring start) const noexcept + { + return ptr >= start.ptr + ? my_crc32c(0, start.ptr, ptr - start.ptr) + : my_crc32c(my_crc32c(0, start.ptr, end() - start.ptr), + &log_sys.buf[log_sys.START_OFFSET], + ptr - &log_sys.buf[log_sys.START_OFFSET]); + } + + void *memcpy(void *buf, size_t size) const noexcept + { + ut_ad(size); + ut_ad(size < srv_page_size); + + auto s= ptr + size - end(); + if (s <= 0) + return ::memcpy(buf, ptr, size); + ::memcpy(buf, ptr, size - s); + ::memcpy(static_cast(buf) + size - s, + &log_sys.buf[log_sys.START_OFFSET], s); + return buf; + } + + bool is_zero(size_t size) const noexcept + { + auto s= ptr + size - end(); + if (s <= 0) + return !memcmp(ptr, field_ref_zero, size); + return !memcmp(ptr, field_ref_zero, size - s) && + !memcmp(&log_sys.buf[log_sys.START_OFFSET], field_ref_zero, s); + } + + uint64_t read8() const noexcept + { + if (UNIV_LIKELY(ptr + 8 <= end())) + return mach_read_from_8(ptr); + byte b[8]; + return mach_read_from_8(static_cast(memcpy(b, 8))); + } + uint32_t read4() const noexcept + { + if (UNIV_LIKELY(ptr + 4 <= end())) + return mach_read_from_4(ptr); + byte b[4]; + return mach_read_from_4(static_cast(memcpy(b, 4))); + } + + /** Get the contiguous, unencrypted buffer. + @param buf return value of copy_if_needed() + @param start start of the mini-transaction + @param decrypt_buf possibly, a copy of the mini-transaction + @return contiguous, non-encrypted buffer */ + const byte *get_buf(const byte *buf, const recv_ring start, + const byte *decrypt_buf) const noexcept + { return ptr == buf && start.ptr < ptr ? start.ptr : decrypt_buf; } + + const char *get_filename(byte* buf, size_t rlen) const noexcept + { + return UNIV_LIKELY(ptr + rlen <= end()) + ? reinterpret_cast(ptr) + : static_cast(memcpy(buf, rlen)); + } + + /** Copy and decrypt a log record if needed. + @param iv initialization vector + @param tmp buffer for the decrypted log record + @param start un-encrypted start of the log record + @param len length of the possibly encrypted part, in bytes */ + const byte *copy_if_needed(const byte *iv, byte *tmp, recv_ring start, + size_t len) + { + if (!len) + return ptr; + const size_t s(*this - start); + ut_ad(s + len <= srv_page_size); + if (!log_sys.is_encrypted()) + { + if (start.ptr + s == ptr && ptr + len <= end()) + return ptr; + start.memcpy(tmp, s + len); + return tmp + s; + } + + start.memcpy(tmp, s); + + const byte *b= ptr; + if (ptr + len > end()) + b= static_cast(memcpy(alloca(len), len)); + return log_decrypt_buf(iv, tmp + s, b, static_cast(len)); + } +}; +#endif + +/** Parse and register one log_t::FORMAT_10_8 mini-transaction. +@param store whether to store the records +@param l log data source */ +template +inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) + noexcept { mysql_mutex_assert_owner(&log_sys.mutex); mysql_mutex_assert_owner(&mutex); - ut_ad(parse_start_lsn); - ut_ad(log_sys.is_physical()); + ut_ad(log_sys.next_checkpoint_lsn); + ut_ad(log_sys.is_latest()); - bool last_phase= (*store == STORE_IF_EXISTS); - const byte *const end= buf + len; -loop: - const byte *const log= buf + recovered_offset; - const lsn_t start_lsn= recovered_lsn; - map::iterator cached_pages_it = pages.end(); + alignas(8) byte iv[MY_AES_BLOCK_SIZE]; + byte *decrypt_buf= static_cast(alloca(srv_page_size)); + + const lsn_t start_lsn{lsn}; + map::iterator cached_pages_it{pages.end()}; /* Check that the entire mini-transaction is included within the buffer */ - const byte *l; + if (l.is_eof(0)) + return PREMATURE_EOF; + + if (*l <= 1) + return GOT_EOF; /* We should never write an empty mini-transaction. */ + + const source begin{l}; uint32_t rlen; - for (l= log; l < end; l+= rlen) + for (uint32_t total_len= 0; !l.is_eof(); l+= rlen, total_len+= rlen) { - if (!*l) + if (total_len >= MTR_SIZE_MAX) + return GOT_EOF; + if (*l <= 1) goto eom_found; - if (UNIV_LIKELY((*l & 0x70) != RESERVED)); - else if (srv_force_recovery) - sql_print_warning("InnoDB: Ignoring unknown log record at LSN " LSN_PF, - recovered_lsn); - else - { -malformed: - sql_print_error("InnoDB: Malformed log record;" - " set innodb_force_recovery=1 to ignore."); -corrupted: - const size_t trailing_bytes= std::min(100, size_t(end - l)); - sql_print_information("InnoDB: Dump from the start of the" - " mini-transaction (LSN=" LSN_PF ") to %zu" - " bytes after the record:", - start_lsn, trailing_bytes); - ut_print_buf(stderr, log, l - log + trailing_bytes); - putc('\n', stderr); - found_corrupt_log= true; - return true; - } - rlen= *l++ & 0xf; - if (l + (rlen ? rlen : 16) >= end) - break; + rlen= *l & 0xf; + ++l; if (!rlen) { + if (l.is_eof(0)) + break; rlen= mlog_decode_varint_length(*l); - if (l + rlen >= end) + if (l.is_eof(rlen)) break; const uint32_t addlen= mlog_decode_varint(l); - if (UNIV_UNLIKELY(addlen == MLOG_DECODE_ERROR)) - { - sql_print_error("InnoDB: Corrupted record length"); - goto corrupted; - } + if (UNIV_UNLIKELY(addlen >= MTR_SIZE_MAX)) + return GOT_EOF; rlen= addlen + 15; } } /* Not the entire mini-transaction was present. */ - return false; + return PREMATURE_EOF; -eom_found: - ut_ad(!*l); - ut_d(const byte *const el= l + 1); + eom_found: + if (*l != log_sys.get_sequence_bit((l - begin) + lsn)) + return GOT_EOF; - const lsn_t end_lsn= recv_calc_lsn_on_data_add(start_lsn, l + 1 - log); - if (UNIV_UNLIKELY(end_lsn > scanned_lsn)) - /* The log record filled a log block, and we require that also the - next log block should have been scanned in */ - return false; + if (l.is_eof(4)) + return PREMATURE_EOF; + + uint32_t crc{l.crc32c(begin)}; + + if (log_sys.is_encrypted()) + { + if (l.is_eof(8 + 4)) + return PREMATURE_EOF; + (l + 1).memcpy(iv, 8); + l+= 8; + crc= my_crc32c(crc, iv, 8); + } + + DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", + { + static int c; + if (!c++) + { + sql_print_information("Invalid log block checksum"); + return GOT_EOF; + } + }); + + if (crc != (l + 1).read4()) + return GOT_EOF; + + l+= 5; + ut_d(const source el{l}); + lsn+= l - begin; + offset= l.ptr - log_sys.buf; ut_d(std::set freed); #if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */ @@ -2230,58 +2326,83 @@ eom_found: uint32_t space_id= 0, page_no= 0, last_offset= 0; bool got_page_op= false; - for (l= log; l < end; l+= rlen) - { - const byte *const recs= l; - const byte b= *l++; - if (!b) + for (l= begin;; l+= rlen) + { + const source recs{l}; + ++l; + const byte b= *recs; + + if (b <= 1) break; - ut_ad(UNIV_LIKELY(b & 0x70) != RESERVED || srv_force_recovery); + + if (UNIV_LIKELY((b & 0x70) != RESERVED)); + else if (srv_force_recovery) + sql_print_warning("InnoDB: Ignoring unknown log record at LSN " LSN_PF, + lsn); + else + { + sql_print_error("InnoDB: Unknown log record at LSN " LSN_PF, lsn); + corrupted: + found_corrupt_log= true; + pthread_cond_broadcast(&cond); + return GOT_EOF; + } + rlen= b & 0xf; - ut_ad(l + rlen < end); - ut_ad(rlen || l + 16 < end); if (!rlen) { const uint32_t lenlen= mlog_decode_varint_length(*l); - ut_ad(l + lenlen < end); const uint32_t addlen= mlog_decode_varint(l); ut_ad(addlen != MLOG_DECODE_ERROR); rlen= addlen + 15 - lenlen; l+= lenlen; } - ut_ad(l + rlen < end); + ut_ad(!l.is_eof(rlen)); + uint32_t idlen; if ((b & 0x80) && got_page_op) { /* This record is for the same page as the previous one. */ if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE)) { -record_corrupted: + record_corrupted: /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */ if (!srv_force_recovery) - goto malformed; + { + malformed: + sql_print_error("InnoDB: Malformed log record at LSN " LSN_PF + "; set innodb_force_recovery=1 to ignore.", lsn); + goto corrupted; + } sql_print_warning("InnoDB: Ignoring malformed log record at LSN " - LSN_PF, recovered_lsn); + LSN_PF, lsn); last_offset= 1; /* the next record must not be same_page */ continue; } + if (srv_operation == SRV_OPERATION_BACKUP) + continue; + DBUG_PRINT("ib_log", + ("scan " LSN_PF ": rec %x len %zu page %u:%u", + lsn, b, l - recs + rlen, space_id, page_no)); goto same_page; } last_offset= 0; idlen= mlog_decode_varint_length(*l); if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen)) { -page_id_corrupted: + if (!*l && b == FILE_CHECKPOINT + 1) + continue; + page_id_corrupted: if (!srv_force_recovery) { sql_print_error("InnoDB: Corrupted page identifier at " LSN_PF "; set innodb_force_recovery=1 to ignore the record.", - recovered_lsn); + lsn); goto corrupted; } sql_print_warning("InnoDB: Ignoring corrupted page identifier at LSN " - LSN_PF, recovered_lsn); + LSN_PF, lsn); continue; } space_id= mlog_decode_varint(l); @@ -2297,12 +2418,17 @@ page_id_corrupted: goto page_id_corrupted; l+= idlen; rlen-= idlen; - got_page_op = !(b & 0x80); - if (got_page_op && apply && !is_predefined_tablespace(space_id)) + mach_write_to_4(iv + 8, space_id); + mach_write_to_4(iv + 12, page_no); + got_page_op= !(b & 0x80); + if (!got_page_op); + else if (srv_operation == SRV_OPERATION_BACKUP) + continue; + else if (file_checkpoint && !is_predefined_tablespace(space_id)) { recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); if (i != recv_spaces.end() && i->first == space_id); - else if (recovered_lsn < mlog_checkpoint_lsn) + else if (lsn < file_checkpoint) /* We have not seen all records between the checkpoint and FILE_CHECKPOINT. There should be a FILE_DELETE for this tablespace later. */ @@ -2313,23 +2439,25 @@ page_id_corrupted: if (!srv_force_recovery) { ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id - << " at " << recovered_lsn + << " at " << lsn << "; set innodb_force_recovery=1 to ignore the record."; goto corrupted; } - ib::warn() << "Ignoring record for " << id << " at " << recovered_lsn; + ib::warn() << "Ignoring record for " << id << " at " << lsn; continue; } } -same_page: DBUG_PRINT("ib_log", ("scan " LSN_PF ": rec %x len %zu page %u:%u", - recovered_lsn, b, static_cast(l + rlen - recs), - space_id, page_no)); - + lsn, b, l - recs + rlen, space_id, page_no)); if (got_page_op) { - const page_id_t id(space_id, page_no); + same_page: + const byte *cl= l.ptr; + if (!rlen); + else if (UNIV_UNLIKELY(l - recs + rlen > srv_page_size)) + goto record_corrupted; + const page_id_t id{space_id, page_no}; ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id)); ut_ad(freed.find(id) == freed.end()); switch (b & 0x70) { @@ -2347,7 +2475,8 @@ same_page: case EXTENDED: if (UNIV_UNLIKELY(!rlen)) goto record_corrupted; - if (rlen == 1 && *l == TRIM_PAGES) + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + if (rlen == 1 && *cl == TRIM_PAGES) { #if 0 /* For now, we can only truncate an undo log tablespace */ if (UNIV_UNLIKELY(!space_id || !page_no)) @@ -2359,7 +2488,7 @@ same_page: static_assert(UT_ARR_SIZE(truncated_undo_spaces) == TRX_SYS_MAX_UNDO_SPACES, "compatibility"); truncated_undo_spaces[space_id - srv_undo_space_id_start]= - { recovered_lsn, page_no }; + { lsn, page_no }; #endif last_offset= 1; /* the next record must not be same_page */ continue; @@ -2374,10 +2503,12 @@ same_page: case MEMSET: if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1)) goto record_corrupted; - const uint32_t olen= mlog_decode_varint_length(*l); + ut_d(const source payload{l}); + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + const uint32_t olen= mlog_decode_varint_length(*cl); if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) goto record_corrupted; - const uint32_t offset= mlog_decode_varint(l); + const uint32_t offset= mlog_decode_varint(cl); ut_ad(offset != MLOG_DECODE_ERROR); static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); if (UNIV_UNLIKELY(offset >= srv_page_size)) @@ -2385,13 +2516,13 @@ same_page: last_offset+= offset; if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size)) goto record_corrupted; - l+= olen; + cl+= olen; rlen-= olen; if ((b & 0x70) == WRITE) { if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) goto record_corrupted; - if (UNIV_UNLIKELY(!page_no) && apply) + if (UNIV_UNLIKELY(!page_no) && file_checkpoint) { const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE && last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4; @@ -2402,15 +2533,15 @@ same_page: { recv_spaces_t::iterator it= recv_spaces.find(space_id); const uint32_t size= has_size - ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + l - + ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + cl - last_offset) : 0; const uint32_t flags= has_flags - ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + l - + ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + cl - last_offset) : file_name_t::initial_flags; if (it == recv_spaces.end()) - ut_ad(!mlog_checkpoint_lsn || space_id == TRX_SYS_SPACE || + ut_ad(!file_checkpoint || space_id == TRX_SYS_SPACE || srv_is_undo_tablespace(space_id)); else if (!it->second.space) { @@ -2422,30 +2553,33 @@ same_page: fil_space_set_recv_size_and_flags(space_id, size, flags); } } + parsed_ok: last_offset+= rlen; + ut_ad(l == payload); + if (!l.set_if_contains(cl)) + (l= recs)+= cl - decrypt_buf; break; } - uint32_t llen= mlog_decode_varint_length(*l); + uint32_t llen= mlog_decode_varint_length(*cl); if (UNIV_UNLIKELY(llen > rlen || llen > 3)) goto record_corrupted; - const uint32_t len= mlog_decode_varint(l); + const uint32_t len= mlog_decode_varint(cl); ut_ad(len != MLOG_DECODE_ERROR); if (UNIV_UNLIKELY(last_offset + len > srv_page_size)) goto record_corrupted; - l+= llen; + cl+= llen; rlen-= llen; llen= len; if ((b & 0x70) == MEMSET) { if (UNIV_UNLIKELY(rlen > llen)) goto record_corrupted; - last_offset+= llen; - break; + goto parsed_ok; } - const uint32_t slen= mlog_decode_varint_length(*l); + const uint32_t slen= mlog_decode_varint_length(*cl); if (UNIV_UNLIKELY(slen != rlen || slen > 3)) goto record_corrupted; - uint32_t s= mlog_decode_varint(l); + uint32_t s= mlog_decode_varint(cl); ut_ad(slen != MLOG_DECODE_ERROR); if (s & 1) s= last_offset - (s >> 1) - 1; @@ -2453,8 +2587,7 @@ same_page: s= last_offset + (s >> 1) + 1; if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size)) goto record_corrupted; - last_offset+= llen; - break; + goto parsed_ok; } #if 0 && defined UNIV_DEBUG switch (b & 0x70) { @@ -2469,7 +2602,7 @@ same_page: } #endif const bool is_init= (b & 0x70) <= INIT_PAGE; - switch (*store) { + switch (store) { case STORE_IF_EXISTS: if (fil_space_t *space= fil_space_t::get(space_id)) { @@ -2484,10 +2617,11 @@ same_page: case STORE_YES: if (!mlog_init.will_avoid_read(id, start_lsn)) { - if (cached_pages_it == pages.end() || cached_pages_it->first != id) - cached_pages_it= pages.emplace(id, page_recv_t()).first; - add(cached_pages_it, start_lsn, end_lsn, recs, - static_cast(l + rlen - recs)); + if (cached_pages_it == pages.end() || + cached_pages_it->first != id) + cached_pages_it= pages.emplace(id, page_recv_t{}).first; + add(cached_pages_it, start_lsn, lsn, + l.get_buf(cl, recs, decrypt_buf), l - recs + rlen); } continue; case STORE_NO: @@ -2505,41 +2639,42 @@ same_page: { switch (b & 0xf0) { case FILE_CHECKPOINT: - if (space_id == 0 && page_no == 0 && rlen == 8) + if (space_id || page_no || l[rlen] > 1); + else if (rlen != 8) + { + if (rlen < UNIV_PAGE_SIZE_MAX && !l.is_zero(rlen)) + continue; + } + else if (const lsn_t c= l.read8()) { - const lsn_t lsn= mach_read_from_8(l); - if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n", - lsn, lsn != checkpoint_lsn - ? "ignored" - : mlog_checkpoint_lsn ? "reread" : "read", - recovered_lsn); + c, c != log_sys.next_checkpoint_lsn + ? "ignored" : file_checkpoint ? "reread" : "read", lsn); - DBUG_PRINT("ib_log", ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF, - lsn, lsn != checkpoint_lsn - ? "ignored" - : mlog_checkpoint_lsn ? "reread" : "read", - recovered_lsn)); + DBUG_PRINT("ib_log", + ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF, + c, c != log_sys.next_checkpoint_lsn + ? "ignored" : file_checkpoint ? "reread" : "read", lsn)); - if (lsn == checkpoint_lsn) + if (c == log_sys.next_checkpoint_lsn) { /* There can be multiple FILE_CHECKPOINT for the same LSN. */ - if (mlog_checkpoint_lsn) + if (file_checkpoint) continue; - mlog_checkpoint_lsn= recovered_lsn; - l+= 8; - recovered_offset= l - buf; - return true; + file_checkpoint= lsn; + return GOT_EOF; } continue; } + else + continue; /* fall through */ default: if (!srv_force_recovery) goto malformed; sql_print_warning("InnoDB: Ignoring malformed log record at LSN " - LSN_PF, recovered_lsn); + LSN_PF, lsn); continue; case FILE_DELETE: case FILE_MODIFY: @@ -2555,7 +2690,7 @@ same_page: } sql_print_warning("InnoDB: Ignoring corrupted file-level record" - " at LSN " LSN_PF, recovered_lsn); + " at LSN " LSN_PF, lsn); continue; } /* fall through */ @@ -2564,7 +2699,8 @@ same_page: goto file_rec_error; /* There is no terminating NUL character. Names must end in .ibd. For FILE_RENAME, there is a NUL between the two file names. */ - const char * const fn= reinterpret_cast(l); + + const char * const fn= l.get_filename(decrypt_buf, rlen); const char *fn2= static_cast(memchr(fn, 0, rlen)); if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME))) @@ -2587,22 +2723,23 @@ same_page: if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4)) goto file_rec_error; - const char saved_end= fn[rlen]; - const_cast(fn[rlen])= '\0'; - fil_name_process(const_cast(fn), fnend - fn, space_id, + if (UNIV_UNLIKELY(!recv_needed_recovery && srv_read_only_mode)) + continue; + + fil_name_process(fn, fnend - fn, space_id, (b & 0xf0) == FILE_DELETE, start_lsn, store); if (fn2) - fil_name_process(const_cast(fn2), fn2end - fn2, space_id, + fil_name_process(fn2, fn2end - fn2, space_id, false, start_lsn, store); if ((b & 0xf0) < FILE_MODIFY && log_file_op) log_file_op(space_id, (b & 0xf0) == FILE_CREATE, - l, static_cast(fnend - fn), + reinterpret_cast(fn), + static_cast(fnend - fn), reinterpret_cast(fn2), fn2 ? static_cast(fn2end - fn2) : 0); - const_cast(fn[rlen])= saved_end; - if (fn2 && apply) + if (fn2 && file_checkpoint) { const size_t len= fn2end - fn2; auto r= renamed_spaces.emplace(space_id, std::string{fn2, len}); @@ -2610,21 +2747,43 @@ same_page: r.first->second= std::string{fn2, len}; } if (is_corrupt_fs()) - return true; + return GOT_EOF; } } + else if (b == FILE_CHECKPOINT + 2 && !space_id && !page_no); else goto malformed; } + l+= log_sys.is_encrypted() ? 4U + 8U : 4U; ut_ad(l == el); - recovered_offset= l - buf; - recovered_lsn= end_lsn; - if (is_memory_exhausted(store) && last_phase) - return false; - goto loop; + return OK; } +ATTRIBUTE_NOINLINE +recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(store_t store) noexcept +{ + recv_buf s{&log_sys.buf[recv_sys.offset]}; + return recv_sys.parse(store, s); +} + +#ifdef HAVE_PMEM +recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(store_t store) noexcept +{ + recv_sys_t::parse_mtr_result r{parse_mtr(store)}; + if (r != PREMATURE_EOF || !log_sys.is_pmem()) + return r; + ut_ad(recv_sys.len == log_sys.file_size); + ut_ad(recv_sys.offset >= log_sys.START_OFFSET); + ut_ad(recv_sys.offset <= recv_sys.len); + recv_ring s + {recv_sys.offset == recv_sys.len + ? &log_sys.buf[log_sys.START_OFFSET] + : &log_sys.buf[recv_sys.offset]}; + return recv_sys.parse(store, s); +} +#endif + /** Apply the hashed log records to the page, if the page lsn is less than the lsn of a log record. @param[in,out] block buffer pool page @@ -2645,7 +2804,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, ut_ad(block->page.id() == p->first); ut_ad(!p->second.is_being_processed()); ut_ad(!space || space->id == block->page.id().space()); - ut_ad(log_sys.is_physical()); + ut_ad(log_sys.is_latest()); if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { ib::info() << "Applying log to page " << block->page.id(); @@ -2676,7 +2835,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, const log_phys_t* l = static_cast(recv); ut_ad(l->lsn); ut_ad(end_lsn <= l->lsn); - ut_ad(l->lsn <= log_sys.log.scanned_lsn); + ut_ad(l->lsn <= log_sys.get_lsn()); ut_ad(l->start_lsn); ut_ad(recv_start_lsn <= l->start_lsn); @@ -2814,9 +2973,7 @@ set_start_lsn: } buf_block_modify_clock_inc(block); - mysql_mutex_lock(&log_sys.flush_order_mutex); buf_flush_note_modification(block, start_lsn, end_lsn); - mysql_mutex_unlock(&log_sys.flush_order_mutex); } else if (free_page && init) { /* There have been no operations that modify the page. Any buffered changes must not be merged. A subsequent @@ -2983,7 +3140,7 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id, ut_ad(recs.state == page_recv_t::RECV_WILL_NOT_READ); buf_block_t* block= nullptr; mlog_init_t::init &i= mlog_init.last(page_id); - const lsn_t end_lsn = recs.log.last()->lsn; + const lsn_t end_lsn= recs.log.last()->lsn; bool first_page= page_id.page_no() == 0; if (end_lsn < i.lsn) DBUG_LOG("ib_log", "skip log for page " << page_id @@ -3112,17 +3269,14 @@ void recv_sys_t::apply(bool last_batch) #ifdef SAFE_MUTEX DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex)); #endif /* SAFE_MUTEX */ - mysql_mutex_lock(&mutex); + mysql_mutex_assert_owner(&mutex); timespec abstime; while (apply_batch_on) { if (is_corrupt_log()) - { - mysql_mutex_unlock(&mutex); return; - } if (last_batch) { mysql_mutex_assert_not_owner(&log_sys.mutex); @@ -3146,11 +3300,11 @@ void recv_sys_t::apply(bool last_batch) if (!pages.empty()) { const char *msg= last_batch - ? "Starting final batch to recover " - : "Starting a batch to recover "; + ? "Starting final batch to recover" + : "Starting a batch to recover"; const size_t n= pages.size(); sql_print_information("InnoDB: %s %zu pages from redo log.", msg, n); - sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log", msg, n); + sd_notifyf(0, "STATUS=%s %zu pages from redo log", msg, n); apply_log_recs= true; apply_batch_on= true; @@ -3231,9 +3385,17 @@ next_page: else { mtr.commit(); + if (!last_batch) + { + const auto it= recv_spaces.find(space_id); + if (it != recv_spaces.end() && + it->second.status == file_name_t::DELETED) + goto erase_page; + } recv_read_in_area(page_id); break; } + erase_page: map::iterator r= p++; r->second.log.clear(); pages.erase(r); @@ -3280,7 +3442,6 @@ next_page: if (is_corrupt_fs() && !srv_force_recovery) sql_print_information("InnoDB: Set innodb_force_recovery=1" " to ignore corrupted pages."); - mysql_mutex_unlock(&mutex); return; } } @@ -3304,7 +3465,7 @@ next_page: { /* Instead of flushing, last_batch could sort the buf_pool.flush_list in ascending order of buf_page_t::oldest_modification. */ - buf_flush_sync_batch(recovered_lsn); + buf_flush_sync_batch(lsn); } if (!last_batch) @@ -3312,384 +3473,206 @@ next_page: buf_pool_invalidate(); mysql_mutex_lock(&log_sys.mutex); } +#ifdef HAVE_PMEM + else if (log_sys.is_pmem()) + mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE); +#endif mysql_mutex_lock(&mutex); ut_d(after_apply= true); clear(); - mysql_mutex_unlock(&mutex); } /** Check whether the number of read redo log blocks exceeds the maximum. -Store last_stored_lsn if the recovery is not in the last phase. -@param[in,out] store whether to store page operations @return whether the memory is exhausted */ -inline bool recv_sys_t::is_memory_exhausted(store_t *store) +inline bool recv_sys_t::is_memory_exhausted() { - if (*store == STORE_NO || - UT_LIST_GET_LEN(blocks) * 3 < buf_pool.get_n_pages()) + if (UT_LIST_GET_LEN(blocks) * 3 < buf_pool.get_n_pages()) return false; - if (*store == STORE_YES) - last_stored_lsn= recovered_lsn; - *store= STORE_NO; DBUG_PRINT("ib_log",("Ran out of memory and last stored lsn " LSN_PF - " last stored offset " ULINTPF "\n", - recovered_lsn, recovered_offset)); + " last stored offset %zu\n", lsn, offset)); return true; } -/** Adds data from a new log block to the parsing buffer of recv_sys if -recv_sys.parse_start_lsn is non-zero. -@param[in] log_block log block to add -@param[in] scanned_lsn lsn of how far we were able to find - data in this log block -@return true if more data added */ -bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn) -{ - ulint more_len; - ulint data_len; - ulint start_offset; - ulint end_offset; - - ut_ad(scanned_lsn >= recv_sys.scanned_lsn); - - if (!recv_sys.parse_start_lsn) { - /* Cannot start parsing yet because no start point for - it found */ - return(false); - } - - data_len = log_block_get_data_len(log_block); - - if (recv_sys.parse_start_lsn >= scanned_lsn) { - - return(false); - - } else if (recv_sys.scanned_lsn >= scanned_lsn) { - - return(false); - - } else if (recv_sys.parse_start_lsn > recv_sys.scanned_lsn) { - more_len = (ulint) (scanned_lsn - recv_sys.parse_start_lsn); - } else { - more_len = (ulint) (scanned_lsn - recv_sys.scanned_lsn); - } - - if (more_len == 0) { - return(false); - } - - ut_ad(data_len >= more_len); - - start_offset = data_len - more_len; - - if (start_offset < LOG_BLOCK_HDR_SIZE) { - start_offset = LOG_BLOCK_HDR_SIZE; - } - - end_offset = std::min(data_len, log_sys.trailer_offset()); - - ut_ad(start_offset <= end_offset); - - if (start_offset < end_offset) { - memcpy(recv_sys.buf + recv_sys.len, - log_block + start_offset, end_offset - start_offset); - - recv_sys.len += end_offset - start_offset; - - ut_a(recv_sys.len <= RECV_PARSING_BUF_SIZE); - } - - return(true); -} - -/** Moves the parsing buffer data left to the buffer start. */ -void recv_sys_justify_left_parsing_buf() -{ - memmove(recv_sys.buf, recv_sys.buf + recv_sys.recovered_offset, - recv_sys.len - recv_sys.recovered_offset); - - recv_sys.len -= recv_sys.recovered_offset; - - recv_sys.recovered_offset = 0; -} - -/** Scan redo log from a buffer and stores new log data to the parsing buffer. -Parse and hash the log records if new data found. -Apply log records automatically when the hash table becomes full. -@param[in,out] store whether the records should be - stored into recv_sys.pages; this is - reset if just debug checking is - needed, or when the num_max_blocks in - recv_sys runs out -@param[in] log_block log segment -@param[in] checkpoint_lsn latest checkpoint LSN -@param[in] start_lsn buffer start LSN -@param[in] end_lsn buffer end LSN -@param[in,out] contiguous_lsn it is known that all groups contain - contiguous log data upto this lsn -@param[out] group_scanned_lsn scanning succeeded upto this lsn -@return true if not able to scan any more in this log group */ -static bool recv_scan_log_recs( - store_t* store, - const byte* log_block, - lsn_t checkpoint_lsn, - lsn_t start_lsn, - lsn_t end_lsn, - lsn_t* contiguous_lsn, - lsn_t* group_scanned_lsn) -{ - lsn_t scanned_lsn = start_lsn; - bool finished = false; - ulint data_len; - bool more_data = false; - bool apply = recv_sys.mlog_checkpoint_lsn != 0; - ulint recv_parsing_buf_size = RECV_PARSING_BUF_SIZE; - const bool last_phase = (*store == STORE_IF_EXISTS); - ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_ad(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_ad(end_lsn >= start_lsn + OS_FILE_LOG_BLOCK_SIZE); - ut_ad(log_sys.is_physical()); - - const byte* const log_end = log_block - + ulint(end_lsn - start_lsn); - constexpr ulint sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT; - - do { - ut_ad(!finished); - - if (log_block_get_flush_bit(log_block)) { - /* This block was a start of a log flush operation: - we know that the previous flush operation must have - been completed for all log groups before this block - can have been flushed to any of the groups. Therefore, - we know that log data is contiguous up to scanned_lsn - in all non-corrupt log groups. */ - - if (scanned_lsn > *contiguous_lsn) { - *contiguous_lsn = scanned_lsn; - } - } - - data_len = log_block_get_data_len(log_block); - - if (scanned_lsn + data_len > recv_sys.scanned_lsn - && log_block_get_checkpoint_no(log_block) - < recv_sys.scanned_checkpoint_no - && (recv_sys.scanned_checkpoint_no - - log_block_get_checkpoint_no(log_block) - > 0x80000000UL)) { - - /* Garbage from a log buffer flush which was made - before the most recent database recovery */ - finished = true; - break; - } - - if (!recv_sys.parse_start_lsn - && (log_block_get_first_rec_group(log_block) > 0)) { - - /* We found a point from which to start the parsing - of log records */ - - recv_sys.parse_start_lsn = scanned_lsn - + log_block_get_first_rec_group(log_block); - recv_sys.scanned_lsn = recv_sys.parse_start_lsn; - recv_sys.recovered_lsn = recv_sys.parse_start_lsn; - } - - scanned_lsn += data_len; - - if (data_len == LOG_BLOCK_HDR_SIZE + sizeof_checkpoint - && scanned_lsn == checkpoint_lsn + sizeof_checkpoint - && log_block[LOG_BLOCK_HDR_SIZE] - == (FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2)) - && checkpoint_lsn == mach_read_from_8( - (LOG_BLOCK_HDR_SIZE + 1 + 2) - + log_block)) { - /* The redo log is logically empty. */ - ut_ad(recv_sys.mlog_checkpoint_lsn == 0 - || recv_sys.mlog_checkpoint_lsn - == checkpoint_lsn); - recv_sys.mlog_checkpoint_lsn = checkpoint_lsn; - DBUG_PRINT("ib_log", ("found empty log; LSN=" LSN_PF, - scanned_lsn)); - finished = true; - break; - } - - if (scanned_lsn > recv_sys.scanned_lsn) { - ut_ad(!srv_log_file_created); - if (!recv_needed_recovery) { - recv_needed_recovery = true; - - if (srv_read_only_mode) { - sql_print_warning( - "InnoDB: innodb_read_only" - " prevents crash recovery"); - return(true); - } - - sql_print_information("InnoDB: Starting" - " crash recovery from" - " checkpoint LSN=" LSN_PF - "," LSN_PF, - checkpoint_lsn, - recv_sys.scanned_lsn); - } - - /* We were able to find more log data: add it to the - parsing buffer if parse_start_lsn is already - non-zero */ - - DBUG_EXECUTE_IF( - "reduce_recv_parsing_buf", - recv_parsing_buf_size = RECV_SCAN_SIZE * 2; - ); - - if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE - >= recv_parsing_buf_size) { - sql_print_error("InnoDB: Log parsing buffer" - " overflow." - " Recovery may have failed!"); - - recv_sys.set_corrupt_log(); - - if (!srv_force_recovery) { - sql_print_information( - "InnoDB: Set" - " innodb_force_recovery" - " to ignore this error."); - return(true); - } - } else if (!recv_sys.is_corrupt_log()) { - more_data = recv_sys_add_to_parsing_buf( - log_block, scanned_lsn); - } - - recv_sys.scanned_lsn = scanned_lsn; - recv_sys.scanned_checkpoint_no - = log_block_get_checkpoint_no(log_block); - } - - /* During last phase of scanning, there can be redo logs - left in recv_sys.buf to parse & store it in recv_sys.heap */ - if (last_phase - && recv_sys.recovered_lsn < recv_sys.scanned_lsn) { - more_data = true; - } - - if (data_len < OS_FILE_LOG_BLOCK_SIZE) { - /* Log data for this group ends here */ - finished = true; - break; - } else { - log_block += OS_FILE_LOG_BLOCK_SIZE; - } - } while (log_block < log_end); - - *group_scanned_lsn = scanned_lsn; - - mysql_mutex_lock(&recv_sys.mutex); - - if (more_data && !recv_sys.is_corrupt_log()) { - /* Try to parse more log records */ - if (recv_sys.parse(checkpoint_lsn, store, apply)) { - ut_ad(recv_sys.is_corrupt_log() - || recv_sys.is_corrupt_fs() - || recv_sys.mlog_checkpoint_lsn - == recv_sys.recovered_lsn); - finished = true; - goto func_exit; - } - - recv_sys.is_memory_exhausted(store); - - if (recv_sys.recovered_offset > recv_parsing_buf_size / 4 - || (recv_sys.recovered_offset - && recv_sys.len - >= recv_parsing_buf_size - RECV_SCAN_SIZE)) { - /* Move parsing buffer data to the buffer start */ - recv_sys_justify_left_parsing_buf(); - } - - /* Need to re-parse the redo log which're stored - in recv_sys.buf */ - if (last_phase && *store == STORE_NO) { - finished = false; - } - } - -func_exit: - recv_sys.maybe_finish_batch(); - mysql_mutex_unlock(&recv_sys.mutex); - return(finished); -} - -/** Scans log from a buffer and stores new log data to the parsing buffer. -Parses and hashes the log records if new data found. -@param[in] checkpoint_lsn latest checkpoint log sequence number -@param[in,out] contiguous_lsn log sequence number -until which all redo log has been scanned -@param[in] last_phase whether changes -can be applied to the tablespaces +/** Scan log_t::FORMAT_10_8 log store records to the parsing buffer. +@param last_phase whether changes can be applied to the tablespaces @return whether rescan is needed (not everything was stored) */ -static -bool -recv_group_scan_log_recs( - lsn_t checkpoint_lsn, - lsn_t* contiguous_lsn, - bool last_phase) +static bool recv_scan_log(bool last_phase) { - DBUG_ENTER("recv_group_scan_log_recs"); - DBUG_ASSERT(!last_phase || recv_sys.mlog_checkpoint_lsn > 0); + DBUG_ENTER("recv_scan_log"); + DBUG_ASSERT(!last_phase || recv_sys.file_checkpoint); - mysql_mutex_lock(&recv_sys.mutex); - recv_sys.len = 0; - recv_sys.recovered_offset = 0; - recv_sys.clear(); - recv_sys.parse_start_lsn = *contiguous_lsn; - recv_sys.scanned_lsn = *contiguous_lsn; - recv_sys.recovered_lsn = *contiguous_lsn; - recv_sys.scanned_checkpoint_no = 0; - ut_ad(recv_max_page_lsn == 0); - mysql_mutex_unlock(&recv_sys.mutex); + ut_ad(log_sys.is_latest()); + const size_t block_size_1{log_sys.get_block_size() - 1}; - lsn_t start_lsn; - lsn_t end_lsn; - store_t store = recv_sys.mlog_checkpoint_lsn == 0 - ? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES); + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.clear(); + ut_d(recv_sys.after_apply= last_phase); + ut_ad(!last_phase || recv_sys.file_checkpoint); - log_sys.log.scanned_lsn = end_lsn = *contiguous_lsn = - ut_uint64_align_down(*contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE); - ut_d(recv_sys.after_apply = last_phase); + store_t store= last_phase + ? STORE_IF_EXISTS : recv_sys.file_checkpoint ? STORE_YES : STORE_NO; + size_t buf_size= log_sys.buf_size; +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + { + recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); + buf_size= size_t(log_sys.file_size); + recv_sys.len= size_t(log_sys.file_size); + } + else +#endif + { + recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & + block_size_1; + recv_sys.len= 0; + } - do { - if (last_phase && store == STORE_NO) { - store = STORE_IF_EXISTS; - recv_sys.apply(false); - /* Rescan the redo logs from last stored lsn */ - end_lsn = recv_sys.recovered_lsn; - } + for (ut_d(lsn_t source_offset= 0);;) + { + mysql_mutex_assert_owner(&log_sys.mutex); +#ifdef UNIV_DEBUG + const bool wrap{source_offset + recv_sys.len == log_sys.file_size}; +#endif + if (size_t size= buf_size - recv_sys.len) + { +#ifndef UNIV_DEBUG + lsn_t +#endif + source_offset= + log_sys.calc_lsn_offset(recv_sys.lsn + recv_sys.len - recv_sys.offset); + ut_ad(!wrap || source_offset == log_t::START_OFFSET); + source_offset&= ~block_size_1; - start_lsn = ut_uint64_align_down(end_lsn, - OS_FILE_LOG_BLOCK_SIZE); - end_lsn = start_lsn; - log_sys.log.read_log_seg(&end_lsn, start_lsn + RECV_SCAN_SIZE); - } while (end_lsn != start_lsn - && !recv_scan_log_recs(&store, log_sys.buf, checkpoint_lsn, - start_lsn, end_lsn, contiguous_lsn, - &log_sys.log.scanned_lsn)); + if (source_offset + size > log_sys.file_size) + size= static_cast(log_sys.file_size - source_offset); - if (recv_sys.is_corrupt_log() || recv_sys.is_corrupt_fs()) { - DBUG_RETURN(false); - } + log_sys.n_log_ios++; + log_sys.log.read(source_offset, {log_sys.buf + recv_sys.len, size}); + recv_sys.len+= size; + } - DBUG_PRINT("ib_log", ("%s " LSN_PF " completed", - last_phase ? "rescan" : "scan", - log_sys.log.scanned_lsn)); + if (recv_sys.report(time(nullptr))) + { + sql_print_information("InnoDB: Read redo log up to LSN=" LSN_PF, + recv_sys.lsn); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Read redo log up to LSN=" LSN_PF, + recv_sys.lsn); + } - DBUG_RETURN(store == STORE_NO); + recv_sys_t::parse_mtr_result r; + + if (UNIV_UNLIKELY(!recv_needed_recovery)) + { + ut_ad(store == (recv_sys.file_checkpoint ? STORE_YES : STORE_NO)); + ut_ad(recv_sys.lsn >= log_sys.next_checkpoint_lsn); + + for (;;) + { + const byte b{log_sys.buf[recv_sys.offset]}; + r= recv_sys.parse_pmem(store); + if (r == recv_sys_t::OK) + { + if (store == STORE_NO && + (b == FILE_CHECKPOINT + 2 + 8 || (b & 0xf0) == FILE_MODIFY)) + continue; + } + else if (r == recv_sys_t::PREMATURE_EOF) + goto read_more; + else if (store != STORE_NO) + break; + + if (store == STORE_NO) + { + const lsn_t end{recv_sys.file_checkpoint}; + mysql_mutex_unlock(&recv_sys.mutex); + + if (!end) + { + recv_sys.set_corrupt_log(); + sql_print_error("InnoDB: Missing FILE_CHECKPOINT(" LSN_PF + ") at " LSN_PF, log_sys.next_checkpoint_lsn, + recv_sys.lsn); + } + else + ut_ad(end == recv_sys.lsn); + DBUG_RETURN(true); + } + + recv_needed_recovery= true; + if (srv_read_only_mode) + { + mysql_mutex_unlock(&recv_sys.mutex); + DBUG_RETURN(false); + } + sql_print_information("InnoDB: Starting crash recovery from" + " checkpoint LSN=" LSN_PF, + log_sys.next_checkpoint_lsn); + break; + } + } + + while ((r= recv_sys.parse_pmem(store)) == recv_sys_t::OK) + { + if (store != STORE_NO && recv_sys.is_memory_exhausted()) + { + ut_ad(last_phase == (store == STORE_IF_EXISTS)); + if (store == STORE_YES) + { + store= STORE_NO; + recv_sys.last_stored_lsn= recv_sys.lsn; + } + else + { + ut_ad(store == STORE_IF_EXISTS); + log_sys.set_recovered_lsn(recv_sys.lsn); + recv_sys.apply(false); + } + } + } + + if (r != recv_sys_t::PREMATURE_EOF) + { + ut_ad(r == recv_sys_t::GOT_EOF); + break; + } + + read_more: +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + break; +#endif + if (recv_sys.offset < log_sys.get_block_size()) + break; + + if (recv_sys.offset > buf_size / 4 || + (recv_sys.offset > block_size_1 && + recv_sys.len >= buf_size - recv_sys.MTR_SIZE_MAX)) + { + const size_t ofs{recv_sys.offset & ~block_size_1}; + memmove_aligned<64>(log_sys.buf, log_sys.buf + ofs, recv_sys.len - ofs); + recv_sys.len-= ofs; + recv_sys.offset&= block_size_1; + } + } + + const bool corrupt= recv_sys.is_corrupt_log() || recv_sys.is_corrupt_fs(); + recv_sys.maybe_finish_batch(); + if (last_phase) + log_sys.set_recovered_lsn(recv_sys.lsn); + mysql_mutex_unlock(&recv_sys.mutex); + + if (corrupt) + DBUG_RETURN(false); + + DBUG_PRINT("ib_log", + ("%s " LSN_PF " completed", last_phase ? "rescan" : "scan", + recv_sys.lsn)); + ut_ad(!last_phase || recv_sys.lsn >= recv_sys.file_checkpoint); + + DBUG_RETURN(store == STORE_NO); } /** Report a missing tablespace for which page-redo log exists. @@ -3978,19 +3961,11 @@ done: } /** Start recovering from a redo log checkpoint. -@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first system tablespace page @return error code or DB_SUCCESS */ -dberr_t -recv_recovery_from_checkpoint_start(lsn_t flush_lsn) +dberr_t recv_recovery_from_checkpoint_start() { - ulint max_cp_field; - lsn_t checkpoint_lsn; bool rescan = false; - ib_uint64_t checkpoint_no; - lsn_t contiguous_lsn; - byte* buf; - dberr_t err = DB_SUCCESS; ut_ad(srv_operation == SRV_OPERATION_NORMAL || srv_operation == SRV_OPERATION_RESTORE @@ -4010,146 +3985,67 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) mysql_mutex_lock(&log_sys.mutex); - err = recv_find_max_checkpoint(&max_cp_field); - - if (err != DB_SUCCESS) { - recv_sys.recovered_lsn = log_sys.get_lsn(); + dberr_t err = recv_sys.find_checkpoint(); + if (err != DB_SUCCESS) { +early_exit: mysql_mutex_unlock(&log_sys.mutex); - return(err); + return err; } - buf = log_sys.checkpoint_buf; - log_sys.log.read(max_cp_field, {buf, OS_FILE_LOG_BLOCK_SIZE}); - - checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN); - checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); + if (!log_set_capacity(srv_log_file_size)) { +err_exit: + err = DB_ERROR; + goto early_exit; + } /* Start reading the log from the checkpoint lsn. The variable contiguous_lsn contains an lsn up to which the log is known to be contiguously written. */ - recv_sys.mlog_checkpoint_lsn = 0; - - ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size); - - const lsn_t end_lsn = mach_read_from_8( - buf + LOG_CHECKPOINT_END_LSN); ut_ad(recv_sys.pages.empty()); - contiguous_lsn = checkpoint_lsn; - switch (log_sys.log.format) { - case 0: - mysql_mutex_unlock(&log_sys.mutex); - return DB_SUCCESS; - default: - if (end_lsn == 0) { - break; + + if (log_sys.format == log_t::FORMAT_3_23) { + goto early_exit; + } + + if (log_sys.is_latest()) { + const bool rewind = recv_sys.lsn + != log_sys.next_checkpoint_lsn; + log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; + + recv_scan_log(false); + if (recv_needed_recovery) { +read_only_recovery: + sql_print_warning("InnoDB: innodb_read_only" + " prevents crash recovery"); + err = DB_READ_ONLY; + goto early_exit; } - if (end_lsn >= checkpoint_lsn) { - contiguous_lsn = end_lsn; - break; + if (recv_sys.is_corrupt_log()) { + sql_print_error("InnoDB: Log scan aborted at LSN " + LSN_PF, recv_sys.lsn); + goto err_exit; } - recv_sys.set_corrupt_log(); - mysql_mutex_unlock(&log_sys.mutex); - return(DB_ERROR); - } - - size_t sizeof_checkpoint; - - if (!log_sys.is_physical()) { - sizeof_checkpoint = 9/* size of MLOG_CHECKPOINT */; - goto completed; - } - - /* Look for FILE_CHECKPOINT. */ - recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false); - /* The first scan should not have stored or applied any records. */ - ut_ad(recv_sys.pages.empty()); - ut_ad(!recv_sys.is_corrupt_fs()); - - if (srv_read_only_mode && recv_needed_recovery) { - mysql_mutex_unlock(&log_sys.mutex); - return(DB_READ_ONLY); - } - - if (recv_sys.is_corrupt_log() && !srv_force_recovery) { - mysql_mutex_unlock(&log_sys.mutex); - sql_print_warning("InnoDB: Log scan aborted at LSN " LSN_PF, - contiguous_lsn); - return(DB_ERROR); - } - - if (recv_sys.mlog_checkpoint_lsn == 0) { - lsn_t scan_lsn = log_sys.log.scanned_lsn; - if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) { - mysql_mutex_unlock(&log_sys.mutex); - sql_print_error("InnoDB: Missing FILE_CHECKPOINT" - " at " LSN_PF - " between the checkpoint " LSN_PF - " and the end " LSN_PF ".", - end_lsn, checkpoint_lsn, scan_lsn); - return(DB_ERROR); + ut_ad(recv_sys.file_checkpoint); + if (rewind) { + recv_sys.lsn = log_sys.next_checkpoint_lsn; + recv_sys.offset = 0; + recv_sys.len = 0; } + ut_ad(!recv_max_page_lsn); + rescan = recv_scan_log(false); - log_sys.log.scanned_lsn = checkpoint_lsn; - } else { - contiguous_lsn = checkpoint_lsn; - rescan = recv_group_scan_log_recs( - checkpoint_lsn, &contiguous_lsn, false); + if (srv_read_only_mode && recv_needed_recovery) { + goto read_only_recovery; + } if ((recv_sys.is_corrupt_log() && !srv_force_recovery) || recv_sys.is_corrupt_fs()) { - mysql_mutex_unlock(&log_sys.mutex); - return(DB_ERROR); + goto err_exit; } } - /* NOTE: we always do a 'recovery' at startup, but only if - there is something wrong we will print a message to the - user about recovery: */ - sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT; - -completed: - if (flush_lsn == checkpoint_lsn + sizeof_checkpoint - && recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) { - /* The redo log is logically empty. */ - } else if (checkpoint_lsn != flush_lsn) { - ut_ad(!srv_log_file_created); - - if (checkpoint_lsn + sizeof_checkpoint < flush_lsn) { - sql_print_warning("InnoDB: Are you sure you are using" - " the right ib_logfile0" - " to start up the database?" - " The checkpoint is " LSN_PF - ", less than the" - " log sequence number " LSN_PF - " in the system tablespace.", - checkpoint_lsn, flush_lsn); - } - - if (!recv_needed_recovery) { - sql_print_information( - "InnoDB: The log sequence number " LSN_PF - " in the system tablespace does not match" - " the log checkpoint " LSN_PF - " in ib_logfile0!", flush_lsn, checkpoint_lsn); - - if (srv_read_only_mode) { - sql_print_error("InnoDB: innodb_read_only" - " prevents crash recovery"); - mysql_mutex_unlock(&log_sys.mutex); - return(DB_READ_ONLY); - } - - recv_needed_recovery = true; - } - } - - log_sys.set_lsn(recv_sys.recovered_lsn); - if (UNIV_LIKELY(log_sys.get_flushed_lsn() < recv_sys.recovered_lsn)) { - /* This may already have been set by create_log_file() - if no logs existed when the server started up. */ - log_sys.set_flushed_lsn(recv_sys.recovered_lsn); - } + log_sys.set_recovered_lsn(recv_sys.lsn); if (recv_needed_recovery) { bool missing_tablespace = false; @@ -4158,8 +4054,7 @@ completed: rescan, missing_tablespace); if (err != DB_SUCCESS) { - mysql_mutex_unlock(&log_sys.mutex); - return(err); + goto early_exit; } /* If there is any missing tablespace and rescan is needed @@ -4169,34 +4064,30 @@ completed: ut_ad(rescan || !missing_tablespace); while (missing_tablespace) { + recv_sys.lsn = recv_sys.last_stored_lsn; DBUG_PRINT("ib_log", ("Rescan of redo log to validate " "the missing tablespace. Scan " "from last stored LSN " LSN_PF, - recv_sys.last_stored_lsn)); - - lsn_t recent_stored_lsn = recv_sys.last_stored_lsn; - rescan = recv_group_scan_log_recs( - checkpoint_lsn, &recent_stored_lsn, false); - + recv_sys.lsn)); + rescan = recv_scan_log(false); ut_ad(!recv_sys.is_corrupt_fs()); missing_tablespace = false; - err = recv_sys.is_corrupt_log() - ? DB_ERROR - : recv_validate_tablespace( - rescan, missing_tablespace); + if (recv_sys.is_corrupt_log()) { + goto err_exit; + } + + err = recv_validate_tablespace( + rescan, missing_tablespace); if (err != DB_SUCCESS) { - mysql_mutex_unlock(&log_sys.mutex); - return err; + goto early_exit; } rescan = true; } - recv_sys.parse_start_lsn = checkpoint_lsn; - if (srv_operation == SRV_OPERATION_NORMAL) { deferred_spaces.deferred_dblwr(); buf_dblwr.recover(); @@ -4205,69 +4096,63 @@ completed: ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN); if (rescan) { - contiguous_lsn = checkpoint_lsn; - - recv_group_scan_log_recs( - checkpoint_lsn, &contiguous_lsn, true); - + recv_sys.lsn = log_sys.next_checkpoint_lsn; + rescan = recv_scan_log(true); if ((recv_sys.is_corrupt_log() && !srv_force_recovery) || recv_sys.is_corrupt_fs()) { - mysql_mutex_unlock(&log_sys.mutex); - return(DB_ERROR); + goto err_exit; } } } else { - ut_ad(!rescan || recv_sys.pages.empty()); + ut_ad(recv_sys.pages.empty()); } - if (log_sys.is_physical() - && (log_sys.log.scanned_lsn < checkpoint_lsn - || log_sys.log.scanned_lsn < recv_max_page_lsn)) { + if (log_sys.is_latest() + && (recv_sys.lsn < log_sys.next_checkpoint_lsn + || recv_sys.lsn < recv_max_page_lsn)) { sql_print_error("InnoDB: We scanned the log up to " LSN_PF "." " A checkpoint was at " LSN_PF " and the maximum LSN on a database page was " LSN_PF ". It is possible that the" " database is now corrupt!", - log_sys.log.scanned_lsn, checkpoint_lsn, + recv_sys.lsn, + log_sys.next_checkpoint_lsn, recv_max_page_lsn); } - if (recv_sys.recovered_lsn < checkpoint_lsn) { - mysql_mutex_unlock(&log_sys.mutex); - sql_print_error("InnoDB: Recovered only to lsn: " LSN_PF - " checkpoint_lsn: " LSN_PF, - recv_sys.recovered_lsn, checkpoint_lsn); - return(DB_ERROR); + if (recv_sys.lsn < log_sys.next_checkpoint_lsn) { + goto err_exit; } - log_sys.next_checkpoint_lsn = checkpoint_lsn; - log_sys.next_checkpoint_no = checkpoint_no + 1; - - recv_synchronize_groups(); - - ut_ad(recv_needed_recovery - || checkpoint_lsn == recv_sys.recovered_lsn); - - log_sys.write_lsn = log_sys.get_lsn(); - log_sys.buf_free = log_sys.write_lsn % OS_FILE_LOG_BLOCK_SIZE; - log_sys.buf_next_to_write = log_sys.buf_free; - - log_sys.last_checkpoint_lsn = checkpoint_lsn; - - if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL - && (~log_t::FORMAT_ENCRYPTED & log_sys.log.format) - == log_t::FORMAT_10_5) { - /* Write a FILE_CHECKPOINT marker as the first thing, - before generating any other redo log. This ensures - that subsequent crash recovery will be possible even - if the server were killed soon after this. */ - fil_names_clear(log_sys.last_checkpoint_lsn, true); + if (!srv_read_only_mode && log_sys.is_latest()) { + ut_ad(log_sys.get_flushed_lsn() == log_sys.get_lsn()); + ut_ad(recv_sys.lsn == log_sys.get_lsn()); + if (!log_sys.is_pmem()) { + const size_t bs_1{log_sys.get_block_size() - 1}; + const size_t ro{recv_sys.offset}; + recv_sys.offset &= bs_1; + memmove_aligned<64>(log_sys.buf, + log_sys.buf + (ro & ~bs_1), + log_sys.get_block_size()); +#ifdef HAVE_PMEM + } else { + mprotect(log_sys.buf, size_t(log_sys.file_size), + PROT_READ | PROT_WRITE); +#endif + } + log_sys.buf_free = recv_sys.offset; + if (recv_needed_recovery + && srv_operation == SRV_OPERATION_NORMAL) { + /* Write a FILE_CHECKPOINT marker as the first thing, + before generating any other redo log. This ensures + that subsequent crash recovery will be possible even + if the server were killed soon after this. */ + fil_names_clear(log_sys.next_checkpoint_lsn); + } } - log_sys.next_checkpoint_no = ++checkpoint_no; - mysql_mutex_lock(&recv_sys.mutex); recv_sys.apply_log_recs = true; recv_no_ibuf_operations = false; diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 03f133d9fc3..d483e3fb566 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -24,18 +24,15 @@ Mini-transaction buffer Created 11/26/1995 Heikki Tuuri *******************************************************/ -#include "mtr0mtr.h" - +#include "mtr0log.h" #include "buf0buf.h" #include "buf0flu.h" -#include "fsp0sysspace.h" #include "page0types.h" -#include "mtr0log.h" -#include "log0recv.h" -#include "my_cpu.h" +#include "log0crypt.h" #ifdef BTR_CUR_HASH_ADAPT # include "btr0sea.h" #endif +#include "log.h" /** Iterate over a memo block in reverse. */ template @@ -404,7 +401,7 @@ void mtr_t::commit() std::pair lsns; - if (const ulint len= prepare_write()) + if (const auto len= prepare_write()) lsns= finish_write(len); else lsns= { m_commit_lsn, PAGE_FLUSH_NO }; @@ -412,9 +409,9 @@ void mtr_t::commit() if (m_made_dirty) mysql_mutex_lock(&log_sys.flush_order_mutex); - /* It is now safe to release the log mutex because the - flush_order mutex will ensure that we are the first one - to insert into the flush list. */ + /* It is now safe to release log_sys.mutex because the + buf_pool.flush_order_mutex will ensure that we are the first one + to insert into buf_pool.flush_list. */ mysql_mutex_unlock(&log_sys.mutex); if (m_freed_pages) @@ -449,9 +446,6 @@ void mtr_t::commit() if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); - - if (m_made_dirty) - srv_stats.log_write_requests.inc(); } else m_memo.for_each_block_in_reverse(CIterate()); @@ -566,7 +560,6 @@ void mtr_t::commit_shrink(fil_space_t &space) mysql_mutex_unlock(&fil_system.mutex); m_memo.for_each_block_in_reverse(CIterate()); - srv_stats.log_write_requests.inc(); release_resources(); } @@ -576,39 +569,53 @@ but generated some redo log on a higher level, such as FILE_MODIFY records and an optional FILE_CHECKPOINT marker. The caller must hold log_sys.mutex. This is to be used at log_checkpoint(). -@param[in] checkpoint_lsn log checkpoint LSN, or 0 */ -void mtr_t::commit_files(lsn_t checkpoint_lsn) +@param checkpoint_lsn the log sequence number of a checkpoint, or 0 +@return current LSN */ +lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn) { - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(is_active()); - ut_ad(!is_inside_ibuf()); - ut_ad(m_log_mode == MTR_LOG_ALL); - ut_ad(!m_made_dirty); - ut_ad(m_memo.size() == 0); - ut_ad(!srv_read_only_mode); - ut_ad(!m_freed_space); - ut_ad(!m_freed_pages); + mysql_mutex_assert_owner(&log_sys.mutex); + ut_ad(is_active()); + ut_ad(!is_inside_ibuf()); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_ad(!m_made_dirty); + ut_ad(m_memo.size() == 0); + ut_ad(!srv_read_only_mode); + ut_ad(!m_freed_space); + ut_ad(!m_freed_pages); + ut_ad(!m_user_space); - if (checkpoint_lsn) { - byte* ptr = m_log.push(SIZE_OF_FILE_CHECKPOINT); - compile_time_assert(SIZE_OF_FILE_CHECKPOINT == 3 + 8 + 1); - *ptr = FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2); - ::memset(ptr + 1, 0, 2); - mach_write_to_8(ptr + 3, checkpoint_lsn); - ptr[3 + 8] = 0; - } else { - *m_log.push(1) = 0; - } + if (checkpoint_lsn) + { + byte *ptr= m_log.push(3 + 8); + *ptr= FILE_CHECKPOINT | (2 + 8); + ::memset(ptr + 1, 0, 2); + mach_write_to_8(ptr + 3, checkpoint_lsn); + } - finish_write(m_log.size()); - srv_stats.log_write_requests.inc(); - release_resources(); + size_t size= m_log.size() + 5; - if (checkpoint_lsn) { - DBUG_PRINT("ib_log", - ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF, - checkpoint_lsn, log_sys.get_lsn())); - } + if (log_sys.is_encrypted()) + { + /* We will not encrypt any FILE_ records, but we will reserve + a nonce at the end. */ + size+= 8; + m_commit_lsn= log_sys.get_lsn(); + } + else + m_commit_lsn= 0; + + m_crc= 0; + m_log.for_each_block([this](const mtr_buf_t::block_t *b) + { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; }); + finish_write(size); + release_resources(); + + if (checkpoint_lsn) + DBUG_PRINT("ib_log", + ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF, + checkpoint_lsn, m_commit_lsn)); + + return m_commit_lsn; } #ifdef UNIV_DEBUG @@ -742,32 +749,32 @@ static time_t log_margin_warn_time; static bool log_close_warned; static time_t log_close_warn_time; -/** Check margin not to overwrite transaction log from the last checkpoint. -If would estimate the log write to exceed the log_capacity, -waits for the checkpoint is done enough. -@param len length of the data to be written */ -static void log_margin_checkpoint_age(ulint len) +/** Display a warning that the log tail is overwriting the head, +making the server crash-unsafe. */ +ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t age, lsn_t capacity) { - const ulint framing_size= log_sys.framing_size(); - /* actual length stored per block */ - const ulint len_per_blk= OS_FILE_LOG_BLOCK_SIZE - framing_size; + time_t t= time(nullptr); + if (!log_close_warned || difftime(t, log_close_warn_time) > 15) + { + log_close_warned= true; + log_close_warn_time= t; - /* actual data length in last block already written */ - ulint extra_len= log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; + sql_print_error("InnoDB: The age of the last checkpoint is " LSN_PF + ", which exceeds the log capacity " LSN_PF ".", + age, capacity); + } +} - ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE); - extra_len-= LOG_BLOCK_HDR_SIZE; +/** Reserve space in the log buffer for appending data. +@param size upper limit of the length of the data to append(), in bytes +@return the current LSN */ +inline lsn_t log_t::append_prepare(size_t size) noexcept +{ + mysql_mutex_assert_owner(&mutex); - /* total extra length for block header and trailer */ - extra_len= ((len + extra_len) / len_per_blk) * framing_size; + lsn_t lsn= get_lsn(); - const ulint margin= len + extra_len; - - mysql_mutex_assert_owner(&log_sys.mutex); - - const lsn_t lsn= log_sys.get_lsn(); - - if (UNIV_UNLIKELY(margin > log_sys.log_capacity)) + if (UNIV_UNLIKELY(size > log_capacity)) { time_t t= time(nullptr); @@ -777,143 +784,63 @@ static void log_margin_checkpoint_age(ulint len) log_margin_warned= true; log_margin_warn_time= t; - ib::error() << "innodb_log_file_size is too small " - "for mini-transaction size " << len; + sql_print_error("InnoDB: innodb_log_file_size is too small " + "for mini-transaction size %zu", size); } + goto throttle; } - else if (UNIV_LIKELY(lsn + margin <= log_sys.last_checkpoint_lsn + - log_sys.log_capacity)) - return; + else if (UNIV_UNLIKELY(lsn + size > last_checkpoint_lsn + log_capacity)) + throttle: + set_check_flush_or_checkpoint(); - log_sys.set_check_flush_or_checkpoint(); -} - - -/** Open the log for log_write_low(). The log must be closed with log_close(). -@param len length of the data to be written -@return start lsn of the log record */ -static lsn_t log_reserve_and_open(size_t len) -{ - for (ut_d(ulint count= 0);;) + if (is_pmem()) { - mysql_mutex_assert_owner(&log_sys.mutex); + for (ut_d(int count= 50); capacity() - size < + size_t(lsn - flushed_to_disk_lsn.load(std::memory_order_relaxed)); ) + { + waits++; + mysql_mutex_unlock(&mutex); + DEBUG_SYNC_C("log_buf_size_exceeded"); + log_write_up_to(lsn, true); + ut_ad(count--); + mysql_mutex_lock(&mutex); + lsn= get_lsn(); + } + return lsn; + } - /* Calculate an upper limit for the space the string may take in - the log buffer */ + /* Calculate the amount of free space needed. */ + size= (4 * 4096) - size + log_sys.buf_size; - size_t len_upper_limit= (4 * OS_FILE_LOG_BLOCK_SIZE) + - srv_log_write_ahead_size + (5 * len) / 4; - - if (log_sys.buf_free + len_upper_limit <= srv_log_buffer_size) - break; - - mysql_mutex_unlock(&log_sys.mutex); + for (ut_d(int count= 50); UNIV_UNLIKELY(buf_free > size); ) + { + waits++; + mysql_mutex_unlock(&mutex); DEBUG_SYNC_C("log_buf_size_exceeded"); - - /* Not enough free space, do a write of the log buffer */ - log_write_up_to(log_sys.get_lsn(), false); - - srv_stats.log_waits.inc(); - - ut_ad(++count < 50); - - mysql_mutex_lock(&log_sys.mutex); + log_write_up_to(lsn, false); + ut_ad(count--); + mysql_mutex_lock(&mutex); + lsn= get_lsn(); } - return log_sys.get_lsn(); + return lsn; } -/** Append data to the log buffer. */ -static void log_write_low(const void *str, size_t size) +/** Finish appending data to the log. +@param lsn the end LSN of the log record +@return whether buf_flush_ahead() will have to be invoked */ +static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept { mysql_mutex_assert_owner(&log_sys.mutex); - const ulint trailer_offset= log_sys.trailer_offset(); - - do - { - /* Calculate a part length */ - size_t len= size; - size_t data_len= (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + size; - - if (data_len > trailer_offset) - { - data_len= trailer_offset; - len= trailer_offset - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE; - } - - memcpy(log_sys.buf + log_sys.buf_free, str, len); - - size-= len; - str= static_cast(str) + len; - - byte *log_block= static_cast(ut_align_down(log_sys.buf + - log_sys.buf_free, - OS_FILE_LOG_BLOCK_SIZE)); - - log_block_set_data_len(log_block, data_len); - lsn_t lsn= log_sys.get_lsn(); - - if (data_len == trailer_offset) - { - /* This block became full */ - log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); - log_block_set_checkpoint_no(log_block, log_sys.next_checkpoint_no); - len+= log_sys.framing_size(); - lsn+= len; - /* Initialize the next block header */ - log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, lsn); - } - else - lsn+= len; - - log_sys.set_lsn(lsn); - log_sys.buf_free+= len; - - ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size}); - } - while (size); -} - -/** Close the log at mini-transaction commit. -@return whether buffer pool flushing is needed */ -static mtr_t::page_flush_ahead log_close(lsn_t lsn) -{ - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(lsn == log_sys.get_lsn()); - - byte *log_block= static_cast(ut_align_down(log_sys.buf + - log_sys.buf_free, - OS_FILE_LOG_BLOCK_SIZE)); - - if (!log_block_get_first_rec_group(log_block)) - { - /* We initialized a new log block which was not written - full by the current mtr: the next mtr log record group - will start within this block at the offset data_len */ - log_block_set_first_rec_group(log_block, - log_block_get_data_len(log_block)); - } - - if (log_sys.buf_free > log_sys.max_buf_free) - log_sys.set_check_flush_or_checkpoint(); + log_sys.write_to_buf++; + log_sys.set_lsn(lsn); const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn; if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) && /* silence message on create_log_file() after the log had been deleted */ checkpoint_age != lsn) - { - time_t t= time(nullptr); - if (!log_close_warned || difftime(t, log_close_warn_time) > 15) - { - log_close_warned= true; - log_close_warn_time= t; - - ib::error() << "The age of the last checkpoint is " << checkpoint_age - << ", which exceeds the log capacity " - << log_sys.log_capacity << "."; - } - } + log_overwrite_warning(checkpoint_age, log_sys.log_capacity); else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async)) return mtr_t::PAGE_FLUSH_NO; else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age)) @@ -923,99 +850,132 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn) return mtr_t::PAGE_FLUSH_SYNC; } -/** Write the block contents to the REDO log */ -struct mtr_write_log +inline size_t mtr_t::prepare_write() { - /** Append a block to the redo log buffer. - @return whether the appending should continue */ - bool operator()(const mtr_buf_t::block_t *block) const + ut_ad(!recv_no_log_write); + if (UNIV_UNLIKELY(m_log_mode != MTR_LOG_ALL)) { - log_write_low(block->begin(), block->used()); - return true; + ut_ad(m_log_mode == MTR_LOG_NO_REDO); + ut_ad(m_log.size() == 0); + mysql_mutex_lock(&log_sys.mutex); + m_commit_lsn= log_sys.get_lsn(); + return 0; } -}; -/** Prepare to write the mini-transaction log to the redo log buffer. -@return number of bytes to write in finish_write() */ -inline ulint mtr_t::prepare_write() -{ - ut_ad(!recv_no_log_write); + size_t len= m_log.size() + 5; + ut_ad(len > 5); + if (log_sys.is_encrypted()) + { + len+= 8; + encrypt(); + } + else + { + m_crc= 0; + m_commit_lsn= 0; + m_log.for_each_block([this](const mtr_buf_t::block_t *b) + { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; }); + } - if (UNIV_UNLIKELY(m_log_mode != MTR_LOG_ALL)) { - ut_ad(m_log_mode == MTR_LOG_NO_REDO); - ut_ad(m_log.size() == 0); - mysql_mutex_lock(&log_sys.mutex); - m_commit_lsn = log_sys.get_lsn(); - return 0; - } + mysql_mutex_lock(&log_sys.mutex); - ulint len = m_log.size(); - ut_ad(len > 0); + if (m_user_space && !is_predefined_tablespace(m_user_space->id) && + !m_user_space->max_lsn) + name_write(); - if (len > srv_log_buffer_size / 2) { - log_buffer_extend(ulong((len + 1) * 2)); - } - - fil_space_t* space = m_user_space; - - if (space != NULL && is_predefined_tablespace(space->id)) { - /* Omit FILE_MODIFY for predefined tablespaces. */ - space = NULL; - } - - mysql_mutex_lock(&log_sys.mutex); - - if (fil_names_write_if_was_clean(space)) { - len = m_log.size(); - } else { - /* This was not the first time of dirtying a - tablespace since the latest checkpoint. */ - ut_ad(len == m_log.size()); - } - - *m_log.push(1) = 0; - len++; - - /* check and attempt a checkpoint if exceeding capacity */ - log_margin_checkpoint_age(len); - - return(len); + return len; } -/** Append the redo log records to the redo log buffer. -@param len number of bytes to write +/** Write the mini-transaction log to the redo log buffer. @return {start_lsn,flush_ahead} */ -inline std::pair mtr_t::finish_write(ulint len) +std::pair mtr_t::finish_write(size_t len) { - ut_ad(m_log_mode == MTR_LOG_ALL); - mysql_mutex_assert_owner(&log_sys.mutex); - ut_ad(m_log.size() == len); - ut_ad(len > 0); + ut_ad(!recv_no_log_write); + ut_ad(m_log_mode == MTR_LOG_ALL); - lsn_t start_lsn; + const lsn_t start_lsn= log_sys.append_prepare(len); + const size_t size{m_commit_lsn ? 5U + 8U : 5U}; - if (m_log.is_small()) { - const mtr_buf_t::block_t* front = m_log.front(); - ut_ad(len <= front->used()); + if (!log_sys.is_pmem()) + { + m_log.for_each_block([](const mtr_buf_t::block_t *b) + { log_sys.append(b->begin(), b->used()); return true; }); - m_commit_lsn = log_reserve_and_write_fast(front->begin(), len, - &start_lsn); + if (log_sys.buf_free >= log_sys.max_buf_free) + log_sys.set_check_flush_or_checkpoint(); - if (!m_commit_lsn) { - goto piecewise; - } - } else { -piecewise: - /* Open the database log for log_write_low */ - start_lsn = log_reserve_and_open(len); - mtr_write_log write_log; - m_log.for_each_block(write_log); - m_commit_lsn = log_sys.get_lsn(); - } - page_flush_ahead flush= log_close(m_commit_lsn); - DBUG_EXECUTE_IF("ib_log_flush_ahead", flush = PAGE_FLUSH_SYNC;); +#ifdef HAVE_PMEM + write_trailer: +#endif + log_sys.buf[log_sys.buf_free]= + log_sys.get_sequence_bit(start_lsn + len - size); + if (m_commit_lsn) + { + byte *nonce= log_sys.buf + log_sys.buf_free + 1; + mach_write_to_8(nonce, m_commit_lsn); + m_crc= my_crc32c(m_crc, nonce, 8); + mach_write_to_4(&log_sys.buf[log_sys.buf_free + 9], m_crc); + log_sys.buf_free+= 8 + 5; + } + else + { + mach_write_to_4(&log_sys.buf[log_sys.buf_free + 1], m_crc); + log_sys.buf_free+= 5; + } + } +#ifdef HAVE_PMEM + else if (UNIV_LIKELY(log_sys.buf_free + len < log_sys.file_size)) + { + m_log.for_each_block([](const mtr_buf_t::block_t *b) + { log_sys.append(b->begin(), b->used()); return true; }); + goto write_trailer; + } + else + { + m_log.for_each_block([](const mtr_buf_t::block_t *b) + { + size_t size{b->used()}; + const size_t size_left{log_sys.file_size - log_sys.buf_free}; + const byte *src= b->begin(); + if (size <= size_left) + { + ::memcpy(log_sys.buf + log_sys.buf_free, src, size); + log_sys.buf_free+= size; + } + else + { + size-= size_left; + ::memcpy(log_sys.buf + log_sys.buf_free, src, size_left); + ::memcpy(log_sys.buf + log_sys.START_OFFSET, src + size_left, size); + log_sys.buf_free= log_sys.START_OFFSET + size; + } + return true; + }); + const size_t size_left{log_sys.file_size - log_sys.buf_free}; + if (size_left > size) + goto write_trailer; - return std::make_pair(start_lsn, flush); + byte tail[5 + 8]; + tail[0]= log_sys.get_sequence_bit(start_lsn + len - size); + + if (m_commit_lsn) + { + mach_write_to_8(tail + 1, m_commit_lsn); + m_crc= my_crc32c(m_crc, tail + 1, 8); + mach_write_to_4(tail + 9, m_crc); + } + else + mach_write_to_4(tail + 1, m_crc); + + ::memcpy(log_sys.buf + log_sys.buf_free, tail, size_left); + ::memcpy(log_sys.buf + log_sys.START_OFFSET, tail + size_left, + size - size_left); + log_sys.buf_free= log_sys.START_OFFSET + (size - size_left); + } +#endif + + m_commit_lsn= start_lsn + len; + return {start_lsn, log_close(m_commit_lsn)}; } /** Find out whether a block was not X-latched by the mini-transaction */ @@ -1376,7 +1336,8 @@ void mtr_t::modify(const buf_block_t &block) } Iterate iteration((FindModified(block))); - if (UNIV_UNLIKELY(m_memo.for_each_block(iteration))) + m_memo.for_each_block(iteration); + if (UNIV_UNLIKELY(!iteration.functor.found)) { ut_ad("modifying an unlatched page" == 0); return; diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index fea1eda17e9..6d3c61e7221 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -33,13 +33,14 @@ The interface to the operating system file i/o primitives Created 10/21/1995 Heikki Tuuri *******************************************************/ -#ifndef UNIV_INNOCHECKSUM #include "os0file.h" #include "sql_const.h" +#include "log.h" -#ifdef UNIV_LINUX +#ifdef __linux__ # include # include +# include #endif #include "srv0mon.h" @@ -64,13 +65,6 @@ Created 10/21/1995 Heikki Tuuri # include #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ -#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) -# include -# ifndef DFS_IOCTL_ATOMIC_WRITE_SET -# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) -# endif -#endif - #ifdef _WIN32 #include #endif @@ -817,18 +811,15 @@ os_file_get_last_error_low( } if (report_all_errors - || (err != ENOSPC && err != EEXIST && !on_error_silent)) { + || (err != ENOSPC && err != EEXIST && err != ENOENT + && !on_error_silent)) { ib::error() << "Operating system error number " << err << " in a file operation."; - if (err == ENOENT) { - ib::error() - << "The error means the system" - " cannot find the path specified."; - } else if (err == EACCES) { + if (err == EACCES) { ib::error() << "The error means mariadbd does not have" @@ -1113,7 +1104,14 @@ os_file_create_simple_func( OS caching (O_DIRECT) here as we do in os_file_create_func(), so we open the same file in the same mode, see man page of open(2). */ if (!srv_read_only_mode && *success) { - os_file_set_nocache(file, name, mode_str); + switch (srv_file_flush_method) { + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + os_file_set_nocache(file, name, mode_str); + break; + default: + break; + } } #ifdef USE_FILE_LOCK @@ -1287,22 +1285,81 @@ os_file_create_func( } while (retry); - /* We disable OS caching (O_DIRECT) only on data files */ - if (!read_only - && *success - && type != OS_LOG_FILE - && type != OS_DATA_FILE_NO_O_DIRECT) { - os_file_set_nocache(file, name, mode_str); + if (!*success) { + return file; } +#if (defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)) || defined O_DIRECT + if (type == OS_DATA_FILE) { + switch (srv_file_flush_method) { + case SRV_O_DSYNC: + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: +# ifdef __linux__ +use_o_direct: +# endif + os_file_set_nocache(file, name, mode_str); + break; + default: + break; + } +# ifdef __linux__ + } else if (type == OS_LOG_FILE && !log_sys.is_opened()) { + struct stat st; + char b[20 + sizeof "/sys/dev/block/" ":" + "/../queue/physical_block_size"]; + int f; + if (fstat(file, &st) || st.st_size & 4095) { + goto skip_o_direct; + } + if (snprintf(b, sizeof b, + "/sys/dev/block/%u:%u/queue/physical_block_size", + major(st.st_dev), minor(st.st_dev)) + >= static_cast(sizeof b)) { + goto skip_o_direct; + } + if ((f = open(b, O_RDONLY)) == -1) { + if (snprintf(b, sizeof b, + "/sys/dev/block/%u:%u/../queue/" + "physical_block_size", + major(st.st_dev), minor(st.st_dev)) + >= static_cast(sizeof b)) { + goto skip_o_direct; + } + f = open(b, O_RDONLY); + } + if (f != -1) { + ssize_t l = read(f, b, sizeof b); + unsigned long s = 0; + + if (l > 0 && static_cast(l) < sizeof b + && b[l - 1] == '\n') { + char* end = b; + s = strtoul(b, &end, 10); + if (b == end || *end != '\n') { + s = 0; + } + } + close(f); + if (s > 4096 || s < 64 || !ut_is_2pow(s)) { + goto skip_o_direct; + } + log_sys.set_block_size(uint32_t(s)); + if (srv_file_flush_method == SRV_O_DSYNC) { + goto use_o_direct; + } + } else { +skip_o_direct: + log_sys.set_block_size(0); + } + } +# endif +#endif + #ifdef USE_FILE_LOCK if (!read_only - && *success - && create_mode != OS_FILE_OPEN_RAW - && os_file_lock(file, name)) { - + && create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) { if (create_mode == OS_FILE_OPEN_RETRY) { - ib::info() << "Retrying to lock the first data file"; @@ -1823,29 +1880,26 @@ os_file_get_last_error_low( if (report_all_errors || (!on_error_silent && err != ERROR_DISK_FULL + && err != ERROR_FILE_NOT_FOUND && err != ERROR_FILE_EXISTS)) { ib::error() << "Operating system error number " << err << " in a file operation."; - if (err == ERROR_PATH_NOT_FOUND) { - ib::error() - << "The error means the system" - " cannot find the path specified."; - - } else if (err == ERROR_ACCESS_DENIED) { - + switch (err) { + case ERROR_PATH_NOT_FOUND: + break; + case ERROR_ACCESS_DENIED: ib::error() << "The error means mariadbd does not have" " the access rights to" " the directory. It may also be" " you have created a subdirectory" " of the same name as a data file."; - - } else if (err == ERROR_SHARING_VIOLATION - || err == ERROR_LOCK_VIOLATION) { - + break; + case ERROR_SHARING_VIOLATION: + case ERROR_LOCK_VIOLATION: ib::error() << "The error means that another program" " is using InnoDB's files." @@ -1853,29 +1907,23 @@ os_file_get_last_error_low( " software or another instance" " of MariaDB." " Please close it to get rid of this error."; - - } else if (err == ERROR_WORKING_SET_QUOTA - || err == ERROR_NO_SYSTEM_RESOURCES) { - + break; + case ERROR_WORKING_SET_QUOTA: + case ERROR_NO_SYSTEM_RESOURCES: ib::error() << "The error means that there are no" " sufficient system resources or quota to" " complete the operation."; - - } else if (err == ERROR_OPERATION_ABORTED) { - + break; + case ERROR_OPERATION_ABORTED: ib::error() << "The error means that the I/O" " operation has been aborted" " because of either a thread exit" " or an application request." " Retry attempt is made."; - } else if (err == ERROR_PATH_NOT_FOUND) { - ib::error() - << "This error means that directory did not exist" - " during file creation."; - } else { - + break; + default: ib::info() << OPERATING_SYSTEM_ERROR_MSG; } } @@ -2059,31 +2107,6 @@ os_file_create_directory( return(true); } -/** Check that IO of specific size is possible for the file -opened with FILE_FLAG_NO_BUFFERING. - -The requirement is that IO is multiple of the disk sector size. - -@param[in] file file handle -@param[in] io_size expected io size -@return true - unbuffered io of requested size is possible, false otherwise. - -@note: this function only works correctly with Windows 8 or later, -(GetFileInformationByHandleEx with FileStorageInfo is only supported there). -It will return true on earlier Windows version. - */ -static bool unbuffered_io_possible(HANDLE file, size_t io_size) -{ - FILE_STORAGE_INFO info; - if (GetFileInformationByHandleEx( - file, FileStorageInfo, &info, sizeof(info))) { - ULONG sector_size = info.LogicalBytesPerSector; - if (sector_size) - return io_size % sector_size == 0; - } - return true; -} - /** NOTE! Use the corresponding macro os_file_create(), not directly this function! @@ -2178,54 +2201,22 @@ os_file_create_func( return(OS_FILE_CLOSED); } - DWORD attributes = 0; - - if (purpose == OS_FILE_AIO) { - -#ifdef WIN_ASYNC_IO - /* If specified, use asynchronous (overlapped) io and no - buffering of writes in the OS */ - - if (srv_use_native_aio) { - attributes |= FILE_FLAG_OVERLAPPED; - } -#endif /* WIN_ASYNC_IO */ - - } else if (purpose == OS_FILE_NORMAL) { - - /* Use default setting. */ - - } else { - - ib::error() - << "Unknown purpose flag (" << purpose << ") " - << "while opening file '" << name << "'"; - - return(OS_FILE_CLOSED); - } + DWORD attributes = (purpose == OS_FILE_AIO && srv_use_native_aio) + ? FILE_FLAG_OVERLAPPED : 0; if (type == OS_LOG_FILE) { - /* There is not reason to use buffered write to logs.*/ attributes |= FILE_FLAG_NO_BUFFERING; } - switch (srv_file_flush_method) - { + switch (srv_file_flush_method) { case SRV_O_DSYNC: if (type == OS_LOG_FILE) { - /* Map O_DSYNC to FILE_WRITE_THROUGH */ attributes |= FILE_FLAG_WRITE_THROUGH; } - break; - + /* fall through */ case SRV_O_DIRECT_NO_FSYNC: case SRV_O_DIRECT: - if (type != OS_DATA_FILE) { - break; - } - /* fall through */ case SRV_ALL_O_DIRECT_FSYNC: - /*Traditional Windows behavior, no buffering for any files.*/ if (type != OS_DATA_FILE_NO_O_DIRECT) { attributes |= FILE_FLAG_NO_BUFFERING; } @@ -2233,28 +2224,10 @@ os_file_create_func( case SRV_FSYNC: case SRV_LITTLESYNC: - break; - case SRV_NOSYNC: - /* Let Windows cache manager handle all writes.*/ - attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); break; - - default: - ut_a(false); /* unknown flush mode.*/ } - - // TODO: Create a bug, this looks wrong. The flush log - // parameter is dynamic. - if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { - /* Do not use unbuffered i/o for the log files because - value 2 denotes that we do not flush the log at every - commit, but only once per second */ - attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); - } - - DWORD access = GENERIC_READ; if (!read_only) { @@ -2269,16 +2242,42 @@ os_file_create_func( name, access, share_mode, my_win_file_secattr(), create_flag, attributes, NULL); - /* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all, - for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/ - if ((file != INVALID_HANDLE_VALUE) - && (attributes & FILE_FLAG_NO_BUFFERING) - && (type == OS_LOG_FILE) - && !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) { + if (file != INVALID_HANDLE_VALUE && type == OS_LOG_FILE + && (attributes & FILE_FLAG_NO_BUFFERING)) { + if (log_sys.is_opened()) { + /* If we are upgrading from multiple log files, + never disable buffering on other than the + first file. We only keep track of the block + size of the first file. */ + no_o_direct: ut_a(CloseHandle(file)); attributes &= ~FILE_FLAG_NO_BUFFERING; create_flag = OPEN_ALWAYS; continue; + } + + /* If FILE_FLAG_NO_BUFFERING was set on the log file, + check if this can work at all, for the expected sizes. + Reopen without the flag, if it won't work. */ + DWORD high, low= GetFileSize(file, &high); + if (low & 4095) { + /* mariadb-backup creates odd-sized files that + will be resized before the log is being + written to */ + skip_o_direct: + log_sys.set_block_size(0); + goto no_o_direct; + } + FILE_STORAGE_INFO i; + if (!GetFileInformationByHandleEx(file, FileStorageInfo, + &i, sizeof i)) { + goto skip_o_direct; + } + const ULONG s = i.PhysicalBytesPerSectorForPerformance; + if (s > 4096 || s < 64 || !ut_is_2pow(s)) { + goto skip_o_direct; + } + log_sys.set_block_size(uint32_t(s)); } *success = (file != INVALID_HANDLE_VALUE); @@ -3142,9 +3141,14 @@ os_file_handle_error_cond_exit( case OS_FILE_PATH_ERROR: case OS_FILE_ALREADY_EXISTS: case OS_FILE_ACCESS_VIOLATION: - return(false); + case OS_FILE_NOT_FOUND: + if (!on_error_silent) { + sql_print_error("InnoDB: File %s was not found", name); + } + return false; + case OS_FILE_SHARING_VIOLATION: std::this_thread::sleep_for(std::chrono::seconds(10)); @@ -3191,15 +3195,6 @@ os_file_set_nocache( const char* file_name MY_ATTRIBUTE((unused)), const char* operation_name MY_ATTRIBUTE((unused))) { - const auto innodb_flush_method = srv_file_flush_method; - switch (innodb_flush_method) { - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: - break; - default: - return; - } - /* some versions of Solaris may not have DIRECTIO_ON */ #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) if (directio(fd, DIRECTIO_ON) == -1) { @@ -3842,8 +3837,9 @@ void os_aio_wait_until_no_pending_reads() dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) { ut_ad(n > 0); - ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0); - ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0); + ut_ad(!(n & 511)); /* payload of page_compressed tables */ + ut_ad((offset % UNIV_ZIP_SIZE_MIN) == 0); + ut_ad((reinterpret_cast(buf) % UNIV_ZIP_SIZE_MIN) == 0); ut_ad(type.is_read() || type.is_write()); ut_ad(type.node); ut_ad(type.node->is_open()); @@ -3895,11 +3891,6 @@ func_exit: cb->m_opcode = type.is_read() ? tpool::aio_opcode::AIO_PREAD : tpool::aio_opcode::AIO_PWRITE; new (cb->m_userdata) IORequest{type}; - ut_a(reinterpret_cast(cb->m_buffer) % OS_FILE_LOG_BLOCK_SIZE - == 0); - ut_a(cb->m_len % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_a(cb->m_offset % OS_FILE_LOG_BLOCK_SIZE == 0); - if (srv_thread_pool->submit_io(cb)) { slots->release(cb); os_file_handle_error(type.node->name, type.is_read() @@ -3922,10 +3913,8 @@ os_aio_print(FILE* file) time_elapsed = 0.001 + difftime(current_time, os_last_printout); fprintf(file, - "Pending flushes (fsync) log: " ULINTPF - "; buffer pool: " ULINTPF "\n" + "Pending flushes (fsync): " ULINTPF "\n" ULINTPF " OS file reads, %zu OS file writes, %zu OS fsyncs\n", - log_sys.get_pending_flushes(), ulint{fil_n_pending_tablespace_flushes}, ulint{os_n_file_reads}, static_cast(os_n_file_writes), @@ -4300,5 +4289,3 @@ invalid: space->set_sizes(this->size); return true; } - -#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 6bf22efda9a..cc3f19bafc8 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2021, MariaDB Corporation. +Copyright (c) 2015, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -188,7 +188,6 @@ row_fts_psort_info_init( fts_psort_t* merge_info = NULL; ulint block_size; ibool ret = TRUE; - bool encrypted = false; ut_ad(ut_is_2pow(old_zip_size)); block_size = 3 * srv_sort_buf_size; @@ -219,10 +218,6 @@ row_fts_psort_info_init( pthread_cond_init(&common_info->sort_cond, nullptr); common_info->opt_doc_id_size = opt_doc_id_size; - if (log_tmp_is_encrypted()) { - encrypted = true; - } - ut_ad(trx->mysql_thd != NULL); const char* path = thd_innodb_tmpdir(trx->mysql_thd); /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for @@ -264,7 +259,7 @@ row_fts_psort_info_init( /* If tablespace is encrypted, allocate additional buffer for encryption/decryption. */ - if (encrypted) { + if (srv_encrypt_log) { /* Need to align memory for O_DIRECT write */ psort_info[j].crypt_block[i] = static_cast( diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index 0382b4abf4a..ff8aa86d71d 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2021, MariaDB Corporation. +Copyright (c) 2015, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -4097,7 +4097,7 @@ page_corrupted: ut_ad(!comp == (size == srv_page_size)); ut_ad(!corrupt); mach_write_to_4(dest + (size - 4), - ut_crc32(dest, size - 4)); + my_crc32c(0, dest, size - 4)); } } diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index 610ac1ad2ae..7fbcdf60094 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -470,7 +470,7 @@ row_log_online_op( /* If encryption is enabled encrypt buffer before writing it to file system. */ - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { if (!log_tmp_block_encrypt( buf, srv_sort_buf_size, log->crypt_tail, byte_offset)) { @@ -607,7 +607,7 @@ row_log_table_close_func( /* If encryption is enabled encrypt buffer before writing it to file system. */ - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { if (!log_tmp_block_encrypt( log->tail.block, srv_sort_buf_size, log->crypt_tail, byte_offset, @@ -2912,7 +2912,7 @@ all_done: goto corruption; } - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { if (!log_tmp_block_decrypt( buf, srv_sort_buf_size, index->online_log->crypt_head, ofs)) { @@ -3266,7 +3266,7 @@ row_log_allocate( dict_index_set_online_status(index, ONLINE_INDEX_CREATION); - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size; log->crypt_head = static_cast( my_large_malloc(&log->crypt_head_size, MYF(MY_WME))); @@ -3823,7 +3823,7 @@ all_done: goto corruption; } - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { if (!log_tmp_block_decrypt( buf, srv_sort_buf_size, index->online_log->crypt_head, ofs)) { diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 44ab435c8e8..7b1e1b5597f 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -1258,7 +1258,7 @@ row_merge_read( IORequestRead, fd, buf, ofs, srv_sort_buf_size, 0); /* If encryption is enabled decrypt buffer */ - if (success && log_tmp_is_encrypted()) { + if (success && srv_encrypt_log) { if (!log_tmp_block_decrypt(buf, srv_sort_buf_size, crypt_buf, ofs)) { return (FALSE); @@ -1303,7 +1303,7 @@ row_merge_write( DBUG_EXECUTE_IF("row_merge_write_failure", DBUG_RETURN(FALSE);); /* For encrypted tables, encrypt data before writing */ - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { if (!log_tmp_block_encrypt(static_cast(buf), buf_len, static_cast(crypt_buf), @@ -4658,7 +4658,7 @@ row_merge_build_indexes( crypt_pfx.m_size = 0; /* silence bogus -Wmaybe-uninitialized */ TRASH_ALLOC(&crypt_pfx, sizeof crypt_pfx); - if (log_tmp_is_encrypted()) { + if (srv_encrypt_log) { crypt_block = static_cast( alloc.allocate_large(block_size, &crypt_pfx)); diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index d377d2d7b28..90d71556a5b 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -2,7 +2,7 @@ Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -663,24 +663,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN}, - {"os_log_fsyncs", "os", - "Number of fsync log writes (innodb_os_log_fsyncs)", - static_cast( - MONITOR_EXISTING | MONITOR_DEFAULT_ON), - MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_FSYNC}, - - {"os_log_pending_fsyncs", "os", - "Number of pending fsync write (innodb_os_log_pending_fsyncs)", - static_cast( - MONITOR_EXISTING | MONITOR_DEFAULT_ON), - MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_FSYNC}, - - {"os_log_pending_writes", "os", - "Number of pending log file writes (innodb_os_log_pending_writes)", - static_cast( - MONITOR_EXISTING | MONITOR_DEFAULT_ON), - MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_WRITES}, - /* ========== Counters for Transaction Module ========== */ {"module_trx", "transaction", "Transaction Manager", MONITOR_MODULE, @@ -781,8 +763,9 @@ static monitor_info_t innodb_counter_info[] = MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY}, {"log_checkpoints", "recovery", "Number of checkpoints", - MONITOR_NONE, - MONITOR_DEFAULT_START, MONITOR_NUM_CHECKPOINT}, + static_cast( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_CHECKPOINTS}, {"log_lsn_last_flush", "recovery", "LSN of Last flush", static_cast( @@ -817,16 +800,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC}, - {"log_pending_log_flushes", "recovery", "Pending log flushes", - static_cast( - MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), - MONITOR_DEFAULT_START, MONITOR_PENDING_LOG_FLUSH}, - - {"log_pending_checkpoint_writes", "recovery", "Pending checkpoints", - static_cast( - MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), - MONITOR_DEFAULT_START, MONITOR_PENDING_CHECKPOINT_WRITE}, - {"log_num_log_io", "recovery", "Number of log I/Os", static_cast( MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), @@ -850,12 +823,6 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES}, - {"log_padded", "recovery", - "Bytes of log padded for log write ahead", - static_cast( - MONITOR_EXISTING | MONITOR_DEFAULT_ON), - MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_PADDED}, - /* ========== Counters for Page Compression ========== */ {"module_compress", "compression", "Page Compression Info", MONITOR_MODULE, @@ -1565,43 +1532,22 @@ srv_mon_process_existing_counter( /* innodb_os_log_written */ case MONITOR_OVLD_OS_LOG_WRITTEN: - value = (mon_type_t) srv_stats.os_log_written; - break; - - /* innodb_os_log_fsyncs */ - case MONITOR_OVLD_OS_LOG_FSYNC: - value = log_sys.get_flushes(); - break; - - /* innodb_os_log_pending_fsyncs */ - case MONITOR_OVLD_OS_LOG_PENDING_FSYNC: - value = log_sys.get_pending_flushes(); - update_min = TRUE; - break; - - /* innodb_os_log_pending_writes */ - case MONITOR_OVLD_OS_LOG_PENDING_WRITES: - value = srv_stats.os_log_pending_writes; - update_min = TRUE; + value = log_sys.get_lsn() - recv_sys.lsn; break; /* innodb_log_waits */ case MONITOR_OVLD_LOG_WAITS: - value = srv_stats.log_waits; + value = log_sys.waits; break; /* innodb_log_write_requests */ case MONITOR_OVLD_LOG_WRITE_REQUEST: - value = srv_stats.log_write_requests; + value = log_sys.write_to_buf; break; /* innodb_log_writes */ case MONITOR_OVLD_LOG_WRITES: - value = srv_stats.log_writes; - break; - - case MONITOR_OVLD_LOG_PADDED: - value = srv_stats.log_padded; + value = log_sys.write_to_log; break; /* innodb_dblwr_writes */ @@ -1759,22 +1705,12 @@ srv_mon_process_existing_counter( value = log_sys.get_lsn(); break; - case MONITOR_PENDING_LOG_FLUSH: - value = static_cast(log_sys.pending_flushes); - - break; - - case MONITOR_PENDING_CHECKPOINT_WRITE: - mysql_mutex_lock(&log_sys.mutex); - value = static_cast( - log_sys.n_pending_checkpoint_writes); - mysql_mutex_unlock(&log_sys.mutex); - break; - case MONITOR_LOG_IO: - mysql_mutex_lock(&log_sys.mutex); - value = static_cast(log_sys.n_log_ios); - mysql_mutex_unlock(&log_sys.mutex); + value = log_sys.n_log_ios; + break; + + case MONITOR_OVLD_CHECKPOINTS: + value = log_sys.next_checkpoint_no; break; case MONITOR_LSN_CHECKPOINT_AGE: diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 5c2945c603d..8bdad052973 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -3,7 +3,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2022, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -153,8 +153,6 @@ char* srv_log_group_home_dir; /** The InnoDB redo log file size, or 0 when changing the redo log format at startup (while disallowing writes to the redo log). */ ulonglong srv_log_file_size; -/** innodb_log_buffer_size, in bytes */ -ulong srv_log_buffer_size; /** innodb_flush_log_at_trx_commit */ ulong srv_flush_log_at_trx_commit; /** innodb_flush_log_at_timeout */ @@ -163,8 +161,6 @@ uint srv_flush_log_at_timeout; ulong srv_page_size; /** log2 of innodb_page_size; @see innodb_init_params() */ uint32_t srv_page_size_shift; -/** innodb_log_write_ahead_size */ -ulong srv_log_write_ahead_size; /** innodb_adaptive_flushing; try to flush dirty pages so as to avoid IO bursts at the checkpoints. */ @@ -1011,12 +1007,6 @@ srv_export_innodb_status(void) export_vars.innodb_data_pending_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); - export_vars.innodb_data_pending_fsyncs = - log_sys.get_pending_flushes() - + fil_n_pending_tablespace_flushes; - - export_vars.innodb_data_fsyncs = os_n_fsyncs; - export_vars.innodb_data_read = srv_stats.data_read; export_vars.innodb_data_reads = os_n_file_reads; @@ -1090,22 +1080,6 @@ srv_export_innodb_status(void) export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id(); export_vars.innodb_history_list_length = trx_sys.history_size(); - export_vars.innodb_log_waits = srv_stats.log_waits; - - export_vars.innodb_os_log_written = srv_stats.os_log_written; - - export_vars.innodb_os_log_fsyncs = log_sys.get_flushes(); - - export_vars.innodb_os_log_pending_fsyncs - = log_sys.get_pending_flushes(); - - export_vars.innodb_os_log_pending_writes = - srv_stats.os_log_pending_writes; - - export_vars.innodb_log_write_requests = srv_stats.log_write_requests; - - export_vars.innodb_log_writes = srv_stats.log_writes; - mysql_mutex_lock(&lock_sys.wait_mutex); export_vars.innodb_row_lock_waits = lock_sys.get_wait_cumulative(); @@ -1204,6 +1178,8 @@ srv_export_innodb_status(void) export_vars.innodb_checkpoint_max_age = static_cast( log_sys.max_checkpoint_age); mysql_mutex_unlock(&log_sys.mutex); + export_vars.innodb_os_log_written = export_vars.innodb_lsn_current + - recv_sys.lsn; export_vars.innodb_checkpoint_age = static_cast( export_vars.innodb_lsn_current @@ -1281,7 +1257,7 @@ static void srv_monitor() void srv_monitor_task(void*) { /* number of successive fatal timeouts observed */ - static lsn_t old_lsn = recv_sys.recovered_lsn; + static lsn_t old_lsn = recv_sys.lsn; ut_ad(!srv_read_only_mode); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 3e39d59c576..e515b76a980 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -98,6 +98,7 @@ Created 2/16/1996 Heikki Tuuri #include "btr0pcur.h" #include "zlib.h" #include "ut0crc32.h" +#include "log.h" /** We are prepared for a situation that we have this many threads waiting for a transactional lock inside InnoDB. srv_start() sets the value. */ @@ -122,14 +123,15 @@ bool srv_startup_is_before_trx_rollback_phase; bool srv_is_being_started; /** TRUE if the server was successfully started */ bool srv_was_started; -/** The original value of srv_log_file_size (innodb_log_file_size) */ -static ulonglong srv_log_file_size_requested; /** whether srv_start() has been called */ static bool srv_start_has_been_called; /** Whether any undo log records can be generated */ bool srv_undo_sources; +/** innodb_encrypt_log */ +my_bool srv_encrypt_log; + #ifdef UNIV_DEBUG /** InnoDB system tablespace to set during recovery */ uint srv_sys_space_size_debug; @@ -171,95 +173,38 @@ static PSI_stage_info* srv_stages[] = }; #endif /* HAVE_PSI_STAGE_INTERFACE */ -/*********************************************************************//** -Check if a file can be opened in read-write mode. -@return true if it doesn't exist or can be opened in rw mode. */ -static -bool -srv_file_check_mode( -/*================*/ - const char* name) /*!< in: filename to check */ -{ - os_file_stat_t stat; - - memset(&stat, 0x0, sizeof(stat)); - - dberr_t err = os_file_get_status( - name, &stat, true, srv_read_only_mode); - - if (err == DB_FAIL) { - ib::error() << "os_file_get_status() failed on '" << name - << "'. Can't determine file permissions."; - return(false); - - } else if (err == DB_SUCCESS) { - - /* Note: stat.rw_perm is only valid of files */ - - if (stat.type == OS_FILE_TYPE_FILE) { - - if (!stat.rw_perm) { - const char* mode = srv_read_only_mode - ? "read" : "read-write"; - ib::error() << name << " can't be opened in " - << mode << " mode."; - return(false); - } - } else { - /* Not a regular file, bail out. */ - ib::error() << "'" << name << "' not a regular file."; - - return(false); - } - } else { - - /* This is OK. If the file create fails on RO media, there - is nothing we can do. */ - - ut_a(err == DB_NOT_FOUND); - } - - return(true); -} - /** Initial number of the redo log file */ static const char INIT_LOG_FILE0[]= "101"; /** Creates log file. -@param[in] create_new_db whether the database is being initialized -@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value -@param[out] logfile0 name of the log file +@param create_new_db whether the database is being initialized +@param lsn log sequence number +@param logfile0 name of the log file @return DB_SUCCESS or error code */ static dberr_t create_log_file(bool create_new_db, lsn_t lsn, - std::string& logfile0) + std::string& logfile0) { - if (srv_read_only_mode) { - ib::error() << "Cannot create log file in read-only mode"; - return DB_READ_ONLY; - } + ut_ad(!srv_read_only_mode); - if (!log_set_capacity(srv_log_file_size_requested)) { - return(DB_ERROR); - } + /* We will retain ib_logfile0 until we have written a new logically + empty log as ib_logfile101 and atomically renamed it to + ib_logfile0 in create_log_file_rename(). */ - /* Crashing after deleting the first file should be - recoverable. The buffer pool was clean, and we can simply - create log file from the scratch. */ - DBUG_EXECUTE_IF("innodb_log_abort_6", delete_log_file("0"); - return DB_ERROR;); - - for (size_t i = 0; i < 102; i++) { + for (size_t i = 1; i < 102; i++) { delete_log_file(std::to_string(i).c_str()); } - DBUG_PRINT("ib_log", ("After innodb_log_abort_6")); DBUG_ASSERT(!buf_pool.any_io_pending()); - DBUG_EXECUTE_IF("innodb_log_abort_7", return DB_ERROR;); - DBUG_PRINT("ib_log", ("After innodb_log_abort_7")); + mysql_mutex_lock(&log_sys.mutex); + if (!log_set_capacity(srv_log_file_size)) { +err_exit: + mysql_mutex_unlock(&log_sys.mutex); + return DB_ERROR; + } logfile0 = get_log_file_path(LOG_FILE_NAME_PREFIX) - .append(INIT_LOG_FILE0); + .append(INIT_LOG_FILE0); bool ret; pfs_os_file_t file = os_file_create( @@ -268,63 +213,30 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn, OS_LOG_FILE, srv_read_only_mode, &ret); if (!ret) { - ib::error() << "Cannot create " << logfile0; - return DB_ERROR; + sql_print_error("InnoDB: Cannot create %s", logfile0.c_str()); + goto err_exit; } - ib::info() << "Setting log file " << logfile0 << " size to " - << ib::bytes_iec{srv_log_file_size}; - ret = os_file_set_size(logfile0.c_str(), file, srv_log_file_size); if (!ret) { os_file_close(file); ib::error() << "Cannot set log file " << logfile0 << " size to " << ib::bytes_iec{srv_log_file_size}; - return DB_ERROR; + goto err_exit; } - ret = os_file_close(file); - ut_a(ret); - - DBUG_EXECUTE_IF("innodb_log_abort_8", return(DB_ERROR);); - DBUG_PRINT("ib_log", ("After innodb_log_abort_8")); - - /* We did not create the first log file initially as LOG_FILE_NAME, so - that crash recovery cannot find it until it has been completed and - renamed. */ - - log_sys.log.create(); - - log_sys.log.open_file(logfile0); + log_sys.set_latest_format(srv_encrypt_log); + log_sys.attach(file, srv_log_file_size); if (!fil_system.sys_space->open(create_new_db)) { - return DB_ERROR; + goto err_exit; } /* Create a log checkpoint. */ - mysql_mutex_lock(&log_sys.mutex); if (log_sys.is_encrypted() && !log_crypt_init()) { - return DB_ERROR; + goto err_exit; } ut_d(recv_no_log_write = false); - lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE); - log_sys.set_lsn(lsn + LOG_BLOCK_HDR_SIZE); - log_sys.log.set_lsn(lsn); - log_sys.log.set_lsn_offset(LOG_FILE_HDR_SIZE); - - log_sys.buf_next_to_write = 0; - log_sys.write_lsn = lsn; - - log_sys.next_checkpoint_no = 0; - log_sys.last_checkpoint_lsn = 0; - - memset(log_sys.buf, 0, srv_log_buffer_size); - log_block_init(log_sys.buf, lsn); - log_block_set_first_rec_group(log_sys.buf, LOG_BLOCK_HDR_SIZE); - memset(log_sys.flush_buf, 0, srv_log_buffer_size); - - log_sys.buf_free = LOG_BLOCK_HDR_SIZE; - - log_sys.log.write_header_durable(lsn); + log_sys.create(lsn); ut_ad(srv_startup_is_before_trx_rollback_phase); if (create_new_db) { @@ -342,41 +254,34 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn, } /** Rename the first redo log file. -@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value -@param[in,out] logfile0 name of the first log file -@return error code -@retval DB_SUCCESS on successful operation */ +@param lsn log sequence number +@param logfile0 name of the log file +@return error code +@retval DB_SUCCESS on successful operation */ MY_ATTRIBUTE((warn_unused_result)) static dberr_t create_log_file_rename(lsn_t lsn, std::string &logfile0) { ut_ad(!srv_log_file_created); ut_d(srv_log_file_created= true); - DBUG_EXECUTE_IF("innodb_log_abort_9", return (DB_ERROR);); - DBUG_PRINT("ib_log", ("After innodb_log_abort_9")); - - /* Rename the first log file, now that a log checkpoint has been created. */ - auto new_name = get_log_file_path(); - - ib::info() << "Renaming log file " << logfile0 << " to " << new_name; - - mysql_mutex_lock(&log_sys.mutex); + std::string new_name{get_log_file_path()}; ut_ad(logfile0.size() == 2 + new_name.size()); + + if (IF_WIN(!MoveFileEx(logfile0.c_str(), new_name.c_str(), + MOVEFILE_REPLACE_EXISTING), + rename(logfile0.c_str(), new_name.c_str()))) + { + sql_print_error("InnoDB: Failed to rename log from %s to %s", + logfile0.c_str(), new_name.c_str()); + return DB_ERROR; + } + logfile0= new_name; - dberr_t err= log_sys.log.rename(std::move(new_name)); - - mysql_mutex_unlock(&log_sys.mutex); - - DBUG_EXECUTE_IF("innodb_log_abort_10", err= DB_ERROR;); - - if (err == DB_SUCCESS) - ib::info() << "New log file created, LSN=" << lsn; - - return err; + return DB_SUCCESS; } /** Create an undo tablespace file -@param[in] name file name +@param[in] name file name @return DB_SUCCESS or error code */ static dberr_t srv_undo_tablespace_create(const char* name) { @@ -445,8 +350,8 @@ static dberr_t srv_validate_undo_tablespaces() if (srv_undo_tablespaces > srv_undo_tablespaces_open) { ib::error() << "Expected to open innodb_undo_tablespaces=" - << srv_undo_tablespaces - << " but was able to find only " + << srv_undo_tablespaces + << " but was able to find only " << srv_undo_tablespaces_open; return DB_ERROR; @@ -454,11 +359,11 @@ static dberr_t srv_validate_undo_tablespaces() else if (srv_undo_tablespaces_open > 0) { ib::info() << "Opened " << srv_undo_tablespaces_open - << " undo tablespaces"; + << " undo tablespaces"; if (srv_undo_tablespaces == 0) ib::warn() << "innodb_undo_tablespaces=0 disables" - " dedicated undo log tablespaces"; + " dedicated undo log tablespaces"; } return DB_SUCCESS; } @@ -473,8 +378,8 @@ static uint32_t trx_rseg_get_n_undo_tablespaces() if (const buf_block_t *sys_header= trx_sysf_get(&mtr, false)) for (ulint rseg_id= 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) != FIL_NULL) - if (uint32_t space= trx_sysf_rseg_get_space(sys_header, rseg_id)) - space_ids.insert(space); + if (uint32_t space= trx_sysf_rseg_get_space(sys_header, rseg_id)) + space_ids.insert(space); mtr.commit(); return static_cast(space_ids.size()); } @@ -561,7 +466,7 @@ err_exit: fil_set_max_space_id_if_bigger(space_id); fil_space_t *space= fil_space_t::create(space_id, fsp_flags, - FIL_TYPE_TABLESPACE, NULL); + FIL_TYPE_TABLESPACE, NULL); ut_a(fil_validate()); ut_a(space); @@ -647,7 +552,7 @@ srv_check_undo_redo_logs_exists() } static dberr_t srv_all_undo_tablespaces_open(bool create_new_db, - uint32_t n_undo) + uint32_t n_undo) { /* Open all the undo tablespaces that are currently in use. If we fail to open any of these it is a fatal error. The tablespace ids @@ -664,7 +569,7 @@ static dberr_t srv_all_undo_tablespaces_open(bool create_new_db, if (!space_id) { if (!create_new_db) - break; + break; ib::error() << "Unable to open create tablespace '" << name << "'."; return DB_ERROR; } @@ -725,8 +630,8 @@ dberr_t srv_undo_tablespaces_init(bool create_new_db) snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i + 1); if (dberr_t err= srv_undo_tablespace_create(name)) { - ib::error() << "Could not create undo tablespace '" << name << "'."; - return err; + ib::error() << "Could not create undo tablespace '" << name << "'."; + return err; } } } @@ -738,7 +643,7 @@ dberr_t srv_undo_tablespaces_init(bool create_new_db) srv_undo_tablespaces_active= srv_undo_tablespaces; uint32_t n_undo= (create_new_db || srv_operation == SRV_OPERATION_BACKUP || - srv_operation == SRV_OPERATION_RESTORE_DELTA) + srv_operation == SRV_OPERATION_RESTORE_DELTA) ? srv_undo_tablespaces : TRX_SYS_N_RSEGS; if (dberr_t err= srv_all_undo_tablespaces_open(create_new_db, n_undo)) @@ -756,7 +661,7 @@ dberr_t srv_undo_tablespaces_init(bool create_new_db) { mtr.start(); fsp_header_init(fil_space_get(srv_undo_space_id_start + i), - SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr); + SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr); mtr.commit(); } } @@ -779,8 +684,6 @@ srv_open_tmp_tablespace(bool create_new_db) srv_tmp_space.delete_files(); srv_tmp_space.set_ignore_read_only(true); - ib::info() << "Creating shared tablespace for temporary tables"; - bool create_new_temp_space; srv_tmp_space.set_space_id(SRV_TMP_SPACE_ID); @@ -795,7 +698,7 @@ srv_open_tmp_tablespace(bool create_new_db) } else if (err != DB_SUCCESS) { ib::error() << "Could not create the shared innodb_temporary."; } else if ((err = srv_tmp_space.open_or_create( - true, create_new_db, &sum_of_new_sizes, NULL)) + true, create_new_db, &sum_of_new_sizes)) != DB_SUCCESS) { ib::error() << "Unable to create the shared innodb_temporary"; } else if (fil_system.temp_space->open(true)) { @@ -869,7 +772,7 @@ srv_init_abort_low( " with error " << err << ". You may need" " to delete the ibdata1 file before trying to start" " up again."; - } else { + } else if (srv_operation == SRV_OPERATION_NORMAL) { ib::error() << "Plugin initialization aborted" #ifdef UNIV_DEBUG " at " << innobase_basename(file) << "[" << line << "]" @@ -884,9 +787,8 @@ srv_init_abort_low( /** Prepare to delete the redo log file. Flush the dirty pages from all the buffer pools. Flush the redo log buffer to the redo log file. -@param[in] old_exists old redo log file exists @return lsn upto which data pages have been flushed. */ -static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists) +static lsn_t srv_prepare_to_delete_redo_log_file() { DBUG_ENTER("srv_prepare_to_delete_redo_log_file"); @@ -897,33 +799,33 @@ static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists) /* Clean the buffer pool. */ buf_flush_sync(); - if (log_sys.log.subformat != 2) - srv_log_file_size= 0; - DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0);); DBUG_PRINT("ib_log", ("After innodb_log_abort_1")); mysql_mutex_lock(&log_sys.mutex); - const bool latest_format= (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) == - log_t::FORMAT_10_5; - lsn_t flushed_lsn= log_sys.get_lsn(); + const bool latest_format{log_sys.is_latest()}; + lsn_t flushed_lsn{log_sys.get_lsn()}; - if (latest_format) + if (latest_format && !(log_sys.file_size & 4095) && + flushed_lsn != log_sys.next_checkpoint_lsn + + (log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 + : SIZE_OF_FILE_CHECKPOINT)) { - fil_names_clear(flushed_lsn, false); + fil_names_clear(flushed_lsn); flushed_lsn= log_sys.get_lsn(); } { const char *msg; - if (!latest_format || srv_log_file_size == 0) + if (!latest_format) { msg= "Upgrading redo log: "; same_size: - ib::info() << msg << ib::bytes_iec(srv_log_file_size_requested) + ib::info() << msg << ib::bytes_iec(srv_log_file_size) << "; LSN=" << flushed_lsn; } - else if (old_exists && srv_log_file_size == srv_log_file_size_requested) + else if (srv_log_file_size == log_sys.file_size) { msg= srv_encrypt_log ? "Encrypting redo log: " : "Removing redo log encryption: "; @@ -939,19 +841,15 @@ same_size: : "Removing encryption and resizing"; ib::info() << msg << " redo log from " - << ib::bytes_iec{srv_log_file_size} << " to " - << ib::bytes_iec{srv_log_file_size_requested} + << ib::bytes_iec{log_sys.file_size} << " to " + << ib::bytes_iec{srv_log_file_size} << "; LSN=" << flushed_lsn; } } mysql_mutex_unlock(&log_sys.mutex); - if (flushed_lsn != log_sys.get_flushed_lsn()) - { - log_write_up_to(flushed_lsn, false); - log_sys.log.flush(); - } + log_write_up_to(flushed_lsn, false); ut_ad(flushed_lsn == log_sys.get_lsn()); ut_ad(!buf_pool.any_io_pending()); @@ -959,88 +857,17 @@ same_size: DBUG_RETURN(flushed_lsn); } -/** Tries to locate LOG_FILE_NAME and check it's size, etc -@param[out] log_file_found returns true here if correct file was found -@return dberr_t with DB_SUCCESS or some error */ -static dberr_t find_and_check_log_file(bool &log_file_found) -{ - log_file_found= false; - - auto logfile0= get_log_file_path(); - os_file_stat_t stat_info; - const dberr_t err= os_file_get_status(logfile0.c_str(), &stat_info, false, - srv_read_only_mode); - - auto is_operation_restore= []() -> bool { - return srv_operation == SRV_OPERATION_RESTORE || - srv_operation == SRV_OPERATION_RESTORE_EXPORT; - }; - - if (err == DB_NOT_FOUND) - { - if (is_operation_restore()) - return DB_NOT_FOUND; - - /* This might be first start after mariabackup - copy-back or move-back. */ - srv_start_after_restore= true; - return DB_SUCCESS; - } - - if (stat_info.type != OS_FILE_TYPE_FILE) - return DB_SUCCESS; - - if (!srv_file_check_mode(logfile0.c_str())) - return DB_ERROR; - - const os_offset_t size= stat_info.size; - ut_a(size != (os_offset_t) -1); - - if (size % OS_FILE_LOG_BLOCK_SIZE) - { - ib::error() << "Log file " << logfile0 << " size " << size - << " is not a multiple of " << OS_FILE_LOG_BLOCK_SIZE - << " bytes"; - return DB_ERROR; - } - - if (size == 0 && is_operation_restore()) - { - /* Tolerate an empty LOG_FILE_NAME from a previous run of - mariabackup --prepare. */ - return DB_NOT_FOUND; - } - /* The first log file must consist of at least the following 512-byte pages: - header, checkpoint page 1, empty, checkpoint page 2, redo log page(s). - - Mariabackup --prepare would create an empty LOG_FILE_NAME. Tolerate it. */ - if (size == 0) - srv_start_after_restore= true; - else if (size <= OS_FILE_LOG_BLOCK_SIZE * 4) - { - ib::error() << "Log file " << logfile0 << " size " << size - << " is too small"; - return DB_ERROR; - } - srv_log_file_size= size; - - log_file_found= true; - return DB_SUCCESS; -} - static tpool::task_group rollback_all_recovered_group(1); static tpool::task rollback_all_recovered_task(trx_rollback_all_recovered, - nullptr, - &rollback_all_recovered_group); + nullptr, + &rollback_all_recovered_group); /** Start InnoDB. @param[in] create_new_db whether to create a new database @return DB_SUCCESS or error code */ dberr_t srv_start(bool create_new_db) { - lsn_t flushed_lsn; dberr_t err = DB_SUCCESS; - bool srv_log_file_found = true; mtr_t mtr; ut_ad(srv_operation == SRV_OPERATION_NORMAL @@ -1227,7 +1054,6 @@ dberr_t srv_start(bool create_new_db) recv_sys.create(); lock_sys.create(srv_lock_table_size); - if (!srv_read_only_mode) { buf_flush_page_cleaner_init(); ut_ad(buf_page_cleaner_is_active); @@ -1249,7 +1075,7 @@ dberr_t srv_start(bool create_new_db) ulint sum_of_new_sizes; err = srv_sys_space.open_or_create( - false, create_new_db, &sum_of_new_sizes, &flushed_lsn); + false, create_new_db, &sum_of_new_sizes); switch (err) { case DB_SUCCESS: @@ -1271,87 +1097,27 @@ dberr_t srv_start(bool create_new_db) return(srv_init_abort(err)); } - srv_log_file_size_requested = srv_log_file_size; - if (innodb_encrypt_temporary_tables && !log_crypt_init()) { return srv_init_abort(DB_ERROR); } std::string logfile0; - bool create_new_log = create_new_db; if (create_new_db) { - flushed_lsn = log_sys.get_lsn(); - log_sys.set_flushed_lsn(flushed_lsn); + lsn_t flushed_lsn = log_sys.init_lsn(); err = create_log_file(true, flushed_lsn, logfile0); if (err != DB_SUCCESS) { - for (Tablespace::const_iterator - i = srv_sys_space.begin(); - i != srv_sys_space.end(); i++) { + for (const Datafile &file: srv_sys_space) { os_file_delete(innodb_data_file_key, - i->filepath()); - } - return(srv_init_abort(err)); - } - } else { - srv_log_file_size = 0; - - bool log_file_found; - if (dberr_t err = find_and_check_log_file(log_file_found)) { - if (err == DB_NOT_FOUND) { - return DB_SUCCESS; + file.filepath()); } return srv_init_abort(err); } - - create_new_log = srv_log_file_size == 0; - if (create_new_log) { - if (flushed_lsn < lsn_t(1000)) { - ib::error() - << "Cannot create log file because" - " data files are corrupt or the" - " database was not shut down cleanly" - " after creating the data files."; - return srv_init_abort(DB_ERROR); - } - - srv_log_file_size = srv_log_file_size_requested; - - err = create_log_file(false, flushed_lsn, logfile0); - - if (err == DB_SUCCESS) { - err = create_log_file_rename(flushed_lsn, - logfile0); - } - - if (err != DB_SUCCESS) { - return(srv_init_abort(err)); - } - - /* Suppress the message about - crash recovery. */ - flushed_lsn = log_sys.get_lsn(); - goto file_checked; - } - - srv_log_file_found = log_file_found; - - log_sys.log.open_file(get_log_file_path()); - - log_sys.log.create(); - - if (!log_set_capacity(srv_log_file_size_requested)) { - return(srv_init_abort(DB_ERROR)); - } - - /* Enable checkpoints in the page cleaner. */ - recv_sys.recovery_on = false; } -file_checked: /* Open log file and data files in the systemtablespace: we keep - them open until database shutdown */ + them open until database shutdown */ ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug); err = fil_system.sys_space->open(create_new_db) @@ -1416,12 +1182,10 @@ file_checked: buf_flush_sync(); - flushed_lsn = log_sys.get_lsn(); - - err = fil_write_flushed_lsn(flushed_lsn); - + const lsn_t lsn{log_sys.get_lsn()}; + err = fil_write_flushed_lsn(lsn); if (err == DB_SUCCESS) { - err = create_log_file_rename(flushed_lsn, logfile0); + err = create_log_file_rename(lsn, logfile0); } if (err != DB_SUCCESS) { @@ -1436,9 +1200,7 @@ file_checked: /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ - err = create_new_log - ? DB_SUCCESS - : recv_recovery_from_checkpoint_start(flushed_lsn); + err = recv_recovery_from_checkpoint_start(); recv_sys.close_files(); recv_sys.dblwr.pages.clear(); @@ -1483,7 +1245,9 @@ file_checked: respective file pages, for the last batch of recv_group_scan_log_recs(). */ + mysql_mutex_lock(&recv_sys.mutex); recv_sys.apply(true); + mysql_mutex_unlock(&recv_sys.mutex); if (recv_sys.is_corrupt_log() || recv_sys.is_corrupt_fs()) { @@ -1582,29 +1346,9 @@ file_checked: recv_sys.debug_free(); - if (srv_operation == SRV_OPERATION_RESTORE - || srv_operation == SRV_OPERATION_RESTORE_EXPORT) { - /* After applying the redo log from - SRV_OPERATION_BACKUP, flush the changes - to the data files and truncate or delete the log. - Unless --export is specified, no further change to - InnoDB files is needed. */ - ut_ad(srv_force_recovery <= SRV_FORCE_IGNORE_CORRUPT); - ut_ad(recv_no_log_write); - err = fil_write_flushed_lsn(log_sys.get_lsn()); - DBUG_ASSERT(!buf_pool.any_io_pending()); - log_sys.log.close_file(); - if (err == DB_SUCCESS) { - bool trunc = srv_operation - == SRV_OPERATION_RESTORE; - if (!trunc) { - delete_log_file("0"); - } else { - auto logfile0 = get_log_file_path(); - /* Truncate the first log file. */ - fclose(fopen(logfile0.c_str(), "w")); - } - } + if (srv_operation != SRV_OPERATION_NORMAL) { + ut_ad(srv_operation == SRV_OPERATION_RESTORE_EXPORT + || srv_operation == SRV_OPERATION_RESTORE); return(err); } @@ -1616,19 +1360,16 @@ file_checked: /* Completely ignore the redo log. */ } else if (srv_read_only_mode) { /* Leave the redo log alone. */ - } else if (srv_log_file_size_requested == srv_log_file_size - && srv_log_file_found - && log_sys.log.format + } else if (log_sys.file_size == srv_log_file_size + && log_sys.format == (srv_encrypt_log - ? log_t::FORMAT_ENC_10_5 - : log_t::FORMAT_10_5) - && log_sys.log.subformat == 2) { + ? log_t::FORMAT_ENC_10_8 + : log_t::FORMAT_10_8)) { /* No need to add or remove encryption, - upgrade, downgrade, or resize. */ + upgrade, or resize. */ } else { /* Prepare to delete the old redo log file */ - flushed_lsn = srv_prepare_to_delete_redo_log_file( - srv_log_file_found); + const lsn_t lsn{srv_prepare_to_delete_redo_log_file()}; DBUG_EXECUTE_IF("innodb_log_abort_1", return(srv_init_abort(DB_ERROR));); @@ -1638,37 +1379,21 @@ file_checked: ut_d(recv_no_log_write = true); DBUG_ASSERT(!buf_pool.any_io_pending()); - DBUG_EXECUTE_IF("innodb_log_abort_3", - return(srv_init_abort(DB_ERROR));); - DBUG_PRINT("ib_log", ("After innodb_log_abort_3")); - - /* Stamp the LSN to the data files. */ - err = fil_write_flushed_lsn(flushed_lsn); - - DBUG_EXECUTE_IF("innodb_log_abort_4", err = DB_ERROR;); - DBUG_PRINT("ib_log", ("After innodb_log_abort_4")); - - if (err != DB_SUCCESS) { - return(srv_init_abort(err)); - } - /* Close the redo log file, so that we can replace it */ - log_sys.log.close_file(); + log_sys.close_file(); + + err = fil_write_flushed_lsn(lsn); DBUG_EXECUTE_IF("innodb_log_abort_5", return(srv_init_abort(DB_ERROR));); DBUG_PRINT("ib_log", ("After innodb_log_abort_5")); - ib::info() - << "Starting to delete and rewrite log file."; - - srv_log_file_size = srv_log_file_size_requested; - - err = create_log_file(false, flushed_lsn, logfile0); + if (err == DB_SUCCESS) { + err = create_log_file(false, lsn, logfile0); + } if (err == DB_SUCCESS) { - err = create_log_file_rename(flushed_lsn, - logfile0); + err = create_log_file_rename(lsn, logfile0); } if (err != DB_SUCCESS) { @@ -1844,10 +1569,18 @@ skip_monitors: srv_is_being_started = false; if (srv_print_verbose_log) { - ib::info() << INNODB_VERSION_STR - << " started; log sequence number " - << recv_sys.recovered_lsn - << "; transaction id " << trx_sys.get_max_trx_id(); + sql_print_information("InnoDB: " + "log sequence number " LSN_PF +#ifdef HAVE_PMEM + "%s" +#endif + "; transaction id " TRX_ID_FMT, + recv_sys.lsn, +#ifdef HAVE_PMEM + log_sys.is_pmem() + ? " (memory-mapped)" : "", +#endif + trx_sys.get_max_trx_id()); } if (srv_force_recovery == 0) { diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 350551a88b1..3137075e232 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2021, MariaDB Corporation. +Copyright (c) 2015, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1130,29 +1130,20 @@ static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx) if (!srv_flush_log_at_trx_commit) return; - if (log_sys.get_flushed_lsn() > lsn) + if (log_sys.get_flushed_lsn(std::memory_order_relaxed) >= lsn) return; - const bool flush= srv_file_flush_method != SRV_NOSYNC && - (srv_flush_log_at_trx_commit & 1); + completion_callback cb, *callback= nullptr; - if (trx->state == TRX_STATE_PREPARED) + if (trx->state != TRX_STATE_PREPARED && !log_sys.is_pmem() && + (cb.m_param= innodb_thd_increment_pending_ops(trx->mysql_thd))) { - /* XA, which is used with binlog as well. - Be conservative, use synchronous wait.*/ -sync: - log_write_up_to(lsn, flush); - return; + cb.m_callback= (void (*)(void *)) thd_decrement_pending_ops; + callback= &cb; } - completion_callback cb; - if ((cb.m_param = innodb_thd_increment_pending_ops(trx->mysql_thd))) - { - cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops; - log_write_up_to(lsn, flush, false, &cb); - } - else - goto sync; + log_write_up_to(lsn, srv_file_flush_method != SRV_NOSYNC && + (srv_flush_log_at_trx_commit & 1), callback); } /**********************************************************************//** diff --git a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result index 8f0357a8954..505f394876c 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result @@ -138,9 +138,6 @@ os_data_fsyncs os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status os_pending_reads os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of reads pending os_pending_writes os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of writes pending os_log_bytes_written os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Bytes of log written (innodb_os_log_written) -os_log_fsyncs os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of fsync log writes (innodb_os_log_fsyncs) -os_log_pending_fsyncs os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of pending fsync write (innodb_os_log_pending_fsyncs) -os_log_pending_writes os 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of pending log file writes (innodb_os_log_pending_writes) trx_rw_commits transaction 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of read-write transactions committed trx_ro_commits transaction 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of read-only transactions committed trx_nl_ro_commits transaction 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of non-locking auto-commit read-only transactions committed @@ -158,20 +155,17 @@ purge_undo_log_pages purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL purge_dml_delay_usec purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Microseconds DML to be delayed due to purge lagging purge_stop_count purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of times purge was stopped purge_resume_count purge 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of times purge was resumed -log_checkpoints recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of checkpoints +log_checkpoints recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of checkpoints log_lsn_last_flush recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value LSN of Last flush log_lsn_last_checkpoint recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value LSN at last checkpoint log_lsn_current recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Current LSN value log_lsn_checkpoint_age recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Current LSN value minus LSN at last checkpoint log_lsn_buf_pool_oldest recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value The oldest modified block LSN in the buffer pool log_max_modified_age_async recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Maximum LSN difference; when exceeded, start asynchronous preflush -log_pending_log_flushes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending log flushes -log_pending_checkpoint_writes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Pending checkpoints log_num_log_io recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 value Number of log I/Os log_waits recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of log waits due to small log buffer (innodb_log_waits) log_write_requests recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of log write requests (innodb_log_write_requests) log_writes recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of log writes (innodb_log_writes) -log_padded recovery 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Bytes of log padded for log write ahead compress_pages_compressed compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages compressed compress_pages_decompressed compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages decompressed compression_pad_increments compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of times padding is incremented to avoid compression failures