1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention

Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored
for system tablespace on SSD

When the maximum configured number of file is exceeded, InnoDB will
close data files. We used to maintain a fil_system.LRU list and
a counter fil_node_t::n_pending to achieve this, at the huge cost
of multiple fil_system.mutex operations per I/O operation.

fil_node_open_file_low(): Implement a FIFO replacement policy:
The last opened file will be moved to the end of fil_system.space_list,
and files will be closed from the start of the list. However, we will
not move tablespaces in fil_system.space_list while
i_s_tablespaces_encryption_fill_table() is executing
(producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION)
because it may cause information of some tablespaces to go missing.
We also avoid this in mariabackup --backup because datafiles_iter_next()
assumes that the ordering is not changed.

IORequest: Fold more parameters to IORequest::type.

fil_space_t::io(): Replaces fil_io().

fil_space_t::flush(): Replaces fil_flush().

OS_AIO_IBUF: Remove. We will always issue synchronous reads of the
change buffer pages in buf_read_page_low().

We will always ignore some errors for background reads.

This should reduce fil_system.mutex contention a little.

fil_node_t::complete_write(): Replaces fil_node_t::complete_io().
On both read and write completion, fil_space_t::release_for_io()
will have to be called.

fil_space_t::io(): Do not acquire fil_system.mutex in the normal
code path.

xb_delta_open_matching_space(): Do not try to open the system tablespace
which was already opened. This fixes a file sharing violation in
mariabackup --prepare --incremental.

Reviewed by: Vladislav Vaintroub
This commit is contained in:
Marko Mäkelä
2020-10-26 15:59:30 +02:00
parent 3a9a3be1c6
commit 45ed9dd957
39 changed files with 1301 additions and 1898 deletions

View File

@@ -93,7 +93,6 @@ xb_fil_node_close_file(
mutex_enter(&fil_system.mutex);
ut_ad(node);
ut_a(node->n_pending == 0);
ut_a(node->n_pending_flushes == 0);
ut_a(!node->being_extended);
@@ -108,20 +107,10 @@ xb_fil_node_close_file(
ut_a(ret);
node->handle = OS_FILE_CLOSED;
mutex_exit(&fil_system.mutex);
ut_a(fil_system.n_open > 0);
fil_system.n_open--;
if (node->space->purpose == FIL_TYPE_TABLESPACE &&
fil_is_user_tablespace_id(node->space->id)) {
ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0);
/* The node is in the LRU list, remove it */
UT_LIST_REMOVE(fil_system.LRU, node);
}
mutex_exit(&fil_system.mutex);
}
/************************************************************************
@@ -180,18 +169,8 @@ xb_fil_cur_open(
return(XB_FIL_CUR_SKIP);
}
mutex_enter(&fil_system.mutex);
fil_system.n_open++;
if (node->space->purpose == FIL_TYPE_TABLESPACE &&
fil_is_user_tablespace_id(node->space->id)) {
/* Put the node to the LRU list */
UT_LIST_ADD_FIRST(fil_system.LRU, node);
}
mutex_exit(&fil_system.mutex);
}
ut_ad(node->is_open());
@@ -427,7 +406,7 @@ xb_fil_cur_read(
retry_count = 10;
ret = XB_FIL_CUR_SUCCESS;
fil_space_t *space = fil_space_acquire_for_io(cursor->space_id);
fil_space_t *space = fil_space_t::get_for_io(cursor->space_id);
if (!space) {
return XB_FIL_CUR_ERROR;

View File

@@ -3011,6 +3011,7 @@ void
xb_fil_io_init()
{
fil_system.create(srv_file_per_table ? 50000 : 5000);
fil_system.freeze_space_list = 1;
fil_system.space_id_reuse_warned = true;
}
@@ -3087,24 +3088,16 @@ xb_load_single_table_tablespace(
bool is_empty_file = file->exists() && file->is_empty_file();
if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) {
os_offset_t node_size = os_file_get_size(file->handle());
os_offset_t n_pages;
ut_a(node_size != (os_offset_t) -1);
n_pages = node_size / fil_space_t::physical_size(file->flags());
space = fil_space_create(
space = fil_space_t::create(
name, file->space_id(), file->flags(),
FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */);
ut_a(space != NULL);
space->add(file->filepath(), OS_FILE_CLOSED, uint32_t(n_pages),
false, false);
space->add(file->filepath(), OS_FILE_CLOSED, 0, false, false);
/* by opening the tablespace we forcing node and space objects
in the cache to be populated with fields from space header */
space->open();
space->get_size();
if (srv_operation == SRV_OPERATION_RESTORE_DELTA
|| xb_close_files) {
@@ -3406,19 +3399,6 @@ xb_load_tablespaces()
return(DB_SUCCESS);
}
/************************************************************************
Initialize the tablespace memory cache and populate it by scanning for and
opening data files.
@returns DB_SUCCESS or error code.*/
static
dberr_t
xb_data_files_init()
{
xb_fil_io_init();
return(xb_load_tablespaces());
}
/** Destroy the tablespace memory cache. */
static void xb_data_files_close()
{
@@ -4607,6 +4587,22 @@ xb_delta_open_matching_space(
return file;
}
if (!info.space_id && fil_system.sys_space) {
fil_node_t *node
= UT_LIST_GET_FIRST(fil_system.sys_space->chain);
for (; node; node = UT_LIST_GET_NEXT(chain, node)) {
if (!strcmp(node->name, real_name)) {
break;
}
}
if (node && node->handle != OS_FILE_CLOSED) {
*success = true;
return node->handle;
}
msg("mariabackup: Cannot find file %s\n", real_name);
return OS_FILE_CLOSED;
}
log_mutex_enter();
if (!fil_is_user_tablespace_id(info.space_id)) {
found:
@@ -4704,7 +4700,7 @@ exit:
ut_ad(fil_space_t::zip_size(flags) == info.zip_size);
ut_ad(fil_space_t::physical_size(flags) == info.page_size);
if (fil_space_create(dest_space_name, info.space_id, flags,
if (fil_space_t::create(dest_space_name, info.space_id, flags,
FIL_TYPE_TABLESPACE, 0)) {
*success = xb_space_create_file(real_name, info.space_id,
flags, &file);
@@ -4925,7 +4921,7 @@ xtrabackup_apply_delta(
os_file_close(src_file);
os_file_delete(0,src_path);
}
if (dst_file != OS_FILE_CLOSED)
if (dst_file != OS_FILE_CLOSED && info.space_id)
os_file_close(dst_file);
return TRUE;
@@ -4933,7 +4929,7 @@ error:
aligned_free(incremental_buffer);
if (src_file != OS_FILE_CLOSED)
os_file_close(src_file);
if (dst_file != OS_FILE_CLOSED)
if (dst_file != OS_FILE_CLOSED && info.space_id)
os_file_close(dst_file);
msg("Error: xtrabackup_apply_delta(): "
"failed to apply %s to %s.\n", src_path, dst_path);
@@ -5387,8 +5383,8 @@ static bool xtrabackup_prepare_func(char** argv)
srv_allow_writes_event = os_event_create(0);
os_event_set(srv_allow_writes_event);
#endif
dberr_t err = xb_data_files_init();
if (err != DB_SUCCESS) {
xb_fil_io_init();
if (dberr_t err = xb_load_tablespaces()) {
msg("mariabackup: error: xb_data_files_init() failed "
"with error %s\n", ut_strerr(err));
goto error_cleanup;
@@ -5396,7 +5392,8 @@ static bool xtrabackup_prepare_func(char** argv)
inc_dir_tables_hash.create(1000);
ok = xtrabackup_apply_deltas();
ok = fil_system.sys_space->open(false)
&& xtrabackup_apply_deltas();
xb_data_files_close();
@@ -5426,6 +5423,8 @@ static bool xtrabackup_prepare_func(char** argv)
goto error_cleanup;
}
fil_system.freeze_space_list = 0;
/* increase IO threads */
if (srv_n_file_io_threads < 10) {
srv_n_read_io_threads = 4;
@@ -5447,6 +5446,8 @@ static bool xtrabackup_prepare_func(char** argv)
goto error_cleanup;
}
ut_ad(!fil_system.freeze_space_list);
if (ok) {
msg("Last binlog file %s, position %lld",
trx_sys.recovered_binlog_filename,

View File

@@ -29,6 +29,7 @@ create table t1(a int not null primary key, b char(200)) engine=innodb;
--source include/wait_condition.inc
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
--echo # Success!
@@ -41,6 +42,7 @@ SET GLOBAL innodb_encrypt_tables = off;
--let $wait_condition=SELECT COUNT(*) = $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0;
--source include/wait_condition.inc
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
@@ -51,6 +53,7 @@ SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_
--let $restart_parameters=--skip-file-key-management --innodb-encrypt-tables=OFF --innodb-encryption-threads=0 --innodb-tablespaces-encryption
-- source include/restart_mysqld.inc
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

View File

@@ -26,6 +26,7 @@ let $restart_parameters= --innodb_encryption_threads=5 --innodb_encryption_rotat
--source include/wait_condition.inc
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
--echo # Restart the server with innodb_encryption_rotate_key_age= 0
@@ -45,6 +46,7 @@ create table t4 (f1 int not null)engine=innodb encrypted=NO;
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
--echo # Disable encryption when innodb_encryption_rotate_key_age is 0
@@ -57,6 +59,7 @@ set global innodb_encrypt_tables = OFF;
--let $wait_condition=SELECT COUNT(*) >= $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0;
--source include/wait_condition.inc
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--echo # Display only encrypted create tables (t3)
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
@@ -73,11 +76,13 @@ set global innodb_encrypt_tables = ON;
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--echo # Display only unencrypted create tables (t4)
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
--let $restart_parameters=
-- source include/restart_mysqld.inc
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
DROP TABLE t4, t3, t2, t1;

View File

@@ -1,4 +1,4 @@
call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded");
call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded");
SET @save_tdc= @@GLOBAL.table_definition_cache;
SET @save_toc= @@GLOBAL.table_open_cache;
SET GLOBAL table_definition_cache= 400;

View File

@@ -32,18 +32,6 @@ commit;
set autocommit=1;
let $success= `SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'innodb_num_page_compressed_trim_op'`;
if (!$success) {
--disable_query_log
--disable_result_log
DROP PROCEDURE innodb_insert_proc;
DROP TABLE innodb_page_compressed;
--enable_query_log
--enable_result_log
--skip "Test requires TRIM";
}
DROP PROCEDURE innodb_insert_proc;
DROP TABLE innodb_page_compressed;

View File

@@ -4,7 +4,7 @@
# This test is slow on buildbot.
--source include/big_test.inc
call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded");
call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded");
SET @save_tdc= @@GLOBAL.table_definition_cache;
SET @save_toc= @@GLOBAL.table_open_cache;

View File

@@ -1,3 +1,4 @@
# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2014, 2020, MariaDB Corporation.
#
@@ -186,7 +187,6 @@ SET(INNOBASE_SOURCES
include/mtr0mtr.h
include/mtr0mtr.ic
include/mtr0types.h
include/os0api.h
include/os0event.h
include/os0file.h
include/os0file.ic

View File

@@ -3304,22 +3304,35 @@ upd_sys:
/**
Prefetch siblings of the leaf for the pessimistic operation.
@param block leaf page */
static void btr_cur_prefetch_siblings(const buf_block_t* block)
@param block leaf page
@param index index of the page */
static void btr_cur_prefetch_siblings(const buf_block_t *block,
const dict_index_t *index)
{
const page_t *page= block->frame;
ut_ad(page_is_leaf(page));
ut_ad(page_is_leaf(block->frame));
if (index->is_ibuf())
return;
const page_t *page= block->frame;
uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
if (prev != FIL_NULL)
buf_read_page_background(page_id_t(block->page.id().space(), prev),
{
ut_a(index->table->space->acquire_for_io());
buf_read_page_background(index->table->space,
page_id_t(block->page.id().space(), prev),
block->zip_size(), false);
}
if (next != FIL_NULL)
buf_read_page_background(page_id_t(block->page.id().space(), next),
{
ut_a(index->table->space->acquire_for_io());
buf_read_page_background(index->table->space,
page_id_t(block->page.id().space(), next),
block->zip_size(), false);
}
}
/*************************************************************//**
Tries to perform an insert to a page in an index tree, next to cursor.
@@ -3436,8 +3449,8 @@ fail:
/* prefetch siblings of the leaf for the pessimistic
operation, if the page is leaf. */
if (page_is_leaf(page) && !index->is_ibuf()) {
btr_cur_prefetch_siblings(block);
if (page_is_leaf(page)) {
btr_cur_prefetch_siblings(block, index);
}
fail_err:
@@ -4575,7 +4588,7 @@ any_extern:
/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block);
btr_cur_prefetch_siblings(block, index);
return(DB_OVERFLOW);
}
@@ -4766,10 +4779,10 @@ func_exit:
}
}
if (err != DB_SUCCESS && !index->is_ibuf()) {
if (err != DB_SUCCESS) {
/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block);
btr_cur_prefetch_siblings(block, index);
}
return(err);
@@ -5481,7 +5494,7 @@ btr_cur_optimistic_delete_func(
if (!no_compress_needed) {
/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block);
btr_cur_prefetch_siblings(block, cursor->index);
goto func_exit;
}

View File

@@ -2768,7 +2768,7 @@ buf_zip_decompress(
ulint size = page_zip_get_size(&block->page.zip);
/* The tablespace will not be found if this function is called
during IMPORT. */
fil_space_t* space= fil_space_acquire_for_io(block->page.id().space());
fil_space_t* space= fil_space_t::get_for_io(block->page.id().space());
const unsigned key_version = mach_read_from_4(
frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
@@ -3034,10 +3034,9 @@ buf_page_get_low(
/* fall through */
case BUF_GET:
case BUF_GET_IF_IN_POOL_OR_WATCH:
fil_space_t* s = fil_space_acquire_for_io(page_id.space());
fil_space_t* s = fil_space_get(page_id.space());
ut_ad(s);
ut_ad(s->zip_size() == zip_size);
s->release_for_io();
}
#endif /* UNIV_DEBUG */
@@ -3107,7 +3106,7 @@ lookup:
}
/* The call path is buf_read_page() ->
buf_read_page_low() (fil_io()) ->
buf_read_page_low() (fil_space_t::io()) ->
buf_page_read_complete() ->
buf_decrypt_after_read(). Here fil_space_t* is used
and we decrypt -> buf_page_check_corrupt() where page
@@ -3161,8 +3160,7 @@ lookup:
asserting. */
if (page_id.space() == TRX_SYS_SPACE) {
} else if (page_id.space() == SRV_TMP_SPACE_ID) {
} else if (fil_space_t* space
= fil_space_acquire_for_io(
} else if (fil_space_t* space= fil_space_t::get_for_io(
page_id.space())) {
bool set = dict_set_corrupted_by_space(space);
space->release_for_io();
@@ -3376,8 +3374,8 @@ re_evict:
if (mode != BUF_GET_IF_IN_POOL
&& mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
} else if (!ibuf_debug) {
} else if (fil_space_t* space =
fil_space_acquire_for_io(page_id.space())) {
} else if (fil_space_t* space
= fil_space_t::get_for_io(page_id.space())) {
/* Try to evict the block from the buffer pool, to use the
insert buffer (change buffer) as much as possible. */
@@ -4869,17 +4867,4 @@ std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
<< ", page number=" << page_id.page_no() << "]";
return out;
}
/**
Calculate the length of trim (punch_hole) operation.
@param[in] bpage Page control block
@param[in] write_length Write length
@return length of the trim or zero. */
ulint
buf_page_get_trim_length(
const buf_page_t* bpage,
ulint write_length)
{
return bpage->physical_size() - write_length;
}
#endif /* !UNIV_INNOCHECKSUM */

View File

@@ -125,7 +125,8 @@ too_small:
byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
trx_sys_block->frame;
for (uint32_t prev_page_no= 0, i= 0; i < 2 * size + FSP_EXTENT_SIZE / 2; i++)
for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
i < 2 * size + extent_size / 2; i++)
{
buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
FSP_UP, &mtr);
@@ -362,15 +363,13 @@ void buf_dblwr_t::recover()
continue;
}
fil_space_t* space= fil_space_acquire_for_io(space_id);
fil_space_t *space= fil_space_t::get_for_io(space_id);
if (!space)
/* The tablespace that this page once belonged to does not exist */
continue;
fil_space_open_if_needed(space);
if (UNIV_UNLIKELY(page_no >= space->size))
if (UNIV_UNLIKELY(page_no >= space->get_size()))
{
/* Do not report the warning for undo tablespaces, because they
can be truncated in place. */
@@ -385,7 +384,6 @@ next_page:
}
const ulint physical_size= space->physical_size();
const ulint zip_size= space->zip_size();
ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
/* We want to ensure that for partial reads the unread portion of
@@ -393,18 +391,15 @@ next_page:
memset(read_buf, 0x0, physical_size);
/* Read in the actual page from the file */
fil_io_t fio= fil_io(IORequest(IORequest::READ | IORequest::DBLWR_RECOVER),
true, page_id, zip_size,
0, physical_size, read_buf, nullptr);
fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
os_offset_t{page_no} * physical_size,
physical_size, read_buf);
if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
ib::warn() << "Double write buffer recovery: " << page_id
<< " (tablespace '" << space->name
<< "') read failed with error: " << fio.err;
if (fio.node)
fio.node->space->release_for_io();
if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
{
/* We will check if the copy in the doublewrite buffer is
@@ -425,18 +420,16 @@ next_page:
/* Write the good page from the doublewrite buffer to the intended
position. */
fio= fil_io(IORequestWrite, true, page_id, zip_size, 0, physical_size,
page, nullptr);
space->reacquire_for_io();
fio= space->io(IORequestWrite,
os_offset_t{page_id.page_no()} * physical_size,
physical_size, page);
if (fio.node)
{
ut_ad(fio.err == DB_SUCCESS);
if (fio.err == DB_SUCCESS)
ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
<< "' from the doublewrite buffer.";
fio.node->space->release_for_io();
goto next_page;
}
}
recv_sys.dblwr.pages.clear();
fil_flush_file_spaces();
@@ -513,7 +506,7 @@ static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
{
if (fil_space_t *space= fil_space_acquire_for_io(b.id().space()))
if (fil_space_t *space= fil_space_t::get_for_io(b.id().space()))
{
buf_dblwr_check_page_lsn(page, *space);
space->release_for_io();
@@ -577,7 +570,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
#ifdef UNIV_DEBUG
for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
{
buf_page_t *bpage= buf_block_arr[i].bpage;
buf_page_t *bpage= buf_block_arr[i].request.bpage;
if (bpage->zip.data)
/* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
@@ -590,18 +583,22 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
}
#endif /* UNIV_DEBUG */
/* Write out the first block of the doublewrite buffer */
fil_io_t fio= fil_io(IORequestWrite, true, block1, 0, 0,
std::min(size, old_first_free) << srv_page_size_shift,
write_buf, nullptr);
fio.node->space->release_for_io();
ut_a(fil_system.sys_space->acquire_for_io());
fil_system.sys_space->io(IORequestWrite,
os_offset_t{block1.page_no()} <<
srv_page_size_shift,
std::min(size, old_first_free) <<
srv_page_size_shift, write_buf);
if (old_first_free > size)
{
/* Write out the second block of the doublewrite buffer. */
fio= fil_io(IORequestWrite, true, block2, 0, 0,
ut_a(fil_system.sys_space->acquire_for_io());
fil_system.sys_space->io(IORequestWrite,
os_offset_t{block2.page_no()} <<
srv_page_size_shift,
(old_first_free - size) << srv_page_size_shift,
write_buf + (size << srv_page_size_shift), nullptr);
fio.node->space->release_for_io();
write_buf + (size << srv_page_size_shift));
}
/* increment the doublewrite flushed pages counter */
@@ -609,7 +606,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
srv_stats.dblwr_writes.inc();
/* Now flush the doublewrite buffer data to disk */
fil_flush(TRX_SYS_SPACE);
fil_system.sys_space->flush();
/* We know that the writes have been flushed to disk now
and in recovery we will find them in the doublewrite buffer
@@ -629,8 +626,8 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
for (ulint i= 0; i < old_first_free; i++)
{
auto e= buf_block_arr[i];
buf_page_t* bpage= e.bpage;
ut_a(bpage->in_file());
buf_page_t* bpage= e.request.bpage;
ut_ad(bpage->in_file());
/* We request frame here to get correct buffer in case of
encryption and/or page compression */
@@ -650,8 +647,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
}
fil_io(IORequest(IORequest::WRITE, bpage, e.lru), false,
bpage->id(), bpage->zip_size(), 0, e_size, frame, bpage);
e.space->io(e.request, bpage->physical_offset(), e_size, frame, bpage);
}
return true;
@@ -680,12 +676,20 @@ void buf_dblwr_t::flush_buffered_writes()
/** Schedule a page write. If the doublewrite memory buffer is full,
flush_buffered_writes() will be invoked to make space.
@param bpage buffer pool page to be written
@param lru true=buf_pool.LRU; false=buf_pool.flush_list
@param space tablespace
@param request asynchronous write request
@param size payload size in bytes */
void buf_dblwr_t::add_to_batch(buf_page_t *bpage, bool lru, size_t size)
void buf_dblwr_t::add_to_batch(fil_space_t *space, const IORequest &request,
size_t size)
{
ut_ad(bpage->in_file());
ut_ad(request.is_async());
ut_ad(request.is_write());
ut_ad(request.bpage);
ut_ad(request.bpage->in_file());
ut_ad(space->id == request.bpage->id().space());
ut_ad(space->pending_io());
ut_ad(!srv_read_only_mode);
const ulint buf_size= 2 * block_size();
mysql_mutex_lock(&mutex);
@@ -707,13 +711,13 @@ void buf_dblwr_t::add_to_batch(buf_page_t *bpage, bool lru, size_t size)
/* We request frame here to get correct buffer in case of
encryption and/or page compression */
void *frame= buf_page_get_frame(bpage);
void *frame= buf_page_get_frame(request.bpage);
memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(p, frame, size);
ut_ad(!bpage->zip_size() || bpage->zip_size() == size);
ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
ut_ad(reserved == first_free);
ut_ad(reserved < buf_size);
buf_block_arr[first_free++]= { bpage, lru, size };
new (buf_block_arr + first_free++) element{space, request, size};
reserved= first_free;
if (first_free != buf_size || !flush_buffered_writes(buf_size / 2))

View File

@@ -626,6 +626,14 @@ buf_load()
so all pages from a given tablespace are consecutive. */
ulint cur_space_id = dump[0].space();
fil_space_t* space = fil_space_acquire_silent(cur_space_id);
if (space) {
bool ok = space->acquire_for_io();
space->release();
if (!ok) {
space = nullptr;
}
}
ulint zip_size = space ? space->zip_size() : 0;
PSI_stage_progress* pfs_stage_progress __attribute__((unused))
@@ -644,22 +652,32 @@ buf_load()
}
if (this_space_id != cur_space_id) {
if (space != NULL) {
space->release();
if (space) {
space->release_for_io();
}
cur_space_id = this_space_id;
space = fil_space_acquire_silent(cur_space_id);
if (space != NULL) {
zip_size = space->zip_size();
if (!space) {
continue;
}
bool ok = space->acquire_for_io();
space->release();
if (!ok) {
space = nullptr;
continue;
}
zip_size = space->zip_size();
}
/* JAN: TODO: As we use background page read below,
if tablespace is encrypted we cant use it. */
if (space == NULL ||
(space && space->crypt_data &&
if (!space || dump[i].page_no() >= space->get_size() ||
(space->crypt_data &&
space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
continue;
@@ -671,11 +689,12 @@ buf_load()
continue;
}
buf_read_page_background(dump[i], zip_size, true);
space->reacquire_for_io();
buf_read_page_background(space, dump[i], zip_size, true);
if (buf_load_abort_flag) {
if (space != NULL) {
space->release();
if (space) {
space->release_for_io();
}
buf_load_abort_flag = false;
ut_free(dump);
@@ -702,8 +721,8 @@ buf_load()
#endif
}
if (space != NULL) {
space->release();
if (space) {
space->release_for_io();
}
ut_free(dump);

View File

@@ -782,6 +782,11 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
{
ut_ad(bpage->in_file());
ut_ad(bpage->ready_for_flush());
ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
(space == fil_system.temp_space));
ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
space->atomic_write_supported);
ut_ad(space->pending_io());
rw_lock_t *rw_lock;
@@ -807,11 +812,6 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
the buffer pool or removed from flush_list or LRU_list. */
ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
(space == fil_system.temp_space));
ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
space->atomic_write_supported);
DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
lru ? "LRU" : "flush_list",
bpage->id().space(), bpage->id().page_no()));
@@ -850,19 +850,22 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
}
}
if (status == buf_page_t::FREED)
buf_release_freed_page(&block->page);
else
{
space->reacquire_for_io();
ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
size_t size, orig_size;
ulint type= IORequest::WRITE;
IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
{
ut_ad(!space->full_crc32());
ut_ad(!space->is_compressed()); /* not page_compressed */
orig_size= size= bpage->zip_size();
if (status != buf_page_t::FREED)
{
buf_flush_update_zip_checksum(frame, orig_size);
buf_flush_update_zip_checksum(frame, size);
frame= buf_page_encrypt(space, bpage, frame, &size);
}
ut_ad(size == bpage->zip_size());
}
else
@@ -870,8 +873,7 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
byte *page= block->frame;
orig_size= size= block->physical_size();
if (status == buf_page_t::FREED);
else if (space->full_crc32())
if (space->full_crc32())
{
/* innodb_checksum_algorithm=full_crc32 is not implemented for
ROW_FORMAT=COMPRESSED pages. */
@@ -888,44 +890,26 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
if (size != orig_size && space->punch_hole)
type|= IORequest::PUNCH_HOLE;
type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
#else
DBUG_EXECUTE_IF("ignore_punch_hole",
if (size != orig_size && space->punch_hole)
type|= IORequest::PUNCH_HOLE;);
type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;);
#endif
frame=page;
}
IORequest request(type, bpage, lru);
ut_ad(status == bpage->status);
switch (status) {
default:
ut_ad(status == buf_page_t::FREED);
buf_release_freed_page(bpage);
break;
case buf_page_t::NORMAL:
if (space->use_doublewrite())
{
ut_ad(!srv_read_only_mode);
if (lru)
buf_pool.n_flush_LRU++;
else
buf_pool.n_flush_list++;
buf_dblwr.add_to_batch(bpage, lru, size);
break;
}
/* fall through */
case buf_page_t::INIT_ON_FLUSH:
if (lru)
buf_pool.n_flush_LRU++;
if (status != buf_page_t::NORMAL || !space->use_doublewrite())
space->io(IORequest(type, bpage),
bpage->physical_offset(), size, frame, bpage);
else
buf_pool.n_flush_list++;
/* FIXME: pass space to fil_io() */
fil_io(request, false, bpage->id(), bpage->zip_size(), 0,
bpage->physical_size(), frame, bpage);
buf_dblwr.add_to_batch(space, IORequest(type, bpage), size);
}
/* Increment the I/O operation count used for selecting LRU policy. */
@@ -973,8 +957,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
? static_cast<uint32_t>(s) : read_ahead;
page_id_t low= id - (id.page_no() % buf_flush_area);
page_id_t high= low + buf_flush_area;
high.set_page_no(std::min(high.page_no(),
static_cast<uint32_t>(space.committed_size - 1)));
high.set_page_no(std::min(high.page_no(), space.last_page_number()));
if (!contiguous)
{
@@ -1018,13 +1001,12 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
return i;
}
MY_ATTRIBUTE((nonnull))
/** Write punch-hole or zeroes of the freed ranges when
innodb_immediate_scrub_data_uncompressed from the freed ranges.
@param[in] space tablespace which contains freed ranges
@param[in] freed_ranges freed ranges of the page to be flushed */
@param space tablespace which may contain ranges of freed pages */
static void buf_flush_freed_pages(fil_space_t *space)
{
ut_ad(space != NULL);
const bool punch_hole= space->punch_hole;
if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
return;
@@ -1043,27 +1025,24 @@ static void buf_flush_freed_pages(fil_space_t *space)
for (const auto &range : freed_ranges)
{
ulint page_size= space->zip_size();
if (!page_size)
page_size= srv_page_size;
const ulint physical_size= space->physical_size();
if (punch_hole)
{
const auto len= (range.last - range.first + 1) * page_size;
const page_id_t page_id(space->id, range.first);
fil_io_t fio= fil_io(IORequestWrite, true, page_id, space->zip_size(),
0, len, nullptr, nullptr, false, true);
if (fio.node)
fio.node->space->release_for_io();
space->reacquire_for_io();
space->io(IORequest(IORequest::PUNCH_RANGE),
os_offset_t{range.first} * physical_size,
(range.last - range.first + 1) * physical_size,
nullptr);
}
else if (srv_immediate_scrub_data_uncompressed)
{
for (auto i= range.first; i <= range.last; i++)
for (os_offset_t i= range.first; i <= range.last; i++)
{
const page_id_t page_id(space->id, i);
fil_io(IORequestWrite, false, page_id, space->zip_size(), 0,
space->zip_size() ? space->zip_size() : srv_page_size,
const_cast<byte*>(field_ref_zero), nullptr, false, false);
space->reacquire_for_io();
space->io(IORequest(IORequest::WRITE_ASYNC),
i * physical_size, physical_size,
const_cast<byte*>(field_ref_zero));
}
}
buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
@@ -1093,7 +1072,8 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
ut_ad(page_id >= id);
ut_ad(page_id < high);
for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold)
for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
++id, ++id_fold)
{
if (count + n_flushed >= n_to_flush)
{
@@ -1190,7 +1170,7 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
@retval nullptr if the pages for this tablespace should be discarded */
static fil_space_t *buf_flush_space(const uint32_t id)
{
fil_space_t *space= fil_space_acquire_for_io(id);
fil_space_t *space= fil_space_t::get_for_io(id);
if (space)
buf_flush_freed_pages(space);
return space;
@@ -1204,6 +1184,37 @@ struct flush_counters_t
ulint evicted;
};
/** Try to discard a dirty page.
@param bpage dirty page whose tablespace is not accessible */
static void buf_flush_discard_page(buf_page_t *bpage)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
ut_ad(bpage->in_file());
ut_ad(bpage->oldest_modification());
rw_lock_t *rw_lock;
if (bpage->state() != BUF_BLOCK_FILE_PAGE)
rw_lock= nullptr;
else
{
rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
if (!rw_lock_sx_lock_nowait(rw_lock, 0))
return;
}
bpage->status= buf_page_t::NORMAL;
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_flush_remove(bpage);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (rw_lock)
rw_lock_sx_unlock(rw_lock);
buf_LRU_free_page(bpage, true);
}
/** Flush dirty blocks from the end of the LRU list.
@param max maximum number of blocks to make available in buf_pool.free
@param n counts of flushed and evicted pages */
@@ -1219,6 +1230,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
? 0 : srv_flush_neighbors;
fil_space_t *space= nullptr;
uint32_t last_space_id= FIL_NULL;
static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
bpage && n->flushed + n->evicted < max &&
@@ -1243,14 +1257,26 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
const page_id_t page_id(bpage->id());
const uint32_t space_id= page_id.space();
if (!space || space->id != space_id)
{
if (last_space_id != space_id)
{
if (space)
space->release_for_io();
space= buf_flush_space(space_id);
if (!space)
continue;
last_space_id= space_id;
}
if (neighbors && space->is_rotational())
else
ut_ad(!space);
}
else if (space->is_stopping())
{
space->release_for_io();
space= nullptr;
}
if (!space)
buf_flush_discard_page(bpage);
else if (neighbors && space->is_rotational())
{
mysql_mutex_unlock(&buf_pool.mutex);
n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
@@ -1328,6 +1354,9 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
? 0 : srv_flush_neighbors;
fil_space_t *space= nullptr;
uint32_t last_space_id= FIL_NULL;
static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
/* Start from the end of the list looking for a suitable block to be
flushed. */
@@ -1360,14 +1389,26 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
const page_id_t page_id(bpage->id());
const uint32_t space_id= page_id.space();
if (!space || space->id != space_id)
{
if (last_space_id != space_id)
{
if (space)
space->release_for_io();
space= buf_flush_space(space_id);
if (!space)
continue;
last_space_id= space_id;
}
if (neighbors && space->is_rotational())
else
ut_ad(!space);
}
else if (space->is_stopping())
{
space->release_for_io();
space= nullptr;
}
if (!space)
buf_flush_discard_page(bpage);
else if (neighbors && space->is_rotational())
{
mysql_mutex_unlock(&buf_pool.mutex);
count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
@@ -1476,10 +1517,9 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn)
while not holding buf_pool.flush_list_mutex */
if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
{
mysql_mutex_unlock(&buf_pool.mutex);
if (running)
return 0;
if (!running)
mysql_cond_broadcast(cond);
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
n_flush++;

View File

@@ -261,26 +261,23 @@ flag is cleared and the x-lock released by an i/o-handler thread.
@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED
if we are trying
to read from a non-existent tablespace
@param[in,out] space tablespace
@param[in] sync true if synchronous aio is desired
@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...,
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] unzip true=request uncompressed page
@param[in] ignore whether to ignore out-of-bounds page_id
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
not exist or is being dropped */
@return whether a read request was queued */
static
ulint
bool
buf_read_page_low(
dberr_t* err,
fil_space_t* space,
bool sync,
ulint mode,
const page_id_t page_id,
ulint zip_size,
bool unzip,
bool ignore = false)
bool unzip)
{
buf_page_t* bpage;
@@ -290,17 +287,22 @@ buf_read_page_low(
ib::error() << "Trying to read doublewrite buffer page "
<< page_id;
ut_ad(0);
return(0);
nothing_read:
space->release_for_io();
return false;
}
if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) {
if (sync) {
} else if (trx_sys_hdr_page(page_id)
|| ibuf_bitmap_page(page_id, zip_size)
|| (!recv_no_ibuf_operations
&& ibuf_page(page_id, zip_size, nullptr))) {
/* Trx sys header is so low in the latching order that we play
safe and do not leave the i/o-completion to an asynchronous
i/o-thread. Ibuf bitmap pages must always be read with
i/o-thread. Change buffer pages must always be read with
syncronous i/o, to make sure they do not get involved in
thread deadlocks. */
sync = true;
}
@@ -311,20 +313,19 @@ buf_read_page_low(
bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
if (bpage == NULL) {
goto nothing_read;
}
return(0);
ut_ad(bpage->in_file());
if (sync) {
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
}
DBUG_LOG("ib_buf",
"read page " << page_id << " zip_size=" << zip_size
<< " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
ut_ad(bpage->in_file());
if (sync) {
thd_wait_begin(NULL, THD_WAIT_DISKIO);
}
void* dst;
if (zip_size) {
@@ -335,20 +336,18 @@ buf_read_page_low(
dst = ((buf_block_t*) bpage)->frame;
}
fil_io_t fio = fil_io(
IORequestRead, sync, page_id, zip_size, 0,
zip_size ? zip_size : srv_page_size,
dst, bpage, ignore);
const ulint len = zip_size ? zip_size : srv_page_size;
auto fio = space->io(IORequest(sync
? IORequest::READ_SYNC
: IORequest::READ_ASYNC),
page_id.page_no() * len, len, dst, bpage);
*err= fio.err;
if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
if (ignore || fio.err == DB_TABLESPACE_DELETED) {
if (!sync || fio.err == DB_TABLESPACE_DELETED) {
buf_pool.corrupted_evict(bpage);
if (sync && fio.node) {
fio.node->space->release_for_io();
}
return(0);
return false;
}
ut_error;
@@ -357,16 +356,16 @@ buf_read_page_low(
if (sync) {
thd_wait_end(NULL);
/* The i/o was already completed in fil_io() */
/* The i/o was already completed in space->io() */
*err = buf_page_read_complete(bpage, *fio.node);
fio.node->space->release_for_io();
space->release_for_io();
if (*err != DB_SUCCESS) {
return(0);
return false;
}
}
return(1);
return true;
}
/** Applies a random read-ahead in buf_pool if there are at least a threshold
@@ -411,7 +410,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
ulint count= 5 + buf_read_ahead_area / 8;
const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
page_id_t high= low + buf_read_ahead_area;
high.set_page_no(std::min(high.page_no(), space->committed_size - 1));
high.set_page_no(std::min(high.page_no(), space->last_page_number()));
/* Count how many blocks in the area have been recently accessed,
that is, reside near the start of the LRU list. */
@@ -427,10 +426,14 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
goto read_ahead;
}
no_read_ahead:
space->release();
return 0;
read_ahead:
if (!space->acquire_for_io())
goto no_read_ahead;
/* Read all the suitable blocks within the area */
const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
@@ -441,13 +444,16 @@ read_ahead:
if (space->is_stopping())
break;
dberr_t err;
count+= buf_read_page_low(&err, false, ibuf_mode, i, zip_size, false);
space->reacquire_for_io();
if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
count++;
}
if (count)
DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
count, space->chain.start->name,
low.page_no()));
space->release_for_io();
space->release();
/* Read ahead is considered one I/O operation for the purpose of
@@ -472,41 +478,49 @@ after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
{
dberr_t err = DB_SUCCESS;
ulint count = buf_read_page_low(
&err, true, BUF_READ_ANY_PAGE, page_id, zip_size, false);
srv_stats.buf_pool_reads.add(count);
if (err == DB_TABLESPACE_DELETED) {
fil_space_t *space= fil_space_acquire(page_id.space());
if (!space)
{
ib::info() << "trying to read page " << page_id
<< " in nonexisting or being-dropped tablespace";
return DB_TABLESPACE_DELETED;
}
else if (!space->acquire_for_io())
{
ib::warn() << "unable to read " << page_id << " from tablespace "
<< space->name;
space->release();
return DB_PAGE_CORRUPTED;
}
/* Increment number of I/O operations used for LRU policy. */
buf_LRU_stat_inc_io();
space->release();
return(err);
dberr_t err;
if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
page_id, zip_size, false))
srv_stats.buf_pool_reads.add(1);
buf_LRU_stat_inc_io();
return err;
}
/** High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
@param[in,out] space tablespace
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] sync true if synchronous aio is desired */
void
buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
ulint zip_size, bool sync)
{
ulint count;
dberr_t err;
count = buf_read_page_low(
&err, sync,
BUF_READ_ANY_PAGE,
page_id, zip_size, false, true);
if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE,
page_id, zip_size, false)) {
srv_stats.buf_pool_reads.add(1);
}
switch (err) {
case DB_SUCCESS:
@@ -528,8 +542,6 @@ buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
<< page_id;
}
srv_stats.buf_pool_reads.add(count);
/* We do not increment number of I/O operations used for LRU policy
here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
about evicting uncompressed version of compressed pages from the
@@ -598,10 +610,19 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
fil_space_t *space= fil_space_acquire(page_id.space());
if (!space)
return 0;
if (high_1.page_no() >= space->committed_size)
else
{
bool ok= space->acquire_for_io();
space->release();
if (!ok)
return 0;
}
if (high_1.page_no() > space->last_page_number())
{
/* The area is not whole. */
space->release();
fail:
space->release_for_io();
return 0;
}
@@ -628,8 +649,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
{
hard_fail:
hash_lock->read_unlock();
space->release();
return 0;
goto fail;
}
const byte *f;
switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
@@ -661,7 +681,7 @@ hard_fail:
if (id != new_low && id != new_high_1)
/* This is not a border page of the area: return */
goto hard_fail;
if (new_high_1.page_no() >= space->committed_size)
if (new_high_1.page_no() > space->last_page_number())
/* The area is not whole */
goto hard_fail;
}
@@ -671,8 +691,7 @@ failed:
hash_lock->read_unlock();
if (--count)
continue;
space->release();
return 0;
goto fail;
}
const unsigned accessed= bpage->is_accessed();
@@ -702,7 +721,8 @@ failed:
if (space->is_stopping())
break;
dberr_t err;
count+= buf_read_page_low(&err, false, ibuf_mode, new_low, zip_size,
space->reacquire_for_io();
count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
false);
}
@@ -710,7 +730,7 @@ failed:
DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
count, space->chain.start->name,
new_low.page_no()));
space->release();
space->release_for_io();
/* Read ahead is considered one I/O operation for the purpose of
LRU policy decision. */
@@ -721,24 +741,19 @@ failed:
}
/** Issues read requests for pages which recovery wants to read in.
@param[in] sync true if the caller wants this function to wait
for the highest address page to get read in, before this function returns
@param[in] space_id tablespace id
@param[in] page_nos array of page numbers to read, with the
highest page number the last in the array
@param[in] n number of page numbers in the array */
void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
ulint n)
void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
{
fil_space_t* space = fil_space_get(space_id);
fil_space_t* space = fil_space_t::get_for_io(space_id);
if (space == NULL) {
/* The tablespace is missing: do nothing */
if (!space) {
/* The tablespace is missing or unreadable: do nothing */
return;
}
fil_space_open_if_needed(space);
const ulint zip_size = space->zip_size();
for (ulint i = 0; i < n; i++) {
@@ -769,9 +784,10 @@ void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
}
dberr_t err;
buf_read_page_low(
&err, sync && i + 1 == n,
BUF_READ_ANY_PAGE, cur_page_id, zip_size, true);
space->reacquire_for_io();
buf_read_page_low(&err, space, false,
BUF_READ_ANY_PAGE, cur_page_id, zip_size,
true);
if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
ib::error() << "Recovery failed to read or decrypt "
@@ -779,5 +795,8 @@ void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
}
}
DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)", n));
DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
space->chain.start->name));
space->release_for_io();
}

View File

@@ -951,7 +951,7 @@ void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
if (fil_space_t* s = fil_space_acquire_silent(space_id)) {
/* Ensure that the tablespace file exists
in order to avoid a crash in buf_page_get_gen(). */
if (s->size || fil_space_get_size(space_id)) {
if (root_page_no < s->get_size()) {
btr_free_if_exists(page_id_t(space_id, root_page_no),
s->zip_size(),
mach_read_from_8(ptr), mtr);

View File

@@ -2975,15 +2975,15 @@ err_exit:
}
if (err == DB_SUCCESS && table->is_readable()) {
if (table->space && !fil_space_get_size(table->space_id)) {
const auto root = dict_table_get_first_index(table)->page;
if (root >= table->space->get_size()) {
corrupted:
table->corrupted = true;
table->file_unreadable = true;
err = DB_CORRUPTION;
} else {
const page_id_t page_id(
table->space->id,
dict_table_get_first_index(table)->page);
const page_id_t page_id(table->space->id, root);
mtr.start();
buf_block_t* block = buf_page_get(
page_id, table->space->zip_size(),

View File

@@ -975,8 +975,7 @@ static inline
void
fil_crypt_read_crypt_data(fil_space_t* space)
{
if (space->crypt_data || space->size
|| !fil_space_get_size(space->id)) {
if (space->crypt_data || space->size || !space->get_size()) {
/* The encryption metadata has already been read, or
the tablespace is not encrypted and the file has been
opened already, or the file cannot be accessed,
@@ -2246,16 +2245,10 @@ static void fil_crypt_rotation_list_fill()
}
/* Ensure that crypt_data has been initialized. */
if (!space->size) {
ut_d(const fil_space_t* s=)
fil_system.read_page0(space->id);
ut_ad(!s || s == space);
if (!space->size) {
/* Page 0 was not loaded.
Skip this tablespace. */
if (!space->get_size()) {
/* Page 0 was not loaded. Skip this tablespace. */
goto next;
}
}
/* Skip ENCRYPTION!=DEFAULT tablespaces. */
if (space->crypt_data

File diff suppressed because it is too large Load Diff

View File

@@ -296,8 +296,6 @@ Datafile::read_first_page(bool read_only_mode)
m_first_page = static_cast<byte*>(
aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size));
constexpr IORequest request(IORequest::READ |
IORequest::DISABLE_PARTIAL_IO_WARNINGS);
dberr_t err = DB_ERROR;
size_t page_size = UNIV_PAGE_SIZE_MAX;
@@ -308,7 +306,8 @@ Datafile::read_first_page(bool read_only_mode)
ulint n_read = 0;
err = os_file_read_no_error_handling(
request, m_handle, m_first_page, 0, page_size, &n_read);
IORequestReadPartial, m_handle, m_first_page, 0,
page_size, &n_read);
if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {

View File

@@ -130,7 +130,7 @@ Tablespace::open_or_create(bool is_temp)
fsp_flags = FSP_FLAGS_PAGE_SSIZE();
}
space = fil_space_create(
space = fil_space_t::create(
m_name, m_space_id, fsp_flags,
is_temp
? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,

View File

@@ -906,13 +906,10 @@ SysTablespace::open_or_create(
if (it != begin) {
} else if (is_temp) {
ut_ad(space_id() == SRV_TMP_SPACE_ID);
space = fil_space_create(
space = fil_space_t::create(
name(), SRV_TMP_SPACE_ID, flags(),
FIL_TYPE_TEMPORARY, NULL);
mutex_enter(&fil_system.mutex);
fil_system.temp_space = space;
mutex_exit(&fil_system.mutex);
ut_ad(space == fil_system.temp_space);
if (!space) {
return DB_ERROR;
}
@@ -920,12 +917,10 @@ SysTablespace::open_or_create(
ut_ad(space->full_crc32());
} else {
ut_ad(space_id() == TRX_SYS_SPACE);
space = fil_space_create(
space = fil_space_t::create(
name(), TRX_SYS_SPACE, it->flags(),
FIL_TYPE_TABLESPACE, NULL);
mutex_enter(&fil_system.mutex);
fil_system.sys_space = space;
mutex_exit(&fil_system.mutex);
ut_ad(space == fil_system.sys_space);
if (!space) {
return DB_ERROR;
}

View File

@@ -7044,6 +7044,7 @@ i_s_tablespaces_encryption_fill_table(
}
mutex_enter(&fil_system.mutex);
fil_system.freeze_space_list++;
for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
space; space = UT_LIST_GET_NEXT(space_list, space)) {
@@ -7060,6 +7061,7 @@ i_s_tablespaces_encryption_fill_table(
}
}
fil_system.freeze_space_list--;
mutex_exit(&fil_system.mutex);
DBUG_RETURN(0);
}

View File

@@ -2300,7 +2300,7 @@ static void ibuf_read_merge_pages(const uint32_t* space_ids,
for (ulint i = 0; i < n_stored; i++) {
const ulint space_id = space_ids[i];
fil_space_t* s = fil_space_acquire_for_io(space_id);
fil_space_t* s = fil_space_t::get_for_io(space_id);
if (!s) {
tablespace_deleted:
/* The tablespace was not found: remove all
@@ -4631,26 +4631,14 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
const unsigned zip_size = space->zip_size();
const unsigned physical_size = space->physical_size();
/* fil_space_t::size and fil_space_t::free_limit would still be 0
at this point. So, we will have to read page 0. */
ut_ad(!space->free_limit);
ut_ad(!space->size);
uint32_t size= std::min(space->free_limit, space->size);
if (size == 0) {
return(DB_TABLE_NOT_FOUND);
}
mtr_t mtr;
uint32_t size;
mtr.start();
if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0),
zip_size,
RW_S_LATCH, &mtr)) {
size = std::min(
mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ sp->frame),
mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ sp->frame));
} else {
size = 0;
}
mtr.commit();
mutex_enter(&ibuf_mutex);

View File

@@ -978,6 +978,15 @@ public:
return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
}
/** @return the byte offset of the page within a file */
os_offset_t physical_offset() const
{
os_offset_t o= id().page_no();
return zip.ssize
? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
: o << srv_page_size_shift;
}
/** @return whether the block is mapped to a data file */
bool in_file() const
{

View File

@@ -52,10 +52,10 @@ class buf_dblwr_t
struct element
{
/** block descriptor */
buf_page_t *bpage;
/** true=buf_pool.flush_list, false=buf_pool.LRU */
bool lru;
/** tablespace */
fil_space_t *space;
/** asynchronous write request */
IORequest request;
/** payload size in bytes */
size_t size;
};
@@ -103,10 +103,11 @@ public:
/** Schedule a page write. If the doublewrite memory buffer is full,
flush_buffered_writes() will be invoked to make space.
@param bpage buffer pool page to be written
@param lru true=buf_pool.LRU; false=buf_pool.flush_list
@param space tablespace
@param request asynchronous write request
@param size payload size in bytes */
void add_to_batch(buf_page_t *bpage, bool lru, size_t size);
void add_to_batch(fil_space_t *space, const IORequest &request,
size_t size) MY_ATTRIBUTE((nonnull));
/** Determine whether the doublewrite buffer is initialized */
bool is_initialised() const

View File

@@ -46,11 +46,13 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
@param[in,out] space tablespace
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] sync true if synchronous aio is desired */
void
buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync);
void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
ulint zip_size, bool sync)
MY_ATTRIBUTE((nonnull));
/** Applies a random read-ahead in buf_pool if there are at least a threshold
value of accessed pages from the random read-ahead area. Does not read any
@@ -101,14 +103,11 @@ ulint
buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
/** Issues read requests for pages which recovery wants to read in.
@param[in] sync true if the caller wants this function to wait
for the highest address page to get read in, before this function returns
@param[in] space_id tablespace id
@param[in] page_nos array of page numbers to read, with the
highest page number the last in the array
@param[in] n number of page numbers in the array */
void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
ulint n);
void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n);
/** @name Modes used in read-ahead @{ */
/** read only pages belonging to the insert buffer tree */

View File

@@ -313,6 +313,25 @@ new_range:
/** Tablespace or log data space */
#ifndef UNIV_INNOCHECKSUM
struct fil_io_t
{
/** error code */
dberr_t err;
/** file; node->space->release_for_io() must follow IORequestRead call */
fil_node_t *node;
};
/** Tablespace encryption mode */
enum fil_encryption_t
{
/** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
FIL_ENCRYPTION_DEFAULT,
/** Encrypted */
FIL_ENCRYPTION_ON,
/** Not encrypted */
FIL_ENCRYPTION_OFF
};
struct fil_space_t : ilist_node<unflushed_spaces_tag_t>,
ilist_node<rotation_list_tag_t>
#else
@@ -348,8 +367,6 @@ struct fil_space_t
/*!< recovered tablespace size in pages;
0 if no size change was read from the redo log,
or if the size change was implemented */
/** the committed size of the tablespace in pages */
Atomic_relaxed<uint32_t> committed_size;
ulint n_reserved_extents;
/*!< number of reserved free extents for
ongoing operations like B-tree page split */
@@ -357,28 +374,33 @@ struct fil_space_t
the tablespace to disk; dropping of the
tablespace is forbidden if this is positive */
private:
/** the committed size of the tablespace in pages */
Atomic_relaxed<uint32_t> committed_size;
/** Number of pending buffer pool operations accessing the
tablespace without holding a table lock or dict_operation_lock
S-latch that would prevent the table (and tablespace) from being
dropped. An example is encryption key rotation.
The tablespace cannot be dropped while this is nonzero, or while
fil_node_t::n_pending is nonzero.
The tablespace cannot be dropped while this is nonzero.
The most significant bit contains the STOP_NEW_OPS flag. */
Atomic_relaxed<size_t> n_pending_ops;
Atomic_relaxed<uint32_t> n_pending_ops;
/** Number of pending block read or write operations
The tablespace object cannot be freed while this is nonzero,
but it can be detached from fil_system.
The most significant bit contains the CLOSING flag. */
std::atomic<uint32_t> n_pending_ios;
/** Flag in n_pending_ops that indicates that the tablespace is being
deleted, and no further operations should be performed */
static constexpr uint32_t STOP_NEW_OPS= ~(~uint32_t(0) >> 1);
/** Flag in n_pending_ios that indicates that the tablespace is a candidate
for being closed, and fil_node_t::is_open() can only be trusted after
acquiring fil_system.mutex and resetting the flag */
static constexpr uint32_t CLOSING= STOP_NEW_OPS;
static constexpr uint32_t NOT_CLOSING= ~CLOSING;
public:
/** Number of pending block read or write operations
(when a write is imminent or a read has recently completed).
The tablespace object cannot be freed while this is nonzero,
but it can be detached from fil_system.
Note that fil_node_t::n_pending tracks actual pending I/O requests.
Protected by fil_system.mutex and std::atomic. */
std::atomic<ulint> n_pending_ios;
rw_lock_t latch; /*!< latch protecting the file space storage
allocation */
UT_LIST_NODE_T(fil_space_t) named_spaces;
@@ -484,9 +506,10 @@ public:
/** @return whether the storage device is rotational (HDD, not SSD) */
inline bool is_rotational() const;
/** Open each file. Only invoked on fil_system.temp_space.
/** Open each file. Never invoked on .ibd files.
@param create_new_db whether to skip the call to fil_node_t::read_page0()
@return whether all files were opened */
bool open();
bool open(bool create_new_db);
/** Close each file. Only invoked on fil_system.temp_space. */
void close();
@@ -497,17 +520,13 @@ public:
size_t referenced() const { return n_pending_ops & ~STOP_NEW_OPS; }
/** Note that operations on the tablespace must stop or can resume */
void set_stopping(bool stopping)
{
ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS);
ut_ad(!(n & STOP_NEW_OPS) == stopping);
}
inline void set_stopping(bool stopping);
MY_ATTRIBUTE((warn_unused_result))
/** @return whether a tablespace reference was successfully acquired */
bool acquire()
{
size_t n= 0;
uint32_t n= 0;
while (!n_pending_ops.compare_exchange_strong(n, n + 1,
std::memory_order_acquire,
std::memory_order_relaxed))
@@ -523,30 +542,41 @@ public:
ut_ad(n & ~STOP_NEW_OPS);
return (n & ~STOP_NEW_OPS) == 1;
}
/** Acquire a tablespace reference for I/O. */
void acquire_for_io() { n_pending_ios++; }
/** Release a tablespace reference for I/O. */
void release_for_io() { ut_d(auto n=) n_pending_ios--; ut_ad(n); }
/** @return whether I/O is pending */
bool pending_io() const { return n_pending_ios; }
/** @return whether the tablespace file can be closed and reopened */
bool belongs_in_lru() const
MY_ATTRIBUTE((warn_unused_result))
/** Acquire a tablespace reference for I/O.
@return whether the file is usable */
bool acquire_for_io()
{
switch (purpose) {
case FIL_TYPE_TEMPORARY:
ut_ad(id == SRV_TMP_SPACE_ID);
return false;
case FIL_TYPE_IMPORT:
ut_ad(id != SRV_TMP_SPACE_ID);
return true;
case FIL_TYPE_TABLESPACE:
ut_ad(id != SRV_TMP_SPACE_ID);
return id && !srv_is_undo_tablespace(id);
return UNIV_LIKELY(!(n_pending_ios.fetch_add(1, std::memory_order_acquire)&
CLOSING)) ||
prepare_for_io();
}
ut_ad(0);
return false;
/** Acquire another tablespace reference for I/O. */
inline void reacquire_for_io();
/** Release a tablespace reference for I/O. */
void release_for_io()
{
ut_d(uint32_t n=) n_pending_ios.fetch_sub(1, std::memory_order_release);
ut_ad(n & NOT_CLOSING);
}
/** @return number of pending reads or writes */
uint32_t pending_io() const
{ return n_pending_ios.load(std::memory_order_acquire) & NOT_CLOSING; }
MY_ATTRIBUTE((warn_unused_result))
/** Prepare to close the file handle.
@return number of pending operations */
uint32_t set_closing()
{
return n_pending_ios.fetch_or(CLOSING, std::memory_order_acquire) &
NOT_CLOSING;
}
/** @return whether close() of the file handle has been requested */
bool is_closing() const
{ return n_pending_ios.load(std::memory_order_acquire) & CLOSING; }
/** @return last_freed_lsn */
lsn_t get_last_freed_lsn() { return last_freed_lsn; }
@@ -835,6 +865,25 @@ public:
}
#ifndef UNIV_INNOCHECKSUM
MY_ATTRIBUTE((warn_unused_result))
/** Create a tablespace in fil_system.
@param name tablespace name
@param id tablespace identifier
@param flags tablespace flags
@param purpose tablespace purpose
@param crypt_data encryption information
@param mode encryption mode
@return pointer to created tablespace, to be filled in with add()
@retval nullptr on failure (such as when the same tablespace exists) */
static fil_space_t *create(const char *name, ulint id, ulint flags,
fil_type_t purpose, fil_space_crypt_t *crypt_data,
fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT);
/** Acquire a tablespace for reading or writing a block.
@param id tablespace ID
@return the tablespace, or nullptr if missing or inaccessible */
static fil_space_t *get_for_io(ulint id);
/** Add/remove the free page in the freed ranges list.
@param[in] offset page number to be added
@param[in] free true if page to be freed */
@@ -863,8 +912,47 @@ public:
std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
freed_ranges.add_range(range);
}
#endif /*!UNIV_INNOCHECKSUM */
/** Set the tablespace size in pages */
void set_sizes(uint32_t s)
{
ut_ad(id ? !size : (size >= s));
size= s; committed_size= s;
}
/** Update committed_size in mtr_t::commit() */
void set_committed_size()
{
ut_ad(rw_lock_own(&latch, RW_LOCK_X));
committed_size= size;
}
/** @return the last persisted page number */
uint32_t last_page_number() const { return committed_size - 1; }
/** @return the size in pages (0 if unreadable) */
inline uint32_t get_size();
/** Read or write data.
@param type I/O context
@param offset offset in bytes
@param len number of bytes
@param buf the data to be read or written
@param bpage buffer block (for type.is_async() completion callback)
@return status and file descriptor */
fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
void *buf, buf_page_t *bpage= nullptr);
/** Flush pending writes from the file system cache to the file */
void flush();
/** Read the first page of a data file.
@return whether the page was found valid */
bool read_page0();
private:
/** @return whether the file is usable for io() */
ATTRIBUTE_COLD bool prepare_for_io();
#endif /*!UNIV_INNOCHECKSUM */
};
#ifndef UNIV_INNOCHECKSUM
@@ -892,8 +980,6 @@ struct fil_node_t {
uint32_t init_size;
/** maximum size of the file in database pages (0 if unlimited) */
uint32_t max_size;
/** count of pending i/o's; is_open must be true if nonzero */
ulint n_pending;
/** count of pending flushes; is_open must be true if nonzero */
ulint n_pending_flushes;
/** whether the file is currently being extended */
@@ -902,8 +988,6 @@ struct fil_node_t {
bool needs_flush;
/** link to other files in this tablespace */
UT_LIST_NODE_T(fil_node_t) chain;
/** link to the fil_system.LRU list (keeping track of open files) */
UT_LIST_NODE_T(fil_node_t) LRU;
/** whether this file could use atomic write (data file) */
bool atomic_write;
@@ -921,9 +1005,8 @@ struct fil_node_t {
}
/** Read the first page of a data file.
@param[in] first whether this is the very first read
@return whether the page was found valid */
bool read_page0(bool first);
bool read_page0();
/** Determine some file metadata when creating or reading the file.
@param file the file that is being created, or OS_FILE_CLOSED */
@@ -942,8 +1025,8 @@ struct fil_node_t {
@return detached handle or OS_FILE_CLOSED */
pfs_os_file_t close_to_free(bool detach_handle= false);
/** Update the data structures on I/O completion */
inline void complete_io(bool write= false);
/** Update the data structures on write completion */
inline void complete_write();
private:
/** Does stuff common for close() and detach() */
@@ -953,6 +1036,13 @@ private:
/** Value of fil_node_t::magic_n */
#define FIL_NODE_MAGIC_N 89389
inline void fil_space_t::reacquire_for_io()
{
ut_d(uint32_t n=) n_pending_ios.fetch_add(1, std::memory_order_relaxed);
ut_ad(n & NOT_CLOSING);
ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
}
inline void fil_space_t::set_imported()
{
ut_ad(purpose == FIL_TYPE_IMPORT);
@@ -963,11 +1053,9 @@ inline void fil_space_t::set_imported()
inline bool fil_space_t::is_rotational() const
{
for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
node = UT_LIST_GET_NEXT(chain, node)) {
if (!node->on_ssd) {
node= UT_LIST_GET_NEXT(chain, node))
if (!node->on_ssd)
return true;
}
}
return false;
}
@@ -1179,16 +1267,6 @@ index */
#define fil_page_index_page_check(page) \
fil_page_type_is_index(fil_page_get_type(page))
/** Enum values for encryption table option */
enum fil_encryption_t {
/** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
FIL_ENCRYPTION_DEFAULT,
/** Encrypted */
FIL_ENCRYPTION_ON,
/** Not encrypted */
FIL_ENCRYPTION_OFF
};
/** Get the file page type.
@param[in] page file page
@return page type */
@@ -1227,7 +1305,6 @@ struct fil_system_t {
*/
fil_system_t(): m_initialised(false)
{
UT_LIST_INIT(LRU, &fil_node_t::LRU);
UT_LIST_INIT(space_list, &fil_space_t::space_list);
UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces);
}
@@ -1275,30 +1352,23 @@ public:
fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
/** Map of fil_space_t::id to fil_space_t* */
hash_table_t spaces;
UT_LIST_BASE_NODE_T(fil_node_t) LRU;
/*!< base node for the LRU list of the
most recently used open files with no
pending i/o's; if we start an i/o on
the file, we first remove it from this
list, and return it to the start of
the list when the i/o ends;
log files and the system tablespace are
not put to this list: they are opened
after the startup, and kept open until
shutdown */
sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
/*!< list of those
tablespaces whose files contain
unflushed writes; those spaces have
at least one file node where
needs_flush == true */
ulint n_open; /*!< number of files currently open */
/** number of currently open files; protected by mutex */
ulint n_open;
ulint max_assigned_id;/*!< maximum space id in the existing
tables, or assigned during the time
mysqld has been up; at an InnoDB
startup we scan the data dictionary
and set here the maximum of the
space id's of the tables there */
/** nonzero if fil_node_open_file_low() should avoid moving the tablespace
to the end of space_list, for FIFO policy of try_to_close() */
ulint freeze_space_list;
UT_LIST_BASE_NODE_T(fil_space_t) space_list;
/*!< list of all file spaces */
UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
@@ -1312,16 +1382,10 @@ public:
key rotation.*/
bool space_id_reuse_warned;
/*!< whether fil_space_create()
/*!< whether fil_space_t::create()
has issued a warning about
potential space_id reuse */
/** Trigger a call to fil_node_t::read_page0()
@param[in] id tablespace identifier
@return tablespace
@retval NULL if the tablespace does not exist or cannot be read */
fil_space_t* read_page0(ulint id);
/** Return the next tablespace from rotation_list.
@param space previous tablespace (NULL to start from the start)
@param recheck whether the removal condition needs to be rechecked after
@@ -1336,63 +1400,28 @@ public:
/** The tablespace memory cache. */
extern fil_system_t fil_system;
/** Update the data structures on I/O completion */
inline void fil_node_t::complete_io(bool write)
/** Note that operations on the tablespace must stop or can resume */
inline void fil_space_t::set_stopping(bool stopping)
{
ut_ad(mutex_own(&fil_system.mutex));
if (write)
{
if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
{
/* We don't need to keep track of unflushed changes as user has
explicitly disabled buffering. */
ut_ad(!space->is_in_unflushed_spaces);
ut_ad(!needs_flush);
}
else if (!space->is_stopping())
{
needs_flush= true;
if (!space->is_in_unflushed_spaces)
{
space->is_in_unflushed_spaces= true;
fil_system.unflushed_spaces.push_front(*space);
}
}
ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS);
ut_ad(!(n & STOP_NEW_OPS) == stopping);
}
switch (n_pending--) {
case 0:
ut_error;
case 1:
if (space->belongs_in_lru())
/* The node must be put back to the LRU list */
UT_LIST_ADD_FIRST(fil_system.LRU, this);
/** @return the size in pages (0 if unreadable) */
inline uint32_t fil_space_t::get_size()
{
if (!size)
{
mutex_enter(&fil_system.mutex);
read_page0();
mutex_exit(&fil_system.mutex);
}
return size;
}
#include "fil0crypt.h"
/** Create a space memory object and put it to the fil_system hash table.
Error messages are issued to the server log.
@param[in] name tablespace name
@param[in] id tablespace identifier
@param[in] flags tablespace flags
@param[in] purpose tablespace purpose
@param[in,out] crypt_data encryption information
@param[in] mode encryption mode
@return pointer to created tablespace, to be filled in with fil_space_t::add()
@retval NULL on failure (such as when the same tablespace exists) */
fil_space_t*
fil_space_create(
const char* name,
ulint id,
ulint flags,
fil_type_t purpose,
fil_space_crypt_t* crypt_data,
fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT)
MY_ATTRIBUTE((warn_unused_result));
/*******************************************************************//**
Assigns a new space id for a new single-table tablespace. This works simply by
incrementing the global counter. If 4 billion id's is not enough, we may need
@@ -1421,21 +1450,6 @@ fil_space_free(
void fil_space_set_recv_size_and_flags(ulint id, uint32_t size,
uint32_t flags);
/*******************************************************************//**
Returns the size of the space in pages. The tablespace must be cached in the
memory cache.
@return space size, 0 if space not found */
ulint
fil_space_get_size(
/*===============*/
ulint id); /*!< in: space id */
/** Opens all system tablespace data files. They stay open until the
database server shutdown. This should be called at a server startup after the
space objects for the system tablespace have been created. The
purpose of this operation is to make sure we never run out of file descriptors
if we need to read from the insert buffer. */
void fil_open_system_tablespace_files();
/** Close all tablespace files at shutdown */
void fil_close_all_files();
/*******************************************************************//**
@@ -1491,14 +1505,6 @@ fil_space_acquire_silent(ulint id)
return (fil_space_acquire_low(id, true));
}
/** Acquire a tablespace for reading or writing a block,
when it could be dropped concurrently.
@param[in] id tablespace ID
@return the tablespace
@retval NULL if missing */
fil_space_t*
fil_space_acquire_for_io(ulint id);
/** Replay a file rename operation if possible.
@param[in] space_id tablespace identifier
@param[in] name old file name
@@ -1674,7 +1680,7 @@ fil_file_readdir_next_file(
memory cache. Note that if we have not done a crash recovery at the database
startup, there may be many tablespaces which are not yet in the memory cache.
@param[in] id Tablespace ID
@param[in] name Tablespace name used in fil_space_create().
@param[in] name Tablespace name used in fil_space_t::create().
@param[in] table_flags table flags
@return the tablespace
@retval NULL if no matching tablespace exists in the memory cache */
@@ -1690,70 +1696,6 @@ fil_space_for_table_exists_in_mem(
@return whether the tablespace is at least as big as requested */
bool fil_space_extend(fil_space_t *space, uint32_t size);
struct fil_io_t
{
/** error code */
dberr_t err;
/** file; node->space->release_for_io() must follow fil_io(sync=true) call */
fil_node_t *node;
};
/** Reads or writes data. This operation could be asynchronous (aio).
@param[in] type IO context
@param[in] sync true if synchronous aio is desired
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] byte_offset remainder of offset in bytes; in aio this
must be divisible by the OS block size
@param[in] len how many bytes to read or write; this must
not cross a file boundary; in aio this must
be a block size multiple
@param[in,out] buf buffer where to store read data or from where
to write; in aio this must be appropriately
aligned
@param[in] message message for aio handler if non-sync aio
used, else ignored
@param[in] ignore whether to ignore errors
@param[in] punch_hole punch the hole to the file for page_compressed
tablespace
@return status and file descriptor */
fil_io_t
fil_io(
const IORequest& type,
bool sync,
const page_id_t page_id,
ulint zip_size,
ulint byte_offset,
ulint len,
void* buf,
void* message,
bool ignore = false,
bool punch_hole = false);
/**********************************************************************//**
Waits for an aio operation to complete. This function is used to write the
handler for completed requests. The aio array of pending requests is divided
into segments (see os0file.cc for more info). The thread specifies which
segment it wants to wait for. */
void
fil_aio_wait(
/*=========*/
ulint segment); /*!< in: the number of the segment in the aio
array to wait for */
/**********************************************************************//**
Flushes to disk possible writes cached by the OS. If the space does not exist
or is being dropped, does not do anything. */
void
fil_flush(
/*======*/
ulint space_id); /*!< in: file space id (this can be a group of
log files or a tablespace of the database) */
/** Flush a tablespace.
@param[in,out] space tablespace to flush */
void
fil_flush(fil_space_t* space);
/** Flush to disk the writes in file spaces of the given type
possibly cached by the OS. */
void fil_flush_file_spaces();
@@ -1846,23 +1788,6 @@ inline bool fil_names_write_if_was_clean(fil_space_t* space)
return(was_clean);
}
/** During crash recovery, open a tablespace if it had not been opened
yet, to get valid size and flags.
@param[in,out] space tablespace */
inline void fil_space_open_if_needed(fil_space_t* space)
{
ut_ad(recv_recovery_is_on());
if (space->size == 0) {
/* Initially, size and flags will be set to 0,
until the files are opened for the first time.
fil_space_get_size() will open the file
and adjust the size and flags. */
ut_d(ulint size =) fil_space_get_size(space->id);
ut_ad(size == space->size);
}
}
/** On a log checkpoint, reset fil_names_dirty_and_write() flags
and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
@param[in] lsn checkpoint LSN

View File

@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2014, 2019, MariaDB Corporation.
Copyright (c) 2014, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,7 @@ File space management types
Created May 26, 2009 Vasil Dimov
*******************************************************/
#ifndef fsp0types_h
#define fsp0types_h
#pragma once
#include <cstddef>
/** The fil_space_t::id of the redo log. All persistent tablespaces
@@ -402,4 +400,6 @@ in full crc32 format. */
/* @} */
#endif /* fsp0types_h */
struct fil_node_t;
struct fil_space_t;
class buf_page_t;

View File

@@ -1,48 +0,0 @@
/***********************************************************************
Copyright (c) 2017, 2019, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
***********************************************************************/
/**************************************************//**
@file os0api.h
The interface to the helper functions.
These functions are used on os0file.h where
including full full header is not feasible and
implemented on buf0buf.cc and fil0fil.cc.
*******************************************************/
#ifndef OS_API_H
#define OS_API_H 1
/** Page control block */
class buf_page_t;
/** File Node */
struct fil_node_t;
/**
Calculate the length of trim (punch_hole) operation.
@param[in] bpage Page control block
@param[in] write_length Write length
@return length of the trim or zero. */
ulint
buf_page_get_trim_length(
const buf_page_t* bpage,
ulint write_length)
MY_ATTRIBUTE((warn_unused_result));
#endif /* OS_API_H */

View File

@@ -37,7 +37,6 @@ Created 10/21/1995 Heikki Tuuri
#define os0file_h
#include "fsp0types.h"
#include "os0api.h"
#include "tpool.h"
#ifndef _WIN32
@@ -46,10 +45,6 @@ Created 10/21/1995 Heikki Tuuri
#include <time.h>
#endif /* !_WIN32 */
/** File node of a tablespace or the log data space */
struct fil_node_t;
struct fil_space_t;
extern bool os_has_said_disk_full;
/** File offset in bytes */
@@ -188,117 +183,75 @@ The I/O context that is passed down to the low level IO code */
class IORequest
{
public:
constexpr IORequest(ulint type= READ, buf_page_t *bpage= nullptr,
bool lru= false) :
m_bpage(bpage), m_type(static_cast<uint16_t>(type)), m_LRU(lru) {}
/** Flags passed in the request, they can be ORred together. */
enum {
READ = 1,
WRITE = 2,
/** Double write buffer recovery. */
DBLWR_RECOVER = 4,
/** Enumarations below can be ORed to READ/WRITE above*/
/** Data file */
DATA_FILE = 8,
/** Disable partial read warnings */
DISABLE_PARTIAL_IO_WARNINGS = 32,
/** Use punch hole if available*/
PUNCH_HOLE = 64,
enum Type
{
/** Synchronous read */
READ_SYNC= 2,
/** Asynchronous read; some errors will be ignored */
READ_ASYNC= READ_SYNC | 1,
/** Possibly partial read; only used with
os_file_read_no_error_handling() */
READ_MAYBE_PARTIAL= READ_SYNC | 4,
/** Read for doublewrite buffer recovery */
DBLWR_RECOVER= READ_SYNC | 8,
/** Synchronous write */
WRITE_SYNC= 16,
/** Asynchronous write */
WRITE_ASYNC= WRITE_SYNC | 1,
/** Write data; evict the block on write completion */
WRITE_LRU= WRITE_ASYNC | 32,
/** Write data and punch hole for the rest */
PUNCH= WRITE_ASYNC | 64,
/** Write data and punch hole; evict the block on write completion */
PUNCH_LRU= PUNCH | WRITE_LRU,
/** Zero out a range of bytes in fil_space_t::io() */
PUNCH_RANGE= WRITE_SYNC | 128,
};
/** @return true if it is a read request */
bool is_read() const
MY_ATTRIBUTE((warn_unused_result))
{
return((m_type & READ) == READ);
}
constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) :
bpage(bpage), type(type) {}
/** @return true if it is a write request */
bool is_write() const
MY_ATTRIBUTE((warn_unused_result))
{
return((m_type & WRITE) == WRITE);
}
constexpr IORequest(const IORequest &old, fil_node_t *node= nullptr) :
bpage(old.bpage), node(node), type(old.type) {}
/** @return true if partial read warning disabled */
bool is_partial_io_warning_disabled() const
MY_ATTRIBUTE((warn_unused_result))
{
return !!(m_type & DISABLE_PARTIAL_IO_WARNINGS);
}
bool is_read() const { return (type & READ_SYNC) != 0; }
bool is_write() const { return (type & WRITE_SYNC) != 0; }
bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
/** @return true if punch hole should be used */
bool punch_hole() const
MY_ATTRIBUTE((warn_unused_result))
{
return((m_type & PUNCH_HOLE) == PUNCH_HOLE);
}
/** @return true if the read should be validated */
bool validate() const
MY_ATTRIBUTE((warn_unused_result))
{
return(is_read() ^ is_write());
}
/** Set the pointer to file node for IO
@param[in] node File node */
void set_fil_node(fil_node_t *node) { m_fil_node= node; }
bool operator==(const IORequest& rhs) const
{
return(m_type == rhs.m_type);
}
/** @return true if the request is from the dblwr recovery */
bool is_dblwr_recover() const
MY_ATTRIBUTE((warn_unused_result))
{
return((m_type & DBLWR_RECOVER) == DBLWR_RECOVER);
}
ulint get_trim_length(ulint write_length) const
{
return (m_bpage ?
buf_page_get_trim_length(m_bpage, write_length)
: 0);
}
inline bool should_punch_hole() const;
/** Free storage space associated with a section of the file.
@param[in] fh Open file handle
@param[in] off Starting offset (SEEK_SET)
@param[in] len Size of the hole
/** If requested, free storage space associated with a section of the file.
@param off byte offset from the start (SEEK_SET)
@param len size of the hole in bytes
@return DB_SUCCESS or error code */
dberr_t punch_hole(os_file_t fh, os_offset_t off, ulint len);
/** @return type of page flush (for writes) */
bool is_LRU() const { return m_LRU; }
dberr_t maybe_punch_hole(os_offset_t off, ulint len)
{
return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
? punch_hole(off, len)
: DB_SUCCESS;
}
private:
/** Page to be written on write operation. */
buf_page_t* const m_bpage= nullptr;
/** Free storage space associated with a section of the file.
@param off byte offset from the start (SEEK_SET)
@param len size of the hole in bytes
@return DB_SUCCESS or error code */
dberr_t punch_hole(os_offset_t off, ulint len) const
MY_ATTRIBUTE((nonnull));
/** File node */
fil_node_t* m_fil_node= nullptr;
public:
/** Page to be written on write operation */
buf_page_t* const bpage= nullptr;
/** File descriptor */
const fil_node_t *const node= nullptr;
/** Request type bit flags */
const uint16_t m_type;
/** for writes, type of page flush */
const bool m_LRU= false;
const Type type;
};
constexpr IORequest IORequestRead(IORequest::READ);
constexpr IORequest IORequestWrite(IORequest::WRITE);
constexpr IORequest IORequestRead(IORequest::READ_SYNC);
constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);
/** Sparse file size information. */
struct os_file_size_t {
@@ -313,20 +266,6 @@ struct os_file_size_t {
/** Win NT does not allow more than 64 */
static const ulint OS_AIO_N_PENDING_IOS_PER_THREAD = 256;
/** Modes for aio operations @{ */
/** Normal asynchronous i/o not for ibuf pages or ibuf bitmap pages */
static const ulint OS_AIO_NORMAL = 21;
/** Asynchronous i/o for ibuf pages or ibuf bitmap pages */
static const ulint OS_AIO_IBUF = 22;
/**Calling thread will wait for the i/o to complete,
and perform IO completion routine itself;
can be used for any pages, ibuf or non-ibuf. This is used to save
CPU time, as we can do with fewer thread switches. */
static const ulint OS_AIO_SYNC = 24;
/* @} */
extern ulint os_n_file_reads;
extern ulint os_n_file_writes;
extern ulint os_n_fsyncs;
@@ -669,9 +608,9 @@ The wrapper functions have the prefix of "innodb_". */
# define os_file_close(file) \
pfs_os_file_close_func(file, __FILE__, __LINE__)
# define os_aio(type, mode, name, file, buf, offset, \
# define os_aio(type, name, file, buf, offset, \
n, read_only, message1, message2) \
pfs_os_aio_func(type, mode, name, file, buf, offset, \
pfs_os_aio_func(type, name, file, buf, offset, \
n, read_only, message1, message2, \
__FILE__, __LINE__)
@@ -859,7 +798,6 @@ function!
Performance schema wrapper function of os_aio() which requests
an asynchronous I/O operation.
@param[in,out] type IO request context
@param[in] mode IO mode
@param[in] name Name of the file or path as NUL terminated
string
@param[in] file Open file handle
@@ -879,8 +817,7 @@ an asynchronous I/O operation.
UNIV_INLINE
dberr_t
pfs_os_aio_func(
IORequest& type,
ulint mode,
const IORequest&type,
const char* name,
pfs_os_file_t file,
void* buf,
@@ -1013,9 +950,9 @@ to original un-instrumented file I/O APIs */
# define os_file_close(file) os_file_close_func(file)
# define os_aio(type, mode, name, file, buf, offset, \
# define os_aio(type, name, file, buf, offset, \
n, read_only, message1, message2) \
os_aio_func(type, mode, name, file, buf, offset, \
os_aio_func(type, name, file, buf, offset, \
n, read_only, message1, message2)
# define os_file_read(type, file, buf, offset, n) \
@@ -1281,7 +1218,6 @@ struct os_aio_userdata_t
NOTE! Use the corresponding macro os_aio(), not directly this function!
Requests an asynchronous i/o operation.
@param[in,out] type IO request context
@param[in] mode IO mode
@param[in] name Name of the file or path as NUL terminated
string
@param[in] file Open file handle
@@ -1298,8 +1234,7 @@ Requests an asynchronous i/o operation.
@return DB_SUCCESS or error code */
dberr_t
os_aio_func(
IORequest& type,
ulint mode,
const IORequest&type,
const char* name,
pfs_os_file_t file,
void* buf,

View File

@@ -206,7 +206,6 @@ function!
Performance schema wrapper function of os_aio() which requests
an asynchronous i/o operation.
@param[in,type] type IO request context
@param[in] mode IO mode
@param[in] name Name of the file or path as NUL terminated
string
@param[in] file Open file handle
@@ -226,8 +225,7 @@ an asynchronous i/o operation.
UNIV_INLINE
dberr_t
pfs_os_aio_func(
IORequest& type,
ulint mode,
const IORequest&type,
const char* name,
pfs_os_file_t file,
void* buf,
@@ -242,8 +240,6 @@ pfs_os_aio_func(
PSI_file_locker_state state;
struct PSI_file_locker* locker = NULL;
ut_ad(type.validate());
/* Register the read or write I/O depending on "type" */
register_pfs_file_io_begin(
&state, locker, file, n,
@@ -251,7 +247,7 @@ pfs_os_aio_func(
src_file, src_line);
dberr_t result = os_aio_func(
type, mode, name, file, buf, offset, n, read_only, m1, m2);
type, name, file, buf, offset, n, read_only, m1, m2);
register_pfs_file_io_end(locker, n);
@@ -284,8 +280,6 @@ pfs_os_file_read_func(
PSI_file_locker_state state;
struct PSI_file_locker* locker = NULL;
ut_ad(type.validate());
register_pfs_file_io_begin(
&state, locker, file, n, PSI_FILE_READ, src_file, src_line);

View File

@@ -46,10 +46,9 @@ Created 3/26/1996 Heikki Tuuri
/** Checks if a page address is the trx sys header page.
@param[in] page_id page id
@return true if trx sys header page */
inline bool trx_sys_hdr_page(const page_id_t& page_id)
inline bool trx_sys_hdr_page(const page_id_t page_id)
{
return(page_id.space() == TRX_SYS_SPACE
&& page_id.page_no() == TRX_SYS_PAGE_NO);
return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
}
/*****************************************************************//**

View File

@@ -2060,7 +2060,14 @@ same_page:
const bool is_init= (b & 0x70) <= INIT_PAGE;
switch (*store) {
case STORE_IF_EXISTS:
if (!fil_space_get_size(space_id))
if (fil_space_t *space= fil_space_acquire_silent(space_id))
{
const auto size= space->get_size();
space->release();
if (!size)
continue;
}
else
continue;
/* fall through */
case STORE_YES:
@@ -2487,7 +2494,7 @@ static void recv_read_in_area(page_id_t page_id)
if (p != page_nos) {
mutex_exit(&recv_sys.mutex);
buf_read_recv_pages(FALSE, page_id.space(), page_nos,
buf_read_recv_pages(page_id.space(), page_nos,
ulint(p - page_nos));
mutex_enter(&recv_sys.mutex);
}
@@ -2513,7 +2520,7 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
if (end_lsn < i.lsn)
DBUG_LOG("ib_log", "skip log for page " << page_id
<< " LSN " << end_lsn << " < " << i.lsn);
else if (fil_space_t *space= fil_space_acquire_for_io(page_id.space()))
else if (fil_space_t *space= fil_space_t::get_for_io(page_id.space()))
{
mtr.start();
mtr.set_log_mode(MTR_LOG_NO_REDO);

View File

@@ -214,7 +214,7 @@ static void memo_slot_release(mtr_memo_slot_t *slot)
case MTR_MEMO_SPACE_X_LOCK:
{
fil_space_t *space= static_cast<fil_space_t*>(slot->object);
space->committed_size= space->size;
space->set_committed_size();
rw_lock_x_unlock(&space->latch);
}
break;
@@ -256,7 +256,7 @@ struct ReleaseLatches {
case MTR_MEMO_SPACE_X_LOCK:
{
fil_space_t *space= static_cast<fil_space_t*>(slot->object);
space->committed_size= space->size;
space->set_committed_size();
rw_lock_x_unlock(&space->latch);
}
break;

View File

@@ -135,7 +135,6 @@ public:
static io_slots *read_slots;
static io_slots *write_slots;
static io_slots *ibuf_slots;
/** Number of retries for partial I/O's */
constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
@@ -3143,14 +3142,7 @@ os_file_io(
bytes_returned += n_bytes;
if (offset > 0
&& type.is_write()
&& type.punch_hole()) {
*err = type.punch_hole(file, offset, n);
} else {
*err = DB_SUCCESS;
}
*err = type.maybe_punch_hole(offset, n);
return(original_n);
}
@@ -3161,8 +3153,7 @@ os_file_io(
bytes_returned += n_bytes;
if (!type.is_partial_io_warning_disabled()) {
if (type.type != IORequest::READ_MAYBE_PARTIAL) {
const char* op = type.is_read()
? "read" : "written";
@@ -3180,7 +3171,7 @@ os_file_io(
*err = DB_IO_ERROR;
if (!type.is_partial_io_warning_disabled()) {
if (type.type != IORequest::READ_MAYBE_PARTIAL) {
ib::warn()
<< "Retry attempts for "
<< (type.is_read() ? "reading" : "writing")
@@ -3208,7 +3199,6 @@ os_file_pwrite(
os_offset_t offset,
dberr_t* err)
{
ut_ad(type.validate());
ut_ad(type.is_write());
++os_n_file_writes;
@@ -3242,7 +3232,6 @@ os_file_write_func(
{
dberr_t err;
ut_ad(type.validate());
ut_ad(n > 0);
WAIT_ALLOW_WRITES();
@@ -3332,7 +3321,6 @@ os_file_read_page(
os_bytes_read_since_printout += n;
ut_ad(type.validate());
ut_ad(n > 0);
ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err);
@@ -3657,13 +3645,9 @@ fallback:
n_bytes = buf_size;
}
dberr_t err;
IORequest request(IORequest::WRITE);
err = os_file_write(
request, name, file, buf, current_size, n_bytes);
if (err != DB_SUCCESS) {
if (os_file_write(IORequestWrite, name,
file, buf, current_size, n_bytes) !=
DB_SUCCESS) {
break;
}
@@ -3786,18 +3770,11 @@ os_file_punch_hole(
#endif /* _WIN32 */
}
inline bool IORequest::should_punch_hole() const
{
return m_fil_node && m_fil_node->space->punch_hole;
}
/** Free storage space associated with a section of the file.
@param[in] fh Open file handle
@param[in] off Starting offset (SEEK_SET)
@param[in] len Size of the hole
@param off byte offset from the start (SEEK_SET)
@param len size of the hole in bytes
@return DB_SUCCESS or error code */
dberr_t
IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
{
/* In this debugging mode, we act as if punch hole is supported,
and then skip any calls to actually punch a hole here.
@@ -3806,7 +3783,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
return(DB_SUCCESS);
);
ulint trim_len = get_trim_length(len);
ulint trim_len = bpage ? bpage->physical_size() - len : 0;
if (trim_len == 0) {
return(DB_SUCCESS);
@@ -3816,11 +3793,11 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
/* Check does file system support punching holes for this
tablespace. */
if (!should_punch_hole()) {
if (!node->space->punch_hole) {
return DB_IO_NO_PUNCH_HOLE;
}
dberr_t err = os_file_punch_hole(fh, off, trim_len);
dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
if (err == DB_SUCCESS) {
srv_stats.page_compressed_trim_op.inc();
@@ -3828,7 +3805,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
/* If punch hole is not supported,
set space so that it is not used. */
if (err == DB_IO_NO_PUNCH_HOLE) {
m_fil_node->space->punch_hole = false;
node->space->punch_hole = false;
err = DB_SUCCESS;
}
}
@@ -3885,12 +3862,8 @@ static void io_callback(tpool::aiocb* cb)
os_aio_userdata_t data(cb->m_userdata);
/* Return cb back to cache*/
if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD) {
if (read_slots->contains(cb)) {
ut_ad(read_slots->contains(cb));
read_slots->release(cb);
} else {
ut_ad(ibuf_slots->contains(cb));
ibuf_slots->release(cb);
}
} else {
ut_ad(write_slots->contains(cb));
write_slots->release(cb);
@@ -4033,8 +4006,7 @@ bool os_aio_init(ulint n_reader_threads, ulint n_writer_threads, ulint)
{
int max_write_events= int(n_writer_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
int max_read_events= int(n_reader_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
int max_ibuf_events = 1 * OS_AIO_N_PENDING_IOS_PER_THREAD;
int max_events = max_read_events + max_write_events + max_ibuf_events;
int max_events = max_read_events + max_write_events;
int ret;
#if LINUX_NATIVE_AIO
@@ -4053,7 +4025,6 @@ bool os_aio_init(ulint n_reader_threads, ulint n_writer_threads, ulint)
}
read_slots = new io_slots(max_read_events, (uint)n_reader_threads);
write_slots = new io_slots(max_write_events, (uint)n_writer_threads);
ibuf_slots = new io_slots(max_ibuf_events, 1);
return true;
}
@@ -4062,10 +4033,8 @@ void os_aio_free()
srv_thread_pool->disable_aio();
delete read_slots;
delete write_slots;
delete ibuf_slots;
read_slots= nullptr;
write_slots= nullptr;
ibuf_slots= nullptr;
}
/** Waits until there are no pending writes. There can
@@ -4088,7 +4057,6 @@ void os_aio_wait_until_no_pending_writes()
NOTE! Use the corresponding macro os_aio(), not directly this function!
Requests an asynchronous i/o operation.
@param[in,out] type IO request context
@param[in] mode IO mode
@param[in] name Name of the file or path as NUL terminated
string
@param[in] file Open file handle
@@ -4106,8 +4074,7 @@ Requests an asynchronous i/o operation.
@return DB_SUCCESS or error code */
dberr_t
os_aio_func(
IORequest& type,
ulint mode,
const IORequest&type,
const char* name,
pfs_os_file_t file,
void* buf,
@@ -4126,10 +4093,7 @@ os_aio_func(
ut_ad((n & 0xFFFFFFFFUL) == n);
#endif /* WIN_ASYNC_IO */
DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
if (mode == OS_AIO_SYNC) {
if (!type.is_async()) {
if (type.is_read()) {
return(os_file_read_func(type, file, buf, offset, n));
}
@@ -4141,20 +4105,14 @@ os_aio_func(
if (type.is_read()) {
++os_n_file_reads;
} else if (type.is_write()) {
++os_n_file_writes;
} else {
ut_error;
ut_ad(type.is_write());
++os_n_file_writes;
}
compile_time_assert(sizeof(os_aio_userdata_t) <= tpool::MAX_AIO_USERDATA_LEN);
os_aio_userdata_t userdata{m1,type,m2};
io_slots* slots;
if (type.is_read()) {
slots = mode == OS_AIO_IBUF?ibuf_slots: read_slots;
} else {
slots = write_slots;
}
io_slots* slots= type.is_read() ? read_slots : write_slots;
tpool::aiocb* cb = slots->acquire();
cb->m_buffer = buf;
@@ -4462,12 +4420,11 @@ void fil_node_t::find_metadata(os_file_t file
}
/** Read the first page of a data file.
@param[in] first whether this is the very first read
@return whether the page was found valid */
bool fil_node_t::read_page0(bool first)
bool fil_node_t::read_page0()
{
ut_ad(mutex_own(&fil_system.mutex));
const ulint psize = space->physical_size();
const unsigned psize = space->physical_size();
#ifndef _WIN32
struct stat statbuf;
if (fstat(handle, &statbuf)) {
@@ -4479,7 +4436,7 @@ bool fil_node_t::read_page0(bool first)
os_offset_t size_bytes = os_file_get_size(handle);
ut_a(size_bytes != (os_offset_t) -1);
#endif
const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
if (size_bytes < min_size) {
ib::error() << "The size of the file " << name
@@ -4546,14 +4503,11 @@ invalid:
return false;
}
if (first) {
ut_ad(space->id != TRX_SYS_SPACE);
#ifdef UNIV_LINUX
find_metadata(handle, &statbuf);
#else
find_metadata();
#endif
/* Truncate the size to a multiple of extent size. */
ulint mask = psize * FSP_EXTENT_SIZE - 1;
@@ -4568,19 +4522,7 @@ invalid:
space->punch_hole = space->is_compressed();
this->size = uint32_t(size_bytes / psize);
space->committed_size = space->size += this->size;
} else if (space->id != TRX_SYS_SPACE || space->size_in_header) {
/* If this is not the first-time open, do nothing.
For the system tablespace, we always get invoked as
first=false, so we detect the true first-time-open based
on size_in_header and proceed to initialize the data. */
return true;
} else {
/* Initialize the size of predefined tablespaces
to FSP_SIZE. */
space->committed_size = size;
}
space->set_sizes(this->size);
ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
ut_ad(space->free_len == 0 || space->free_len == free_len);
space->size_in_header = size;

View File

@@ -3424,8 +3424,7 @@ fil_iterate(
byte* const writeptr = readptr;
err = os_file_read_no_error_handling(
IORequest(IORequest::READ
| IORequest::DISABLE_PARTIAL_IO_WARNINGS),
IORequestReadPartial,
iter.file, readptr, offset, n_bytes, 0);
if (err != DB_SUCCESS) {
ib::error() << iter.filepath
@@ -3664,9 +3663,7 @@ not_encrypted:
/* A page was updated in the set, write back to disk. */
if (updated) {
IORequest write_request(IORequest::WRITE);
err = os_file_write(write_request,
err = os_file_write(IORequestWrite,
iter.filepath, iter.file,
writeptr, offset, n_bytes);
@@ -3759,9 +3756,7 @@ fil_tablespace_iterate(
/* Read the first page and determine the page and zip size. */
err = os_file_read_no_error_handling(
IORequest(IORequest::READ
| IORequest::DISABLE_PARTIAL_IO_WARNINGS),
err = os_file_read_no_error_handling(IORequestReadPartial,
file, page, 0, srv_page_size, 0);
if (err == DB_SUCCESS) {

View File

@@ -545,7 +545,7 @@ row_quiesce_table_start(
if (!trx_is_interrupted(trx)) {
/* Ensure that all asynchronous IO is completed. */
os_aio_wait_until_no_pending_writes();
fil_flush(table->space_id);
table->space->flush();
if (row_quiesce_write_cfg(table, trx->mysql_thd)
!= DB_SUCCESS) {

View File

@@ -229,10 +229,12 @@ srv_file_check_mode(
static const char INIT_LOG_FILE0[]= "101";
/** Creates log file.
@param[in] create_new_db whether the database is being initialized
@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value
@param[out] logfile0 name of the log file
@return DB_SUCCESS or error code */
static dberr_t create_log_file(lsn_t lsn, std::string& logfile0)
static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
std::string& logfile0)
{
if (srv_read_only_mode) {
ib::error() << "Cannot create log file in read-only mode";
@@ -296,7 +298,9 @@ static dberr_t create_log_file(lsn_t lsn, std::string& logfile0)
}
log_sys.log.open_file(logfile0);
fil_open_system_tablespace_files();
if (!fil_system.sys_space->open(create_new_db)) {
return DB_ERROR;
}
/* Create a log checkpoint. */
log_mutex_enter();
@@ -553,7 +557,7 @@ err_exit:
fil_set_max_space_id_if_bigger(space_id);
fil_space_t *space= fil_space_create(undo_name, space_id, fsp_flags,
fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags,
FIL_TYPE_TABLESPACE, NULL);
ut_a(fil_validate());
ut_a(space);
@@ -563,21 +567,16 @@ err_exit:
if (create)
{
space->set_sizes(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
space->size= file->size= uint32_t(size >> srv_page_size_shift);
space->size_in_header= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
space->committed_size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
}
else
{
success= file->read_page0(true);
if (!success)
else if (!file->read_page0())
{
os_file_close(file->handle);
file->handle= OS_FILE_CLOSED;
ut_a(fil_system.n_open > 0);
fil_system.n_open--;
}
}
mutex_exit(&fil_system.mutex);
return space_id;
@@ -803,7 +802,7 @@ srv_open_tmp_tablespace(bool create_new_db)
true, create_new_db, &sum_of_new_sizes, NULL))
!= DB_SUCCESS) {
ib::error() << "Unable to create the shared innodb_temporary";
} else if (fil_system.temp_space->open()) {
} else if (fil_system.temp_space->open(true)) {
/* Initialize the header page */
mtr_t mtr;
mtr.start();
@@ -1304,7 +1303,7 @@ dberr_t srv_start(bool create_new_db)
log_sys.set_flushed_lsn(flushed_lsn);
buf_flush_sync();
err = create_log_file(flushed_lsn, logfile0);
err = create_log_file(true, flushed_lsn, logfile0);
if (err != DB_SUCCESS) {
return(srv_init_abort(err));
@@ -1333,7 +1332,7 @@ dberr_t srv_start(bool create_new_db)
srv_log_file_size = srv_log_file_size_requested;
err = create_log_file(flushed_lsn, logfile0);
err = create_log_file(false, flushed_lsn, logfile0);
if (err == DB_SUCCESS) {
err = create_log_file_rename(flushed_lsn,
@@ -1364,11 +1363,11 @@ dberr_t srv_start(bool create_new_db)
file_checked:
/* Open log file and data files in the systemtablespace: we keep
them open until database shutdown */
fil_open_system_tablespace_files();
ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);
err = srv_undo_tablespaces_init(create_new_db);
err = fil_system.sys_space->open(create_new_db)
? srv_undo_tablespaces_init(create_new_db)
: DB_ERROR;
/* If the force recovery is set very high then we carry on regardless
of all errors. Basically this is fingers crossed mode. */
@@ -1673,7 +1672,7 @@ file_checked:
srv_log_file_size = srv_log_file_size_requested;
err = create_log_file(flushed_lsn, logfile0);
err = create_log_file(false, flushed_lsn, logfile0);
if (err == DB_SUCCESS) {
err = create_log_file_rename(flushed_lsn,

View File

@@ -584,11 +584,10 @@ static void trx_purge_truncate_history()
: 0, j = i;; ) {
ulint space_id = srv_undo_space_id_start + i;
ut_ad(srv_is_undo_tablespace(space_id));
fil_space_t* space= fil_space_get(space_id);
if (fil_space_get_size(space_id)
> threshold) {
purge_sys.truncate.current
= fil_space_get(space_id);
if (space && space->get_size() > threshold) {
purge_sys.truncate.current = space;
break;
}