MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention

Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
2025-08-07 00:04:31 +03:00 · 2020-10-26 15:59:30 +02:00
parent 3a9a3be1c6
commit 45ed9dd957
39 changed files with 1301 additions and 1898 deletions
--- a/extra/mariabackup/fil_cur.cc
+++ b/extra/mariabackup/fil_cur.cc
@@ -93,7 +93,6 @@ xb_fil_node_close_file(
 	mutex_enter(&fil_system.mutex);

 	ut_ad(node);
-	ut_a(node->n_pending == 0);
 	ut_a(node->n_pending_flushes == 0);
 	ut_a(!node->being_extended);

@@ -108,20 +107,10 @@ xb_fil_node_close_file(
 	ut_a(ret);

 	node->handle = OS_FILE_CLOSED;
+	mutex_exit(&fil_system.mutex);

 	ut_a(fil_system.n_open > 0);
 	fil_system.n_open--;
-
-	if (node->space->purpose == FIL_TYPE_TABLESPACE &&
-	    fil_is_user_tablespace_id(node->space->id)) {
-
-		ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0);
-
-		/* The node is in the LRU list, remove it */
-		UT_LIST_REMOVE(fil_system.LRU, node);
-	}
-
-	mutex_exit(&fil_system.mutex);
 }

 /************************************************************************
@@ -180,18 +169,8 @@ xb_fil_cur_open(

 			return(XB_FIL_CUR_SKIP);
 		}
-		mutex_enter(&fil_system.mutex);

 		fil_system.n_open++;
-
-		if (node->space->purpose == FIL_TYPE_TABLESPACE &&
-		    fil_is_user_tablespace_id(node->space->id)) {
-
-			/* Put the node to the LRU list */
-			UT_LIST_ADD_FIRST(fil_system.LRU, node);
-		}
-
-		mutex_exit(&fil_system.mutex);
 	}

 	ut_ad(node->is_open());
@@ -427,7 +406,7 @@ xb_fil_cur_read(
 	retry_count = 10;
 	ret = XB_FIL_CUR_SUCCESS;

-	fil_space_t *space = fil_space_acquire_for_io(cursor->space_id);
+	fil_space_t *space = fil_space_t::get_for_io(cursor->space_id);

 	if (!space) {
 		return XB_FIL_CUR_ERROR;
--- a/extra/mariabackup/xtrabackup.cc
+++ b/extra/mariabackup/xtrabackup.cc
@@ -3011,6 +3011,7 @@ void
 xb_fil_io_init()
 {
 	fil_system.create(srv_file_per_table ? 50000 : 5000);
+	fil_system.freeze_space_list = 1;
 	fil_system.space_id_reuse_warned = true;
 }

@@ -3087,24 +3088,16 @@ xb_load_single_table_tablespace(
 	bool is_empty_file = file->exists() && file->is_empty_file();

 	if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) {
-		os_offset_t	node_size = os_file_get_size(file->handle());
-		os_offset_t	n_pages;
-
-		ut_a(node_size != (os_offset_t) -1);
-
-		n_pages = node_size / fil_space_t::physical_size(file->flags());
-
-		space = fil_space_create(
+		space = fil_space_t::create(
 			name, file->space_id(), file->flags(),
 			FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */);

 		ut_a(space != NULL);

-		space->add(file->filepath(), OS_FILE_CLOSED, uint32_t(n_pages),
-			   false, false);
+		space->add(file->filepath(), OS_FILE_CLOSED, 0, false, false);
 		/* by opening the tablespace we forcing node and space objects
 		in the cache to be populated with fields from space header */
-		space->open();
+		space->get_size();

 		if (srv_operation == SRV_OPERATION_RESTORE_DELTA
 		    || xb_close_files) {
@@ -3406,19 +3399,6 @@ xb_load_tablespaces()
 	return(DB_SUCCESS);
 }

-/************************************************************************
-Initialize the tablespace memory cache and populate it by scanning for and
-opening data files.
-@returns DB_SUCCESS or error code.*/
-static
-dberr_t
-xb_data_files_init()
-{
-	xb_fil_io_init();
-
-	return(xb_load_tablespaces());
-}
-
 /** Destroy the tablespace memory cache. */
 static void xb_data_files_close()
 {
@@ -4607,6 +4587,22 @@ xb_delta_open_matching_space(
 		return file;
 	}

+	if (!info.space_id && fil_system.sys_space) {
+		fil_node_t *node
+			= UT_LIST_GET_FIRST(fil_system.sys_space->chain);
+		for (; node; node = UT_LIST_GET_NEXT(chain, node)) {
+			if (!strcmp(node->name, real_name)) {
+				break;
+			}
+		}
+		if (node && node->handle != OS_FILE_CLOSED) {
+			*success = true;
+			return node->handle;
+		}
+		msg("mariabackup: Cannot find file %s\n", real_name);
+		return OS_FILE_CLOSED;
+	}
+
 	log_mutex_enter();
 	if (!fil_is_user_tablespace_id(info.space_id)) {
 found:
@@ -4704,7 +4700,7 @@ exit:
 	ut_ad(fil_space_t::zip_size(flags) == info.zip_size);
 	ut_ad(fil_space_t::physical_size(flags) == info.page_size);

-	if (fil_space_create(dest_space_name, info.space_id, flags,
+	if (fil_space_t::create(dest_space_name, info.space_id, flags,
 				FIL_TYPE_TABLESPACE, 0)) {
 		*success = xb_space_create_file(real_name, info.space_id,
 						flags, &file);
@@ -4925,7 +4921,7 @@ xtrabackup_apply_delta(
 		os_file_close(src_file);
 		os_file_delete(0,src_path);
 	}
-	if (dst_file != OS_FILE_CLOSED)
+	if (dst_file != OS_FILE_CLOSED && info.space_id)
 		os_file_close(dst_file);
 	return TRUE;

@@ -4933,7 +4929,7 @@ error:
 	aligned_free(incremental_buffer);
 	if (src_file != OS_FILE_CLOSED)
 		os_file_close(src_file);
-	if (dst_file != OS_FILE_CLOSED)
+	if (dst_file != OS_FILE_CLOSED && info.space_id)
 		os_file_close(dst_file);
 	msg("Error: xtrabackup_apply_delta(): "
 	    "failed to apply %s to %s.\n", src_path, dst_path);
@@ -5387,8 +5383,8 @@ static bool xtrabackup_prepare_func(char** argv)
 		srv_allow_writes_event = os_event_create(0);
 		os_event_set(srv_allow_writes_event);
 #endif
-		dberr_t err = xb_data_files_init();
-		if (err != DB_SUCCESS) {
+		xb_fil_io_init();
+		if (dberr_t err = xb_load_tablespaces()) {
 			msg("mariabackup: error: xb_data_files_init() failed "
 			    "with error %s\n", ut_strerr(err));
 			goto error_cleanup;
@@ -5396,7 +5392,8 @@ static bool xtrabackup_prepare_func(char** argv)

 		inc_dir_tables_hash.create(1000);

-		ok = xtrabackup_apply_deltas();
+		ok = fil_system.sys_space->open(false)
+			&& xtrabackup_apply_deltas();

 		xb_data_files_close();

@@ -5426,6 +5423,8 @@ static bool xtrabackup_prepare_func(char** argv)
 		goto error_cleanup;
 	}

+	fil_system.freeze_space_list = 0;
+
 	/* increase IO threads */
 	if (srv_n_file_io_threads < 10) {
 		srv_n_read_io_threads = 4;
@@ -5447,6 +5446,8 @@ static bool xtrabackup_prepare_func(char** argv)
 		goto error_cleanup;
 	}

+	ut_ad(!fil_system.freeze_space_list);
+
 	if (ok) {
 		msg("Last binlog file %s, position %lld",
 		    trx_sys.recovered_binlog_filename,
--- a/mysql-test/suite/encryption/t/innodb-remove-encryption.test
+++ b/mysql-test/suite/encryption/t/innodb-remove-encryption.test
@@ -29,6 +29,7 @@ create table t1(a int not null primary key, b char(200)) engine=innodb;
 --source include/wait_condition.inc

 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

 --echo # Success!
@@ -41,6 +42,7 @@ SET GLOBAL innodb_encrypt_tables = off;
 --let $wait_condition=SELECT COUNT(*) = $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0;
 --source include/wait_condition.inc

+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

@@ -51,6 +53,7 @@ SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_
 --let $restart_parameters=--skip-file-key-management --innodb-encrypt-tables=OFF --innodb-encryption-threads=0 --innodb-tablespaces-encryption
 -- source include/restart_mysqld.inc

+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

--- a/mysql-test/suite/encryption/t/innodb_encrypt_key_rotation_age.test
+++ b/mysql-test/suite/encryption/t/innodb_encrypt_key_rotation_age.test
@@ -26,6 +26,7 @@ let $restart_parameters= --innodb_encryption_threads=5 --innodb_encryption_rotat
 --source include/wait_condition.inc

 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

 --echo # Restart the server with innodb_encryption_rotate_key_age= 0
@@ -45,6 +46,7 @@ create table t4 (f1 int not null)engine=innodb encrypted=NO;

 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;

+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

 --echo # Disable encryption when innodb_encryption_rotate_key_age is 0
@@ -57,6 +59,7 @@ set global innodb_encrypt_tables = OFF;
 --let $wait_condition=SELECT COUNT(*) >= $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0;
 --source include/wait_condition.inc

+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
 --echo # Display only encrypted create tables (t3)
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
@@ -73,11 +76,13 @@ set global innodb_encrypt_tables = ON;

 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
 --echo # Display only unencrypted create tables (t4)
+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

 --let $restart_parameters=
 -- source include/restart_mysqld.inc

 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
+--sorted_result
 SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
 DROP TABLE t4, t3, t2, t1;
--- a/mysql-test/suite/innodb/r/table_definition_cache_debug.result
+++ b/mysql-test/suite/innodb/r/table_definition_cache_debug.result
@@ -1,4 +1,4 @@
-call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded");
+call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded");
 SET @save_tdc= @@GLOBAL.table_definition_cache;
 SET @save_toc= @@GLOBAL.table_open_cache;
 SET GLOBAL table_definition_cache= 400;
--- a/mysql-test/suite/innodb/t/innodb-trim.test
+++ b/mysql-test/suite/innodb/t/innodb-trim.test
@@ -32,18 +32,6 @@ commit;
 set autocommit=1;


-let $success= `SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'innodb_num_page_compressed_trim_op'`;
-
-if (!$success) {
--disable_query_log
--disable_result_log
-    DROP PROCEDURE innodb_insert_proc;
-    DROP TABLE innodb_page_compressed;
--enable_query_log
--enable_result_log
-    --skip "Test requires TRIM";
-}
-
 DROP PROCEDURE innodb_insert_proc;
 DROP TABLE innodb_page_compressed;

--- a/mysql-test/suite/innodb/t/table_definition_cache_debug.test
+++ b/mysql-test/suite/innodb/t/table_definition_cache_debug.test
@@ -4,7 +4,7 @@
 # This test is slow on buildbot.
 --source include/big_test.inc

-call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded");
+call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded");

 SET @save_tdc= @@GLOBAL.table_definition_cache;
 SET @save_toc= @@GLOBAL.table_open_cache;
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 # Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
 # Copyright (c) 2014, 2020, MariaDB Corporation.
 #
@@ -186,7 +187,6 @@ SET(INNOBASE_SOURCES
 	include/mtr0mtr.h
 	include/mtr0mtr.ic
 	include/mtr0types.h
-	include/os0api.h
 	include/os0event.h
 	include/os0file.h
 	include/os0file.ic
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -3304,22 +3304,35 @@ upd_sys:

 /**
 Prefetch siblings of the leaf for the pessimistic operation.
-@param block	leaf page */
-static void btr_cur_prefetch_siblings(const buf_block_t* block)
+@param block	leaf page
+@param index    index of the page */
+static void btr_cur_prefetch_siblings(const buf_block_t *block,
+                                      const dict_index_t *index)
 {
-  const page_t *page= block->frame;
-  ut_ad(page_is_leaf(page));
+  ut_ad(page_is_leaf(block->frame));

+  if (index->is_ibuf())
+    return;
+
+  const page_t *page= block->frame;
  uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
  uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));

  if (prev != FIL_NULL)
-    buf_read_page_background(page_id_t(block->page.id().space(), prev),
+  {
+    ut_a(index->table->space->acquire_for_io());
+    buf_read_page_background(index->table->space,
+                             page_id_t(block->page.id().space(), prev),
                             block->zip_size(), false);
+  }
  if (next != FIL_NULL)
-    buf_read_page_background(page_id_t(block->page.id().space(), next),
+  {
+    ut_a(index->table->space->acquire_for_io());
+    buf_read_page_background(index->table->space,
+                             page_id_t(block->page.id().space(), next),
                             block->zip_size(), false);
  }
+}

 /*************************************************************//**
 Tries to perform an insert to a page in an index tree, next to cursor.
@@ -3436,8 +3449,8 @@ fail:

 		/* prefetch siblings of the leaf for the pessimistic
 		operation, if the page is leaf. */
-		if (page_is_leaf(page) && !index->is_ibuf()) {
-			btr_cur_prefetch_siblings(block);
+		if (page_is_leaf(page)) {
+			btr_cur_prefetch_siblings(block, index);
 		}
 fail_err:

@@ -4575,7 +4588,7 @@ any_extern:

 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block);
+		btr_cur_prefetch_siblings(block, index);

 		return(DB_OVERFLOW);
 	}
@@ -4766,10 +4779,10 @@ func_exit:
 		}
 	}

-	if (err != DB_SUCCESS && !index->is_ibuf()) {
+	if (err != DB_SUCCESS) {
 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block);
+		btr_cur_prefetch_siblings(block, index);
 	}

 	return(err);
@@ -5481,7 +5494,7 @@ btr_cur_optimistic_delete_func(
 	if (!no_compress_needed) {
 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block);
+		btr_cur_prefetch_siblings(block, cursor->index);
 		goto func_exit;
 	}

--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2768,7 +2768,7 @@ buf_zip_decompress(
 	ulint		size = page_zip_get_size(&block->page.zip);
 	/* The tablespace will not be found if this function is called
 	during IMPORT. */
-	fil_space_t* space= fil_space_acquire_for_io(block->page.id().space());
+	fil_space_t* space= fil_space_t::get_for_io(block->page.id().space());
 	const unsigned key_version = mach_read_from_4(
 		frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
 	fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
@@ -3034,10 +3034,9 @@ buf_page_get_low(
 		/* fall through */
 	case BUF_GET:
 	case BUF_GET_IF_IN_POOL_OR_WATCH:
-		fil_space_t* s = fil_space_acquire_for_io(page_id.space());
+		fil_space_t* s = fil_space_get(page_id.space());
 		ut_ad(s);
 		ut_ad(s->zip_size() == zip_size);
-		s->release_for_io();
 	}
 #endif /* UNIV_DEBUG */

@@ -3107,7 +3106,7 @@ lookup:
 		}

 		/* The call path is buf_read_page() ->
-		buf_read_page_low() (fil_io()) ->
+		buf_read_page_low() (fil_space_t::io()) ->
 		buf_page_read_complete() ->
 		buf_decrypt_after_read(). Here fil_space_t* is used
 		and we decrypt -> buf_page_check_corrupt() where page
@@ -3161,8 +3160,7 @@ lookup:
 			asserting. */
 			if (page_id.space() == TRX_SYS_SPACE) {
 			} else if (page_id.space() == SRV_TMP_SPACE_ID) {
-			} else if (fil_space_t* space
-				   = fil_space_acquire_for_io(
+			} else if (fil_space_t* space= fil_space_t::get_for_io(
 					   page_id.space())) {
 				bool set = dict_set_corrupted_by_space(space);
 				space->release_for_io();
@@ -3376,8 +3374,8 @@ re_evict:
 	if (mode != BUF_GET_IF_IN_POOL
 	    && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
 	} else if (!ibuf_debug) {
-	} else if (fil_space_t* space =
-		   fil_space_acquire_for_io(page_id.space())) {
+	} else if (fil_space_t* space
+		   = fil_space_t::get_for_io(page_id.space())) {
 		/* Try to evict the block from the buffer pool, to use the
 		insert buffer (change buffer) as much as possible. */

@@ -4869,17 +4867,4 @@ std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
      << ", page number=" << page_id.page_no() << "]";
  return out;
 }
-
-/**
-Calculate the length of trim (punch_hole) operation.
-@param[in]	bpage		Page control block
-@param[in]	write_length	Write length
-@return length of the trim or zero. */
-ulint
-buf_page_get_trim_length(
-	const buf_page_t*	bpage,
-	ulint			write_length)
-{
-	return bpage->physical_size() - write_length;
-}
 #endif /* !UNIV_INNOCHECKSUM */
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -125,7 +125,8 @@ too_small:

  byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
    trx_sys_block->frame;
-  for (uint32_t prev_page_no= 0, i= 0; i < 2 * size + FSP_EXTENT_SIZE / 2; i++)
+  for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+       i < 2 * size + extent_size / 2; i++)
  {
    buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
                                                 FSP_UP, &mtr);
@@ -362,15 +363,13 @@ void buf_dblwr_t::recover()
      continue;
    }

-    fil_space_t* space= fil_space_acquire_for_io(space_id);
+    fil_space_t *space= fil_space_t::get_for_io(space_id);

    if (!space)
      /* The tablespace that this page once belonged to does not exist */
      continue;

-    fil_space_open_if_needed(space);
-
-    if (UNIV_UNLIKELY(page_no >= space->size))
+    if (UNIV_UNLIKELY(page_no >= space->get_size()))
    {
      /* Do not report the warning for undo tablespaces, because they
      can be truncated in place. */
@@ -385,7 +384,6 @@ next_page:
    }

    const ulint physical_size= space->physical_size();
-    const ulint zip_size= space->zip_size();
    ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));

    /* We want to ensure that for partial reads the unread portion of
@@ -393,18 +391,15 @@ next_page:
    memset(read_buf, 0x0, physical_size);

    /* Read in the actual page from the file */
-    fil_io_t fio= fil_io(IORequest(IORequest::READ | IORequest::DBLWR_RECOVER),
-                         true, page_id, zip_size,
-                         0, physical_size, read_buf, nullptr);
+    fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+                            os_offset_t{page_no} * physical_size,
+                            physical_size, read_buf);

    if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
       ib::warn() << "Double write buffer recovery: " << page_id
                  << " (tablespace '" << space->name
                  << "') read failed with error: " << fio.err;

-    if (fio.node)
-      fio.node->space->release_for_io();
-
    if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
    {
      /* We will check if the copy in the doublewrite buffer is
@@ -425,18 +420,16 @@ next_page:

    /* Write the good page from the doublewrite buffer to the intended
    position. */
-    fio= fil_io(IORequestWrite, true, page_id, zip_size, 0, physical_size,
-                page, nullptr);
+    space->reacquire_for_io();
+    fio= space->io(IORequestWrite,
+                   os_offset_t{page_id.page_no()} * physical_size,
+                   physical_size, page);

-    if (fio.node)
-    {
-      ut_ad(fio.err == DB_SUCCESS);
+    if (fio.err == DB_SUCCESS)
      ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
                 << "' from the doublewrite buffer.";
-      fio.node->space->release_for_io();
    goto next_page;
  }
-  }

  recv_sys.dblwr.pages.clear();
  fil_flush_file_spaces();
@@ -513,7 +506,7 @@ static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)

 static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
 {
-  if (fil_space_t *space= fil_space_acquire_for_io(b.id().space()))
+  if (fil_space_t *space= fil_space_t::get_for_io(b.id().space()))
  {
    buf_dblwr_check_page_lsn(page, *space);
    space->release_for_io();
@@ -577,7 +570,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
 #ifdef UNIV_DEBUG
  for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
  {
-    buf_page_t *bpage= buf_block_arr[i].bpage;
+    buf_page_t *bpage= buf_block_arr[i].request.bpage;

    if (bpage->zip.data)
      /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
@@ -590,18 +583,22 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
  }
 #endif /* UNIV_DEBUG */
  /* Write out the first block of the doublewrite buffer */
-  fil_io_t fio= fil_io(IORequestWrite, true, block1, 0, 0,
-                       std::min(size, old_first_free) << srv_page_size_shift,
-                       write_buf, nullptr);
-  fio.node->space->release_for_io();
+  ut_a(fil_system.sys_space->acquire_for_io());
+  fil_system.sys_space->io(IORequestWrite,
+                           os_offset_t{block1.page_no()} <<
+                           srv_page_size_shift,
+                           std::min(size, old_first_free) <<
+                           srv_page_size_shift, write_buf);

  if (old_first_free > size)
  {
    /* Write out the second block of the doublewrite buffer. */
-    fio= fil_io(IORequestWrite, true, block2, 0, 0,
+    ut_a(fil_system.sys_space->acquire_for_io());
+    fil_system.sys_space->io(IORequestWrite,
+                             os_offset_t{block2.page_no()} <<
+                             srv_page_size_shift,
                             (old_first_free - size) << srv_page_size_shift,
-                write_buf + (size << srv_page_size_shift), nullptr);
-    fio.node->space->release_for_io();
+                             write_buf + (size << srv_page_size_shift));
  }

  /* increment the doublewrite flushed pages counter */
@@ -609,7 +606,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
  srv_stats.dblwr_writes.inc();

  /* Now flush the doublewrite buffer data to disk */
-  fil_flush(TRX_SYS_SPACE);
+  fil_system.sys_space->flush();

  /* We know that the writes have been flushed to disk now
  and in recovery we will find them in the doublewrite buffer
@@ -629,8 +626,8 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
  for (ulint i= 0; i < old_first_free; i++)
  {
    auto e= buf_block_arr[i];
-    buf_page_t* bpage= e.bpage;
-    ut_a(bpage->in_file());
+    buf_page_t* bpage= e.request.bpage;
+    ut_ad(bpage->in_file());

    /* We request frame here to get correct buffer in case of
    encryption and/or page compression */
@@ -650,8 +647,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
      ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
    }

-    fil_io(IORequest(IORequest::WRITE, bpage, e.lru), false,
-           bpage->id(), bpage->zip_size(), 0, e_size, frame, bpage);
+    e.space->io(e.request, bpage->physical_offset(), e_size, frame, bpage);
  }

  return true;
@@ -680,12 +676,20 @@ void buf_dblwr_t::flush_buffered_writes()

 /** Schedule a page write. If the doublewrite memory buffer is full,
 flush_buffered_writes() will be invoked to make space.
-@param bpage      buffer pool page to be written
-@param lru        true=buf_pool.LRU; false=buf_pool.flush_list
+@param space      tablespace
+@param request    asynchronous write request
@param size       payload size in bytes */
-void buf_dblwr_t::add_to_batch(buf_page_t *bpage, bool lru, size_t size)
+void buf_dblwr_t::add_to_batch(fil_space_t *space, const IORequest &request,
+                               size_t size)
 {
-  ut_ad(bpage->in_file());
+  ut_ad(request.is_async());
+  ut_ad(request.is_write());
+  ut_ad(request.bpage);
+  ut_ad(request.bpage->in_file());
+  ut_ad(space->id == request.bpage->id().space());
+  ut_ad(space->pending_io());
+  ut_ad(!srv_read_only_mode);
+
  const ulint buf_size= 2 * block_size();

  mysql_mutex_lock(&mutex);
@@ -707,13 +711,13 @@ void buf_dblwr_t::add_to_batch(buf_page_t *bpage, bool lru, size_t size)

  /* We request frame here to get correct buffer in case of
  encryption and/or page compression */
-  void *frame= buf_page_get_frame(bpage);
+  void *frame= buf_page_get_frame(request.bpage);

  memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(p, frame, size);
-  ut_ad(!bpage->zip_size() || bpage->zip_size() == size);
+  ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
  ut_ad(reserved == first_free);
  ut_ad(reserved < buf_size);
-  buf_block_arr[first_free++]= { bpage, lru, size };
+  new (buf_block_arr + first_free++) element{space, request, size};
  reserved= first_free;

  if (first_free != buf_size || !flush_buffered_writes(buf_size / 2))
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -626,6 +626,14 @@ buf_load()
 	so all pages from a given tablespace are consecutive. */
 	ulint		cur_space_id = dump[0].space();
 	fil_space_t*	space = fil_space_acquire_silent(cur_space_id);
+	if (space) {
+		bool ok = space->acquire_for_io();
+		space->release();
+		if (!ok) {
+			space = nullptr;
+		}
+	}
+
 	ulint		zip_size = space ? space->zip_size() : 0;

 	PSI_stage_progress*	pfs_stage_progress __attribute__((unused))
@@ -644,22 +652,32 @@ buf_load()
 		}

 		if (this_space_id != cur_space_id) {
-			if (space != NULL) {
-				space->release();
+			if (space) {
+				space->release_for_io();
 			}

 			cur_space_id = this_space_id;
 			space = fil_space_acquire_silent(cur_space_id);

-			if (space != NULL) {
-				zip_size = space->zip_size();
+			if (!space) {
+				continue;
 			}
+
+			bool ok = space->acquire_for_io();
+			space->release();
+
+			if (!ok) {
+				space = nullptr;
+				continue;
+			}
+
+			zip_size = space->zip_size();
 		}

 		/* JAN: TODO: As we use background page read below,
 		if tablespace is encrypted we cant use it. */
-		if (space == NULL ||
-		   (space && space->crypt_data &&
+		if (!space || dump[i].page_no() >= space->get_size() ||
+		    (space->crypt_data &&
 		     space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
 		     space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
 			continue;
@@ -671,11 +689,12 @@ buf_load()
 			continue;
 		}

-		buf_read_page_background(dump[i], zip_size, true);
+		space->reacquire_for_io();
+		buf_read_page_background(space, dump[i], zip_size, true);

 		if (buf_load_abort_flag) {
-			if (space != NULL) {
-				space->release();
+			if (space) {
+				space->release_for_io();
 			}
 			buf_load_abort_flag = false;
 			ut_free(dump);
@@ -702,8 +721,8 @@ buf_load()
 #endif
 	}

-	if (space != NULL) {
-		space->release();
+	if (space) {
+		space->release_for_io();
 	}

 	ut_free(dump);
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -782,6 +782,11 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
 {
  ut_ad(bpage->in_file());
  ut_ad(bpage->ready_for_flush());
+  ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+        (space == fil_system.temp_space));
+  ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
+        space->atomic_write_supported);
+  ut_ad(space->pending_io());

  rw_lock_t *rw_lock;

@@ -807,11 +812,6 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
  io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
  the buffer pool or removed from flush_list or LRU_list. */

-  ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
-        (space == fil_system.temp_space));
-  ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
-        space->atomic_write_supported);
-
  DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
                        lru ? "LRU" : "flush_list",
                        bpage->id().space(), bpage->id().page_no()));
@@ -850,19 +850,22 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
    }
  }

+  if (status == buf_page_t::FREED)
+    buf_release_freed_page(&block->page);
+  else
+  {
+    space->reacquire_for_io();
+    ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
    size_t size, orig_size;
-  ulint type= IORequest::WRITE;
+    IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;

    if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
    {
      ut_ad(!space->full_crc32());
      ut_ad(!space->is_compressed()); /* not page_compressed */
      orig_size= size= bpage->zip_size();
-    if (status != buf_page_t::FREED)
-    {
-      buf_flush_update_zip_checksum(frame, orig_size);
+      buf_flush_update_zip_checksum(frame, size);
      frame= buf_page_encrypt(space, bpage, frame, &size);
-    }
      ut_ad(size == bpage->zip_size());
    }
    else
@@ -870,8 +873,7 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
      byte *page= block->frame;
      orig_size= size= block->physical_size();

-    if (status == buf_page_t::FREED);
-    else if (space->full_crc32())
+      if (space->full_crc32())
      {
        /* innodb_checksum_algorithm=full_crc32 is not implemented for
        ROW_FORMAT=COMPRESSED pages. */
@@ -888,44 +890,26 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)

 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
      if (size != orig_size && space->punch_hole)
-      type|= IORequest::PUNCH_HOLE;
+        type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
 #else
      DBUG_EXECUTE_IF("ignore_punch_hole",
                      if (size != orig_size && space->punch_hole)
-                        type|= IORequest::PUNCH_HOLE;);
+                        type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;);
 #endif
      frame=page;
    }

-  IORequest request(type, bpage, lru);
-
    ut_ad(status == bpage->status);

-  switch (status) {
-  default:
-    ut_ad(status == buf_page_t::FREED);
-    buf_release_freed_page(bpage);
-    break;
-  case buf_page_t::NORMAL:
-    if (space->use_doublewrite())
-    {
-      ut_ad(!srv_read_only_mode);
    if (lru)
      buf_pool.n_flush_LRU++;
    else
      buf_pool.n_flush_list++;
-      buf_dblwr.add_to_batch(bpage, lru, size);
-      break;
-    }
-    /* fall through */
-  case buf_page_t::INIT_ON_FLUSH:
-    if (lru)
-      buf_pool.n_flush_LRU++;
+    if (status != buf_page_t::NORMAL || !space->use_doublewrite())
+      space->io(IORequest(type, bpage),
+                bpage->physical_offset(), size, frame, bpage);
    else
-      buf_pool.n_flush_list++;
-    /* FIXME: pass space to fil_io() */
-    fil_io(request, false, bpage->id(), bpage->zip_size(), 0,
-           bpage->physical_size(), frame, bpage);
+      buf_dblwr.add_to_batch(space, IORequest(type, bpage), size);
  }

  /* Increment the I/O operation count used for selecting LRU policy. */
@@ -973,8 +957,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
    ? static_cast<uint32_t>(s) : read_ahead;
  page_id_t low= id - (id.page_no() % buf_flush_area);
  page_id_t high= low + buf_flush_area;
-  high.set_page_no(std::min(high.page_no(),
-                            static_cast<uint32_t>(space.committed_size - 1)));
+  high.set_page_no(std::min(high.page_no(), space.last_page_number()));

  if (!contiguous)
  {
@@ -1018,13 +1001,12 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
  return i;
 }

+MY_ATTRIBUTE((nonnull))
 /** Write punch-hole or zeroes of the freed ranges when
 innodb_immediate_scrub_data_uncompressed from the freed ranges.
-@param[in]	space		tablespace which contains freed ranges
-@param[in]	freed_ranges	freed ranges of the page to be flushed */
+@param space   tablespace which may contain ranges of freed pages */
 static void buf_flush_freed_pages(fil_space_t *space)
 {
-  ut_ad(space != NULL);
  const bool punch_hole= space->punch_hole;
  if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
    return;
@@ -1043,27 +1025,24 @@ static void buf_flush_freed_pages(fil_space_t *space)

  for (const auto &range : freed_ranges)
  {
-    ulint page_size= space->zip_size();
-    if (!page_size)
-      page_size= srv_page_size;
+    const ulint physical_size= space->physical_size();

    if (punch_hole)
    {
-      const auto len= (range.last - range.first + 1) * page_size;
-      const page_id_t page_id(space->id, range.first);
-      fil_io_t fio= fil_io(IORequestWrite, true, page_id, space->zip_size(),
-                           0, len, nullptr, nullptr, false, true);
-      if (fio.node)
-        fio.node->space->release_for_io();
+      space->reacquire_for_io();
+      space->io(IORequest(IORequest::PUNCH_RANGE),
+                          os_offset_t{range.first} * physical_size,
+                          (range.last - range.first + 1) * physical_size,
+                          nullptr);
    }
    else if (srv_immediate_scrub_data_uncompressed)
    {
-      for (auto i= range.first; i <= range.last; i++)
+      for (os_offset_t i= range.first; i <= range.last; i++)
      {
-        const page_id_t page_id(space->id, i);
-        fil_io(IORequestWrite, false, page_id, space->zip_size(), 0,
-               space->zip_size() ? space->zip_size() : srv_page_size,
-               const_cast<byte*>(field_ref_zero), nullptr, false, false);
+        space->reacquire_for_io();
+        space->io(IORequest(IORequest::WRITE_ASYNC),
+                  i * physical_size, physical_size,
+                  const_cast<byte*>(field_ref_zero));
      }
    }
    buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
@@ -1093,7 +1072,8 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
  ut_ad(page_id >= id);
  ut_ad(page_id < high);

-  for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold)
+  for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
+       ++id, ++id_fold)
  {
    if (count + n_flushed >= n_to_flush)
    {
@@ -1190,7 +1170,7 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
@retval nullptr if the pages for this tablespace should be discarded */
 static fil_space_t *buf_flush_space(const uint32_t id)
 {
-  fil_space_t *space= fil_space_acquire_for_io(id);
+  fil_space_t *space= fil_space_t::get_for_io(id);
  if (space)
    buf_flush_freed_pages(space);
  return space;
@@ -1204,6 +1184,37 @@ struct flush_counters_t
  ulint evicted;
 };

+/** Try to discard a dirty page.
+@param bpage      dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  ut_ad(bpage->in_file());
+  ut_ad(bpage->oldest_modification());
+
+  rw_lock_t *rw_lock;
+
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+    rw_lock= nullptr;
+  else
+  {
+    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+    if (!rw_lock_sx_lock_nowait(rw_lock, 0))
+      return;
+  }
+
+  bpage->status= buf_page_t::NORMAL;
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_flush_remove(bpage);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (rw_lock)
+    rw_lock_sx_unlock(rw_lock);
+
+  buf_LRU_free_page(bpage, true);
+}
+
 /** Flush dirty blocks from the end of the LRU list.
@param max   maximum number of blocks to make available in buf_pool.free
@param n     counts of flushed and evicted pages */
@@ -1219,6 +1230,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
    ? 0 : srv_flush_neighbors;
  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");

  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
       bpage && n->flushed + n->evicted < max &&
@@ -1243,14 +1257,26 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
      const page_id_t page_id(bpage->id());
      const uint32_t space_id= page_id.space();
      if (!space || space->id != space_id)
+      {
+        if (last_space_id != space_id)
        {
          if (space)
            space->release_for_io();
          space= buf_flush_space(space_id);
-        if (!space)
-          continue;
+          last_space_id= space_id;
        }
-      if (neighbors && space->is_rotational())
+        else
+          ut_ad(!space);
+      }
+      else if (space->is_stopping())
+      {
+        space->release_for_io();
+        space= nullptr;
+      }
+
+      if (!space)
+        buf_flush_discard_page(bpage);
+      else if (neighbors && space->is_rotational())
      {
        mysql_mutex_unlock(&buf_pool.mutex);
        n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
@@ -1328,6 +1354,9 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
    ? 0 : srv_flush_neighbors;
  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");

  /* Start from the end of the list looking for a suitable block to be
  flushed. */
@@ -1360,14 +1389,26 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
      const page_id_t page_id(bpage->id());
      const uint32_t space_id= page_id.space();
      if (!space || space->id != space_id)
+      {
+        if (last_space_id != space_id)
        {
          if (space)
            space->release_for_io();
          space= buf_flush_space(space_id);
-        if (!space)
-          continue;
+          last_space_id= space_id;
        }
-      if (neighbors && space->is_rotational())
+        else
+          ut_ad(!space);
+      }
+      else if (space->is_stopping())
+      {
+        space->release_for_io();
+        space= nullptr;
+      }
+
+      if (!space)
+        buf_flush_discard_page(bpage);
+      else if (neighbors && space->is_rotational())
      {
        mysql_mutex_unlock(&buf_pool.mutex);
        count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
@@ -1476,10 +1517,9 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn)
  while not holding buf_pool.flush_list_mutex */
  if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
  {
-    mysql_mutex_unlock(&buf_pool.mutex);
-    if (running)
-      return 0;
+    if (!running)
      mysql_cond_broadcast(cond);
+    mysql_mutex_unlock(&buf_pool.mutex);
    return 0;
  }
  n_flush++;
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -261,26 +261,23 @@ flag is cleared and the x-lock released by an i/o-handler thread.
@param[out] err		DB_SUCCESS or DB_TABLESPACE_DELETED
 			if we are trying
 			to read from a non-existent tablespace
+@param[in,out] space	tablespace
@param[in] sync		true if synchronous aio is desired
@param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
@param[in] page_id	page id
@param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@param[in] unzip	true=request uncompressed page
-@param[in] ignore	whether to ignore out-of-bounds page_id
-@return 1 if a read request was queued, 0 if the page already resided
-in buf_pool, or if the page is in the doublewrite buffer blocks in
-which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped */
+@return whether a read request was queued */
 static
-ulint
+bool
 buf_read_page_low(
 	dberr_t*		err,
+	fil_space_t*		space,
 	bool			sync,
 	ulint			mode,
 	const page_id_t		page_id,
 	ulint			zip_size,
-	bool			unzip,
-	bool			ignore = false)
+	bool			unzip)
 {
 	buf_page_t*	bpage;

@@ -290,17 +287,22 @@ buf_read_page_low(
 		ib::error() << "Trying to read doublewrite buffer page "
 			<< page_id;
 		ut_ad(0);
-		return(0);
+nothing_read:
+		space->release_for_io();
+		return false;
 	}

-	if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) {
+	if (sync) {
+	} else if (trx_sys_hdr_page(page_id)
+		   || ibuf_bitmap_page(page_id, zip_size)
+		   || (!recv_no_ibuf_operations
+		       && ibuf_page(page_id, zip_size, nullptr))) {

 		/* Trx sys header is so low in the latching order that we play
 		safe and do not leave the i/o-completion to an asynchronous
-		i/o-thread. Ibuf bitmap pages must always be read with
+		i/o-thread. Change buffer pages must always be read with
 		syncronous i/o, to make sure they do not get involved in
 		thread deadlocks. */
-
 		sync = true;
 	}

@@ -311,20 +313,19 @@ buf_read_page_low(
 	bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);

 	if (bpage == NULL) {
+		goto nothing_read;
+	}

-		return(0);
+	ut_ad(bpage->in_file());
+
+	if (sync) {
+		thd_wait_begin(nullptr, THD_WAIT_DISKIO);
 	}

 	DBUG_LOG("ib_buf",
 		 "read page " << page_id << " zip_size=" << zip_size
 		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));

-	ut_ad(bpage->in_file());
-
-	if (sync) {
-		thd_wait_begin(NULL, THD_WAIT_DISKIO);
-	}
-
 	void*	dst;

 	if (zip_size) {
@@ -335,20 +336,18 @@ buf_read_page_low(
 		dst = ((buf_block_t*) bpage)->frame;
 	}

-	fil_io_t fio = fil_io(
-		IORequestRead, sync, page_id, zip_size, 0,
-		zip_size ? zip_size : srv_page_size,
-		dst, bpage, ignore);
+	const ulint len = zip_size ? zip_size : srv_page_size;

+	auto fio = space->io(IORequest(sync
+				       ? IORequest::READ_SYNC
+				       : IORequest::READ_ASYNC),
+			     page_id.page_no() * len, len, dst, bpage);
 	*err= fio.err;

 	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
-		if (ignore || fio.err == DB_TABLESPACE_DELETED) {
+		if (!sync || fio.err == DB_TABLESPACE_DELETED) {
 			buf_pool.corrupted_evict(bpage);
-			if (sync && fio.node) {
-				fio.node->space->release_for_io();
-			}
-			return(0);
+			return false;
 		}

 		ut_error;
@@ -357,16 +356,16 @@ buf_read_page_low(
 	if (sync) {
 		thd_wait_end(NULL);

-		/* The i/o was already completed in fil_io() */
+		/* The i/o was already completed in space->io() */
 		*err = buf_page_read_complete(bpage, *fio.node);
-		fio.node->space->release_for_io();
+		space->release_for_io();

 		if (*err != DB_SUCCESS) {
-			return(0);
+			return false;
 		}
 	}

-	return(1);
+	return true;
 }

 /** Applies a random read-ahead in buf_pool if there are at least a threshold
@@ -411,7 +410,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
  ulint count= 5 + buf_read_ahead_area / 8;
  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
  page_id_t high= low + buf_read_ahead_area;
-  high.set_page_no(std::min(high.page_no(), space->committed_size - 1));
+  high.set_page_no(std::min(high.page_no(), space->last_page_number()));

  /* Count how many blocks in the area have been recently accessed,
  that is, reside near the start of the LRU list. */
@@ -427,10 +426,14 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
      goto read_ahead;
  }

+no_read_ahead:
  space->release();
  return 0;

 read_ahead:
+  if (!space->acquire_for_io())
+    goto no_read_ahead;
+
  /* Read all the suitable blocks within the area */
  const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;

@@ -441,13 +444,16 @@ read_ahead:
    if (space->is_stopping())
      break;
    dberr_t err;
-    count+= buf_read_page_low(&err, false, ibuf_mode, i, zip_size, false);
+    space->reacquire_for_io();
+    if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
+      count++;
  }

  if (count)
    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
 			  count, space->chain.start->name,
 			  low.page_no()));
+  space->release_for_io();
  space->release();

  /* Read ahead is considered one I/O operation for the purpose of
@@ -472,41 +478,49 @@ after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
 dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
 {
-	dberr_t		err = DB_SUCCESS;
-
-	ulint count = buf_read_page_low(
-		&err, true, BUF_READ_ANY_PAGE, page_id, zip_size, false);
-
-	srv_stats.buf_pool_reads.add(count);
-
-	if (err == DB_TABLESPACE_DELETED) {
+  fil_space_t *space= fil_space_acquire(page_id.space());
+  if (!space)
+  {
    ib::info() << "trying to read page " << page_id
               << " in nonexisting or being-dropped tablespace";
+    return DB_TABLESPACE_DELETED;
+  }
+  else if (!space->acquire_for_io())
+  {
+    ib::warn() << "unable to read " << page_id << " from tablespace "
+               << space->name;
+    space->release();
+    return DB_PAGE_CORRUPTED;
  }

-	/* Increment number of I/O operations used for LRU policy. */
-	buf_LRU_stat_inc_io();
+  space->release();

-	return(err);
+  dberr_t err;
+  if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
+			page_id, zip_size, false))
+    srv_stats.buf_pool_reads.add(1);
+
+  buf_LRU_stat_inc_io();
+  return err;
 }

 /** High-level function which reads a page asynchronously from a file to the
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
+@param[in,out]	space		tablespace
@param[in]	page_id		page id
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@param[in]	sync		true if synchronous aio is desired */
-void
-buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+			      ulint zip_size, bool sync)
 {
-	ulint		count;
 	dberr_t		err;

-	count = buf_read_page_low(
-		&err, sync,
-		BUF_READ_ANY_PAGE,
-		page_id, zip_size, false, true);
+	if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE,
+			      page_id, zip_size, false)) {
+		srv_stats.buf_pool_reads.add(1);
+	}

 	switch (err) {
 	case DB_SUCCESS:
@@ -528,8 +542,6 @@ buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync)
 			<< page_id;
 	}

-	srv_stats.buf_pool_reads.add(count);
-
 	/* We do not increment number of I/O operations used for LRU policy
 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
 	about evicting uncompressed version of compressed pages from the
@@ -598,10 +610,19 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
  fil_space_t *space= fil_space_acquire(page_id.space());
  if (!space)
    return 0;
-  if (high_1.page_no() >= space->committed_size)
+  else
+  {
+    bool ok= space->acquire_for_io();
+    space->release();
+    if (!ok)
+      return 0;
+  }
+
+  if (high_1.page_no() > space->last_page_number())
  {
    /* The area is not whole. */
-    space->release();
+fail:
+    space->release_for_io();
    return 0;
  }

@@ -628,8 +649,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
      {
 hard_fail:
        hash_lock->read_unlock();
-        space->release();
-        return 0;
+	goto fail;
      }
      const byte *f;
      switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
@@ -661,7 +681,7 @@ hard_fail:
      if (id != new_low && id != new_high_1)
        /* This is not a border page of the area: return */
        goto hard_fail;
-      if (new_high_1.page_no() >= space->committed_size)
+      if (new_high_1.page_no() > space->last_page_number())
        /* The area is not whole */
        goto hard_fail;
    }
@@ -671,8 +691,7 @@ failed:
      hash_lock->read_unlock();
      if (--count)
        continue;
-      space->release();
-      return 0;
+      goto fail;
    }

    const unsigned accessed= bpage->is_accessed();
@@ -702,7 +721,8 @@ failed:
    if (space->is_stopping())
      break;
    dberr_t err;
-    count+= buf_read_page_low(&err, false, ibuf_mode, new_low, zip_size,
+    space->reacquire_for_io();
+    count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
                              false);
  }

@@ -710,7 +730,7 @@ failed:
    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
                          count, space->chain.start->name,
                          new_low.page_no()));
-  space->release();
+  space->release_for_io();

  /* Read ahead is considered one I/O operation for the purpose of
  LRU policy decision. */
@@ -721,24 +741,19 @@ failed:
 }

 /** Issues read requests for pages which recovery wants to read in.
-@param[in]	sync		true if the caller wants this function to wait
-for the highest address page to get read in, before this function returns
@param[in]	space_id	tablespace id
@param[in]	page_nos	array of page numbers to read, with the
 highest page number the last in the array
@param[in]	n		number of page numbers in the array */
-void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
-                         ulint n)
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
 {
-	fil_space_t*		space	= fil_space_get(space_id);
+	fil_space_t* space = fil_space_t::get_for_io(space_id);

-	if (space == NULL) {
-		/* The tablespace is missing: do nothing */
+	if (!space) {
+		/* The tablespace is missing or unreadable: do nothing */
 		return;
 	}

-	fil_space_open_if_needed(space);
-
 	const ulint zip_size = space->zip_size();

 	for (ulint i = 0; i < n; i++) {
@@ -769,9 +784,10 @@ void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
 		}

 		dberr_t err;
-		buf_read_page_low(
-			&err, sync && i + 1 == n,
-			BUF_READ_ANY_PAGE, cur_page_id, zip_size, true);
+		space->reacquire_for_io();
+		buf_read_page_low(&err, space, false,
+				  BUF_READ_ANY_PAGE, cur_page_id, zip_size,
+				  true);

 		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
 			ib::error() << "Recovery failed to read or decrypt "
@@ -779,5 +795,8 @@ void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
 		}
 	}

-	DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)", n));
+
+        DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
+			      space->chain.start->name));
+	space->release_for_io();
 }
--- a/storage/innobase/dict/dict0crea.cc
+++ b/storage/innobase/dict/dict0crea.cc
@@ -951,7 +951,7 @@ void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
 	if (fil_space_t* s = fil_space_acquire_silent(space_id)) {
 		/* Ensure that the tablespace file exists
 		in order to avoid a crash in buf_page_get_gen(). */
-		if (s->size || fil_space_get_size(space_id)) {
+		if (root_page_no < s->get_size()) {
 			btr_free_if_exists(page_id_t(space_id, root_page_no),
 					   s->zip_size(),
 					   mach_read_from_8(ptr), mtr);
--- a/storage/innobase/dict/dict0load.cc
+++ b/storage/innobase/dict/dict0load.cc
@@ -2975,15 +2975,15 @@ err_exit:
 	}

 	if (err == DB_SUCCESS && table->is_readable()) {
-		if (table->space && !fil_space_get_size(table->space_id)) {
+		const auto root = dict_table_get_first_index(table)->page;
+
+		if (root >= table->space->get_size()) {
 corrupted:
 			table->corrupted = true;
 			table->file_unreadable = true;
 			err = DB_CORRUPTION;
 		} else {
-			const page_id_t page_id(
-				table->space->id,
-				dict_table_get_first_index(table)->page);
+			const page_id_t page_id(table->space->id, root);
 			mtr.start();
 			buf_block_t* block = buf_page_get(
 				page_id, table->space->zip_size(),
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -975,8 +975,7 @@ static inline
 void
 fil_crypt_read_crypt_data(fil_space_t* space)
 {
-	if (space->crypt_data || space->size
-	    || !fil_space_get_size(space->id)) {
+	if (space->crypt_data || space->size || !space->get_size()) {
 		/* The encryption metadata has already been read, or
 		the tablespace is not encrypted and the file has been
 		opened already, or the file cannot be accessed,
@@ -2246,16 +2245,10 @@ static void fil_crypt_rotation_list_fill()
 		}

 		/* Ensure that crypt_data has been initialized. */
-		if (!space->size) {
-			ut_d(const fil_space_t* s=)
-			        fil_system.read_page0(space->id);
-			ut_ad(!s || s == space);
-			if (!space->size) {
-				/* Page 0 was not loaded.
-				Skip this tablespace. */
+		if (!space->get_size()) {
+			/* Page 0 was not loaded. Skip this tablespace. */
 			goto next;
 		}
-		}

 		/* Skip ENCRYPTION!=DEFAULT tablespaces. */
 		if (space->crypt_data
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
--- a/storage/innobase/fsp/fsp0file.cc
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -296,8 +296,6 @@ Datafile::read_first_page(bool read_only_mode)
 	m_first_page = static_cast<byte*>(
 		aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size));

-	constexpr IORequest request(IORequest::READ |
-				    IORequest::DISABLE_PARTIAL_IO_WARNINGS);
 	dberr_t		err = DB_ERROR;
 	size_t		page_size = UNIV_PAGE_SIZE_MAX;

@@ -308,7 +306,8 @@ Datafile::read_first_page(bool read_only_mode)
 		ulint	n_read = 0;

 		err = os_file_read_no_error_handling(
-			request, m_handle, m_first_page, 0, page_size, &n_read);
+			IORequestReadPartial, m_handle, m_first_page, 0,
+			page_size, &n_read);

 		if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {

--- a/storage/innobase/fsp/fsp0space.cc
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -130,7 +130,7 @@ Tablespace::open_or_create(bool is_temp)
 				fsp_flags = FSP_FLAGS_PAGE_SSIZE();
 			}

-			space = fil_space_create(
+			space = fil_space_t::create(
 				m_name, m_space_id, fsp_flags,
 				is_temp
 				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -906,13 +906,10 @@ SysTablespace::open_or_create(
 		if (it != begin) {
 		} else if (is_temp) {
 			ut_ad(space_id() == SRV_TMP_SPACE_ID);
-			space = fil_space_create(
+			space = fil_space_t::create(
 				name(), SRV_TMP_SPACE_ID, flags(),
 				FIL_TYPE_TEMPORARY, NULL);
-
-			mutex_enter(&fil_system.mutex);
-			fil_system.temp_space = space;
-			mutex_exit(&fil_system.mutex);
+			ut_ad(space == fil_system.temp_space);
 			if (!space) {
 				return DB_ERROR;
 			}
@@ -920,12 +917,10 @@ SysTablespace::open_or_create(
 			ut_ad(space->full_crc32());
 		} else {
 			ut_ad(space_id() == TRX_SYS_SPACE);
-			space = fil_space_create(
+			space = fil_space_t::create(
 				name(), TRX_SYS_SPACE, it->flags(),
 				FIL_TYPE_TABLESPACE, NULL);
-			mutex_enter(&fil_system.mutex);
-			fil_system.sys_space = space;
-			mutex_exit(&fil_system.mutex);
+			ut_ad(space == fil_system.sys_space);
 			if (!space) {
 				return DB_ERROR;
 			}
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -7044,6 +7044,7 @@ i_s_tablespaces_encryption_fill_table(
 	}

 	mutex_enter(&fil_system.mutex);
+	fil_system.freeze_space_list++;

 	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
 	     space; space = UT_LIST_GET_NEXT(space_list, space)) {
@@ -7060,6 +7061,7 @@ i_s_tablespaces_encryption_fill_table(
 		}
 	}

+	fil_system.freeze_space_list--;
 	mutex_exit(&fil_system.mutex);
 	DBUG_RETURN(0);
 }
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -2300,7 +2300,7 @@ static void ibuf_read_merge_pages(const uint32_t* space_ids,

 	for (ulint i = 0; i < n_stored; i++) {
 		const ulint space_id = space_ids[i];
-		fil_space_t* s = fil_space_acquire_for_io(space_id);
+		fil_space_t* s = fil_space_t::get_for_io(space_id);
 		if (!s) {
 tablespace_deleted:
 			/* The tablespace was not found: remove all
@@ -4631,26 +4631,14 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)

 	const unsigned zip_size = space->zip_size();
 	const unsigned physical_size = space->physical_size();
-	/* fil_space_t::size and fil_space_t::free_limit would still be 0
-	at this point. So, we will have to read page 0. */
-	ut_ad(!space->free_limit);
-	ut_ad(!space->size);
+
+	uint32_t size= std::min(space->free_limit, space->size);
+
+	if (size == 0) {
+		return(DB_TABLE_NOT_FOUND);
+	}

 	mtr_t mtr;
-	uint32_t size;
-	mtr.start();
-	if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0),
-					   zip_size,
-					   RW_S_LATCH, &mtr)) {
-		size = std::min(
-			mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
-					 + sp->frame),
-			mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					 + sp->frame));
-	} else {
-		size = 0;
-	}
-	mtr.commit();

 	mutex_enter(&ibuf_mutex);

--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -978,6 +978,15 @@ public:
    return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
  }

+  /** @return the byte offset of the page within a file */
+  os_offset_t physical_offset() const
+  {
+    os_offset_t o= id().page_no();
+    return zip.ssize
+      ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
+      : o << srv_page_size_shift;
+  }
+
  /** @return whether the block is mapped to a data file */
  bool in_file() const
  {
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -52,10 +52,10 @@ class buf_dblwr_t

  struct element
  {
-    /** block descriptor */
-    buf_page_t *bpage;
-    /** true=buf_pool.flush_list, false=buf_pool.LRU */
-    bool lru;
+    /** tablespace */
+    fil_space_t *space;
+    /** asynchronous write request */
+    IORequest request;
    /** payload size in bytes */
    size_t size;
  };
@@ -103,10 +103,11 @@ public:

  /** Schedule a page write. If the doublewrite memory buffer is full,
  flush_buffered_writes() will be invoked to make space.
-  @param bpage      buffer pool page to be written
-  @param lru        true=buf_pool.LRU; false=buf_pool.flush_list
+  @param space      tablespace
+  @param request    asynchronous write request
  @param size       payload size in bytes */
-  void add_to_batch(buf_page_t *bpage, bool lru, size_t size);
+  void add_to_batch(fil_space_t *space, const IORequest &request,
+                    size_t size) MY_ATTRIBUTE((nonnull));

  /** Determine whether the doublewrite buffer is initialized */
  bool is_initialised() const
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@@ -46,11 +46,13 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
+@param[in,out]	space		tablespace
@param[in]	page_id		page id
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@param[in]	sync		true if synchronous aio is desired */
-void
-buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync);
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+			      ulint zip_size, bool sync)
+  MY_ATTRIBUTE((nonnull));

 /** Applies a random read-ahead in buf_pool if there are at least a threshold
 value of accessed pages from the random read-ahead area. Does not read any
@@ -101,14 +103,11 @@ ulint
 buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);

 /** Issues read requests for pages which recovery wants to read in.
-@param[in]	sync		true if the caller wants this function to wait
-for the highest address page to get read in, before this function returns
@param[in]	space_id	tablespace id
@param[in]	page_nos	array of page numbers to read, with the
 highest page number the last in the array
@param[in]	n		number of page numbers in the array */
-void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos,
-                         ulint n);
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n);

 /** @name Modes used in read-ahead @{ */
 /** read only pages belonging to the insert buffer tree */
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -313,6 +313,25 @@ new_range:

 /** Tablespace or log data space */
 #ifndef UNIV_INNOCHECKSUM
+struct fil_io_t
+{
+  /** error code */
+  dberr_t err;
+  /** file; node->space->release_for_io() must follow IORequestRead call */
+  fil_node_t *node;
+};
+
+/** Tablespace encryption mode */
+enum fil_encryption_t
+{
+  /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
+  FIL_ENCRYPTION_DEFAULT,
+  /** Encrypted */
+  FIL_ENCRYPTION_ON,
+  /** Not encrypted */
+  FIL_ENCRYPTION_OFF
+};
+
 struct fil_space_t : ilist_node<unflushed_spaces_tag_t>,
                     ilist_node<rotation_list_tag_t>
 #else
@@ -348,8 +367,6 @@ struct fil_space_t
 				/*!< recovered tablespace size in pages;
 				0 if no size change was read from the redo log,
 				or if the size change was implemented */
-  /** the committed size of the tablespace in pages */
-  Atomic_relaxed<uint32_t> committed_size;
 	ulint		n_reserved_extents;
 				/*!< number of reserved free extents for
 				ongoing operations like B-tree page split */
@@ -357,28 +374,33 @@ struct fil_space_t
 				the tablespace to disk; dropping of the
 				tablespace is forbidden if this is positive */
 private:
+  /** the committed size of the tablespace in pages */
+  Atomic_relaxed<uint32_t> committed_size;
  /** Number of pending buffer pool operations accessing the
  tablespace without holding a table lock or dict_operation_lock
  S-latch that would prevent the table (and tablespace) from being
  dropped. An example is encryption key rotation.

-  The tablespace cannot be dropped while this is nonzero, or while
-  fil_node_t::n_pending is nonzero.
+  The tablespace cannot be dropped while this is nonzero.

  The most significant bit contains the STOP_NEW_OPS flag. */
-  Atomic_relaxed<size_t> n_pending_ops;
+  Atomic_relaxed<uint32_t> n_pending_ops;
+  /** Number of pending block read or write operations
+  The tablespace object cannot be freed while this is nonzero,
+  but it can be detached from fil_system.
+
+  The most significant bit contains the CLOSING flag. */
+  std::atomic<uint32_t> n_pending_ios;

  /** Flag in n_pending_ops that indicates that the tablespace is being
  deleted, and no further operations should be performed */
  static constexpr uint32_t STOP_NEW_OPS= ~(~uint32_t(0) >> 1);
+  /** Flag in n_pending_ios that indicates that the tablespace is a candidate
+  for being closed, and fil_node_t::is_open() can only be trusted after
+  acquiring fil_system.mutex and resetting the flag */
+  static constexpr uint32_t CLOSING= STOP_NEW_OPS;
+  static constexpr uint32_t NOT_CLOSING= ~CLOSING;
 public:
-	/** Number of pending block read or write operations
-	(when a write is imminent or a read has recently completed).
-	The tablespace object cannot be freed while this is nonzero,
-	but it can be detached from fil_system.
-	Note that fil_node_t::n_pending tracks actual pending I/O requests.
-	Protected by fil_system.mutex and std::atomic. */
-	std::atomic<ulint>		n_pending_ios;
 	rw_lock_t	latch;	/*!< latch protecting the file space storage
 				allocation */
 	UT_LIST_NODE_T(fil_space_t) named_spaces;
@@ -484,9 +506,10 @@ public:
  /** @return whether the storage device is rotational (HDD, not SSD) */
  inline bool is_rotational() const;

-	/** Open each file. Only invoked on fil_system.temp_space.
+  /** Open each file. Never invoked on .ibd files.
+  @param create_new_db    whether to skip the call to fil_node_t::read_page0()
  @return whether all files were opened */
-	bool open();
+  bool open(bool create_new_db);
  /** Close each file. Only invoked on fil_system.temp_space. */
  void close();

@@ -497,17 +520,13 @@ public:
  size_t referenced() const { return n_pending_ops & ~STOP_NEW_OPS; }

  /** Note that operations on the tablespace must stop or can resume */
-  void set_stopping(bool stopping)
-  {
-    ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS);
-    ut_ad(!(n & STOP_NEW_OPS) == stopping);
-  }
+  inline void set_stopping(bool stopping);

  MY_ATTRIBUTE((warn_unused_result))
  /** @return whether a tablespace reference was successfully acquired */
  bool acquire()
  {
-    size_t n= 0;
+    uint32_t n= 0;
    while (!n_pending_ops.compare_exchange_strong(n, n + 1,
                                                  std::memory_order_acquire,
                                                  std::memory_order_relaxed))
@@ -523,30 +542,41 @@ public:
    ut_ad(n & ~STOP_NEW_OPS);
    return (n & ~STOP_NEW_OPS) == 1;
  }
-  /** Acquire a tablespace reference for I/O. */
-  void acquire_for_io() { n_pending_ios++; }
-  /** Release a tablespace reference for I/O. */
-  void release_for_io() { ut_d(auto n=) n_pending_ios--; ut_ad(n); }
-  /** @return whether I/O is pending */
-  bool pending_io() const { return n_pending_ios; }

-  /** @return whether the tablespace file can be closed and reopened */
-  bool belongs_in_lru() const
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference for I/O.
+  @return whether the file is usable */
+  bool acquire_for_io()
  {
-    switch (purpose) {
-    case FIL_TYPE_TEMPORARY:
-      ut_ad(id == SRV_TMP_SPACE_ID);
-      return false;
-    case FIL_TYPE_IMPORT:
-      ut_ad(id != SRV_TMP_SPACE_ID);
-      return true;
-    case FIL_TYPE_TABLESPACE:
-      ut_ad(id != SRV_TMP_SPACE_ID);
-      return id && !srv_is_undo_tablespace(id);
+    return UNIV_LIKELY(!(n_pending_ios.fetch_add(1, std::memory_order_acquire)&
+                         CLOSING)) ||
+      prepare_for_io();
  }
-    ut_ad(0);
-    return false;
+
+  /** Acquire another tablespace reference for I/O. */
+  inline void reacquire_for_io();
+
+  /** Release a tablespace reference for I/O. */
+  void release_for_io()
+  {
+    ut_d(uint32_t n=) n_pending_ios.fetch_sub(1, std::memory_order_release);
+    ut_ad(n & NOT_CLOSING);
  }
+  /** @return number of pending reads or writes */
+  uint32_t pending_io() const
+  { return n_pending_ios.load(std::memory_order_acquire) & NOT_CLOSING; }
+
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Prepare to close the file handle.
+  @return number of pending operations */
+  uint32_t set_closing()
+  {
+    return n_pending_ios.fetch_or(CLOSING, std::memory_order_acquire) &
+      NOT_CLOSING;
+  }
+  /** @return whether close() of the file handle has been requested */
+  bool is_closing() const
+  { return n_pending_ios.load(std::memory_order_acquire) & CLOSING; }

  /** @return last_freed_lsn */
  lsn_t get_last_freed_lsn() { return last_freed_lsn; }
@@ -835,6 +865,25 @@ public:
 	}

 #ifndef UNIV_INNOCHECKSUM
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Create a tablespace in fil_system.
+  @param name       tablespace name
+  @param id         tablespace identifier
+  @param flags      tablespace flags
+  @param purpose    tablespace purpose
+  @param crypt_data encryption information
+  @param mode       encryption mode
+  @return pointer to created tablespace, to be filled in with add()
+  @retval nullptr on failure (such as when the same tablespace exists) */
+  static fil_space_t *create(const char *name, ulint id, ulint flags,
+                             fil_type_t purpose, fil_space_crypt_t *crypt_data,
+                             fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT);
+
+  /** Acquire a tablespace for reading or writing a block.
+  @param id     tablespace ID
+  @return the tablespace, or nullptr if missing or inaccessible */
+  static fil_space_t *get_for_io(ulint id);
+
  /** Add/remove the free page in the freed ranges list.
  @param[in] offset     page number to be added
  @param[in] free       true if page to be freed */
@@ -863,8 +912,47 @@ public:
    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
    freed_ranges.add_range(range);
  }
-#endif /*!UNIV_INNOCHECKSUM */

+  /** Set the tablespace size in pages */
+  void set_sizes(uint32_t s)
+  {
+    ut_ad(id ? !size : (size >= s));
+    size= s; committed_size= s;
+  }
+
+  /** Update committed_size in mtr_t::commit() */
+  void set_committed_size()
+  {
+    ut_ad(rw_lock_own(&latch, RW_LOCK_X));
+    committed_size= size;
+  }
+
+  /** @return the last persisted page number */
+  uint32_t last_page_number() const { return committed_size - 1; }
+
+  /** @return the size in pages (0 if unreadable) */
+  inline uint32_t get_size();
+
+  /** Read or write data.
+  @param type     I/O context
+  @param offset   offset in bytes
+  @param len      number of bytes
+  @param buf      the data to be read or written
+  @param bpage    buffer block (for type.is_async() completion callback)
+  @return status and file descriptor */
+  fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
+              void *buf, buf_page_t *bpage= nullptr);
+  /** Flush pending writes from the file system cache to the file */
+  void flush();
+
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
+
+private:
+  /** @return whether the file is usable for io() */
+  ATTRIBUTE_COLD bool prepare_for_io();
+#endif /*!UNIV_INNOCHECKSUM */
 };

 #ifndef UNIV_INNOCHECKSUM
@@ -892,8 +980,6 @@ struct fil_node_t {
 	uint32_t	init_size;
 	/** maximum size of the file in database pages (0 if unlimited) */
 	uint32_t	max_size;
-	/** count of pending i/o's; is_open must be true if nonzero */
-	ulint		n_pending;
 	/** count of pending flushes; is_open must be true if nonzero */
 	ulint		n_pending_flushes;
 	/** whether the file is currently being extended */
@@ -902,8 +988,6 @@ struct fil_node_t {
 	bool		needs_flush;
 	/** link to other files in this tablespace */
 	UT_LIST_NODE_T(fil_node_t) chain;
-	/** link to the fil_system.LRU list (keeping track of open files) */
-	UT_LIST_NODE_T(fil_node_t) LRU;

 	/** whether this file could use atomic write (data file) */
 	bool		atomic_write;
@@ -921,9 +1005,8 @@ struct fil_node_t {
 	}

 	/** Read the first page of a data file.
-	@param[in]	first	whether this is the very first read
 	@return	whether the page was found valid */
-	bool read_page0(bool first);
+	bool read_page0();

 	/** Determine some file metadata when creating or reading the file.
 	@param	file	the file that is being created, or OS_FILE_CLOSED */
@@ -942,8 +1025,8 @@ struct fil_node_t {
  @return detached handle or OS_FILE_CLOSED */
  pfs_os_file_t close_to_free(bool detach_handle= false);

-  /** Update the data structures on I/O completion */
-  inline void complete_io(bool write= false);
+  /** Update the data structures on write completion */
+  inline void complete_write();

 private:
  /** Does stuff common for close() and detach() */
@@ -953,6 +1036,13 @@ private:
 /** Value of fil_node_t::magic_n */
 #define	FIL_NODE_MAGIC_N	89389

+inline void fil_space_t::reacquire_for_io()
+{
+  ut_d(uint32_t n=) n_pending_ios.fetch_add(1, std::memory_order_relaxed);
+  ut_ad(n & NOT_CLOSING);
+  ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+}
+
 inline void fil_space_t::set_imported()
 {
  ut_ad(purpose == FIL_TYPE_IMPORT);
@@ -963,11 +1053,9 @@ inline void fil_space_t::set_imported()
 inline bool fil_space_t::is_rotational() const
 {
  for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
-	     node = UT_LIST_GET_NEXT(chain, node)) {
-		if (!node->on_ssd) {
+       node= UT_LIST_GET_NEXT(chain, node))
+    if (!node->on_ssd)
      return true;
-		}
-	}
  return false;
 }

@@ -1179,16 +1267,6 @@ index */
 #define fil_page_index_page_check(page)                         \
        fil_page_type_is_index(fil_page_get_type(page))

-/** Enum values for encryption table option */
-enum fil_encryption_t {
-	/** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
-	FIL_ENCRYPTION_DEFAULT,
-	/** Encrypted */
-	FIL_ENCRYPTION_ON,
-	/** Not encrypted */
-	FIL_ENCRYPTION_OFF
-};
-
 /** Get the file page type.
@param[in]	page	file page
@return page type */
@@ -1227,7 +1305,6 @@ struct fil_system_t {
  */
  fil_system_t(): m_initialised(false)
  {
-    UT_LIST_INIT(LRU, &fil_node_t::LRU);
    UT_LIST_INIT(space_list, &fil_space_t::space_list);
    UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces);
  }
@@ -1275,30 +1352,23 @@ public:
 	fil_space_t*	temp_space;	/*!< The innodb_temporary tablespace */
  /** Map of fil_space_t::id to fil_space_t* */
  hash_table_t spaces;
-	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
-					/*!< base node for the LRU list of the
-					most recently used open files with no
-					pending i/o's; if we start an i/o on
-					the file, we first remove it from this
-					list, and return it to the start of
-					the list when the i/o ends;
-					log files and the system tablespace are
-					not put to this list: they are opened
-					after the startup, and kept open until
-					shutdown */
 	sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
 					/*!< list of those
 					tablespaces whose files contain
 					unflushed writes; those spaces have
 					at least one file node where
 					needs_flush == true */
-	ulint		n_open;		/*!< number of files currently open */
+  /** number of currently open files; protected by mutex */
+  ulint n_open;
 	ulint		max_assigned_id;/*!< maximum space id in the existing
 					tables, or assigned during the time
 					mysqld has been up; at an InnoDB
 					startup we scan the data dictionary
 					and set here the maximum of the
 					space id's of the tables there */
+  /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
+  to the end of space_list, for FIFO policy of try_to_close() */
+  ulint freeze_space_list;
 	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
 					/*!< list of all file spaces */
 	UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
@@ -1312,16 +1382,10 @@ public:
 					key rotation.*/

 	bool		space_id_reuse_warned;
-					/*!< whether fil_space_create()
+					/*!< whether fil_space_t::create()
 					has issued a warning about
 					potential space_id reuse */

-	/** Trigger a call to fil_node_t::read_page0()
-	@param[in]	id	tablespace identifier
-	@return	tablespace
-	@retval	NULL	if the tablespace does not exist or cannot be read */
-	fil_space_t* read_page0(ulint id);
-
  /** Return the next tablespace from rotation_list.
  @param space   previous tablespace (NULL to start from the start)
  @param recheck whether the removal condition needs to be rechecked after
@@ -1336,63 +1400,28 @@ public:
 /** The tablespace memory cache. */
 extern fil_system_t	fil_system;

-/** Update the data structures on I/O completion */
-inline void fil_node_t::complete_io(bool write)
+/** Note that operations on the tablespace must stop or can resume */
+inline void fil_space_t::set_stopping(bool stopping)
 {
  ut_ad(mutex_own(&fil_system.mutex));
-
-  if (write)
-  {
-    if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
-    {
-      /* We don't need to keep track of unflushed changes as user has
-      explicitly disabled buffering. */
-      ut_ad(!space->is_in_unflushed_spaces);
-      ut_ad(!needs_flush);
-    }
-    else if (!space->is_stopping())
-    {
-      needs_flush= true;
-      if (!space->is_in_unflushed_spaces)
-      {
-        space->is_in_unflushed_spaces= true;
-        fil_system.unflushed_spaces.push_front(*space);
-      }
-    }
+  ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS);
+  ut_ad(!(n & STOP_NEW_OPS) == stopping);
 }

-  switch (n_pending--) {
-  case 0:
-    ut_error;
-  case 1:
-    if (space->belongs_in_lru())
-      /* The node must be put back to the LRU list */
-      UT_LIST_ADD_FIRST(fil_system.LRU, this);
+/** @return the size in pages (0 if unreadable) */
+inline uint32_t fil_space_t::get_size()
+{
+  if (!size)
+  {
+    mutex_enter(&fil_system.mutex);
+    read_page0();
+    mutex_exit(&fil_system.mutex);
  }
+  return size;
 }

 #include "fil0crypt.h"

-/** Create a space memory object and put it to the fil_system hash table.
-Error messages are issued to the server log.
-@param[in]	name		tablespace name
-@param[in]	id		tablespace identifier
-@param[in]	flags		tablespace flags
-@param[in]	purpose		tablespace purpose
-@param[in,out] crypt_data	encryption information
-@param[in]	mode		encryption mode
-@return pointer to created tablespace, to be filled in with fil_space_t::add()
-@retval NULL on failure (such as when the same tablespace exists) */
-fil_space_t*
-fil_space_create(
-	const char*		name,
-	ulint			id,
-	ulint			flags,
-	fil_type_t		purpose,
-	fil_space_crypt_t*	crypt_data,
-	fil_encryption_t	mode = FIL_ENCRYPTION_DEFAULT)
-	MY_ATTRIBUTE((warn_unused_result));
-
 /*******************************************************************//**
 Assigns a new space id for a new single-table tablespace. This works simply by
 incrementing the global counter. If 4 billion id's is not enough, we may need
@@ -1421,21 +1450,6 @@ fil_space_free(
 void fil_space_set_recv_size_and_flags(ulint id, uint32_t size,
                                       uint32_t flags);

-/*******************************************************************//**
-Returns the size of the space in pages. The tablespace must be cached in the
-memory cache.
-@return space size, 0 if space not found */
-ulint
-fil_space_get_size(
-/*===============*/
-	ulint	id);	/*!< in: space id */
-
-/** Opens all system tablespace data files. They stay open until the
-database server shutdown. This should be called at a server startup after the
-space objects for the system tablespace have been created. The
-purpose of this operation is to make sure we never run out of file descriptors
-if we need to read from the insert buffer. */
-void fil_open_system_tablespace_files();
 /** Close all tablespace files at shutdown */
 void fil_close_all_files();
 /*******************************************************************//**
@@ -1491,14 +1505,6 @@ fil_space_acquire_silent(ulint id)
 	return (fil_space_acquire_low(id, true));
 }

-/** Acquire a tablespace for reading or writing a block,
-when it could be dropped concurrently.
-@param[in]	id	tablespace ID
-@return	the tablespace
-@retval	NULL if missing */
-fil_space_t*
-fil_space_acquire_for_io(ulint id);
-
 /** Replay a file rename operation if possible.
@param[in]	space_id	tablespace identifier
@param[in]	name		old file name
@@ -1674,7 +1680,7 @@ fil_file_readdir_next_file(
 memory cache. Note that if we have not done a crash recovery at the database
 startup, there may be many tablespaces which are not yet in the memory cache.
@param[in]	id		Tablespace ID
-@param[in]	name		Tablespace name used in fil_space_create().
+@param[in]	name		Tablespace name used in fil_space_t::create().
@param[in]	table_flags	table flags
@return the tablespace
@retval	NULL	if no matching tablespace exists in the memory cache */
@@ -1690,70 +1696,6 @@ fil_space_for_table_exists_in_mem(
@return whether the tablespace is at least as big as requested */
 bool fil_space_extend(fil_space_t *space, uint32_t size);

-struct fil_io_t
-{
-  /** error code */
-  dberr_t err;
-  /** file; node->space->release_for_io() must follow fil_io(sync=true) call */
-  fil_node_t *node;
-};
-
-/** Reads or writes data. This operation could be asynchronous (aio).
-
-@param[in]	type		IO context
-@param[in]	sync		true if synchronous aio is desired
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	byte_offset	remainder of offset in bytes; in aio this
-				must be divisible by the OS block size
-@param[in]	len		how many bytes to read or write; this must
-				not cross a file boundary; in aio this must
-				be a block size multiple
-@param[in,out]	buf		buffer where to store read data or from where
-				to write; in aio this must be appropriately
-				aligned
-@param[in]	message		message for aio handler if non-sync aio
-				used, else ignored
-@param[in]	ignore		whether to ignore errors
-@param[in]	punch_hole	punch the hole to the file for page_compressed
-				tablespace
-@return status and file descriptor */
-fil_io_t
-fil_io(
-	const IORequest&	type,
-	bool			sync,
-	const page_id_t		page_id,
-	ulint			zip_size,
-	ulint			byte_offset,
-	ulint			len,
-	void*			buf,
-	void*			message,
-	bool			ignore = false,
-	bool			punch_hole = false);
-
-/**********************************************************************//**
-Waits for an aio operation to complete. This function is used to write the
-handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.cc for more info). The thread specifies which
-segment it wants to wait for. */
-void
-fil_aio_wait(
-/*=========*/
-	ulint	segment);	/*!< in: the number of the segment in the aio
-				array to wait for */
-/**********************************************************************//**
-Flushes to disk possible writes cached by the OS. If the space does not exist
-or is being dropped, does not do anything. */
-void
-fil_flush(
-/*======*/
-	ulint	space_id);	/*!< in: file space id (this can be a group of
-				log files or a tablespace of the database) */
-/** Flush a tablespace.
-@param[in,out]	space	tablespace to flush */
-void
-fil_flush(fil_space_t* space);
-
 /** Flush to disk the writes in file spaces of the given type
 possibly cached by the OS. */
 void fil_flush_file_spaces();
@@ -1846,23 +1788,6 @@ inline bool fil_names_write_if_was_clean(fil_space_t* space)
 	return(was_clean);
 }

-/** During crash recovery, open a tablespace if it had not been opened
-yet, to get valid size and flags.
-@param[in,out]	space	tablespace */
-inline void fil_space_open_if_needed(fil_space_t* space)
-{
-	ut_ad(recv_recovery_is_on());
-
-	if (space->size == 0) {
-		/* Initially, size and flags will be set to 0,
-		until the files are opened for the first time.
-		fil_space_get_size() will open the file
-		and adjust the size and flags. */
-		ut_d(ulint size	=) fil_space_get_size(space->id);
-		ut_ad(size == space->size);
-	}
-}
-
 /** On a log checkpoint, reset fil_names_dirty_and_write() flags
 and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
@param[in]	lsn		checkpoint LSN
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************

 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2019, MariaDB Corporation.
+Copyright (c) 2014, 2020, MariaDB Corporation.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,7 @@ File space management types
 Created May 26, 2009 Vasil Dimov
 *******************************************************/

-#ifndef fsp0types_h
-#define fsp0types_h
-
+#pragma once
 #include <cstddef>

 /** The fil_space_t::id of the redo log. All persistent tablespaces
@@ -402,4 +400,6 @@ in full crc32 format. */

 /* @} */

-#endif /* fsp0types_h */
+struct fil_node_t;
+struct fil_space_t;
+class buf_page_t;
--- a/storage/innobase/include/os0api.h
+++ b/storage/innobase/include/os0api.h
@@ -1,48 +0,0 @@
-/***********************************************************************
-
-Copyright (c) 2017, 2019, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-***********************************************************************/
-
-/**************************************************//**
-@file os0api.h
-The interface to the helper functions.
-These functions are used on os0file.h where
-including full full header is not feasible and
-implemented on buf0buf.cc and fil0fil.cc.
-*******************************************************/
-
-#ifndef OS_API_H
-#define OS_API_H 1
-
-/** Page control block */
-class buf_page_t;
-
-/** File Node */
-struct fil_node_t;
-
-/**
-Calculate the length of trim (punch_hole) operation.
-@param[in]	bpage		Page control block
-@param[in]	write_length	Write length
-@return length of the trim or zero. */
-ulint
-buf_page_get_trim_length(
-	const buf_page_t*	bpage,
-	ulint			write_length)
-	MY_ATTRIBUTE((warn_unused_result));
-
-#endif /* OS_API_H */
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -37,7 +37,6 @@ Created 10/21/1995 Heikki Tuuri
 #define os0file_h

 #include "fsp0types.h"
-#include "os0api.h"
 #include "tpool.h"

 #ifndef _WIN32
@@ -46,10 +45,6 @@ Created 10/21/1995 Heikki Tuuri
 #include <time.h>
 #endif /* !_WIN32 */

-/** File node of a tablespace or the log data space */
-struct fil_node_t;
-struct fil_space_t;
-
 extern bool	os_has_said_disk_full;

 /** File offset in bytes */
@@ -188,117 +183,75 @@ The I/O context that is passed down to the low level IO code */
 class IORequest
 {
 public:
-  constexpr IORequest(ulint type= READ, buf_page_t *bpage= nullptr,
-                      bool lru= false) :
-    m_bpage(bpage), m_type(static_cast<uint16_t>(type)), m_LRU(lru) {}
-
-	/** Flags passed in the request, they can be ORred together. */
-	enum {
-		READ = 1,
-		WRITE = 2,
-
-		/** Double write buffer recovery. */
-		DBLWR_RECOVER = 4,
-
-		/** Enumarations below can be ORed to READ/WRITE above*/
-
-		/** Data file */
-		DATA_FILE = 8,
-
-		/** Disable partial read warnings */
-		DISABLE_PARTIAL_IO_WARNINGS = 32,
-
-		/** Use punch hole if available*/
-		PUNCH_HOLE = 64,
+  enum Type
+  {
+    /** Synchronous read */
+    READ_SYNC= 2,
+    /** Asynchronous read; some errors will be ignored */
+    READ_ASYNC= READ_SYNC | 1,
+    /** Possibly partial read; only used with
+    os_file_read_no_error_handling() */
+    READ_MAYBE_PARTIAL= READ_SYNC | 4,
+    /** Read for doublewrite buffer recovery */
+    DBLWR_RECOVER= READ_SYNC | 8,
+    /** Synchronous write */
+    WRITE_SYNC= 16,
+    /** Asynchronous write */
+    WRITE_ASYNC= WRITE_SYNC | 1,
+    /** Write data; evict the block on write completion */
+    WRITE_LRU= WRITE_ASYNC | 32,
+    /** Write data and punch hole for the rest */
+    PUNCH= WRITE_ASYNC | 64,
+    /** Write data and punch hole; evict the block on write completion */
+    PUNCH_LRU= PUNCH | WRITE_LRU,
+    /** Zero out a range of bytes in fil_space_t::io() */
+    PUNCH_RANGE= WRITE_SYNC | 128,
  };

-	/** @return true if it is a read request */
-	bool is_read() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & READ) == READ);
-	}
+  constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) :
+    bpage(bpage), type(type) {}

-	/** @return true if it is a write request */
-	bool is_write() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & WRITE) == WRITE);
-	}
+  constexpr IORequest(const IORequest &old, fil_node_t *node= nullptr) :
+    bpage(old.bpage), node(node), type(old.type) {}

-	/** @return true if partial read warning disabled */
-	bool is_partial_io_warning_disabled() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return !!(m_type & DISABLE_PARTIAL_IO_WARNINGS);
-	}
+  bool is_read() const { return (type & READ_SYNC) != 0; }
+  bool is_write() const { return (type & WRITE_SYNC) != 0; }
+  bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
+  bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }

-	/** @return true if punch hole should be used */
-	bool punch_hole() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & PUNCH_HOLE) == PUNCH_HOLE);
-	}
-
-	/** @return true if the read should be validated */
-	bool validate() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return(is_read() ^ is_write());
-	}
-
-	/** Set the pointer to file node for IO
-	@param[in] node			File node */
-	void set_fil_node(fil_node_t *node) { m_fil_node= node; }
-
-	bool operator==(const IORequest& rhs) const
-	{
-		return(m_type == rhs.m_type);
-	}
-
-	/** @return true if the request is from the dblwr recovery */
-	bool is_dblwr_recover() const
-		MY_ATTRIBUTE((warn_unused_result))
-	{
-		return((m_type & DBLWR_RECOVER) == DBLWR_RECOVER);
-	}
-
-	ulint get_trim_length(ulint write_length) const
-	{
-		return (m_bpage ?
-			buf_page_get_trim_length(m_bpage, write_length)
-			: 0);
-	}
-
-	inline bool should_punch_hole() const;
-
-	/** Free storage space associated with a section of the file.
-	@param[in]	fh		Open file handle
-	@param[in]	off		Starting offset (SEEK_SET)
-	@param[in]	len		Size of the hole
+  /** If requested, free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
  @return DB_SUCCESS or error code */
-	dberr_t punch_hole(os_file_t fh, os_offset_t off, ulint len);
-
-  /** @return type of page flush (for writes) */
-  bool is_LRU() const { return m_LRU; }
+  dberr_t maybe_punch_hole(os_offset_t off, ulint len)
+  {
+    return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
+      ? punch_hole(off, len)
+      : DB_SUCCESS;
+  }

 private:
-	/** Page to be written on write operation. */
-	buf_page_t* const	m_bpage= nullptr;
+  /** Free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t punch_hole(os_offset_t off, ulint len) const
+    MY_ATTRIBUTE((nonnull));

-	/** File node */
-	fil_node_t*		m_fil_node= nullptr;
+public:
+  /** Page to be written on write operation */
+  buf_page_t* const bpage= nullptr;
+
+  /** File descriptor */
+  const fil_node_t *const node= nullptr;

  /** Request type bit flags */
-  const uint16_t m_type;
-
-  /** for writes, type of page flush */
-  const bool m_LRU= false;
+  const Type type;
 };

-constexpr IORequest IORequestRead(IORequest::READ);
-constexpr IORequest IORequestWrite(IORequest::WRITE);
-
+constexpr IORequest IORequestRead(IORequest::READ_SYNC);
+constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
+constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);

 /** Sparse file size information. */
 struct os_file_size_t {
@@ -313,20 +266,6 @@ struct os_file_size_t {
 /** Win NT does not allow more than 64 */
 static const ulint OS_AIO_N_PENDING_IOS_PER_THREAD = 256;

-/** Modes for aio operations @{ */
-/** Normal asynchronous i/o not for ibuf pages or ibuf bitmap pages */
-static const ulint OS_AIO_NORMAL = 21;
-
-/**  Asynchronous i/o for ibuf pages or ibuf bitmap pages */
-static const ulint OS_AIO_IBUF = 22;
-
-/**Calling thread will wait for the i/o to complete,
-and perform IO completion routine itself;
-can be used for any pages, ibuf or non-ibuf.  This is used to save
-CPU time, as we can do with fewer thread switches. */
-static const ulint OS_AIO_SYNC = 24;
-/* @} */
-
 extern ulint	os_n_file_reads;
 extern ulint	os_n_file_writes;
 extern ulint	os_n_fsyncs;
@@ -669,9 +608,9 @@ The wrapper functions have the prefix of "innodb_". */
 # define os_file_close(file)						\
 	pfs_os_file_close_func(file, __FILE__, __LINE__)

-# define os_aio(type, mode, name, file, buf, offset,		\
+# define os_aio(type, name, file, buf, offset,		\
 	n, read_only, message1, message2)			\
-	pfs_os_aio_func(type, mode, name, file, buf, offset,	\
+	pfs_os_aio_func(type, name, file, buf, offset,	\
 		n, read_only, message1, message2,		\
 			__FILE__, __LINE__)

@@ -859,7 +798,6 @@ function!
 Performance schema wrapper function of os_aio() which requests
 an asynchronous I/O operation.
@param[in,out]	type		IO request context
-@param[in]	mode		IO mode
@param[in]	name		Name of the file or path as NUL terminated
 				string
@param[in]	file		Open file handle
@@ -879,8 +817,7 @@ an asynchronous I/O operation.
 UNIV_INLINE
 dberr_t
 pfs_os_aio_func(
-	IORequest&	type,
-	ulint		mode,
+	const IORequest&type,
 	const char*	name,
 	pfs_os_file_t	file,
 	void*		buf,
@@ -1013,9 +950,9 @@ to original un-instrumented file I/O APIs */

 # define os_file_close(file)	os_file_close_func(file)

-# define os_aio(type, mode, name, file, buf, offset,			\
+# define os_aio(type, name, file, buf, offset,			\
 	n, read_only, message1, message2)			\
-	os_aio_func(type, mode, name, file, buf, offset,		\
+	os_aio_func(type, name, file, buf, offset,		\
 		n, read_only, message1, message2)

 # define os_file_read(type, file, buf, offset, n)			\
@@ -1281,7 +1218,6 @@ struct os_aio_userdata_t
 NOTE! Use the corresponding macro os_aio(), not directly this function!
 Requests an asynchronous i/o operation.
@param[in,out]	type		IO request context
-@param[in]	mode		IO mode
@param[in]	name		Name of the file or path as NUL terminated
 				string
@param[in]	file		Open file handle
@@ -1298,8 +1234,7 @@ Requests an asynchronous i/o operation.
@return DB_SUCCESS or error code */
 dberr_t
 os_aio_func(
-	IORequest&	type,
-	ulint		mode,
+	const IORequest&type,
 	const char*	name,
 	pfs_os_file_t	file,
 	void*		buf,
--- a/storage/innobase/include/os0file.ic
+++ b/storage/innobase/include/os0file.ic
@@ -206,7 +206,6 @@ function!
 Performance schema wrapper function of os_aio() which requests
 an asynchronous i/o operation.
@param[in,type]	type		IO request context
-@param[in]	mode		IO mode
@param[in]	name		Name of the file or path as NUL terminated
 				string
@param[in]	file		Open file handle
@@ -226,8 +225,7 @@ an asynchronous i/o operation.
 UNIV_INLINE
 dberr_t
 pfs_os_aio_func(
-	IORequest&	type,
-	ulint		mode,
+	const IORequest&type,
 	const char*	name,
 	pfs_os_file_t	file,
 	void*		buf,
@@ -242,8 +240,6 @@ pfs_os_aio_func(
 	PSI_file_locker_state	state;
 	struct PSI_file_locker*	locker = NULL;

-	ut_ad(type.validate());
-
 	/* Register the read or write I/O depending on "type" */
 	register_pfs_file_io_begin(
 		&state, locker, file, n,
@@ -251,7 +247,7 @@ pfs_os_aio_func(
 		src_file, src_line);

 	dberr_t	result = os_aio_func(
-		type, mode, name, file, buf, offset, n, read_only, m1, m2);
+		type, name, file, buf, offset, n, read_only, m1, m2);

 	register_pfs_file_io_end(locker, n);

@@ -284,8 +280,6 @@ pfs_os_file_read_func(
 	PSI_file_locker_state	state;
 	struct PSI_file_locker*	locker = NULL;

-	ut_ad(type.validate());
-
 	register_pfs_file_io_begin(
 		&state, locker, file, n, PSI_FILE_READ, src_file, src_line);

--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -46,10 +46,9 @@ Created 3/26/1996 Heikki Tuuri
 /** Checks if a page address is the trx sys header page.
@param[in]	page_id	page id
@return true if trx sys header page */
-inline bool trx_sys_hdr_page(const page_id_t& page_id)
+inline bool trx_sys_hdr_page(const page_id_t page_id)
 {
-	return(page_id.space() == TRX_SYS_SPACE
-	       && page_id.page_no() == TRX_SYS_PAGE_NO);
+  return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
 }

 /*****************************************************************//**
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -2060,7 +2060,14 @@ same_page:
      const bool is_init= (b & 0x70) <= INIT_PAGE;
      switch (*store) {
      case STORE_IF_EXISTS:
-        if (!fil_space_get_size(space_id))
+        if (fil_space_t *space= fil_space_acquire_silent(space_id))
+        {
+          const auto size= space->get_size();
+	  space->release();
+	  if (!size)
+            continue;
+	}
+	else
          continue;
        /* fall through */
      case STORE_YES:
@@ -2487,7 +2494,7 @@ static void recv_read_in_area(page_id_t page_id)

 	if (p != page_nos) {
 		mutex_exit(&recv_sys.mutex);
-		buf_read_recv_pages(FALSE, page_id.space(), page_nos,
+		buf_read_recv_pages(page_id.space(), page_nos,
 				    ulint(p - page_nos));
 		mutex_enter(&recv_sys.mutex);
 	}
@@ -2513,7 +2520,7 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
  if (end_lsn < i.lsn)
    DBUG_LOG("ib_log", "skip log for page " << page_id
             << " LSN " << end_lsn << " < " << i.lsn);
-  else if (fil_space_t *space= fil_space_acquire_for_io(page_id.space()))
+  else if (fil_space_t *space= fil_space_t::get_for_io(page_id.space()))
  {
    mtr.start();
    mtr.set_log_mode(MTR_LOG_NO_REDO);
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -214,7 +214,7 @@ static void memo_slot_release(mtr_memo_slot_t *slot)
  case MTR_MEMO_SPACE_X_LOCK:
    {
      fil_space_t *space= static_cast<fil_space_t*>(slot->object);
-      space->committed_size= space->size;
+      space->set_committed_size();
      rw_lock_x_unlock(&space->latch);
    }
    break;
@@ -256,7 +256,7 @@ struct ReleaseLatches {
    case MTR_MEMO_SPACE_X_LOCK:
      {
        fil_space_t *space= static_cast<fil_space_t*>(slot->object);
-        space->committed_size= space->size;
+        space->set_committed_size();
        rw_lock_x_unlock(&space->latch);
      }
      break;
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -135,7 +135,6 @@ public:

 static io_slots *read_slots;
 static io_slots *write_slots;
-static io_slots *ibuf_slots;

 /** Number of retries for partial I/O's */
 constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
@@ -3143,14 +3142,7 @@ os_file_io(

 			bytes_returned += n_bytes;

-			if (offset > 0
-			    && type.is_write()
-			    && type.punch_hole()) {
-				*err = type.punch_hole(file, offset, n);
-
-			} else {
-				*err = DB_SUCCESS;
-			}
+			*err = type.maybe_punch_hole(offset, n);

 			return(original_n);
 		}
@@ -3161,8 +3153,7 @@ os_file_io(

 		bytes_returned += n_bytes;

-		if (!type.is_partial_io_warning_disabled()) {
-
+		if (type.type != IORequest::READ_MAYBE_PARTIAL) {
 			const char*	op = type.is_read()
 				? "read" : "written";

@@ -3180,7 +3171,7 @@ os_file_io(

 	*err = DB_IO_ERROR;

-	if (!type.is_partial_io_warning_disabled()) {
+	if (type.type != IORequest::READ_MAYBE_PARTIAL) {
 		ib::warn()
 			<< "Retry attempts for "
 			<< (type.is_read() ? "reading" : "writing")
@@ -3208,7 +3199,6 @@ os_file_pwrite(
 	os_offset_t		offset,
 	dberr_t*		err)
 {
-	ut_ad(type.validate());
 	ut_ad(type.is_write());

 	++os_n_file_writes;
@@ -3242,7 +3232,6 @@ os_file_write_func(
 {
 	dberr_t		err;

-	ut_ad(type.validate());
 	ut_ad(n > 0);

 	WAIT_ALLOW_WRITES();
@@ -3332,7 +3321,6 @@ os_file_read_page(

 	os_bytes_read_since_printout += n;

-	ut_ad(type.validate());
 	ut_ad(n > 0);

 	ssize_t	n_bytes = os_file_pread(type, file, buf, n, offset, &err);
@@ -3657,13 +3645,9 @@ fallback:
 			n_bytes = buf_size;
 		}

-		dberr_t		err;
-		IORequest	request(IORequest::WRITE);
-
-		err = os_file_write(
-			request, name, file, buf, current_size, n_bytes);
-
-		if (err != DB_SUCCESS) {
+		if (os_file_write(IORequestWrite, name,
+				  file, buf, current_size, n_bytes) !=
+		    DB_SUCCESS) {
 			break;
 		}

@@ -3786,18 +3770,11 @@ os_file_punch_hole(
 #endif /* _WIN32 */
 }

-inline bool IORequest::should_punch_hole() const
-{
-	return m_fil_node && m_fil_node->space->punch_hole;
-}
-
 /** Free storage space associated with a section of the file.
-@param[in]	fh		Open file handle
-@param[in]	off		Starting offset (SEEK_SET)
-@param[in]	len		Size of the hole
+@param off   byte offset from the start (SEEK_SET)
+@param len   size of the hole in bytes
@return DB_SUCCESS or error code */
-dberr_t
-IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
+dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
 {
 	/* In this debugging mode, we act as if punch hole is supported,
 	and then skip any calls to actually punch a hole here.
@@ -3806,7 +3783,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
 		return(DB_SUCCESS);
 	);

-	ulint trim_len = get_trim_length(len);
+	ulint trim_len = bpage ? bpage->physical_size() - len : 0;

 	if (trim_len == 0) {
 		return(DB_SUCCESS);
@@ -3816,11 +3793,11 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)

 	/* Check does file system support punching holes for this
 	tablespace. */
-	if (!should_punch_hole()) {
+	if (!node->space->punch_hole) {
 		return DB_IO_NO_PUNCH_HOLE;
 	}

-	dberr_t err = os_file_punch_hole(fh, off, trim_len);
+	dberr_t err = os_file_punch_hole(node->handle, off, trim_len);

 	if (err == DB_SUCCESS) {
 		srv_stats.page_compressed_trim_op.inc();
@@ -3828,7 +3805,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
 		/* If punch hole is not supported,
 		set space so that it is not used. */
 		if (err == DB_IO_NO_PUNCH_HOLE) {
-			m_fil_node->space->punch_hole = false;
+			node->space->punch_hole = false;
 			err = DB_SUCCESS;
 		}
 	}
@@ -3885,12 +3862,8 @@ static void io_callback(tpool::aiocb* cb)
 	os_aio_userdata_t data(cb->m_userdata);
 	/* Return cb back to cache*/
 	if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD) {
-		if (read_slots->contains(cb)) {
+		ut_ad(read_slots->contains(cb));
 		read_slots->release(cb);
-		}	else {
-			ut_ad(ibuf_slots->contains(cb));
-			ibuf_slots->release(cb);
-		}
 	} else	{
 		ut_ad(write_slots->contains(cb));
 		write_slots->release(cb);
@@ -4033,8 +4006,7 @@ bool os_aio_init(ulint n_reader_threads, ulint n_writer_threads, ulint)
 {
  int max_write_events= int(n_writer_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
  int max_read_events= int(n_reader_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
-	int max_ibuf_events = 1 * OS_AIO_N_PENDING_IOS_PER_THREAD;
-	int max_events = max_read_events + max_write_events + max_ibuf_events;
+  int max_events = max_read_events + max_write_events;
 	int ret;

 #if LINUX_NATIVE_AIO
@@ -4053,7 +4025,6 @@ bool os_aio_init(ulint n_reader_threads, ulint n_writer_threads, ulint)
 	}
 	read_slots = new io_slots(max_read_events, (uint)n_reader_threads);
 	write_slots = new io_slots(max_write_events, (uint)n_writer_threads);
-	ibuf_slots = new io_slots(max_ibuf_events, 1);
 	return true;
 }

@@ -4062,10 +4033,8 @@ void os_aio_free()
  srv_thread_pool->disable_aio();
  delete read_slots;
  delete write_slots;
-  delete ibuf_slots;
  read_slots= nullptr;
  write_slots= nullptr;
-  ibuf_slots= nullptr;
 }

 /** Waits until there are no pending writes. There can
@@ -4088,7 +4057,6 @@ void os_aio_wait_until_no_pending_writes()
 NOTE! Use the corresponding macro os_aio(), not directly this function!
 Requests an asynchronous i/o operation.
@param[in,out]	type		IO request context
-@param[in]	mode		IO mode
@param[in]	name		Name of the file or path as NUL terminated
 				string
@param[in]	file		Open file handle
@@ -4106,8 +4074,7 @@ Requests an asynchronous i/o operation.
@return DB_SUCCESS or error code */
 dberr_t
 os_aio_func(
-	IORequest&	type,
-	ulint		mode,
+	const IORequest&type,
 	const char*	name,
 	pfs_os_file_t	file,
 	void*		buf,
@@ -4126,10 +4093,7 @@ os_aio_func(
 	ut_ad((n & 0xFFFFFFFFUL) == n);
 #endif /* WIN_ASYNC_IO */

-	DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
-			mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
-
-	if (mode == OS_AIO_SYNC) {
+	if (!type.is_async()) {
 		if (type.is_read()) {
 			return(os_file_read_func(type, file, buf, offset, n));
 		}
@@ -4141,20 +4105,14 @@ os_aio_func(

 	if (type.is_read()) {
 		++os_n_file_reads;
-	} else if (type.is_write()) {
-			++os_n_file_writes;
 	} else {
-		ut_error;
+		ut_ad(type.is_write());
+		++os_n_file_writes;
 	}

 	compile_time_assert(sizeof(os_aio_userdata_t) <= tpool::MAX_AIO_USERDATA_LEN);
 	os_aio_userdata_t userdata{m1,type,m2};
-	io_slots* slots;
-	if (type.is_read()) {
-		slots = mode == OS_AIO_IBUF?ibuf_slots: read_slots;
-	} else {
-		slots = write_slots;
-	}
+	io_slots* slots= type.is_read() ? read_slots : write_slots;
 	tpool::aiocb* cb = slots->acquire();

 	cb->m_buffer = buf;
@@ -4462,12 +4420,11 @@ void fil_node_t::find_metadata(os_file_t file
 }

 /** Read the first page of a data file.
-@param[in]	first	whether this is the very first read
@return	whether the page was found valid */
-bool fil_node_t::read_page0(bool first)
+bool fil_node_t::read_page0()
 {
 	ut_ad(mutex_own(&fil_system.mutex));
-	const ulint psize = space->physical_size();
+	const unsigned psize = space->physical_size();
 #ifndef _WIN32
 	struct stat statbuf;
 	if (fstat(handle, &statbuf)) {
@@ -4479,7 +4436,7 @@ bool fil_node_t::read_page0(bool first)
 	os_offset_t size_bytes = os_file_get_size(handle);
 	ut_a(size_bytes != (os_offset_t) -1);
 #endif
-	const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
+	const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;

 	if (size_bytes < min_size) {
 		ib::error() << "The size of the file " << name
@@ -4546,14 +4503,11 @@ invalid:
 		return false;
 	}

-	if (first) {
-		ut_ad(space->id != TRX_SYS_SPACE);
 #ifdef UNIV_LINUX
 	find_metadata(handle, &statbuf);
 #else
 	find_metadata();
 #endif
-
 	/* Truncate the size to a multiple of extent size. */
 	ulint	mask = psize * FSP_EXTENT_SIZE - 1;

@@ -4568,19 +4522,7 @@ invalid:

 	space->punch_hole = space->is_compressed();
 	this->size = uint32_t(size_bytes / psize);
-		space->committed_size = space->size += this->size;
-	} else if (space->id != TRX_SYS_SPACE || space->size_in_header) {
-		/* If this is not the first-time open, do nothing.
-		For the system tablespace, we always get invoked as
-		first=false, so we detect the true first-time-open based
-		on size_in_header and proceed to initialize the data. */
-		return true;
-	} else {
-		/* Initialize the size of predefined tablespaces
-		to FSP_SIZE. */
-		space->committed_size = size;
-	}
-
+	space->set_sizes(this->size);
 	ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
 	ut_ad(space->free_len == 0 || space->free_len == free_len);
 	space->size_in_header = size;
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -3424,8 +3424,7 @@ fil_iterate(
 		byte* const writeptr = readptr;

 		err = os_file_read_no_error_handling(
-			IORequest(IORequest::READ
-				  | IORequest::DISABLE_PARTIAL_IO_WARNINGS),
+			IORequestReadPartial,
 			iter.file, readptr, offset, n_bytes, 0);
 		if (err != DB_SUCCESS) {
 			ib::error() << iter.filepath
@@ -3664,9 +3663,7 @@ not_encrypted:

 		/* A page was updated in the set, write back to disk. */
 		if (updated) {
-			IORequest       write_request(IORequest::WRITE);
-
-			err = os_file_write(write_request,
+			err = os_file_write(IORequestWrite,
 					    iter.filepath, iter.file,
 					    writeptr, offset, n_bytes);

@@ -3759,9 +3756,7 @@ fil_tablespace_iterate(

 	/* Read the first page and determine the page and zip size. */

-	err = os_file_read_no_error_handling(
-		IORequest(IORequest::READ
-			  | IORequest::DISABLE_PARTIAL_IO_WARNINGS),
+	err = os_file_read_no_error_handling(IORequestReadPartial,
 					     file, page, 0, srv_page_size, 0);

 	if (err == DB_SUCCESS) {
--- a/storage/innobase/row/row0quiesce.cc
+++ b/storage/innobase/row/row0quiesce.cc
@@ -545,7 +545,7 @@ row_quiesce_table_start(
 	if (!trx_is_interrupted(trx)) {
 		/* Ensure that all asynchronous IO is completed. */
 		os_aio_wait_until_no_pending_writes();
-		fil_flush(table->space_id);
+		table->space->flush();

 		if (row_quiesce_write_cfg(table, trx->mysql_thd)
 		    != DB_SUCCESS) {
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -229,10 +229,12 @@ srv_file_check_mode(
 static const char INIT_LOG_FILE0[]= "101";

 /** Creates log file.
+@param[in]  create_new_db   whether the database is being initialized
@param[in]  lsn		    FIL_PAGE_FILE_FLUSH_LSN value
@param[out] logfile0        name of the log file
@return DB_SUCCESS or error code */
-static dberr_t create_log_file(lsn_t lsn, std::string& logfile0)
+static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
+                               std::string& logfile0)
 {
 	if (srv_read_only_mode) {
 		ib::error() << "Cannot create log file in read-only mode";
@@ -296,7 +298,9 @@ static dberr_t create_log_file(lsn_t lsn, std::string& logfile0)
 	}

 	log_sys.log.open_file(logfile0);
-	fil_open_system_tablespace_files();
+	if (!fil_system.sys_space->open(create_new_db)) {
+		return DB_ERROR;
+	}

 	/* Create a log checkpoint. */
 	log_mutex_enter();
@@ -553,7 +557,7 @@ err_exit:

  fil_set_max_space_id_if_bigger(space_id);

-  fil_space_t *space= fil_space_create(undo_name, space_id, fsp_flags,
+  fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags,
 					  FIL_TYPE_TABLESPACE, NULL);
  ut_a(fil_validate());
  ut_a(space);
@@ -563,21 +567,16 @@ err_exit:

  if (create)
  {
+    space->set_sizes(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
    space->size= file->size= uint32_t(size >> srv_page_size_shift);
-    space->size_in_header= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
-    space->committed_size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
  }
-  else
-  {
-    success= file->read_page0(true);
-    if (!success)
+  else if (!file->read_page0())
  {
    os_file_close(file->handle);
    file->handle= OS_FILE_CLOSED;
    ut_a(fil_system.n_open > 0);
    fil_system.n_open--;
  }
-  }

  mutex_exit(&fil_system.mutex);
  return space_id;
@@ -803,7 +802,7 @@ srv_open_tmp_tablespace(bool create_new_db)
 			    true, create_new_db, &sum_of_new_sizes, NULL))
 		   != DB_SUCCESS) {
 		ib::error() << "Unable to create the shared innodb_temporary";
-	} else if (fil_system.temp_space->open()) {
+	} else if (fil_system.temp_space->open(true)) {
 		/* Initialize the header page */
 		mtr_t mtr;
 		mtr.start();
@@ -1304,7 +1303,7 @@ dberr_t srv_start(bool create_new_db)
 		log_sys.set_flushed_lsn(flushed_lsn);
 		buf_flush_sync();

-		err = create_log_file(flushed_lsn, logfile0);
+		err = create_log_file(true, flushed_lsn, logfile0);

 		if (err != DB_SUCCESS) {
 			return(srv_init_abort(err));
@@ -1333,7 +1332,7 @@ dberr_t srv_start(bool create_new_db)

 			srv_log_file_size = srv_log_file_size_requested;

-			err = create_log_file(flushed_lsn, logfile0);
+			err = create_log_file(false, flushed_lsn, logfile0);

 			if (err == DB_SUCCESS) {
 				err = create_log_file_rename(flushed_lsn,
@@ -1364,11 +1363,11 @@ dberr_t srv_start(bool create_new_db)
 file_checked:
 	/* Open log file and data files in the systemtablespace: we keep
        them open until database shutdown */
-
-	fil_open_system_tablespace_files();
 	ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);

-	err = srv_undo_tablespaces_init(create_new_db);
+	err = fil_system.sys_space->open(create_new_db)
+		? srv_undo_tablespaces_init(create_new_db)
+		: DB_ERROR;

 	/* If the force recovery is set very high then we carry on regardless
 	of all errors. Basically this is fingers crossed mode. */
@@ -1673,7 +1672,7 @@ file_checked:

 			srv_log_file_size = srv_log_file_size_requested;

-			err = create_log_file(flushed_lsn, logfile0);
+			err = create_log_file(false, flushed_lsn, logfile0);

 			if (err == DB_SUCCESS) {
 				err = create_log_file_rename(flushed_lsn,
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -584,11 +584,10 @@ static void trx_purge_truncate_history()
 				     : 0, j = i;; ) {
 				ulint space_id = srv_undo_space_id_start + i;
 				ut_ad(srv_is_undo_tablespace(space_id));
+				fil_space_t* space= fil_space_get(space_id);

-				if (fil_space_get_size(space_id)
-				    > threshold) {
-					purge_sys.truncate.current
-						= fil_space_get(space_id);
+				if (space && space->get_size() > threshold) {
+					purge_sys.truncate.current = space;
 					break;
 				}