From c6de1267dda77d78f843b26b4dafe0bc0473f7d5 Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Thu, 10 Apr 2025 10:04:14 +0530 Subject: [PATCH] MDEV-35689 InnoDB system tables cannot be optimized or defragmented - With the help of MDEV-14795, InnoDB implemented a way to shrink the InnoDB system tablespace after undo tablespaces have been moved to separate files (MDEV-29986). There is no way to defragment any pages of InnoDB system tables. By doing that, shrinking of system tablespace can be more effective. This patch deals with defragment of system tables inside ibdata1. Following steps are done to do the defragmentation of system tablespace: 1) Make sure that there is no user tables exist in ibdata1 2) Iterate through all extent descriptor pages in system tablespace and note their states. 3) Find the free earlier extent to replace the lastly used extents in the system tablespace. 4) Iterate through all indexes of system tablespace and defragment the tree level by level. 5) Iterate the level from left page to right page and find out the page comes under the extent to be replaced. If it is then do step (6) else step(4) 6) Prepare the allocation of new extent by latching necessary pages. If any error happens then there is no modification of page happened till step (5). 7) Allocate the new page from the new extent 8) Prepare the associated pages for the block to be modified 9) Prepare the step of freeing of page 10) If any error happens during preparing of associated pages, freeing of page then restore the page which was modified during new page allocation 11) Copy the old page content to new page 12) Change the associative pages like left, right and parent page 13) Complete the freeing of old page Allocation of page from new extent, changing of relative pages, freeing of page are done by 2 steps. one is prepare which latches the to be modified pages and checks their validation. Other is complete(), Do the operation fseg_validate(): Validate the list exist in inode segment Defragmentation is enabled only when :autoextend exist in innodb_data_file_path variable. --- .../suite/innodb/r/sys_defragment.result | 25 + .../suite/innodb/r/sys_defragment_fail.result | 52 + mysql-test/suite/innodb/r/sys_truncate.result | 2 +- .../suite/innodb/r/sys_truncate_debug.result | 2 +- mysql-test/suite/innodb/t/sys_defragment.opt | 6 + mysql-test/suite/innodb/t/sys_defragment.test | 40 + .../suite/innodb/t/sys_defragment_fail.opt | 6 + .../suite/innodb/t/sys_defragment_fail.test | 90 + mysql-test/suite/innodb/t/sys_truncate.opt | 1 + mysql-test/suite/innodb/t/sys_truncate.test | 3 +- .../suite/innodb/t/sys_truncate_debug.opt | 1 + .../suite/innodb/t/sys_truncate_debug.test | 2 +- storage/innobase/fsp/fsp0fsp.cc | 1843 ++++++++++++++++- storage/innobase/fut/fut0lst.cc | 39 +- storage/innobase/include/dict0dict.h | 8 + storage/innobase/include/fil0fil.h | 6 + storage/innobase/include/fut0lst.h | 5 +- 17 files changed, 2067 insertions(+), 64 deletions(-) create mode 100644 mysql-test/suite/innodb/r/sys_defragment.result create mode 100644 mysql-test/suite/innodb/r/sys_defragment_fail.result create mode 100644 mysql-test/suite/innodb/t/sys_defragment.opt create mode 100644 mysql-test/suite/innodb/t/sys_defragment.test create mode 100644 mysql-test/suite/innodb/t/sys_defragment_fail.opt create mode 100644 mysql-test/suite/innodb/t/sys_defragment_fail.test diff --git a/mysql-test/suite/innodb/r/sys_defragment.result b/mysql-test/suite/innodb/r/sys_defragment.result new file mode 100644 index 00000000000..f279d694e6e --- /dev/null +++ b/mysql-test/suite/innodb/r/sys_defragment.result @@ -0,0 +1,25 @@ +# restart +SET GLOBAL innodb_file_per_table= 0; +Warnings: +Warning 1287 '@@innodb_file_per_table' is deprecated and will be removed in a future release +SET GLOBAL innodb_limit_optimistic_insert_debug = 2; +CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL, f3 INT NOT NULL)ENGINE=InnoDB; +INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; +SET GLOBAL innodb_file_per_table= default; +Warnings: +Warning 1287 '@@innodb_file_per_table' is deprecated and will be removed in a future release +CREATE TABLE t2(f1 INT NOT NULL PRIMARY KEY,f2 VARCHAR(40))ENGINE=InnoDB PARTITION BY KEY() PARTITIONS 256; +INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; +DROP TABLE t2, t1; +InnoDB 0 transactions not purged +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 205520896 +set GLOBAL innodb_fast_shutdown= 0; +# restart +FOUND 1 /InnoDB: Moving the data from extents 4096 through 22016/ in mysqld.1.err +FOUND 1 /InnoDB: Defragmentation of system tablespace is successful/ in mysqld.1.err +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 14680064 +# restart diff --git a/mysql-test/suite/innodb/r/sys_defragment_fail.result b/mysql-test/suite/innodb/r/sys_defragment_fail.result new file mode 100644 index 00000000000..5abf4e65772 --- /dev/null +++ b/mysql-test/suite/innodb/r/sys_defragment_fail.result @@ -0,0 +1,52 @@ +call mtr.add_suppression("InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed: Data structure corruption"); +call mtr.add_suppression("InnoDB: Defragmentation of CLUST_IND in SYS_COLUMNS failed: Data structure corruption"); +call mtr.add_suppression("InnoDB: Cannot free the unused segments in system tablespace"); +# restart +set GLOBAL innodb_file_per_table = 0; +Warnings: +Warning 1287 '@@innodb_file_per_table' is deprecated and will be removed in a future release +set GLOBAL innodb_limit_optimistic_insert_debug = 2; +CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL)ENGINE=InnoDB; +INSERT INTO t1 SELECT seq, seq FROM seq_1_to_4096; +SET GLOBAL innodb_file_per_table= 1; +Warnings: +Warning 1287 '@@innodb_file_per_table' is deprecated and will be removed in a future release +CREATE TABLE t2(f1 INT NOT NULL PRIMARY KEY, +f2 VARCHAR(40))ENGINE=InnoDB PARTITION BY KEY() PARTITIONS 256; +INSERT INTO t1 SELECT seq, seq FROM seq_1_to_4096; +DROP TABLE t2; +InnoDB 0 transactions not purged +# restart +FOUND 1 /InnoDB: User table exists in the system tablespace/ in mysqld.1.err +DROP TABLE t1; +InnoDB 0 transactions not purged +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 58720256 +# restart: --debug_dbug=+d,fail_after_level_defragment +FOUND 1 /InnoDB: Defragmentation of CLUST_IND in SYS_COLUMNS failed./ in mysqld.1.err +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 58720256 +# restart: --debug_dbug=d,allocation_prepare_fail +FOUND 1 /InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed./ in mysqld.1.err +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 58720256 +# restart: --debug_dbug=d,relation_page_prepare_fail +FOUND 2 /InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed./ in mysqld.1.err +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 58720256 +# restart: --debug_dbug=d,remover_prepare_fail +FOUND 3 /InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed./ in mysqld.1.err +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 58720256 +# restart +FOUND 5 /InnoDB: Moving the data from extents 4096 through 8960/ in mysqld.1.err +FOUND 1 /InnoDB: Defragmentation of system tablespace is successful/ in mysqld.1.err +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; +name file_size +innodb_system 15728640 +# restart diff --git a/mysql-test/suite/innodb/r/sys_truncate.result b/mysql-test/suite/innodb/r/sys_truncate.result index bb8eafbe584..909f363096e 100644 --- a/mysql-test/suite/innodb/r/sys_truncate.result +++ b/mysql-test/suite/innodb/r/sys_truncate.result @@ -4,7 +4,7 @@ Warning 1287 '@@innodb_file_per_table' is deprecated and will be removed in a fu SET UNIQUE_CHECKS=0, FOREIGN_KEY_CHECKS=0; CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL, f3 INT NOT NULL, INDEX(f1), -INDEX(f2), INDEX(f3))ENGINE=InnoDB; +INDEX(f2), INDEX(f3))STATS_PERSISTENT=0 ENGINE=InnoDB; BEGIN; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; diff --git a/mysql-test/suite/innodb/r/sys_truncate_debug.result b/mysql-test/suite/innodb/r/sys_truncate_debug.result index b198d449a76..f51693d849c 100644 --- a/mysql-test/suite/innodb/r/sys_truncate_debug.result +++ b/mysql-test/suite/innodb/r/sys_truncate_debug.result @@ -9,7 +9,7 @@ Warning 1287 '@@innodb_file_per_table' is deprecated and will be removed in a fu SET UNIQUE_CHECKS=0, FOREIGN_KEY_CHECKS=0; CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL, f3 INT NOT NULL, INDEX(f1), -INDEX(f2), INDEX(f3))ENGINE=InnoDB; +INDEX(f2), INDEX(f3))STATS_PERSISTENT=0 ENGINE=InnoDB; BEGIN; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; diff --git a/mysql-test/suite/innodb/t/sys_defragment.opt b/mysql-test/suite/innodb/t/sys_defragment.opt new file mode 100644 index 00000000000..beba09e7f48 --- /dev/null +++ b/mysql-test/suite/innodb/t/sys_defragment.opt @@ -0,0 +1,6 @@ +--innodb_page_size=4k +--innodb_data_file_path=ibdata1:1M:autoextend:autoshrink +--innodb_undo_tablespaces=0 +--innodb_stats_persistent=0 +--skip_partition=0 +--innodb_sys_tablespaces diff --git a/mysql-test/suite/innodb/t/sys_defragment.test b/mysql-test/suite/innodb/t/sys_defragment.test new file mode 100644 index 00000000000..93b219458a6 --- /dev/null +++ b/mysql-test/suite/innodb/t/sys_defragment.test @@ -0,0 +1,40 @@ +--source include/have_innodb.inc +--source include/have_sequence.inc +--source include/have_debug.inc + +--let MYSQLD_DATADIR= `SELECT @@datadir` +--source include/shutdown_mysqld.inc +--copy_file $MYSQLD_DATADIR/ibdata1 $MYSQLD_DATADIR/ibdata1_copy +--copy_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile0_copy +--source include/start_mysqld.inc + +SET GLOBAL innodb_file_per_table= 0; +SET GLOBAL innodb_limit_optimistic_insert_debug = 2; +CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL, f3 INT NOT NULL)ENGINE=InnoDB; +INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; +SET GLOBAL innodb_file_per_table= default; +CREATE TABLE t2(f1 INT NOT NULL PRIMARY KEY,f2 VARCHAR(40))ENGINE=InnoDB PARTITION BY KEY() PARTITIONS 256; +INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; +DROP TABLE t2, t1; +--source include/wait_all_purged.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +set GLOBAL innodb_fast_shutdown= 0; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: Moving the data from extents 4096 through 22016; +--source include/search_pattern_in_file.inc + +let SEARCH_PATTERN=InnoDB: Defragmentation of system tablespace is successful; +--source include/search_pattern_in_file.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +--source include/shutdown_mysqld.inc + +--move_file $MYSQLD_DATADIR/ibdata1_copy $MYSQLD_DATADIR/ibdata1 +--move_file $MYSQLD_DATADIR/ib_logfile0_copy $MYSQLD_DATADIR/ib_logfile0 + +--source include/start_mysqld.inc diff --git a/mysql-test/suite/innodb/t/sys_defragment_fail.opt b/mysql-test/suite/innodb/t/sys_defragment_fail.opt new file mode 100644 index 00000000000..6326b7e90a3 --- /dev/null +++ b/mysql-test/suite/innodb/t/sys_defragment_fail.opt @@ -0,0 +1,6 @@ +--innodb_page_size=4k +--innodb_data_file_path=ibdata1:15M:autoextend:autoshrink +--innodb_undo_tablespaces=0 +--innodb_stats_persistent=0 +--skip_partition=0 +--innodb_sys_tablespaces diff --git a/mysql-test/suite/innodb/t/sys_defragment_fail.test b/mysql-test/suite/innodb/t/sys_defragment_fail.test new file mode 100644 index 00000000000..6136d43d601 --- /dev/null +++ b/mysql-test/suite/innodb/t/sys_defragment_fail.test @@ -0,0 +1,90 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_sequence.inc + +call mtr.add_suppression("InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed: Data structure corruption"); +call mtr.add_suppression("InnoDB: Defragmentation of CLUST_IND in SYS_COLUMNS failed: Data structure corruption"); +call mtr.add_suppression("InnoDB: Cannot free the unused segments in system tablespace"); + +--let MYSQLD_DATADIR= `SELECT @@datadir` +--source include/shutdown_mysqld.inc +--copy_file $MYSQLD_DATADIR/ibdata1 $MYSQLD_DATADIR/ibdata1_copy +--copy_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile0_copy +--source include/start_mysqld.inc + +set GLOBAL innodb_file_per_table = 0; +set GLOBAL innodb_limit_optimistic_insert_debug = 2; +CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL)ENGINE=InnoDB; +INSERT INTO t1 SELECT seq, seq FROM seq_1_to_4096; + +SET GLOBAL innodb_file_per_table= 1; +CREATE TABLE t2(f1 INT NOT NULL PRIMARY KEY, + f2 VARCHAR(40))ENGINE=InnoDB PARTITION BY KEY() PARTITIONS 256; + +INSERT INTO t1 SELECT seq, seq FROM seq_1_to_4096; +DROP TABLE t2; + +--source include/wait_all_purged.inc +let $restart_parameters=; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: User table exists in the system tablespace; +--source include/search_pattern_in_file.inc +DROP TABLE t1; + +--source include/wait_all_purged.inc +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +let $restart_parameters=--debug_dbug=+d,fail_after_level_defragment; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: Defragmentation of CLUST_IND in SYS_COLUMNS failed.; +--source include/search_pattern_in_file.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +let $restart_parameters=--debug_dbug=d,allocation_prepare_fail; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed.; +--source include/search_pattern_in_file.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +let $restart_parameters=--debug_dbug=d,relation_page_prepare_fail; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed.; +--source include/search_pattern_in_file.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +let $restart_parameters=--debug_dbug=d,remover_prepare_fail; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: Defragmentation of CLUST_IND in SYS_INDEXES failed.; +--source include/search_pattern_in_file.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +let $restart_parameters=; +--source include/restart_mysqld.inc + +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN= InnoDB: Moving the data from extents 4096 through 8960; +--source include/search_pattern_in_file.inc + +let SEARCH_PATTERN=InnoDB: Defragmentation of system tablespace is successful; +--source include/search_pattern_in_file.inc + +select name, file_size from information_schema.innodb_sys_tablespaces where space = 0; + +--source include/shutdown_mysqld.inc +--move_file $MYSQLD_DATADIR/ibdata1_copy $MYSQLD_DATADIR/ibdata1 +--move_file $MYSQLD_DATADIR/ib_logfile0_copy $MYSQLD_DATADIR/ib_logfile0 +--source include/start_mysqld.inc diff --git a/mysql-test/suite/innodb/t/sys_truncate.opt b/mysql-test/suite/innodb/t/sys_truncate.opt index f940dadffd3..375d9847b65 100644 --- a/mysql-test/suite/innodb/t/sys_truncate.opt +++ b/mysql-test/suite/innodb/t/sys_truncate.opt @@ -1,2 +1,3 @@ --innodb_data_file_path=ibdata1:10M:autoextend:autoshrink --innodb_sys_tablespaces +--innodb_buffer_pool_size=75M diff --git a/mysql-test/suite/innodb/t/sys_truncate.test b/mysql-test/suite/innodb/t/sys_truncate.test index d5e05dea34c..fbba010dc09 100644 --- a/mysql-test/suite/innodb/t/sys_truncate.test +++ b/mysql-test/suite/innodb/t/sys_truncate.test @@ -1,10 +1,11 @@ --source include/have_innodb.inc --source include/have_sequence.inc + SET GLOBAL INNODB_FILE_PER_TABLE= 0; SET UNIQUE_CHECKS=0, FOREIGN_KEY_CHECKS=0; CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL, f3 INT NOT NULL, INDEX(f1), - INDEX(f2), INDEX(f3))ENGINE=InnoDB; + INDEX(f2), INDEX(f3))STATS_PERSISTENT=0 ENGINE=InnoDB; BEGIN; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; diff --git a/mysql-test/suite/innodb/t/sys_truncate_debug.opt b/mysql-test/suite/innodb/t/sys_truncate_debug.opt index b8a0ed244e4..97647d9843b 100644 --- a/mysql-test/suite/innodb/t/sys_truncate_debug.opt +++ b/mysql-test/suite/innodb/t/sys_truncate_debug.opt @@ -1,3 +1,4 @@ --innodb_data_file_path=ibdata1:1M:autoextend:autoshrink --innodb_sys_tablespaces --innodb_page_size=4k +--innodb_buffer_pool_size=100M diff --git a/mysql-test/suite/innodb/t/sys_truncate_debug.test b/mysql-test/suite/innodb/t/sys_truncate_debug.test index be70ea743e9..7dcb5ffde2a 100644 --- a/mysql-test/suite/innodb/t/sys_truncate_debug.test +++ b/mysql-test/suite/innodb/t/sys_truncate_debug.test @@ -16,7 +16,7 @@ SET GLOBAL INNODB_FILE_PER_TABLE= 0; SET UNIQUE_CHECKS=0, FOREIGN_KEY_CHECKS=0; CREATE TABLE t1(f1 INT NOT NULL, f2 INT NOT NULL, f3 INT NOT NULL, INDEX(f1), - INDEX(f2), INDEX(f3))ENGINE=InnoDB; + INDEX(f2), INDEX(f3))STATS_PERSISTENT=0 ENGINE=InnoDB; BEGIN; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; INSERT INTO t1 SELECT seq, seq, seq FROM seq_1_to_16384; diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index cc55ddd66cc..63d68930e0e 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -42,6 +42,7 @@ Created 11/29/1995 Heikki Tuuri #ifndef DBUG_OFF # include "trx0purge.h" #endif +#include #include #include "trx0undo.h" @@ -237,15 +238,13 @@ inline void xdes_set_state(const buf_block_t &block, xdes_t *descr, Gets the state of an xdes. @return state */ UNIV_INLINE -ulint +uint32_t xdes_get_state( /*===========*/ const xdes_t* descr) /*!< in: descriptor */ { - ulint state; - ut_ad(descr); - state = mach_read_from_4(descr + XDES_STATE); + uint32_t state = mach_read_from_4(descr + XDES_STATE); ut_ad(state - 1 < XDES_FSEG); return(state); } @@ -1057,7 +1056,7 @@ fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr, @param[in,out] mtr mini-transaction @return block, initialized */ static buf_block_t* fsp_page_create(fil_space_t *space, uint32_t offset, - mtr_t *mtr) + mtr_t *mtr) noexcept { buf_block_t *free_block= buf_LRU_get_free_block(have_no_mutex), *block= buf_page_create(space, offset, space->zip_size(), mtr, free_block); @@ -3089,14 +3088,13 @@ std::ostream &fseg_header::to_stream(std::ostream &out) const } #endif /* UNIV_DEBUG */ -/** Get the latched extent descriptor page or -acquire the extent descriptor page. +/** Get the latched page page or acquire the page. @param page_id page identifier to be acquired @param mtr mini-transaction @param err error code @return block descriptor */ static -buf_block_t *fsp_get_latched_xdes_page( +buf_block_t *fsp_get_latched_page( page_id_t page_id, mtr_t *mtr, dberr_t *err) { buf_block_t *block= nullptr; @@ -3118,7 +3116,7 @@ class fsp_xdes_old_page const uint32_t m_space; public: fsp_xdes_old_page(uint32_t space):m_space(space) {} - ulint n_pages() + uint32_t n_pages() noexcept { uint32_t count=0; for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++) @@ -3127,7 +3125,7 @@ public: } __attribute__((warn_unused_result)) - dberr_t insert(uint32_t page_no, mtr_t *mtr) + dberr_t insert(uint32_t page_no, mtr_t *mtr) noexcept { uint32_t m_index= page_no >> srv_page_size_shift; if (m_old_xdes_pages.size() > m_index && @@ -3137,7 +3135,7 @@ public: DBUG_EXECUTE_IF("shrink_buffer_pool_full", return DB_OUT_OF_MEMORY;); dberr_t err= DB_SUCCESS; - buf_block_t *block= fsp_get_latched_xdes_page( + buf_block_t *block= fsp_get_latched_page( page_id_t(m_space, page_no), mtr, &err); if (block) { @@ -3154,7 +3152,7 @@ public: return err; } - buf_block_t *search(uint32_t page_no) + buf_block_t *search(uint32_t page_no) noexcept { uint32_t m_index= page_no >> srv_page_size_shift; if (m_index > m_old_xdes_pages.size()) @@ -3162,7 +3160,7 @@ public: return m_old_xdes_pages[m_index]; } - void restore(mtr_t *mtr) + void restore(mtr_t *mtr) noexcept { for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++) { @@ -3198,11 +3196,11 @@ static dberr_t fsp_lst_update_skip( buf_block_t *header, uint16_t hdr_offset, fil_addr_t cur_addr, fil_addr_t last_valid_addr, - uint32_t skip_len, mtr_t *mtr) + uint32_t skip_len, mtr_t *mtr) noexcept { dberr_t err= DB_SUCCESS; uint32_t space_id= header->page.id().space(); - buf_block_t *cur= fsp_get_latched_xdes_page( + buf_block_t *cur= fsp_get_latched_page( page_id_t(space_id, cur_addr.page), mtr, &err); if (!cur) return err; @@ -3229,7 +3227,7 @@ dberr_t fsp_lst_update_skip( prev= cur; else { - prev= fsp_get_latched_xdes_page( + prev= fsp_get_latched_page( page_id_t(space_id, last_valid_addr.page), mtr, &err); if (!prev) return err; @@ -3270,7 +3268,7 @@ dberr_t fsp_lst_write_end( buf_block_t *header, uint16_t hdr_offset, fil_addr_t cur_addr, uint32_t skip_len, uint32_t orig_len, - mtr_t *mtr) + mtr_t *mtr) noexcept { dberr_t err= DB_SUCCESS; byte *len_bytes= &header->page.frame[hdr_offset + FLST_LEN]; @@ -3312,7 +3310,7 @@ func_exit: header->page.frame + hdr_offset + FLST_LAST, cur_addr.page, cur_addr.boffset, mtr); - buf_block_t *cur_block= fsp_get_latched_xdes_page( + buf_block_t *cur_block= fsp_get_latched_page( page_id_t(header->page.id().space(), cur_addr.page), mtr, &err); @@ -3340,7 +3338,7 @@ func_exit: __attribute__((warn_unused_result)) static dberr_t fsp_shrink_list(buf_block_t *header, uint16_t hdr_offset, - uint32_t threshold, mtr_t *mtr) + uint32_t threshold, mtr_t *mtr) noexcept { ut_ad(mach_read_from_4(header->page.frame + FIL_PAGE_OFFSET) == 0); const uint32_t len= flst_get_len(hdr_offset + header->page.frame); @@ -3362,7 +3360,7 @@ dberr_t fsp_shrink_list(buf_block_t *header, uint16_t hdr_offset, ut_ad(!(addr.page & (srv_page_size - 1))); if (!descr_block || descr_block->page.id().page_no() != addr.page) { - descr_block= fsp_get_latched_xdes_page( + descr_block= fsp_get_latched_page( page_id_t(header->page.id().space(), addr.page), mtr, &err); if (!descr_block) return err; @@ -3426,7 +3424,7 @@ dberr_t fsp_xdes_reset(uint32_t space_id, uint32_t threshold, mtr_t *mtr) 0, (cur_descr_page + srv_page_size - 1)); last_descr_offset+= XDES_SIZE; dberr_t err= DB_SUCCESS; - buf_block_t *block= fsp_get_latched_xdes_page( + buf_block_t *block= fsp_get_latched_page( page_id_t(space_id, cur_descr_page), mtr, &err); if (!block) return err; @@ -3478,7 +3476,7 @@ dberr_t fsp_traverse_extents( { if (!block) { - block= fsp_get_latched_xdes_page( + block= fsp_get_latched_page( page_id_t(space->id, last_descr_page_no), mtr, &err); if (!block) return err; @@ -3546,32 +3544,31 @@ dberr_t fsp_traverse_extents( return err; } -#ifdef UNIV_DEBUG /** Validate the system tablespace list */ __attribute__((warn_unused_result)) -dberr_t fsp_tablespace_validate(fil_space_t *space) +static dberr_t fsp_tablespace_validate(fil_space_t *space, + mtr_t *mtr) noexcept { /* Validate all FSP list in system tablespace */ - mtr_t local_mtr; dberr_t err= DB_SUCCESS; - local_mtr.start(); - if (buf_block_t *header= fsp_get_header( - space, &local_mtr, &err)) + if (buf_block_t *header= fsp_get_header(space, mtr, &err)) { - flst_validate(header, FSP_FREE + FSP_HEADER_OFFSET, &local_mtr); - flst_validate(header, FSP_FREE_FRAG + FSP_HEADER_OFFSET, - &local_mtr); - flst_validate(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, - &local_mtr); - flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, - &local_mtr); - flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, - &local_mtr); + err= flst_validate(header, FSP_FREE + FSP_HEADER_OFFSET, mtr); + if (err == DB_SUCCESS) + err= flst_validate(header, FSP_FREE_FRAG + FSP_HEADER_OFFSET, + mtr); + if (err == DB_SUCCESS) + err= flst_validate(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + mtr); + if (err == DB_SUCCESS) + err= flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + mtr); + if (err == DB_SUCCESS) + err= flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + mtr); } - local_mtr.commit(); return err; } -#endif /* UNIV_DEBUG */ /** Store the inode information which basically stores the page and offset */ @@ -4017,6 +4014,1748 @@ dberr_t fil_space_t::garbage_collect(bool shutdown) return unused_inodes.free_segs(); } +class SpaceDefragmenter; + +namespace flst +{ + /** Validate the file list node for the system tablespace. + @param addr file space address + @return true if validation successful or false */ + static bool node_valid(const fil_addr_t *addr) noexcept + { + return addr->boffset >= FIL_PAGE_DATA && + addr->boffset < (srv_page_size - FIL_PAGE_DATA_END); + } + + /** Prepare the steps for removing the file list node + @param descr_block descriptor block + @param xoffset descriptor offset within the block + @param free_limit maximum free limit in the tablespace + @param mtr mini-transaction + @param prev_block previous block in the list + @param next_block next block in the list + @return error code */ + static dberr_t remove_prepare(const buf_block_t &descr_block, + uint32_t xoffset, uint32_t free_limit, + mtr_t *mtr, buf_block_t **prev_block, + buf_block_t **next_block) noexcept + { + const xdes_t *descr= descr_block.page.frame + xoffset; + fil_addr_t prev_addr= flst_get_prev_addr(descr); + fil_addr_t next_addr= flst_get_next_addr(descr); + dberr_t err= DB_SUCCESS; + + if (prev_addr.page != FIL_NULL) + { + if (!node_valid(&prev_addr)) + return DB_CORRUPTION; + + *prev_block= fsp_get_latched_page(page_id_t{0, prev_addr.page}, + mtr, &err); + ut_ad(!*prev_block == (err != DB_SUCCESS)); + + if (!*prev_block) + return err; + + fil_addr_t cur_addr= + flst_get_next_addr((*prev_block)->page.frame + + prev_addr.boffset); + if (cur_addr.page != descr_block.page.id().page_no() || + cur_addr.boffset != xoffset) + return DB_CORRUPTION; + } + + if (next_addr.page != FIL_NULL) + { + if (!node_valid(&next_addr)) + return DB_CORRUPTION; + + *next_block= fsp_get_latched_page(page_id_t{0, next_addr.page}, + mtr, &err); + ut_ad(!*next_block == (err != DB_SUCCESS)); + if (!*next_block) + return err; + + fil_addr_t cur_addr= + flst_get_prev_addr((*next_block)->page.frame + next_addr.boffset); + if (cur_addr.page != descr_block.page.id().page_no() || + cur_addr.boffset != xoffset) + return DB_CORRUPTION; + } + + return err; + } + + /** Complete the steps for removing the file list node + @param base base block where free list starts + @param boffset offset where list starts + @param descr descriptor to be removed + @param mtr mini-transaction */ + static void remove_complete(buf_block_t *base, uint16_t boffset, + xdes_t *descr, mtr_t *mtr) noexcept + { + fil_addr_t prev_addr= flst_get_prev_addr(descr + XDES_FLST_NODE); + fil_addr_t next_addr= flst_get_next_addr(descr + XDES_FLST_NODE); + /* remove_prepare() checked these already */ + ut_ad(next_addr.page == FIL_NULL || node_valid(&next_addr)); + ut_ad(prev_addr.page == FIL_NULL || node_valid(&prev_addr)); + byte *list= base->page.frame + boffset; + + buf_block_t *prev_block= nullptr; + buf_block_t *next_block= nullptr; + + if (prev_addr.page != FIL_NULL) + { + prev_block= + mtr->get_already_latched(page_id_t{0, prev_addr.page}, + MTR_MEMO_PAGE_SX_FIX); + ut_ad(prev_block); + + flst_write_addr(*prev_block, prev_block->page.frame + + prev_addr.boffset + FLST_NEXT, + next_addr.page, next_addr.boffset, mtr); + } + else + flst_write_addr(*base, list + FLST_FIRST, + next_addr.page, next_addr.boffset, mtr); + + if (next_addr.page != FIL_NULL) + { + next_block= + mtr->get_already_latched(page_id_t{0, next_addr.page}, + MTR_MEMO_PAGE_SX_FIX); + ut_ad(next_block); + + flst_write_addr(*next_block, next_block->page.frame + + next_addr.boffset + FLST_PREV, + prev_addr.page, prev_addr.boffset, mtr); + } + else + flst_write_addr(*base, list + FLST_LAST, + prev_addr.page, prev_addr.boffset, mtr); + + /* All callers of remove_prepare() does check the FLST_LEN of + the list */ + byte *len= list + FLST_LEN; + mtr->write<4>(*base, len, mach_read_from_4(len) - 1); + } + + /** Prepare the steps for adding the block into last of the list + @param base block where list starts + @param boffset offset to find the list + @param free_limit maximum free limit in the tablespace + @param mtr mini-transaction + @param last_block_list last block in the list + @return error code */ + static dberr_t append_prepare(const buf_block_t &base, uint16_t boffset, + uint32_t free_limit, mtr_t *mtr, + buf_block_t **last_block_list) noexcept + { + ut_ad(!*last_block_list); + if (!flst_get_len(base.page.frame + boffset)) + return DB_SUCCESS; + + fil_addr_t addr= flst_get_last(base.page.frame + boffset); + + if (addr.page >= free_limit) + return DB_CORRUPTION; + + if (!node_valid(&addr)) + return DB_CORRUPTION; + + dberr_t err= DB_SUCCESS; + *last_block_list= fsp_get_latched_page(page_id_t{0, addr.page}, + mtr, &err); + return err; + } + + /** Complete the steps for adding the block into last of the list + @param base base block where free list starts + @param boffset offset where list starts + @param curr extent descriptor block + @param coffset offset to point the descriptor + @param mtr mini-transaction */ + static void append_complete(buf_block_t *base, uint16_t boffset, + buf_block_t *curr, uint16_t coffset, + mtr_t *mtr) noexcept + { + fil_addr_t last_addr= flst_get_last(base->page.frame + boffset); + ut_ad(last_addr.page == FIL_NULL || node_valid(&last_addr)); + buf_block_t *last_block_list= nullptr; + if (last_addr.page != FIL_NULL) + { + last_block_list= + mtr->get_already_latched(page_id_t{0, last_addr.page}, + MTR_MEMO_PAGE_SX_FIX); + ut_ad(last_block_list); + + fil_addr_t addr= flst_get_last(base->page.frame + boffset); + + flst_write_addr(*last_block_list, + last_block_list->page.frame + addr.boffset + + FLST_NEXT, + curr->page.id().page_no(), coffset, mtr); + flst_write_addr(*curr, + curr->page.frame + coffset + FLST_PREV, + addr.page, addr.boffset, mtr); + flst_write_addr(*base, base->page.frame + boffset + FLST_LAST, + curr->page.id().page_no(), coffset, mtr); + } + else + { + /* Encountered empty list. So add current block as FIRST + and LAST block in the list */ + flst_write_addr(*curr, + curr->page.frame + coffset + FLST_PREV, + FIL_NULL, 0, mtr); + flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST, + curr->page.id().page_no(), coffset, mtr); + memcpy(base->page.frame + boffset + FLST_LAST, + base->page.frame + boffset + FLST_FIRST, FIL_ADDR_SIZE); + mtr->memmove(*base, boffset + FLST_LAST, + boffset + FLST_FIRST, FIL_ADDR_SIZE); + } + + flst_write_addr(*curr, + curr->page.frame + coffset + FLST_NEXT, + FIL_NULL, 0, mtr); + + byte *len= base->page.frame + boffset + FLST_LEN; + mtr->write<4>(*base, len, mach_read_from_4(len) + 1); + } +} /* namespace flst */ + +static dberr_t fseg_validate_low(fil_space_t *space, dict_index_t *index, + mtr_t *mtr) noexcept +{ + dberr_t err= DB_SUCCESS; + buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err); + if (UNIV_UNLIKELY(!root)) + return err; + + fseg_header_t *seg_header= + root->page.frame + PAGE_HEADER + PAGE_BTR_SEG_TOP; + buf_block_t *iblock; + fseg_inode_t *inode= fseg_inode_try_get(seg_header, 0, 0, mtr, + &iblock, &err); + if (!inode) + return err; + + uint16_t i_offset= uint16_t(inode - iblock->page.frame); + + err= flst_validate(iblock, uint16_t(i_offset + FSEG_FREE), mtr); + if (err == DB_SUCCESS) + err= flst_validate(iblock, uint16_t(i_offset + FSEG_NOT_FULL), mtr); + if (err == DB_SUCCESS) + err= flst_validate(iblock, uint16_t(i_offset + FSEG_FULL), mtr); + + if (err) return err; + + seg_header= root->page.frame + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + inode= fseg_inode_try_get(seg_header, 0, 0, mtr, &iblock, &err); + if (!inode) + return err; + + i_offset= uint16_t(inode - iblock->page.frame); + + err= flst_validate(iblock, uint16_t(i_offset + FSEG_FREE), mtr); + if (err == DB_SUCCESS) + err= flst_validate(iblock, uint16_t(i_offset + FSEG_NOT_FULL), mtr); + if (err == DB_SUCCESS) + err= flst_validate(iblock, uint16_t(i_offset + FSEG_FULL), mtr); + return err; +} + +/** Validate the system tablespace list */ +__attribute__((warn_unused_result)) +static dberr_t fseg_validate(fil_space_t *space, + dict_index_t *index) noexcept +{ + /* Validate all FSP list in system tablespace */ + mtr_t mtr; + mtr.start(); + dberr_t err= fseg_validate_low(space, index, &mtr); + mtr.commit(); + return err; +} + +/** Prepare the associate pages of the current block and modify +the associated pages */ +class AssociatedPages final +{ + buf_block_t *m_left_block= nullptr; + buf_block_t *m_right_block= nullptr; + buf_block_t *m_parent_block= nullptr; + buf_block_t *const m_cur_block; + mtr_t *const m_mtr; + +public: + AssociatedPages(buf_block_t *cur_block, mtr_t *mtr) + : m_cur_block(cur_block), m_mtr(mtr) {} + + /** Fetch the left, right and parent page for the respective + current block and make sure that there is no issue exist */ + dberr_t prepare(uint32_t parent_page) noexcept + { + uint32_t left_page_no= btr_page_get_prev(m_cur_block->page.frame); + dberr_t err= DB_SUCCESS; + if (left_page_no != FIL_NULL) + { + m_left_block= fsp_get_latched_page(page_id_t{0, left_page_no}, + m_mtr, &err); + ut_ad(!m_left_block == (err != DB_SUCCESS)); + if (!m_left_block) + return err; + } + + uint32_t right_page_no= btr_page_get_next(m_cur_block->page.frame); + if (right_page_no != FIL_NULL) + { + m_right_block= fsp_get_latched_page(page_id_t{0, right_page_no}, + m_mtr, &err); + ut_ad(!m_right_block == (err != DB_SUCCESS)); + if (!m_right_block) + return err; + } + + m_parent_block= fsp_get_latched_page(page_id_t{0, parent_page}, + m_mtr, &err); + return err; + } + + /** Modify the FIL_PAGE_NEXT, FIL_PAGE_PREV, CHILD_PAGE of + respective left, right and parent block to new page number */ + void complete(uint32_t new_page_no, uint32_t parent_offset) noexcept + { + if (m_left_block) + m_mtr->write<4>(*m_left_block, + m_left_block->page.frame + FIL_PAGE_NEXT, + new_page_no); + + if (m_right_block) + m_mtr->write<4>(*m_right_block, + m_right_block->page.frame + FIL_PAGE_PREV, + new_page_no); + + m_mtr->write<4>(*m_parent_block, + m_parent_block->page.frame + parent_offset, + new_page_no); + } +}; + +/** page operation for the system tablespace does the 2 things: +1) Page Allocation +2) Page removal + +Steps for page allocation depends on new extent state. + +(1) If the xdes_get_state(new_descr) == XDES_FREE then +remove the new extent from FSP_FREE list + + (1.1) If the page has to be allocated for segment then + add the newly allocated extent descriptor to + FSEG_NOT_FULL list and make the xdes_set_state(new_descr) + as XDES_FSEG + + (1.2) If the page has to be non-segment page then add the + newly allocated extent descriptor to FSP_FREE_FRAG list + and make the xdes_set_state(new_descr) as XDES_FREE_FRAG + + (1.3) Allocate a page from the new extent + +(2) If the xdes_get_state(new_descr) == XDES_FREE_FRAG then + + (2.1) Allocate a page from the new extent + + (2.2) xdes_get_n_used(new_descr) is FSP_EXTENT_SIZE then + - Remove the new extent descriptor from FSP_FREE_FRAG list + - Add the new extent descriptor to FSP_FULL_FRAG list + and make xdes_set_state(new_descr) as XDES_FULL_FRAG + +(3) If the xdes_get_state(new_descr) == XDES_FSEG then + + (3.1) Allocate a page from extent + + (3.2) xdes_get_n_used(new_descr) is FSP_EXTENT_SIZE then + - Remove the new extent descriptor from FSEG_NOT_FULL list + - Add the new extent descriptor to FSEG_FULL list + + +Steps for removing the page from extent: + + (1) To remove the page from extent and number of used + pages in extent descriptor is FSP_EXTENT_SIZE + + (1a) If the xdes_get_state(m_old_descr) is XDES_FSEG then + move the extent descriptor from FSEG_FULL to FSEG_NOT_FULL + + (1b) If the xdes_get_stats(m_old_descr) is XDES_FREE_FRAG/XDES_FULL_FRAG + then move the extent descriptor from FSP_FULL_FRAG to + FSP_FREE_FRAG list + + (2) If the number of used pages in extent descriptor is 0 then + move the extent descriptor to FSP_FREE + + (3) Free the page and mark the XDES_FREE_BIT of the respective + page in current extent descriptor + +Above all scenario done by 2 steps to make sure that there +will be no error scenario once the modification of the pages +has started. +1) prepare - Basically validates the necessary condition +and make sure that pages are being latched +2) Complete - Completes the action by using the latched +pages in prepare step */ +class PageOperator final +{ + /** Header block for the tablespace */ + buf_block_t *const m_header_block= nullptr; + /** Index node block */ + buf_block_t *const m_iblock= nullptr; + /** Index node */ + fseg_inode_t *const m_inode= nullptr; + /** offset of index node within index node page*/ + uint16_t m_ioffset= 0; + /** Maximum free limit of the tablespace */ + uint32_t m_free_limit= 0; + /** Segment id */ + uint64_t m_seg_id= 0; + /** Extent size */ + uint32_t m_extent_size= 0; + + /** New block to be allocated */ + buf_block_t *m_new_block= nullptr; + /** New block extent descriptor */ + buf_block_t *m_new_xdes= nullptr; + /** New block descriptor */ + xdes_t *m_new_descr= nullptr; + /** New block descriptor offset within xdes page */ + uint16_t m_xoffset= 0; + /** New extent descriptor state */ + uint32_t m_new_state= 0; + /** Need segment allocation */ + bool m_need_segment= false; + /** Old pages during allocation to be saved */ + buf_block_t *m_old_pages[8]= {nullptr}; + /** Page to be removed */ + byte m_old_page_no[4]= {0}; + /** Old block extent descriptor page */ + buf_block_t *m_old_xdes= nullptr; + /** Old block descriptor */ + xdes_t *m_old_descr= nullptr; + /** Old block descriptor offset with descriptor page */ + uint16_t m_old_xoffset= 0; + /** Old descriptor state */ + uint32_t m_old_state= 0; + /** Mini-transaction to allocate & free a page */ + mtr_t *const m_mtr; + + /** Save the old page state of the block before + allocating a page + @param block block to be stored + @return error code */ + dberr_t save_old_page(buf_block_t *block) noexcept + { + if (!block) return DB_SUCCESS; + size_t first_free; + for (first_free= 0; first_free < array_elements(m_old_pages); first_free++) + { + const buf_block_t *b= m_old_pages[first_free]; + if (!b) + goto found; + if (b->page.hash == &block->page) + return DB_SUCCESS; + } + return DB_CORRUPTION; +found: + buf_block_t *old= buf_LRU_get_free_block(have_no_mutex_soft); + if (!old) return DB_OUT_OF_MEMORY; + memcpy_aligned( + old->page.frame, block->page.frame, srv_page_size); + m_old_pages[first_free]= old; + old->page.hash= &block->page; + return DB_SUCCESS; + } + + /** Prepare the steps for free extent allocation by validating + FLST_PREV, FLST_NEXT of choosen extent descriptor + and their FLST_LEN of FSP_FREE list in FSP_HEADER_PAGE. + @return error code or DB_SUCCESS */ + dberr_t free_extent_prepare() noexcept + { + /* At least there should be 1 element in FSP_FREE list */ + byte *len= + &m_header_block->page.frame[FSP_HEADER_OFFSET + FSP_FREE + + FLST_LEN]; + if (mach_read_from_4(len) == 0) + return DB_CORRUPTION; + + buf_block_t *fsp_free_prev= nullptr; + buf_block_t *fsp_free_next= nullptr; + + dberr_t err= flst::remove_prepare(*m_new_xdes, m_xoffset, + m_free_limit, m_mtr, + &fsp_free_prev, &fsp_free_next); + if (err == DB_SUCCESS) + { + err= save_old_page(fsp_free_prev); + if (err == DB_SUCCESS) + err= save_old_page(fsp_free_next); + } + return err; + } + + /** Complete the free extent allocation */ + void free_extent_complete() noexcept + { + flst::remove_complete(m_header_block, FSP_HEADER_OFFSET + FSP_FREE, + m_new_descr, m_mtr); + fil_system.sys_space->free_len--; + } + + /** Prepare the steps to do the following + 1) free extent allocation + 2) Add the extent to FSEG_NOT_FULL list by validating the + last extent descriptor in FSEG_NOT_FULL list of segment inode + @return error code */ + dberr_t initialize_segment_prepare() noexcept + { + dberr_t err= free_extent_prepare(); + if (err) return err; + + buf_block_t *fseg_not_full_last= nullptr; + err= flst::append_prepare(*m_iblock, + uint16_t(m_ioffset + FSEG_NOT_FULL), + m_free_limit, m_mtr, &fseg_not_full_last); + if (err == DB_SUCCESS) + err= save_old_page(fseg_not_full_last); + return err; + } + + /** This function does the following + 1) Allocating the free extent + 2) Appending the extent to FSEG_NOT_FULL list in segment inode + 3) Mark the extent state as XDES_FSEG */ + void initialize_segment_complete() noexcept + { + free_extent_complete(); + flst::append_complete(m_iblock, + uint16_t(m_ioffset + FSEG_NOT_FULL), + m_new_xdes, m_xoffset, m_mtr); + + /* Update the FSEG_NOT_FULL_N_USED in inode */ + byte *p_not_full= m_inode + FSEG_NOT_FULL_N_USED; + m_mtr->write<4>(*m_iblock, p_not_full, + mach_read_from_4(p_not_full) + 1); + xdes_set_state(*m_new_xdes, m_new_descr, XDES_FSEG, m_mtr); + m_mtr->write<8,mtr_t::MAYBE_NOP>(*m_new_xdes, + m_new_descr + XDES_ID, + m_seg_id); + xdes_set_free(*m_new_xdes, m_new_descr, + m_new_block->page.id().page_no() % m_extent_size, + m_mtr); + } + + /** Prepare the steps for + 1) Allocating the free extent + 2) Adding the extent to FSP_FREE_FRAG list by validating + the last extent descriptor in FSP_FREE_FRAG list of FSP_HEADER page + @return error code */ + dberr_t initialize_free_frag_prepare() noexcept + { + dberr_t err= free_extent_prepare(); + if (err) return err; + + buf_block_t *fsp_free_frag_last= nullptr; + err= flst::append_prepare(*m_header_block, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + m_free_limit, m_mtr, &fsp_free_frag_last); + + if (err == DB_SUCCESS) + err= save_old_page(fsp_free_frag_last); + return err; + } + + /** This function does the following + 1) Allocating the free extent + 2) Appending the extent to FSP_FREE_FRAG list in FSP_HEADER page + 3) Mark the extent state as XDES_FREE_FRAG */ + void initialize_free_frag_complete() noexcept + { + free_extent_complete(); + flst::append_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + m_new_xdes, m_xoffset, m_mtr); + + byte *n_frag_used= m_header_block->page.frame + + FSP_HEADER_OFFSET + FSP_FRAG_N_USED; + m_mtr->write<4>(*m_header_block, n_frag_used, + mach_read_from_4(n_frag_used) + 1); + + /* Allocate the extent state to FREE_FRAG & update FSP_FRAG_N_USED */ + xdes_set_state(*m_new_xdes, m_new_descr, XDES_FREE_FRAG, m_mtr); + xdes_set_free(*m_new_xdes, m_new_descr, + m_new_block->page.id().page_no() % m_extent_size, + m_mtr); + } + + /** Prepare the steps to + 1) Allocate a page from XDES_FSEG extent + 2) If the extent size is FSP_EXTENT_SIZE then + prepare the extent to move from FSEG_NOT_FULL to FSEG_FULL + list in segment inode by validating the last extent descriptor in + FSEG_FULL list and previous and next extent in FSEG_NOT_FULL list. + @return error code */ + dberr_t alloc_from_fseg_prepare() noexcept + { + uint32_t n_used= xdes_get_n_used(m_new_descr); + if (n_used < 1 || n_used >= m_extent_size) + return DB_CORRUPTION; + + if (n_used < m_extent_size) + return DB_SUCCESS; + + byte *lst= m_iblock->page.frame + uint16_t(m_ioffset + FSEG_NOT_FULL); + if (!mach_read_from_4(lst + FLST_LEN)) + return DB_CORRUPTION; + + buf_block_t *fseg_not_full_prev= nullptr; + buf_block_t *fseg_not_full_next= nullptr; + dberr_t err= flst::remove_prepare(*m_new_xdes, m_xoffset, + m_free_limit, m_mtr, + &fseg_not_full_prev, + &fseg_not_full_next); + if (err) return err; + + buf_block_t *fseg_full_last= nullptr; + err= flst::append_prepare(*m_iblock, + uint16_t(m_ioffset + FSEG_FULL), + m_free_limit, m_mtr, &fseg_full_last); + if (err == DB_SUCCESS) + { + err= save_old_page(fseg_not_full_prev); + if (err == DB_SUCCESS) + err= save_old_page(fseg_not_full_next); + if (err == DB_SUCCESS) + err= save_old_page(fseg_full_last); + } + return err; + } + + /** Does the following + 1) Complete the page allocation from file segment. + 2) If the extent size is FSP_EXTENT_SIZE then + i) Remove the extent from FSEG_NOT_FULL list + ii) Add the extent to FSEG_FULL */ + void alloc_from_fseg_complete() noexcept + { + xdes_set_free(*m_new_xdes, m_new_descr, + m_new_block->page.id().page_no() % m_extent_size, + m_mtr); + + byte *p_not_full= m_inode + FSEG_NOT_FULL_N_USED; + uint32_t n_used_val= mach_read_from_4(p_not_full) + 1; + + if (xdes_get_n_used(m_new_descr) == m_extent_size) + { + n_used_val-= FSP_EXTENT_SIZE; + m_mtr->write<4>(*m_iblock, p_not_full, n_used_val); + flst::remove_complete(m_iblock, + uint16_t(m_ioffset + FSEG_NOT_FULL), + m_new_descr, m_mtr); + flst::append_complete(m_iblock, + uint16_t(m_ioffset + FSEG_FULL), + m_new_xdes, m_xoffset, m_mtr); + } + else + m_mtr->write<4>(*m_iblock, p_not_full, n_used_val); + } + + /** Prepare the steps to + 1) Allocate the page from free fragment extent. + 2) If the extent size is FSP_EXTENT_SIZE then prepare the + steps to move the extent from FSP_FREE_FRAG to FSP_FULL_FRAG + list by validating the next, previous extent descriptor of + current extent descriptor in FSP_FREE_FRAG list and + last extent descriptor in FSP_FULL_FRAG list + @return error code */ + dberr_t alloc_from_free_frag_prepare() noexcept + { + uint32_t n_used= xdes_get_n_used(m_new_descr); + if (n_used < 1 || n_used >= m_extent_size) + return DB_CORRUPTION; + + if (n_used < m_extent_size) + return DB_SUCCESS; + + byte *lst= m_header_block->page.frame + FSP_HEADER_OFFSET + FSP_FREE_FRAG; + if (!mach_read_from_4(lst + FLST_LEN)) + return DB_CORRUPTION; + + buf_block_t *fsp_free_frag_prev= nullptr; + buf_block_t *fsp_free_frag_next= nullptr; + dberr_t err= flst::remove_prepare(*m_new_xdes, m_xoffset, + m_free_limit, m_mtr, + &fsp_free_frag_prev, + &fsp_free_frag_next); + if (err) return err; + + buf_block_t *fsp_full_frag_last= nullptr; + err= flst::append_prepare(*m_header_block, + FSP_HEADER_OFFSET + FSP_FULL_FRAG, + m_free_limit, m_mtr, + &fsp_full_frag_last); + + if (err == DB_SUCCESS) + { + err= save_old_page(fsp_free_frag_prev); + if (err == DB_SUCCESS) + err= save_old_page(fsp_free_frag_next); + if (err == DB_SUCCESS) + err= save_old_page(fsp_full_frag_last); + } + return err; + } + + /** Does the following + 1) Allocate the page from fragment extent + 2) If the extent size is FSP_EXTENT_SIZE then + i) remove the extent descriptor from FSP_FREE_FRAG list + ii) Add the extent descriptor in FSP_FULL_FRAG list */ + void alloc_from_free_frag_complete() noexcept + { + xdes_set_free(*m_new_xdes, m_new_descr, + m_new_block->page.id().page_no() % m_extent_size, + m_mtr); + + byte *frag_n_used= m_header_block->page.frame + FSP_HEADER_OFFSET + + FSP_FRAG_N_USED; + uint32_t n_used_frag= mach_read_from_4(frag_n_used) + 1; + + if (xdes_get_n_used(m_new_descr) == m_extent_size) + { + n_used_frag-= FSP_EXTENT_SIZE; + m_mtr->write<4>(*m_header_block, frag_n_used, n_used_frag); + flst::remove_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + m_new_descr, m_mtr); + + flst::append_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FULL_FRAG, + m_new_xdes, m_xoffset, m_mtr); + } + else + m_mtr->write<4>(*m_header_block, frag_n_used, n_used_frag); + } + + /** Prepare the steps to free the page from fragment pages. + 1) Check the page exist in segment fragment array + 2) If the extent descriptor is in XDES_FULL_FRAG then + prepare the steps to move the extent descriptor + from FSP_FULL_FRAG to FSP_FREE_FRAG list by validating + the FLST_PREV, FLST_NEXT of current extent descriptor + and FLST_LAST in FSP_FREE_FRAG list + 3) If the extent is about to empty then prepare the steps + to move the extent descriptor from FSP_FREE_FRAG to FSP_FREE list + by validating the FLST_PREV, FLST_NEXT of current extent + descriptor and FLST_LAST in FSP_FREE list + @return error code */ + dberr_t free_from_frag_prepare() noexcept + { + uint32_t n_arr_slots= m_extent_size / 2; + bool page_exist= false; + for (ulint i= 0; i < n_arr_slots; i++) + { + if (!memcmp(m_inode + FSEG_FRAG_ARR + i * FSEG_FRAG_SLOT_SIZE, + m_old_page_no, 4)) + { + page_exist= true; + break; + } + } + + if (!page_exist) return DB_CORRUPTION; + + buf_block_t *fsp_full_frag_prev= nullptr; + buf_block_t *fsp_full_frag_next= nullptr; + buf_block_t *fsp_free_frag_last= nullptr; + dberr_t err= DB_SUCCESS; + uint32_t n_used= xdes_get_n_used(m_old_descr); + + if (m_old_state == XDES_FULL_FRAG) + { + if (n_used != m_extent_size) + return DB_CORRUPTION; + + byte *lst= m_header_block->page.frame + FSP_HEADER_OFFSET + FSP_FULL_FRAG; + if (!mach_read_from_4(lst + FLST_LEN)) + return DB_CORRUPTION; + + err= flst::remove_prepare(*m_old_xdes, m_old_xoffset, m_free_limit, + m_mtr, &fsp_full_frag_prev, + &fsp_full_frag_next); + + if (err) return err; + + return flst::append_prepare(*m_header_block, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + m_free_limit, m_mtr, + &fsp_free_frag_last); + } + + if (n_used >= m_extent_size || n_used == 0) + return DB_CORRUPTION; + + buf_block_t *fsp_free_frag_prev= nullptr; + buf_block_t *fsp_free_frag_next= nullptr; + buf_block_t *fsp_free_last= nullptr; + + if (n_used == 1) + { + byte *lst= m_header_block->page.frame + FSP_HEADER_OFFSET + FSP_FREE_FRAG; + if (!mach_read_from_4(lst + FLST_LEN)) + return DB_CORRUPTION; + + err= flst::remove_prepare(*m_old_xdes, m_old_xoffset, m_free_limit, + m_mtr, &fsp_free_frag_prev, + &fsp_free_frag_next); + if (err) return err; + + return flst::append_prepare(*m_header_block, + FSP_HEADER_OFFSET + FSP_FREE, + m_free_limit, m_mtr, + &fsp_free_last); + } + return err; + } + + /** Complete the removal of page from XDES_FREE_FRAG + (or) XDES_FULL_FRAG list. + 1) If the extent is from FSP_FULL_FRAG then move the + extent descriptor from FSP_FULL_FRAG to FSP_FREE_FRAG + 2) If the extent is from FSP_FREE_FRAG and no pages + has been used in that descr then move the extent + from FSP_FREE_FRAG to FSP_FREE */ + void free_from_frag_complete() noexcept + { + uint32_t old_page_no= mach_read_from_4(m_old_page_no); + m_mtr->free(*fil_system.sys_space, old_page_no); + xdes_set_free(*m_old_xdes, m_old_descr, + old_page_no % m_extent_size, m_mtr); + uint32_t n_used= xdes_get_n_used(m_old_descr); + byte *frag_n_used= m_header_block->page.frame + FSP_HEADER_OFFSET + + FSP_FRAG_N_USED; + uint32_t n_frag_used= mach_read_from_4(frag_n_used) - 1; + + for (size_t i= 0, frag= m_ioffset + FSEG_FRAG_ARR; + i < m_extent_size / 2; i++, frag += FSEG_FRAG_SLOT_SIZE) + { + if (!memcmp(m_iblock->page.frame + frag, m_old_page_no, 4)) + { + m_mtr->memset(m_iblock, frag, 4, 0xff); + break; + } + } + + if (n_used == m_extent_size - 1) + { + flst::remove_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FULL_FRAG, + m_old_descr, m_mtr); + + xdes_set_state(*m_old_xdes, m_old_descr, XDES_FREE_FRAG, m_mtr); + + flst::append_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + m_old_xdes, m_old_xoffset, m_mtr); + + n_frag_used += m_extent_size; + } + else if (n_used == 0) + { + flst::remove_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + m_old_descr, m_mtr); + + xdes_set_state(*m_old_xdes, m_old_descr, XDES_FREE, m_mtr); + + flst::append_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FREE, + m_old_xdes, m_old_xoffset, m_mtr); + } + m_mtr->write<4>(*m_header_block, frag_n_used, n_frag_used); + } + + /** Prepare the removal of page from file segment + 1) If the number of used pages in extent descriptor is + FSP_EXTENT_SIZE then move the extent descriptor from + FSEG_FULL to FSEG_NOT_FULL list by validating the + FLST_PREV, FLST_NEXT of current extent descriptor + and last extent descriptor in FSEG_NOT_FULL list + 2) If the number of used pages in extent descriptor is 0 + then move the extent descriptor from FSEG_NOT_FULL to + FSP_FREE list by validating the FLST_PREV, FLST_NEXT + of current extent descriptor and last extent descriptor + in FSP_FREE list + @return error code */ + dberr_t free_from_fseg_prepare() noexcept + { + if (memcmp(m_old_descr, m_inode + FSEG_ID, 8)) + return DB_CORRUPTION; + + uint32_t n_used= xdes_get_n_used(m_old_descr); + if (n_used == 0 || n_used > m_extent_size) + return DB_CORRUPTION; + + buf_block_t *fseg_full_prev= nullptr; + buf_block_t *fseg_full_next= nullptr; + buf_block_t *fseg_not_full_last= nullptr; + buf_block_t *fseg_not_full_prev= nullptr; + buf_block_t *fseg_not_full_next= nullptr; + buf_block_t *fsp_free_last= nullptr; + + dberr_t err= DB_SUCCESS; + + if (n_used == m_extent_size) + { + byte *lst= m_iblock->page.frame + uint16_t(m_ioffset + FSEG_FULL); + if (!mach_read_from_4(lst + FLST_LEN)) + return DB_CORRUPTION; + + err= flst::remove_prepare(*m_old_xdes, m_old_xoffset, m_free_limit, + m_mtr, &fseg_full_prev, + &fseg_full_next); + if (err) return err; + + err= flst::append_prepare(*m_iblock, + uint16_t(FSEG_NOT_FULL + m_ioffset), + m_free_limit, m_mtr, + &fseg_not_full_last); + if (err) return err; + } + else + { + uint32_t not_full_n_used= + mach_read_from_4(m_inode + FSEG_NOT_FULL_N_USED); + if (!not_full_n_used) return DB_CORRUPTION; + } + + if (n_used == 1) + { + byte *lst= m_iblock->page.frame + uint16_t(m_ioffset + FSEG_NOT_FULL); + if (!mach_read_from_4(lst + FLST_LEN)) + return DB_CORRUPTION; + + err= flst::remove_prepare(*m_old_xdes, m_old_xoffset, m_free_limit, + m_mtr, &fseg_not_full_prev, + &fseg_not_full_next); + if (err) return err; + + err= flst::append_prepare(*m_header_block, + FSP_FREE + FSP_HEADER_OFFSET, + m_free_limit, m_mtr, &fsp_free_last); + } + return err; + } + + /** Complete the removal of page from file segment + 1) If the extent is from FSEG_FULL then move the + extent descriptor from FSEG_FULL to FSEG_NOT_FULL + 2) If the extent is from FSEG_NOT_FULL then move the + extent descriptor to FSP_FREE */ + void free_from_fseg_complete() noexcept + { + uint32_t n_used= xdes_get_n_used(m_old_descr); + uint32_t old_page_no= mach_read_from_4(m_old_page_no); + m_mtr->free(*fil_system.sys_space, old_page_no); + xdes_set_free(*m_old_xdes, m_old_descr, + old_page_no % m_extent_size, m_mtr); + + byte* p_not_full = m_inode + FSEG_NOT_FULL_N_USED; + uint32_t not_full_n_used = mach_read_from_4(p_not_full) - 1; + if (n_used == m_extent_size) + { + flst::remove_complete(m_iblock, uint16_t(m_ioffset + FSEG_FULL), + m_old_descr, m_mtr); + flst::append_complete(m_iblock, + uint16_t(m_ioffset + FSEG_NOT_FULL), + m_old_xdes, m_old_xoffset, m_mtr); + not_full_n_used += m_extent_size; + } + m_mtr->write<4>(*m_iblock, p_not_full, not_full_n_used); + + if (n_used == 1) + { + flst::remove_complete(m_iblock, + uint16_t(m_ioffset + FSEG_NOT_FULL), + m_old_descr, m_mtr); + + xdes_set_state(*m_old_xdes, m_old_descr, XDES_FREE, m_mtr); + flst::append_complete(m_header_block, + FSP_HEADER_OFFSET + FSP_FREE, + m_old_xdes, m_old_xoffset, m_mtr); + fil_system.sys_space->free_len++; + } + } +public: + PageOperator(buf_block_t *header_block, buf_block_t *iblock, + fseg_inode_t *inode, + uint32_t extent_size, byte* old_page_no, + mtr_t *mtr) : + m_header_block(header_block), + m_iblock(iblock), m_inode(inode), m_extent_size(extent_size), + m_mtr(mtr) + { + if (old_page_no) + memcpy(m_old_page_no, old_page_no, 4); + m_free_limit= mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + m_header_block->page.frame); + m_seg_id= mach_read_from_8(m_inode + FSEG_ID); + } + + ~PageOperator() + { + for (buf_block_t *old : m_old_pages) + if (old) + { + old->page.hash= nullptr; + buf_block_free(old); + } + } + + + /** Get allocated new block */ + buf_block_t* get_new_block() const noexcept { return m_new_block; } + + /** Prepare the new page allocation from the new given extent + @param new_extent starting page of new extent + @param segment segment allocation + @return error code */ + dberr_t prepare_new_page(uint32_t new_extent, bool segment) noexcept + { + dberr_t err= DB_SUCCESS; + uint32_t size= mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + m_header_block->page.frame); + if (new_extent >= size || new_extent >= m_free_limit) + return DB_CORRUPTION; + + uint32_t new_descr_page_no= xdes_calc_descriptor_page(0, new_extent); + m_new_xdes= fsp_get_latched_page(page_id_t{0, new_descr_page_no}, + m_mtr, &err); + if (!m_new_xdes) + return err; + + ut_ad(!m_new_block); + m_ioffset= uint16_t(m_inode - m_iblock->page.frame); + m_need_segment= segment; + m_xoffset= uint16_t(xdes_calc_descriptor_index(0, new_extent) * XDES_SIZE + + XDES_ARR_OFFSET + XDES_FLST_NODE); + m_new_descr= m_new_xdes->page.frame + m_xoffset - XDES_FLST_NODE; + m_new_state= uint32_t(xdes_get_state(m_new_descr)); + uint32_t new_page= 0; + + /* Allocate the new extent and initialize the extent state + with XDES_FSEG/XDES_FREE_FRAG */ + if (m_new_state == XDES_FREE) + { + if (segment) err= initialize_segment_prepare(); + else err= initialize_free_frag_prepare(); + + if (err) return err; +new_page: + new_page= xdes_find_free(m_new_descr); + if (new_page == FIL_NULL) + return DB_CORRUPTION; + + new_page+= new_extent; + m_new_block= fsp_page_create(fil_system.sys_space, new_page, m_mtr); + err= save_old_page(m_header_block); + if (err == DB_SUCCESS) + err= save_old_page(m_iblock); + if (err == DB_SUCCESS) + err= save_old_page(m_new_xdes); + if (err == DB_SUCCESS) + err= save_old_page(m_new_block); + return err; + } + + uint32_t n_used= xdes_get_n_used(m_new_descr); + if (n_used == 0 || n_used >= m_extent_size) + return DB_CORRUPTION; + + /* Allocate the page from file segment */ + if (m_seg_id != FIL_NULL && m_new_state == XDES_FSEG && + mach_read_from_8(m_new_descr + XDES_ID) == m_seg_id) + err= alloc_from_fseg_prepare(); + /* Allocate the page from free frag */ + else if (m_new_state == XDES_FREE_FRAG || m_new_state == XDES_FULL_FRAG) + err= alloc_from_free_frag_prepare(); + else return DB_CORRUPTION; + + if (err) return err; + goto new_page; + } + + /** Complete the page allocation from FREE extent descriptor + or XDES_FSEG/XDES_FREE_FRAG extent list */ + void complete_new_page() noexcept + { + if (m_new_state == XDES_FREE) + { + if (m_need_segment) + return initialize_segment_complete(); + return initialize_free_frag_complete(); + } + if (m_new_state == XDES_FSEG) + return alloc_from_fseg_complete(); + return alloc_from_free_frag_complete(); + } + + /** Assign the fragment slot of the index node. + This step should be done after removing the old page + because there is a possiblity that FRAGMENT ARRAY + could be full. */ + void assign_frag_slot() noexcept + { + if ((!m_need_segment && m_new_state == XDES_FREE) || + m_new_state == XDES_FULL_FRAG || + m_new_state == XDES_FREE_FRAG) + fseg_set_nth_frag_page_no(m_inode, m_iblock, + fseg_find_free_frag_page_slot(m_inode), + m_new_block->page.id().page_no(), m_mtr); + } + + /** Restore the page modified during page allocation */ + void restore_old_pages() noexcept + { + for (buf_block_t *old : m_old_pages) + if (old) + memcpy_aligned( + old->page.hash->frame, old->page.frame, srv_page_size); + } + + /** Prepare the steps to remove the page from file segment + (or) fragment extent. + @return error code */ + dberr_t prepare_old_page() noexcept + { + uint32_t old_page_no= mach_read_from_4(m_old_page_no); + uint32_t old_descr_page_no= + xdes_calc_descriptor_page(0, old_page_no); + dberr_t err= DB_SUCCESS; + m_old_xdes= fsp_get_latched_page(page_id_t{0, old_descr_page_no}, + m_mtr, &err); + if (!m_old_xdes) + return err; + + m_old_xoffset= + uint16_t(xdes_calc_descriptor_index(0, old_page_no) * XDES_SIZE + + XDES_ARR_OFFSET + XDES_FLST_NODE); + + m_old_descr= m_old_xdes->page.frame + m_old_xoffset - XDES_FLST_NODE; + m_old_state= uint32_t(xdes_get_state(m_old_descr)); + if (m_old_state == XDES_FREE) + return DB_CORRUPTION; + + if (xdes_is_free(m_old_descr, old_page_no & (m_extent_size -1))) + return DB_CORRUPTION; + + m_ioffset= uint16_t(m_inode - m_iblock->page.frame); + return m_old_state == XDES_FSEG + ? free_from_fseg_prepare() + : free_from_frag_prepare(); + } + + /** Complete the removal of page operation */ + void complete_free_old_page() noexcept + { + return m_old_state == XDES_FSEG + ? free_from_fseg_complete() + : free_from_frag_complete(); + } +}; + + +class IndexDefragmenter final +{ + /** Parent block and its associate offset where + we store the child page number. This is stored + in the form of */ + std::unordered_map m_parent_pages; + + dict_index_t &m_index; + + buf_block_t *m_root; + /** Iterate through the page and map the child_page_no + with the parent page and their associate offset + in m_parent_pages + @param block block to be traversed */ + dberr_t get_child_pages(buf_block_t *block) noexcept; + + /** Get the first block for the given level + @param level level + @param mtr mini-transaction + @param cur_page_no first page number for the given level + @return error code or DB_SUCCESS */ + dberr_t get_level_block(uint16_t level, mtr_t *mtr, + uint32_t *cur_page_no) noexcept; + + /** Defragment the level of the index + @param level level to be defragmented + @param mtr mini-transaction + @param space_defrag space defragmenter information + and also responsible for allocating new + segment or page from tablespace + @return error code or DB_SUCCESS */ + dberr_t defragment_level(uint16_t level, mtr_t *mtr, + SpaceDefragmenter *space_defrag) noexcept; + +public: + IndexDefragmenter(dict_index_t &index): m_index(index) {} + + /** Defragment the index with the help of space defragmenter. + 1) Iterate through each level of the index + 2) Find out what are the pages/segment + to be modified for the index. + 3) Allocate the page from the new segment/extent + 4) Copy the to be changed page content to new page + 5) Change the associative pages in the tree with + new page(left, right, parent block) + 6) Do step (4), (5) within single mini-transaction + and commit the mini-transaction + @return error code or DB_SUCCESS */ + dberr_t defragment(SpaceDefragmenter *space_defrag) noexcept; +}; + +class SpaceDefragmenter final +{ + /** Extent is already allocated for defragmentation */ + static constexpr uint32_t XDES_USED= ~0U; + /** Store the extent information in the tablespace */ + std::map m_extent_info; + /** Map of last used extent with early unused extent within + the tablespace */ + std::map m_extent_map; + + /** Collect the extent information from tablespace */ + dberr_t extract_extent_state() noexcept + { + mtr_t mtr; + dberr_t err= DB_SUCCESS; + uint32_t last_descr_page_no= 0; + fil_space_t *space= fil_system.sys_space; + mtr.start(); + mtr.x_lock_space(space); + buf_block_t *last_descr= buf_page_get_gen(page_id_t{space->id, 0}, 0, + RW_S_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, &mtr, + &err); + if (!last_descr) + { +func_exit: + mtr.commit(); + return err; + } + + for (uint32_t xdes_n= 0; xdes_n < space->free_limit; + xdes_n+= m_extent_size) + { + /* Ignore doublewrite buffer extent */ + if (buf_dblwr.is_inside(xdes_n)) + continue; + uint32_t descr_page_no= + xdes_calc_descriptor_page(space->id, xdes_n); + if (descr_page_no != last_descr_page_no) + { + last_descr= buf_page_get_gen(page_id_t{space->id, xdes_n}, + 0, RW_S_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, &mtr, + &err); + if (!last_descr) + goto func_exit; + } + xdes_t *descr= XDES_ARR_OFFSET + XDES_SIZE * + xdes_calc_descriptor_index(0, xdes_n) + last_descr->page.frame; + last_descr_page_no= descr_page_no; + /* Ignore the extent descriptor extent */ + if (xdes_n % srv_page_size == 0 && xdes_get_n_used(descr) == 2) + continue; + m_extent_info[xdes_n]= xdes_get_state(descr); + } + goto func_exit; + } + + /** Find the earlier free extent for the given used extent + @param max_limit Find the extent below max limit extent + @return value + @retval FIL_NULL if there is no extent */ + uint32_t find_free_extent(uint32_t max_limit) noexcept + { + for (auto &extent_info : m_extent_info) + { + if (max_limit <= extent_info.first) + return FIL_NULL; + + if (extent_info.second == XDES_FREE) + { + /* Mark the extent as used one */ + extent_info.second = XDES_USED; + return extent_info.first; + } + } + return FIL_NULL; + } + + /** Defragment the indexes */ + dberr_t defragment_index(dict_index_t &index) noexcept + { + IndexDefragmenter index_defrag(index); + return index_defrag.defragment(this); + } + + /** Defragment the table */ + dberr_t defragment_table(const dict_table_t *table) noexcept + { + for (dict_index_t *index= dict_table_get_first_index(table); + index; index= dict_table_get_next_index(index)) + { + dberr_t err= fseg_validate(fil_system.sys_space, index); + if (err == DB_SUCCESS) + err= defragment_index(*index); + + if (err) + { + sql_print_error("InnoDB: Defragmentation of %s in %s failed: %s", + index->name, table->name.m_name, ut_strerr(err)); + return err; + } + } + return DB_SUCCESS; + } +public: + const uint32_t m_extent_size; + + SpaceDefragmenter() noexcept : m_extent_size(FSP_EXTENT_SIZE) {} + + /** Find the new extent for the existing last used extent + Iterate the tablespace from last and find out the free + extent in the beginning of the tablespace */ + dberr_t find_new_extents() noexcept + { + dberr_t err= extract_extent_state(); + if (err) return err; + + uint32_t free_limit= fil_system.sys_space->free_limit; + uint32_t fixed_size= srv_sys_space.get_min_size(); + while (free_limit > fixed_size) + { + uint32_t state= m_extent_info[free_limit]; + + switch (state) { + case XDES_USED: + goto func_exit; + case XDES_FREE: + goto prev_extent; + case XDES_FSEG: + case XDES_FULL_FRAG: + case XDES_FREE_FRAG: + uint32_t dest= find_free_extent(free_limit); + if (dest == FIL_NULL) + goto func_exit; + m_extent_map[free_limit]= dest; + break; + } +prev_extent: + free_limit-= FSP_EXTENT_SIZE; + } +func_exit: + if (m_extent_map.empty()) + return DB_SUCCESS_LOCKED_REC; + + sql_print_information("InnoDB: System tablespace defragmentation " + "process starts"); + sql_print_information("InnoDB: Moving the data from extents %" + PRIu32 " through %" PRIu32, + m_extent_map.begin()->first, + m_extent_map.rbegin()->first); + return DB_SUCCESS; + } + + /** Defragment the system tables */ + dberr_t defragment_system_tables() noexcept + { + dberr_t err= defragment_table(dict_sys.sys_tables); + if (err == DB_SUCCESS) + err= defragment_table(dict_sys.sys_columns); + if (err == DB_SUCCESS) + err= defragment_table(dict_sys.sys_indexes); + if (err == DB_SUCCESS) + err= defragment_table(dict_sys.sys_fields); + if (err == DB_SUCCESS) + err= defragment_table(dict_sys.sys_foreign); + if (err == DB_SUCCESS) + err= defragment_table(dict_sys.sys_foreign_cols); + if (err == DB_SUCCESS) + err= defragment_table(dict_sys.sys_virtual); + + if (err == DB_SUCCESS) + sql_print_information("InnoDB: Defragmentation of system " + "tablespace is successful"); + return err; + } + + /** @return extent which replaces the later extent + or same extent if there is no replacement exist */ + uint32_t get_new_extent(uint32_t old_extent) const noexcept + { + auto it= m_extent_map.find(old_extent); + if (it != m_extent_map.end()) + return it->second; + return old_extent; + } + + /** @return state for the given extent */ + uint32_t get_state(uint32_t extent) noexcept + { + return m_extent_info[extent]; + } +}; + +dberr_t IndexDefragmenter::get_child_pages(buf_block_t *block) noexcept +{ + const byte *page= block->page.frame; + const rec_t *rec= page_rec_get_next_low(page + PAGE_OLD_INFIMUM, false); + while (rec != page + PAGE_OLD_SUPREMUM) + { + ulint len; + ulint offset= rec_get_nth_field_offs_old(rec, + rec_get_n_fields_old(rec) - 1, + &len); + if (len != 4) + return DB_CORRUPTION; + + if (offset >= srv_page_size) + return DB_CORRUPTION; + + const byte *field= rec + offset; + /* m_parent_pages[child_page_no] = + 1st 32 bit to indicate offset in parent page + 2nd 32 bit to indicate parent page number */ + m_parent_pages[mach_read_from_4(field)]= + uint64_t(page_offset(field)) << 32 | block->page.id().page_no(); + rec= page_rec_get_next_low(rec, false); + } + return DB_SUCCESS; +} + +dberr_t IndexDefragmenter::get_level_block(uint16_t level, mtr_t *mtr, + uint32_t *cur_page_no) noexcept +{ + uint32_t child_page_no= m_index.page; + dberr_t err= DB_SUCCESS; + uint16_t prev_level= UINT16_MAX; + while (1) + { + buf_block_t *block= fsp_get_latched_page(page_id_t{0, child_page_no}, + mtr, &err); + if (!block) + return err; + + page_t *page= buf_block_get_frame(block); + uint16_t cur_level= btr_page_get_level(page); + if (cur_level == level) + break; + + if (prev_level == UINT16_MAX) + prev_level= cur_level; + else if (prev_level != cur_level + 1) + return DB_CORRUPTION; + + const rec_t *rec= page_rec_get_next_low(page + PAGE_OLD_INFIMUM, false); + if (rec && rec != page + PAGE_OLD_SUPREMUM) + { + ulint len; + rec+= rec_get_nth_field_offs_old(rec, rec_get_n_fields_old(rec) - 1, + &len); + if (len != 4 || rec + len - page > page_header_get_field(page, + PAGE_HEAP_TOP)) + return DB_CORRUPTION; + child_page_no= mach_read_from_4(rec); + } + else + return DB_CORRUPTION; + if (cur_level == level + 1) + break; + prev_level= cur_level; + } + *cur_page_no= child_page_no; + return err; +} + +dberr_t IndexDefragmenter::defragment_level( + uint16_t level, + mtr_t *mtr, + SpaceDefragmenter *space_defrag) noexcept +{ + uint32_t cur_page_no= FIL_NULL; + dberr_t err= get_level_block(level, mtr, &cur_page_no); + if (err) + return err; + + fil_space_t *const space= fil_system.sys_space; + uint32_t extent_size= space_defrag->m_extent_size; + + buf_block_t *block= fsp_get_latched_page(page_id_t{0, cur_page_no}, + mtr, &err); + if (!block) + return err; + + for (;;) + { + page_t *page= buf_block_get_frame(block); + uint32_t next_page_no= btr_page_get_next(page); + uint32_t cur_extent= (cur_page_no / extent_size) * extent_size; + uint32_t old_state= space_defrag->get_state(cur_extent); + + if (old_state == XDES_FREE) + { +fetch_next_page: + if (next_page_no == FIL_NULL) + break; + mtr->commit(); + cur_page_no= next_page_no; + + mtr->start(); + mtr->x_lock_space(space); + block= fsp_get_latched_page(page_id_t{0, cur_page_no}, + mtr, &err); + if (!block) + return err; + continue; + } + + uint32_t new_extent= space_defrag->get_new_extent(cur_extent); + /* There is no need for extent to be changed */ + if (new_extent == cur_extent) + { + if (level) + { + /* Store the child page number and their offset + exist in the parent block records */ + err= get_child_pages(block); + if (err) return err; + } + goto fetch_next_page; + } + + buf_block_t *header_block= + fsp_get_latched_page(page_id_t{0, 0}, mtr, &err); + if (!header_block) + return err; + + const fseg_header_t *seg_header= m_root->page.frame + + (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP + : PAGE_HEADER + PAGE_BTR_SEG_LEAF); + + buf_block_t *iblock; + fseg_inode_t *inode= fseg_inode_try_get(seg_header, 0, 0, mtr, + &iblock, &err); + if (!inode) + return err; + + auto parent_it= m_parent_pages.find(cur_page_no); + if (parent_it == m_parent_pages.end()) + { + err= DB_CORRUPTION; + return err; + } + + uint32_t parent_page_no= uint32_t(parent_it->second); + + uint32_t parent_offset= uint32_t(parent_it->second >> 32); + + if (parent_offset >= srv_page_size - FIL_PAGE_DATA_END) + { + err= DB_CORRUPTION; + return err; + } + + PageOperator operation(header_block, iblock, inode, extent_size, + page + FIL_PAGE_OFFSET, mtr); + + AssociatedPages related_pages(block, mtr); + + err= operation.prepare_new_page(new_extent, old_state == XDES_FSEG); + + DBUG_EXECUTE_IF("allocation_prepare_fail", err= DB_CORRUPTION;); + if (err) + { +err_exit: + operation.restore_old_pages(); + mtr->discard_modifications(); + return err; + } + + err= related_pages.prepare(parent_page_no); + DBUG_EXECUTE_IF("relation_page_prepare_fail", err= DB_CORRUPTION;); + + if (err) goto err_exit; + + operation.complete_new_page(); + + /* After allocating the new page, try to prepare the steps + of page removal function. Because there is a possiblity that + last block in FSEG_NOT_FULL/FSP_FREE_FRAG/FSP_FREE last block + could've changed while allocating the new block. */ + err= operation.prepare_old_page(); + + DBUG_EXECUTE_IF("remover_prepare_fail", err= DB_CORRUPTION;); + if (err) goto err_exit; + + /* Copy the data from old block to new block */ + buf_block_t *new_block= operation.get_new_block(); + uint32_t new_page_no= new_block->page.id().page_no(); + /* Copy FIL_PAGE_PREV, FIL_PAGE_NEXT */ + mtr->memcpy(*new_block, + new_block->page.frame + FIL_PAGE_PREV, + block->page.frame + FIL_PAGE_PREV, + page_has_next(block->page.frame) ? 8 : 4); + mtr->memcpy(*new_block, new_block->page.frame + FIL_PAGE_TYPE, + block->page.frame + FIL_PAGE_TYPE, + srv_page_size - FIL_PAGE_TYPE - 8); + + /* Assign the new block page number in left, right + and parent block */ + related_pages.complete(new_page_no, parent_offset); + + /* Complete the page free operation */ + operation.complete_free_old_page(); + /* Add the new page in inode fragment array */ + operation.assign_frag_slot(); + + if (level) + { + err= get_child_pages(new_block); + if (err) return err; + } + goto fetch_next_page; + } + + ut_a(!fsp_tablespace_validate(space, mtr)); + ut_a(!fseg_validate_low(space, &m_index, mtr)); + if (level > 1) + { + mtr->commit(); + mtr->start(); + mtr->x_lock_space(space); + } + return DB_SUCCESS; +} + +dberr_t IndexDefragmenter::defragment(SpaceDefragmenter *space_defrag) noexcept +{ + mtr_t mtr; + mtr.start(); + dberr_t err= DB_SUCCESS; + m_index.lock.x_lock(SRW_LOCK_CALL); + fil_space_t *const space= fil_system.sys_space; + mtr.x_lock_space(space); + m_root= btr_root_block_get(&m_index, RW_S_LATCH, &mtr, &err); + if (!m_root) + { + mtr.commit(); + m_index.lock.x_unlock(); + return err; + } + + m_root->page.fix(); + mtr.release_last_page(); + uint16_t level= btr_page_get_level(m_root->page.frame); + while (1) + { + err= defragment_level(level, &mtr, space_defrag); + DBUG_EXECUTE_IF("fail_after_level_defragment", + if (m_index.table->id == 2 && level == 1) + err= DB_CORRUPTION;); + if (err || !level) + break; + level--; + } + ut_ad(err == DB_SUCCESS || !mtr.has_modifications()); + mtr.commit(); + m_index.lock.x_unlock(); + m_root->page.unfix(); + return err; +} + +/** check whether any user table exist in system tablespace +@retval DB_SUCCESS_LOCKED_REC if user table exist +@retval DB_SUCCESS if no user table exist +@retval DB_CORRUPTION if any error encountered */ +static dberr_t user_tables_exists() noexcept +{ + mtr_t mtr; + btr_pcur_t pcur; + dberr_t err= DB_SUCCESS; + mtr.start(); + for (const rec_t *rec= dict_startscan_system(&pcur, &mtr, + dict_sys.sys_tables); + rec; rec= dict_getnext_system(&pcur, &mtr)) + { + const byte *field= nullptr; + ulint len= 0; + if (rec_get_deleted_flag(rec, 0)) + { +corrupt: + sql_print_error("InnoDB: Encountered corrupted record in SYS_TABLES"); + err= DB_CORRUPTION; + goto func_exit; + } + field= rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__SPACE, &len); + if (len != 4) + goto corrupt; + if (mach_read_from_4(field) != 0) + continue; + field= rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__ID, &len); + if (len != 8) + goto corrupt; + if (!dict_sys.is_sys_table(mach_read_from_8(field))) + { + err= DB_SUCCESS_LOCKED_REC; + btr_pcur_close(&pcur); + goto func_exit; + } + } +func_exit: + mtr.commit(); + return err; +} + +dberr_t fil_space_t::defragment() noexcept +{ + ut_ad(this == fil_system.sys_space); + dberr_t err= user_tables_exists(); + if (err == DB_SUCCESS_LOCKED_REC) + { + sql_print_information( + "InnoDB: User table exists in the system tablespace." + "Please try to move the data from system tablespace " + "to separate tablespace before defragment the " + "system tablespace."); + return DB_SUCCESS; + } else if (err) { return err; } + + SpaceDefragmenter defragmenter; + err= defragmenter.find_new_extents(); + /* There is no free extent exist */ + if (err == DB_SUCCESS_LOCKED_REC) + return DB_SUCCESS; + + if (err == DB_SUCCESS) + err= defragmenter.defragment_system_tables(); + return err; +} + void fsp_system_tablespace_truncate(bool shutdown) { ut_ad(!purge_sys.enabled()); @@ -4030,6 +5769,16 @@ void fsp_system_tablespace_truncate(bool shutdown) return; } + if (!shutdown) + { + err= space->defragment(); + if (err) + { + srv_sys_space.set_shrink_fail(); + return; + } + } + mtr_t mtr; mtr.start(); mtr.x_lock_space(space); @@ -4064,7 +5813,11 @@ err_exit: fil_system.set_use_doublewrite(false); buf_block_t *header= nullptr; - ut_ad(!fsp_tablespace_validate(space)); +#ifdef UNIV_DEBUG + mtr.start(); + ut_ad(!fsp_tablespace_validate(space, &mtr)); + mtr.commit(); +#endif /* UNIV_DEBUG */ mtr.start(); mtr.x_lock_space(space); @@ -4090,7 +5843,7 @@ err_exit: UINT32PF " to " UINT32PF " pages", space->size, last_used_extent); - header= fsp_get_latched_xdes_page( + header= fsp_get_latched_page( page_id_t(space->id, 0), &mtr, &err); if (!header) goto err_exit; @@ -4132,7 +5885,11 @@ mtr_max: old_xdes_list.restore(&mtr); mtr.discard_modifications(); mtr.commit(); - ut_ad(!fsp_tablespace_validate(space)); +#ifdef UNIV_DEBUG + mtr.start(); + ut_ad(!fsp_tablespace_validate(space, &mtr)); + mtr.commit(); +#endif /* UNIV_DEBUG */ sql_print_error( "InnoDB: Cannot shrink the system tablespace " "because the mini-transaction log size (%zu bytes) " @@ -4206,7 +5963,7 @@ func_exit: UINT32PF " to " UINT32PF " pages", space->size, last_used_extent); - buf_block_t *header= fsp_get_latched_xdes_page( + buf_block_t *header= fsp_get_latched_page( page_id_t(space->id, 0), &mtr, &err); if (!header) goto func_exit; diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc index ff876801242..11d424ae303 100644 --- a/storage/innobase/fut/fut0lst.cc +++ b/storage/innobase/fut/fut0lst.cc @@ -409,45 +409,56 @@ dberr_t flst_remove(buf_block_t *base, uint16_t boffset, return err; } -#ifdef UNIV_DEBUG /** Validate a file-based list. */ -void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr) +dberr_t flst_validate(const buf_block_t *base, uint16_t boffset, + mtr_t *mtr) noexcept { - ut_ad(boffset < base->physical_size()); + if (boffset >= base->physical_size()) + return DB_CORRUPTION; + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); const uint32_t len= flst_get_len(base->page.frame + boffset); fil_addr_t addr= flst_get_first(base->page.frame + boffset); + dberr_t err= DB_SUCCESS; for (uint32_t i= len; i--; ) { - ut_ad(addr.boffset >= FIL_PAGE_DATA); - ut_ad(addr.boffset < base->physical_size() - FIL_PAGE_DATA_END); + if (addr.boffset < FIL_PAGE_DATA || + addr.boffset >= base->physical_size() - FIL_PAGE_DATA_END) + return DB_CORRUPTION; const buf_block_t *b= buf_page_get_gen(page_id_t(base->page.id().space(), addr.page), - base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr); - ut_ad(b); + base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr, + &err); + if (!b) + return err; addr= flst_get_next_addr(b->page.frame + addr.boffset); mtr->release_last_page(); } - ut_ad(addr.page == FIL_NULL); + if (addr.page != FIL_NULL) + return DB_CORRUPTION; addr= flst_get_last(base->page.frame + boffset); for (uint32_t i= len; i--; ) { - ut_ad(addr.boffset >= FIL_PAGE_DATA); - ut_ad(addr.boffset < base->physical_size() - FIL_PAGE_DATA_END); + if (addr.boffset < FIL_PAGE_DATA || + addr.boffset >= base->physical_size() - FIL_PAGE_DATA_END) + return DB_CORRUPTION; const buf_block_t *b= buf_page_get_gen(page_id_t(base->page.id().space(), addr.page), - base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr); - ut_ad(b); + base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr, + &err); + if (!b) + return err; addr= flst_get_prev_addr(b->page.frame + addr.boffset); mtr->release_last_page(); } - ut_ad(addr.page == FIL_NULL); + if (addr.page != FIL_NULL) + return DB_CORRUPTION; + return err; } -#endif diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 751f8744052..afe3f3eedcc 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -1504,6 +1504,14 @@ public: bool load_sys_tables() noexcept; /** Create or check system tables on startup */ dberr_t create_or_check_sys_tables() noexcept; + + bool is_sys_table(table_id_t table_id) const noexcept + { + return (table_id > 0 && table_id <= 4) || + table_id == sys_foreign->id || + table_id == sys_foreign_cols->id || + table_id == sys_virtual->id; + } }; /** the data dictionary cache */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index cd1aa8ca336..4e0744b2207 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1021,6 +1021,12 @@ public: @param shutdown called during slow shutdown @return error code */ dberr_t garbage_collect(bool shutdown); + + /** Move InnoDB system tables closer to the start of + the tablespace. + @return error code + @retval DB_SUCCESS on successful operation */ + dberr_t defragment() noexcept; private: /** @return whether the file is usable for io() */ ATTRIBUTE_COLD bool prepare_acquired() noexcept; diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h index dc8806a5c74..4a701fa0d8b 100644 --- a/storage/innobase/include/fut0lst.h +++ b/storage/innobase/include/fut0lst.h @@ -161,9 +161,8 @@ inline fil_addr_t flst_get_prev_addr(const flst_node_t *node) void flst_write_addr(const buf_block_t &block, byte *faddr, uint32_t page, uint16_t boffset, mtr_t *mtr); -# ifdef UNIV_DEBUG /** Validate a file-based list. */ -void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr); -# endif +dberr_t flst_validate(const buf_block_t *base, uint16_t boffset, + mtr_t *mtr) noexcept; #endif /* !UNIV_INNOCHECKSUM */