diff --git a/mysql-test/suite/perfschema/r/sxlock_func.result b/mysql-test/suite/perfschema/r/sxlock_func.result index 21ecd5eee0b..04db3a42dd0 100644 --- a/mysql-test/suite/perfschema/r/sxlock_func.result +++ b/mysql-test/suite/perfschema/r/sxlock_func.result @@ -17,7 +17,6 @@ wait/synch/sxlock/innodb/dict_table_stats wait/synch/sxlock/innodb/fil_space_latch wait/synch/sxlock/innodb/fts_cache_init_rw_lock wait/synch/sxlock/innodb/fts_cache_rw_lock -wait/synch/sxlock/innodb/hash_table_locks wait/synch/sxlock/innodb/index_online_log wait/synch/sxlock/innodb/index_tree_rw_lock wait/synch/sxlock/innodb/trx_i_s_cache_lock diff --git a/mysql-test/suite/perfschema/t/show_sanity.test b/mysql-test/suite/perfschema/t/show_sanity.test index 029e1d9033e..f6c43c88111 100644 --- a/mysql-test/suite/perfschema/t/show_sanity.test +++ b/mysql-test/suite/perfschema/t/show_sanity.test @@ -486,7 +486,6 @@ insert into test.sanity values ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_OPTIMIZE_FULLTEXT_ONLY"), ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_CLEANERS"), ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_CLEANER_DISABLED_DEBUG"), - ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_HASH_LOCKS"), ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_SIZE"), ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PRINT_ALL_DEADLOCKS"), ("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PURGE_BATCH_SIZE"), diff --git a/mysql-test/suite/sys_vars/r/innodb_page_hash_locks_basic.result b/mysql-test/suite/sys_vars/r/innodb_page_hash_locks_basic.result deleted file mode 100644 index 8368c863212..00000000000 --- a/mysql-test/suite/sys_vars/r/innodb_page_hash_locks_basic.result +++ /dev/null @@ -1,24 +0,0 @@ -select @@global.innodb_page_hash_locks between 1 and 1024; -@@global.innodb_page_hash_locks between 1 and 1024 -1 -select @@global.innodb_page_hash_locks; -@@global.innodb_page_hash_locks -64 -select @@session.innodb_page_hash_locks; -ERROR HY000: Variable 'innodb_page_hash_locks' is a GLOBAL variable -show global variables like 'innodb_page_hash_locks'; -Variable_name Value -innodb_page_hash_locks 64 -show session variables like 'innodb_page_hash_locks'; -Variable_name Value -innodb_page_hash_locks 64 -select * from information_schema.global_variables where variable_name='innodb_page_hash_locks'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_PAGE_HASH_LOCKS 64 -select * from information_schema.session_variables where variable_name='innodb_page_hash_locks'; -VARIABLE_NAME VARIABLE_VALUE -INNODB_PAGE_HASH_LOCKS 64 -set global innodb_page_hash_locks=1; -ERROR HY000: Variable 'innodb_page_hash_locks' is a read only variable -set @@session.innodb_page_hash_locks='some'; -ERROR HY000: Variable 'innodb_page_hash_locks' is a read only variable diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff index 1771ee10832..84aecf2ce36 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff @@ -380,15 +380,6 @@ VARIABLE_COMMENT Deprecated parameter with no effect. NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 64 -@@ -1513,7 +1513,7 @@ - SESSION_VALUE NULL - DEFAULT_VALUE 16 - VARIABLE_SCOPE GLOBAL --VARIABLE_TYPE BIGINT UNSIGNED -+VARIABLE_TYPE INT UNSIGNED - VARIABLE_COMMENT Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2 - NUMERIC_MIN_VALUE 1 - NUMERIC_MAX_VALUE 1024 @@ -1525,7 +1525,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 16384 diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 501bf629038..4d062995b1e 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -1509,18 +1509,6 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST OFF,ON READ_ONLY NO COMMAND_LINE_ARGUMENT OPTIONAL -VARIABLE_NAME INNODB_PAGE_HASH_LOCKS -SESSION_VALUE NULL -DEFAULT_VALUE 64 -VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2 -NUMERIC_MIN_VALUE 1 -NUMERIC_MAX_VALUE 1024 -NUMERIC_BLOCK_SIZE 0 -ENUM_VALUE_LIST NULL -READ_ONLY YES -COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_PAGE_SIZE SESSION_VALUE NULL DEFAULT_VALUE 16384 diff --git a/mysql-test/suite/sys_vars/t/innodb_page_hash_locks_basic.test b/mysql-test/suite/sys_vars/t/innodb_page_hash_locks_basic.test deleted file mode 100644 index ee4798c1f90..00000000000 --- a/mysql-test/suite/sys_vars/t/innodb_page_hash_locks_basic.test +++ /dev/null @@ -1,24 +0,0 @@ ---source include/have_innodb.inc ---source include/have_debug.inc - -# -# exists as global only -# -select @@global.innodb_page_hash_locks between 1 and 1024; -select @@global.innodb_page_hash_locks; ---error ER_INCORRECT_GLOBAL_LOCAL_VAR -select @@session.innodb_page_hash_locks; -show global variables like 'innodb_page_hash_locks'; -show session variables like 'innodb_page_hash_locks'; ---disable_warnings -select * from information_schema.global_variables where variable_name='innodb_page_hash_locks'; -select * from information_schema.session_variables where variable_name='innodb_page_hash_locks'; ---enable_warnings - -# -# show that it's read-only -# ---error ER_INCORRECT_GLOBAL_LOCAL_VAR -set global innodb_page_hash_locks=1; ---error ER_INCORRECT_GLOBAL_LOCAL_VAR -set @@session.innodb_page_hash_locks='some'; diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index fbb4b36133b..e65bef02f47 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -1085,15 +1085,15 @@ fail: buf_block_t* block = buf_pool.block_from_ahi(rec); if (!ahi_latch) { - rw_lock_t* hash_lock = buf_pool.hash_lock_get( + page_hash_latch* hash_lock = buf_pool.hash_lock_get( block->page.id()); - rw_lock_s_lock(hash_lock); + hash_lock->read_lock(); if (block->page.state() == BUF_BLOCK_REMOVE_HASH) { /* Another thread is just freeing the block from the LRU list of the buffer pool: do not try to access this page. */ - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); goto fail; } @@ -1104,7 +1104,7 @@ fail: DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED); buf_block_buf_fix_inc(block, __FILE__, __LINE__); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); block->page.set_accessed(); buf_page_make_young_if_needed(&block->page); diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc index 74d8476b8c9..db156ba036b 100644 --- a/storage/innobase/buf/buf0buddy.cc +++ b/storage/innobase/buf/buf0buddy.cc @@ -555,8 +555,8 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) return false; } - rw_lock_t * hash_lock = buf_pool.hash_lock_get_low(fold); - rw_lock_x_lock(hash_lock); + page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); if (bpage->can_relocate()) { /* Relocate the compressed page. */ @@ -567,7 +567,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) memcpy(dst, src, size); bpage->zip.data = reinterpret_cast(dst); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); buf_buddy_mem_invalid( reinterpret_cast(src), i); @@ -578,7 +578,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) return(true); } - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); return(false); } diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 44b8d912d24..c4453efc74f 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -40,6 +40,7 @@ Created 11/5/1995 Heikki Tuuri #include #ifndef UNIV_INNOCHECKSUM +#include "my_cpu.h" #include "mem0mem.h" #include "btr0btr.h" #include "fil0fil.h" @@ -278,6 +279,47 @@ the read requests for the whole area. */ #ifndef UNIV_INNOCHECKSUM +void page_hash_latch::read_lock_wait() +{ + auto l= read_lock_yield(); + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + if (l & WRITER_PENDING) + ut_delay(srv_spin_wait_delay); + if (read_trylock()) + return; + l= read_lock_yield(); + } + /* Fall back to yielding to other threads. */ + for (;;) + { + if (l & WRITER_PENDING) + os_thread_yield(); + if (read_trylock()) + return; + l= read_lock_yield(); + } +} + +void page_hash_latch::write_lock_wait() +{ + write_lock_wait_start(); + + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + if (write_lock_poll()) + return; + ut_delay(srv_spin_wait_delay); + } + + /* Fall back to yielding to other threads. */ + do + os_thread_yield(); + while (!write_lock_poll()); +} + /** Value in microseconds */ constexpr int WAIT_FOR_READ= 100; constexpr int WAIT_FOR_WRITE= 100; @@ -1441,6 +1483,15 @@ static void buf_block_free_mutexes(buf_block_t* block) ut_d(ut_free(block->debug_latch)); } +/** Create the hash table. +@param n the lower bound of n_cells */ +void buf_pool_t::page_hash_table::create(ulint n) +{ + n_cells= ut_find_prime(n); + array= static_cast + (ut_zalloc_nokey(pad(n_cells) * sizeof *array)); +} + /** Create the buffer pool. @return whether the creation failed */ bool buf_pool_t::create() @@ -1517,16 +1568,7 @@ bool buf_pool_t::create() n_chunks_new= n_chunks; - /* Number of locks protecting page_hash must be a power of two */ - srv_n_page_hash_locks= my_round_up_to_next_power(static_cast - (srv_n_page_hash_locks)); - ut_a(srv_n_page_hash_locks != 0); - ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS); - page_hash.create(2 * curr_size); - for (auto i= srv_n_page_hash_locks; i--; ) - rw_lock_create(hash_table_locks_key, &page_hash_latches[i], - SYNC_BUF_PAGE_HASH); zip_hash.create(2 * curr_size); last_printout_time= time(NULL); @@ -1604,9 +1646,14 @@ void buf_pool_t::close() ut_free(chunks); chunks= nullptr; - for (auto i= srv_n_page_hash_locks; i--; ) - rw_lock_free(&page_hash_latches[i]); page_hash.free(); + while (page_hash_table *old_page_hash= freed_page_hash) + { + freed_page_hash= static_cast + (old_page_hash->array[1].node); + old_page_hash->free(); + UT_DELETE(old_page_hash); + } zip_hash.free(); io_buf.close(); @@ -1632,8 +1679,8 @@ inline bool buf_pool_t::realloc(buf_block_t *block) } const page_id_t id(block->page.id()); - rw_lock_t* hash_lock = hash_lock_get(id); - rw_lock_x_lock(hash_lock); + page_hash_latch* hash_lock = hash_lock_get(id); + hash_lock->write_lock(); if (block->page.can_relocate()) { memcpy_aligned( @@ -1722,13 +1769,13 @@ inline bool buf_pool_t::realloc(buf_block_t *block) ut_ad(new_block->lock_hash_val == lock_rec_hash( id.space(), id.page_no())); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); /* free block */ ut_d(block->page.set_state(BUF_BLOCK_MEMORY)); buf_LRU_block_free_non_file_page(block); } else { - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); buf_LRU_block_free_non_file_page(new_block); } @@ -1922,30 +1969,39 @@ inline bool buf_pool_t::withdraw_blocks() } /** resize page_hash and zip_hash */ -static void buf_pool_resize_hash() +inline void buf_pool_t::resize_hash() { - hash_table_t new_hash; - new_hash.create(2 * buf_pool.curr_size); + page_hash_table *new_page_hash= UT_NEW_NOKEY(page_hash_table()); + new_page_hash->create(2 * buf_pool.curr_size); + new_page_hash->write_lock_all(); - for (ulint i= 0; i < buf_pool.page_hash.n_cells; i++) + for (auto i= page_hash.pad(page_hash.n_cells); i--; ) { - while (buf_page_t *bpage= static_cast - (HASH_GET_FIRST(&buf_pool.page_hash, i))) + static_assert(!((page_hash_table::ELEMENTS_PER_LATCH + 1) & + page_hash_table::ELEMENTS_PER_LATCH), + "must be one less than a power of 2"); + if (!(i & page_hash_table::ELEMENTS_PER_LATCH)) + { + ut_ad(reinterpret_cast + (&page_hash.array[i])->is_write_locked()); + continue; + } + while (buf_page_t *bpage= static_cast + (page_hash.array[i].node)) { - buf_page_t *prev_bpage= bpage; ut_ad(bpage->in_page_hash); - bpage= static_cast(HASH_GET_NEXT(hash, prev_bpage)); - const ulint fold= prev_bpage->id().fold(); - HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, fold, prev_bpage); - HASH_INSERT(buf_page_t, hash, &new_hash, fold, prev_bpage); + const ulint fold= bpage->id().fold(); + HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage); + HASH_INSERT(buf_page_t, hash, new_page_hash, fold, bpage); } } - std::swap(buf_pool.page_hash.array, new_hash.array); - buf_pool.page_hash.n_cells= new_hash.n_cells; - new_hash.free(); + buf_pool.page_hash.array[1].node= freed_page_hash; + std::swap(buf_pool.page_hash, *new_page_hash); + freed_page_hash= new_page_hash; /* recreate zip_hash */ + hash_table_t new_hash; new_hash.create(2 * buf_pool.curr_size); for (ulint i= 0; i < buf_pool.zip_hash.n_cells; i++) @@ -1953,11 +2009,9 @@ static void buf_pool_resize_hash() while (buf_page_t *bpage= static_cast (HASH_GET_FIRST(&buf_pool.zip_hash, i))) { - buf_page_t *prev_bpage= bpage; - bpage= static_cast(HASH_GET_NEXT(hash, prev_bpage)); - const ulint fold= BUF_POOL_ZIP_FOLD_BPAGE(prev_bpage); - HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, prev_bpage); - HASH_INSERT(buf_page_t, hash, &new_hash, fold, prev_bpage); + const ulint fold= BUF_POOL_ZIP_FOLD_BPAGE(bpage); + HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage); + HASH_INSERT(buf_page_t, hash, &new_hash, fold, bpage); } } @@ -1967,6 +2021,49 @@ static void buf_pool_resize_hash() } +inline void buf_pool_t::page_hash_table::write_lock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast(array[n]).write_lock(); + if (!n) + break; + } +} + + +inline void buf_pool_t::page_hash_table::write_unlock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast(array[n]).write_unlock(); + if (!n) + break; + } +} + + +inline void buf_pool_t::write_lock_all_page_hash() +{ + ut_ad(mutex_own(&mutex)); + page_hash.write_lock_all(); + for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash; + old_page_hash= static_cast + (old_page_hash->array[1].node)) + old_page_hash->write_lock_all(); +} + + +inline void buf_pool_t::write_unlock_all_page_hash() +{ + page_hash.write_unlock_all(); + for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash; + old_page_hash= static_cast + (old_page_hash->array[1].node)) + old_page_hash->write_unlock_all(); +} + + /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ inline void buf_pool_t::resize() { @@ -2131,8 +2228,7 @@ withdraw_retry: resizing.store(true, std::memory_order_relaxed); mutex_enter(&mutex); - for (auto i= srv_n_page_hash_locks; i--; ) - rw_lock_x_lock(&page_hash_latches[i]); + write_lock_all_page_hash(); chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); @@ -2278,13 +2374,12 @@ calc_buf_pool_size: if the new size is too different */ if (!warning && new_size_too_diff) { buf_resize_status("Resizing hash table"); - buf_pool_resize_hash(); + resize_hash(); ib::info() << "hash tables were resized"; } mutex_exit(&mutex); - for (auto i= srv_n_page_hash_locks; i--; ) - rw_lock_x_unlock(&page_hash_latches[i]); + write_unlock_all_page_hash(); UT_DELETE(chunk_map_old); @@ -2390,7 +2485,7 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) const ulint fold= bpage->id().fold(); ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE); ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(rw_lock_own(buf_pool.hash_lock_get(bpage->id()), RW_LOCK_X)); + ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked()); ut_a(bpage->io_fix() == BUF_IO_NONE); ut_a(!bpage->buf_fix_count()); ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold)); @@ -2443,11 +2538,11 @@ relocated, and reacquired. @return a buffer pool block corresponding to id @retval nullptr if the block was not present, and a watch was installed */ inline buf_page_t *buf_pool_t::watch_set(const page_id_t id, - rw_lock_t **hash_lock) + page_hash_latch **hash_lock) { const ulint fold= id.fold(); - ut_ad(*hash_lock == hash_lock_get_low(fold)); - ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X)); + ut_ad(*hash_lock == page_hash.lock_get(fold)); + ut_ad((*hash_lock)->is_write_locked()); retry: if (buf_page_t *bpage= page_hash_get_low(id, fold)) @@ -2460,7 +2555,7 @@ retry: return nullptr; } - rw_lock_x_unlock(*hash_lock); + (*hash_lock)->write_unlock(); /* Allocate a watch[] and then try to insert it into the page_hash. */ mutex_enter(&mutex); @@ -2484,18 +2579,18 @@ retry: w->set_state(BUF_BLOCK_ZIP_PAGE); w->id_= id; - *hash_lock= hash_lock_get_low(fold); - rw_lock_x_lock(*hash_lock); + *hash_lock= page_hash.lock_get(fold); + (*hash_lock)->write_lock(); mutex_exit(&mutex); buf_page_t *bpage= page_hash_get_low(id, fold); if (UNIV_LIKELY_NULL(bpage)) { - rw_lock_x_unlock(*hash_lock); + (*hash_lock)->write_unlock(); mutex_enter(&mutex); w->set_state(BUF_BLOCK_NOT_USED); - *hash_lock= hash_lock_get_low(fold); - rw_lock_x_lock(*hash_lock); + *hash_lock= page_hash.lock_get(fold); + (*hash_lock)->write_lock(); mutex_exit(&mutex); goto retry; } @@ -2533,7 +2628,7 @@ void buf_page_free(const page_id_t page_id, buf_pool.stat.n_page_gets++; const ulint fold= page_id.fold(); - rw_lock_t *hash_lock= buf_pool.page_hash_lock(fold); + page_hash_latch *hash_lock= buf_pool.page_hash.lock(fold); buf_block_t *block= reinterpret_cast (buf_pool.page_hash_get_low(page_id, fold)); @@ -2544,7 +2639,7 @@ void buf_page_free(const page_id_t page_id, { /* FIXME: if block!=NULL, convert to BUF_BLOCK_FILE_PAGE, but avoid buf_zip_decompress() */ - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); return; } @@ -2559,7 +2654,7 @@ void buf_page_free(const page_id_t page_id, block->page.status= buf_page_t::FREED; buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); } /** Get read access to a compressed page (usually of type @@ -2581,7 +2676,7 @@ buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) bool discard_attempted= false; const ulint fold= page_id.fold(); buf_page_t *bpage; - rw_lock_t *hash_lock; + page_hash_latch *hash_lock; for (;;) { @@ -2604,13 +2699,13 @@ lookup: #endif /* UNIV_DEBUG */ } - ut_ad(rw_lock_own(hash_lock, RW_LOCK_S)); + ut_ad(hash_lock->is_read_locked()); if (!bpage->zip.data) { /* There is no compressed page. */ err_exit: - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); return nullptr; } @@ -2625,7 +2720,7 @@ err_exit: if (!discard_attempted) { discard_attempted= true; - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); mutex_enter(&buf_pool.mutex); if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold)) buf_LRU_free_page(bpage, false); @@ -2645,7 +2740,7 @@ err_exit: got_block: bool must_read= bpage->io_fix() == BUF_IO_READ; - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); DBUG_ASSERT(bpage->status != buf_page_t::FREED); @@ -2981,7 +3076,7 @@ loop: buf_block_t* fix_block; block = guess; - rw_lock_t* hash_lock = buf_pool.page_hash_lock(fold); + page_hash_latch* hash_lock = buf_pool.page_hash.lock(fold); if (block) { @@ -3006,14 +3101,14 @@ lookup: } if (!block || buf_pool.watch_is_sentinel(block->page)) { - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); block = nullptr; } if (UNIV_UNLIKELY(!block)) { /* Page not in buf_pool: needs to be read from file */ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { - hash_lock = buf_pool.page_hash_lock(fold); + hash_lock = buf_pool.page_hash.lock(fold); if (buf_page_t *bpage= buf_pool.watch_set( page_id, &hash_lock)) { @@ -3021,13 +3116,13 @@ lookup: increment the fix count to make sure that no state change takes place. */ bpage->fix(); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); block = reinterpret_cast(bpage); fix_block = block; goto got_block; } - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); } switch (mode) { @@ -3121,7 +3216,7 @@ lookup: } fix_block->fix(); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); got_block: switch (mode) { @@ -3212,9 +3307,9 @@ evict_from_pool: buf_block_init_low(block); mutex_enter(&buf_pool.mutex); - hash_lock = buf_pool.hash_lock_get_low(fold); + hash_lock = buf_pool.page_hash.lock_get(fold); - rw_lock_x_lock(hash_lock); + hash_lock->write_lock(); /* Buffer-fixing prevents the page_hash from changing. */ ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold)); @@ -3228,7 +3323,7 @@ evict_from_pool: This should be extremely unlikely, for example, if buf_page_get_zip() was invoked. */ - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); buf_LRU_block_free_non_file_page(block); mutex_exit(&buf_pool.mutex); @@ -3276,7 +3371,7 @@ evict_from_pool: UNIV_MEM_INVALID(bpage, sizeof *bpage); mutex_exit(&buf_pool.mutex); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); buf_pool.n_pend_unzip++; access_time = block->page.is_accessed(); @@ -3312,9 +3407,6 @@ evict_from_pool: ut_ad(block == fix_block); ut_ad(fix_block->page.buf_fix_count()); - ut_ad(!rw_lock_own_flagged(hash_lock, - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE); #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG @@ -3336,8 +3428,8 @@ evict_from_pool: if (buf_LRU_free_page(&fix_block->page, true)) { space->release_for_io(); - hash_lock = buf_pool.hash_lock_get_low(fold); - rw_lock_x_lock(hash_lock); + hash_lock = buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); mutex_exit(&buf_pool.mutex); /* We may set the watch, as it would have been set if the page were not in the @@ -3346,7 +3438,7 @@ evict_from_pool: mode == BUF_GET_IF_IN_POOL_OR_WATCH ? buf_pool.watch_set(page_id, &hash_lock) : buf_pool.page_hash_get_low(page_id, fold)); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); if (block != NULL) { /* Either the page has been read in or @@ -3467,9 +3559,6 @@ get_latch: buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr)); } - ut_ad(!rw_lock_own_flagged(hash_lock, - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - return(fix_block); } @@ -3558,17 +3647,17 @@ buf_page_optimistic_get( return FALSE; } - rw_lock_t *hash_lock = buf_pool.hash_lock_get(block->page.id()); - rw_lock_s_lock(hash_lock); + page_hash_latch *hash_lock = buf_pool.hash_lock_get(block->page.id()); + hash_lock->read_lock(); if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE || block->page.io_fix() != BUF_IO_NONE)) { - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); return(FALSE); } buf_block_buf_fix_inc(block, file, line); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); const bool first_access = block->page.set_accessed(); @@ -3645,7 +3734,7 @@ buf_page_try_get_func( ut_ad(mtr); ut_ad(mtr->is_active()); - rw_lock_t *hash_lock; + page_hash_latch *hash_lock; buf_page_t *bpage= buf_pool.page_hash_get_locked(page_id, page_id.fold(), &hash_lock); @@ -3653,13 +3742,13 @@ buf_page_try_get_func( return nullptr; if (bpage->state() != BUF_BLOCK_FILE_PAGE) { - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); return nullptr; } buf_block_t *block= reinterpret_cast(bpage); buf_block_buf_fix_inc(block, file, line); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX; if (!rw_lock_s_lock_nowait(&block->lock, file, line)) @@ -3770,8 +3859,8 @@ buf_page_create(fil_space_t *space, uint32_t offset, /* The block must be put to the LRU list */ buf_LRU_add_block(&block->page, false); - rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id); - rw_lock_x_lock(hash_lock); + page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); block->page.set_state(BUF_BLOCK_FILE_PAGE); ut_d(block->page.in_page_hash= true); HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page); @@ -3783,7 +3872,7 @@ buf_page_create(fil_space_t *space, uint32_t offset, the block. */ block->page.set_io_fix(BUF_IO_READ); rw_lock_x_lock(&block->lock); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); /* buf_pool.mutex may be released and reacquired by buf_buddy_alloc(). We must defer this operation until @@ -3801,7 +3890,7 @@ buf_page_create(fil_space_t *space, uint32_t offset, rw_lock_x_unlock(&block->lock); } else - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); mutex_exit(&buf_pool.mutex); @@ -3954,10 +4043,10 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space) void buf_pool_t::corrupted_evict(buf_page_t *bpage) { const page_id_t id(bpage->id()); - rw_lock_t *hash_lock= hash_lock_get(id); + page_hash_latch *hash_lock= hash_lock_get(id); mutex_enter(&mutex); - rw_lock_x_lock(hash_lock); + hash_lock->write_lock(); ut_ad(bpage->io_fix() == BUF_IO_READ); ut_ad(!bpage->oldest_modification()); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index bc6e24a55de..f9189216ac0 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -141,7 +141,7 @@ caller needs to free the page to the free list @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In this case the block is already returned to the buddy allocator. */ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, - rw_lock_t *hash_lock, bool zip); + page_hash_latch *hash_lock, bool zip); /** Free a block to buf_pool */ static void buf_LRU_block_free_hashed_page(buf_block_t *block) @@ -1160,8 +1160,8 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) bpage->can_relocate() from changing due to a concurrent execution of buf_page_get_low(). */ const ulint fold = id.fold(); - rw_lock_t* hash_lock = buf_pool.hash_lock_get_low(fold); - rw_lock_x_lock(hash_lock); + page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); if (UNIV_UNLIKELY(!bpage->can_relocate())) { /* Do not free buffer fixed and I/O-fixed blocks. */ @@ -1178,7 +1178,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) } else if (bpage->oldest_modification() && bpage->state() != BUF_BLOCK_FILE_PAGE) { func_exit: - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); return(false); } else if (bpage->state() == BUF_BLOCK_FILE_PAGE) { @@ -1201,10 +1201,6 @@ func_exit: return(true); } - /* buf_LRU_block_remove_hashed() releases the hash_lock */ - ut_ad(!rw_lock_own_flagged(hash_lock, - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr then it was a compressed page with an uncompressed frame and we are interested in freeing only the uncompressed frame. @@ -1215,7 +1211,7 @@ func_exit: if (UNIV_LIKELY_NULL(b)) { buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); - rw_lock_x_lock(hash_lock); + hash_lock->write_lock(); ut_ad(!buf_pool.page_hash_get_low(id, fold)); ut_ad(b->zip_size()); @@ -1301,7 +1297,7 @@ func_exit: decompressing the block while we release hash_lock. */ b->set_io_fix(BUF_IO_PIN); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); } mutex_exit(&buf_pool.mutex); @@ -1405,10 +1401,10 @@ caller needs to free the page to the free list @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In this case the block is already returned to the buddy allocator. */ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, - rw_lock_t *hash_lock, bool zip) + page_hash_latch *hash_lock, bool zip) { ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); + ut_ad(hash_lock->is_write_locked()); ut_a(bpage->io_fix() == BUF_IO_NONE); ut_a(!bpage->buf_fix_count()); @@ -1501,7 +1497,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, #ifdef UNIV_DEBUG UT_LIST_REMOVE(buf_pool.zip_clean, bpage); #endif /* UNIV_DEBUG */ - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, bpage->zip_size()); @@ -1542,7 +1538,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, and by the time we'll release it in the caller we'd have inserted the compressed only descriptor in the page_hash. */ - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); if (zip && bpage->zip.data) { /* Free the compressed page. */ @@ -1578,20 +1574,15 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, @param id page identifier @param hash_lock buf_pool.page_hash latch (will be released here) */ void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id, - rw_lock_t *hash_lock) + page_hash_latch *hash_lock) { while (bpage->buf_fix_count()) - { /* Wait for other threads to release the fix count before releasing the bpage from LRU list. */ - ut_delay(1); - } + (void) LF_BACKOFF(); if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true)) buf_LRU_block_free_hashed_page(reinterpret_cast(bpage)); - - /* buf_LRU_block_remove_hashed() releases hash_lock */ - ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); } /** Update buf_pool.LRU_old_ratio. diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 0007e1d0831..8eaf0089495 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -53,7 +53,7 @@ that the block has been replaced with the real block. @param watch sentinel */ inline void buf_pool_t::watch_remove(buf_page_t *watch) { - ut_ad(rw_lock_own(hash_lock_get(watch->id()), RW_LOCK_X)); + ut_ad(hash_lock_get(watch->id())->is_write_locked()); ut_a(watch_is_sentinel(*watch)); if (watch->buf_fix_count()) { @@ -125,14 +125,14 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, /* We must acquire hash_lock this early to prevent a race condition with buf_pool_t::watch_remove() */ - rw_lock_t *hash_lock= buf_pool.hash_lock_get_low(fold); - rw_lock_x_lock(hash_lock); + page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold); if (hash_page && !buf_pool.watch_is_sentinel(*hash_page)) { /* The page is already in the buffer pool. */ - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); if (block) { rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ); @@ -160,7 +160,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ut_ad(!block->page.in_page_hash); ut_d(block->page.in_page_hash= true); HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, true/* to old blocks */); @@ -184,7 +184,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, } else { - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); /* The compressed page must be allocated before the control block (bpage), in order to avoid the @@ -193,7 +193,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, bool lru= false; void *data= buf_buddy_alloc(zip_size, &lru); - rw_lock_x_lock(hash_lock); + hash_lock->write_lock(); /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool.mutex. Thus, we must @@ -205,7 +205,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page))) { /* The block was added by some other thread. */ - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); buf_buddy_free(data, zip_size); goto func_exit; } @@ -234,7 +234,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, ut_d(bpage->in_page_hash= true); HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage); bpage->set_io_fix(BUF_IO_READ); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); /* The block must be put to the LRU list, to the old blocks. The zip size is already set into the page zip */ @@ -253,7 +253,6 @@ func_exit_no_mutex: if (mode == BUF_READ_IBUF_PAGES_ONLY) ibuf_mtr_commit(&mtr); - ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); ut_ad(!bpage || bpage->in_file()); return bpage; @@ -426,10 +425,10 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) for (page_id_t i= low; i < high; ++i) { const ulint fold= i.fold(); - rw_lock_t *hash_lock= buf_pool.page_hash_lock(fold); - const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold); + page_hash_latch *hash_lock= buf_pool.page_hash.lock(fold); + const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold); bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); if (found && !--count) goto read_ahead; } @@ -620,7 +619,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) for (page_id_t i= low; i != high_1; ++i) { const ulint fold= i.fold(); - rw_lock_t *hash_lock= buf_pool.page_hash_lock(fold); + page_hash_latch *hash_lock= buf_pool.page_hash.lock(fold); const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold); if (i == page_id) { @@ -632,7 +631,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) if (!bpage) { hard_fail: - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); space->release(); return 0; } @@ -673,7 +672,7 @@ hard_fail: else if (!bpage) { failed: - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); if (--count) continue; space->release(); @@ -694,7 +693,7 @@ failed: prev_accessed= accessed; if (fail) goto failed; - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); } /* If we got this far, read-ahead can be sensible: do it */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 75ee30476f7..4e1155836a4 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -590,8 +590,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] = { PSI_RWLOCK_KEY(trx_purge_latch), PSI_RWLOCK_KEY(index_tree_rw_lock), PSI_RWLOCK_KEY(index_online_log), - PSI_RWLOCK_KEY(dict_table_stats), - PSI_RWLOCK_KEY(hash_table_locks) + PSI_RWLOCK_KEY(dict_table_stats) }; # endif /* UNIV_PFS_RWLOCK */ @@ -19500,11 +19499,6 @@ static MYSQL_SYSVAR_ULONG(buffer_pool_chunk_size, srv_buf_pool_chunk_unit, 128 * 1024 * 1024, 1024 * 1024, LONG_MAX, 1024 * 1024); #if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG -static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks, - PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, - "Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2", - NULL, NULL, 64, 1, MAX_PAGE_HASH_LOCKS, 0); - static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "Number of pages reserved in doublewrite buffer for batch flushing", @@ -20393,7 +20387,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(merge_threshold_set_all_debug), #endif /* UNIV_DEBUG */ #if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG - MYSQL_SYSVAR(page_hash_locks), MYSQL_SYSVAR(doublewrite_batch_size), #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ MYSQL_SYSVAR(status_output), diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 96c96113e85..4e0b25c52dd 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -70,9 +70,6 @@ struct fil_addr_t; #define BUF_EVICT_IF_IN_POOL 20 /*!< evict a clean block if found */ /* @} */ -#define MAX_PAGE_HASH_LOCKS 1024 /*!< The maximum number of - page_hash locks */ - # ifdef UNIV_DEBUG extern my_bool buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing buffer pool is not allowed. */ @@ -1605,47 +1602,9 @@ public: } /** Get the page_hash latch for a page */ - rw_lock_t *hash_lock_get(const page_id_t id) const + page_hash_latch *hash_lock_get(const page_id_t id) const { - return hash_lock_get_low(id.fold()); - } - /** Get a page_hash latch. */ - rw_lock_t *hash_lock_get_low(ulint fold) const - { - return page_hash_latches + - ut_2pow_remainder(page_hash.calc_hash(fold), - ulint{srv_n_page_hash_locks}); - } -private: - /** Get a page_hash latch. */ - rw_lock_t *hash_lock_get_low(ulint fold, ulint n_cells) const - { - return page_hash_latches + - ut_2pow_remainder(ut_hash_ulint(fold, n_cells), - ulint{srv_n_page_hash_locks}); - } -public: - - /** Acquire a page_hash bucket latch, tolerating concurrent resize() - @tparam exclusive whether the latch is to be acquired exclusively - @param fold hash bucket key */ - template rw_lock_t *page_hash_lock(ulint fold) - { - for (;;) - { - auto n_cells= page_hash.n_cells; - rw_lock_t *latch= hash_lock_get_low(fold, n_cells); - if (exclusive) - rw_lock_x_lock(latch); - else - rw_lock_s_lock(latch); - if (UNIV_LIKELY(n_cells == page_hash.n_cells)) - return latch; - if (exclusive) - rw_lock_x_unlock(latch); - else - rw_lock_s_unlock(latch); - } + return page_hash.lock_get(id.fold()); } /** Look up a block descriptor. @@ -1656,9 +1615,7 @@ public: buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold) { ut_ad(id.fold() == fold); - ut_ad(mutex_own(&mutex) || - rw_lock_own_flagged(hash_lock_get_low(fold), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(mutex_own(&mutex) || page_hash.lock_get(fold)->is_locked()); buf_page_t *bpage; /* Look for the page in the hash table */ HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage, @@ -1676,17 +1633,14 @@ private: @retval nullptr if no block was found; !lock || !*lock will also hold */ template buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold, - rw_lock_t **hash_lock) + page_hash_latch **hash_lock) { ut_ad(hash_lock || !exclusive); - rw_lock_t *latch= page_hash_lock(fold); + page_hash_latch *latch= page_hash.lock(fold); buf_page_t *bpage= page_hash_get_low(page_id, fold); if (!bpage || watch_is_sentinel(*bpage)) { - if (exclusive) - rw_lock_x_unlock(latch); - else - rw_lock_s_unlock(latch); + latch->release(); if (hash_lock) *hash_lock= nullptr; return watch ? bpage : nullptr; @@ -1697,10 +1651,8 @@ private: if (hash_lock) *hash_lock= latch; /* to be released by the caller */ - else if (exclusive) - rw_lock_x_unlock(latch); else - rw_lock_s_unlock(latch); + latch->release(); return bpage; } public: @@ -1713,7 +1665,7 @@ public: @retval nullptr if no block was found; !lock || !*lock will also hold */ template buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold, - rw_lock_t **hash_lock) + page_hash_latch **hash_lock) { return page_hash_get_locked(page_id, fold, hash_lock); } /** @return whether the buffer pool contains a page @@ -1730,9 +1682,7 @@ public: @return whether bpage a sentinel for a buffer pool watch */ bool watch_is_sentinel(const buf_page_t &bpage) { - ut_ad(mutex_own(&mutex) || - rw_lock_own_flagged(hash_lock_get(bpage.id()), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(mutex_own(&mutex) || hash_lock_get(bpage.id())->is_locked()); ut_ad(bpage.in_file()); if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)]) @@ -1754,11 +1704,11 @@ public: bool watch_occurred(const page_id_t id) { const ulint fold= id.fold(); - rw_lock_t *hash_lock= page_hash_lock(fold); + page_hash_latch *hash_lock= page_hash.lock(fold); /* The page must exist because watch_set() increments buf_fix_count. */ buf_page_t *bpage= page_hash_get_low(id, fold); const bool is_sentinel= watch_is_sentinel(*bpage); - rw_lock_s_unlock(hash_lock); + hash_lock->read_unlock(); return !is_sentinel; } @@ -1769,7 +1719,8 @@ public: @param hash_lock exclusively held page_hash latch @return a buffer pool block corresponding to id @retval nullptr if the block was not present, and a watch was installed */ - inline buf_page_t *watch_set(const page_id_t id, rw_lock_t **hash_lock); + inline buf_page_t *watch_set(const page_id_t id, + page_hash_latch **hash_lock); /** Stop watching whether a page has been read in. watch_set(id) must have returned nullptr before. @@ -1777,7 +1728,7 @@ public: void watch_unset(const page_id_t id) { const ulint fold= id.fold(); - rw_lock_t *hash_lock= page_hash_lock(fold); + page_hash_latch *hash_lock= page_hash.lock(fold); /* The page must exist because watch_set() increments buf_fix_count. */ buf_page_t *watch= page_hash_get_low(id, fold); if (watch->unfix() == 0 && watch_is_sentinel(*watch)) @@ -1786,7 +1737,7 @@ public: ut_ad(watch->in_page_hash); ut_d(watch->in_page_hash= false); HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch); - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); // Now that the watch is detached from page_hash, release it to watch[]. mutex_enter(&mutex); /* It is possible that watch_remove() already removed the watch. */ @@ -1799,7 +1750,7 @@ public: mutex_exit(&mutex); } else - rw_lock_x_unlock(hash_lock); + hash_lock->write_unlock(); } /** Remove the sentinel block for the watch before replacing it with a @@ -1872,11 +1823,92 @@ public: /** read-ahead request size in pages */ Atomic_counter read_ahead_area; + /** Hash table with singly-linked overflow lists. @see hash_table_t */ + struct page_hash_table + { + /** Number of array[] elements per page_hash_latch. + Must be one less than a power of 2. */ + static constexpr size_t ELEMENTS_PER_LATCH= 1023; + + /** number of payload elements in array[] */ + Atomic_relaxed n_cells; + /** the hash array, with pad(n_cells) elements */ + hash_cell_t *array; + + /** Create the hash table. + @param n the lower bound of n_cells */ + void create(ulint n); + + /** Free the hash table. */ + void free() { ut_free(array); array= nullptr; } + + /** @return the index of an array element */ + ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); } + /** @return raw array index converted to padded index */ + static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; } + private: + /** @return the hash value before any ELEMENTS_PER_LATCH padding */ + static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); } + + /** @return the index of an array element */ + static ulint calc_hash(ulint fold, ulint n_cells) + { + return pad(hash(fold, n_cells)); + } + /** Get a page_hash latch. */ + page_hash_latch *lock_get(ulint fold, ulint n) const + { + static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH), + "must be one less than a power of 2"); + return reinterpret_cast + (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]); + } + public: + /** Get a page_hash latch. */ + page_hash_latch *lock_get(ulint fold) const + { return lock_get(fold, n_cells); } + + /** Acquire an array latch, tolerating concurrent buf_pool_t::resize() + @tparam exclusive whether the latch is to be acquired exclusively + @param fold hash bucket key */ + template page_hash_latch *lock(ulint fold) + { + for (;;) + { + auto n= n_cells; + page_hash_latch *latch= lock_get(fold, n); + latch->acquire(); + /* Our latch prevents n_cells from changing. */ + if (UNIV_LIKELY(n == n_cells)) + return latch; + /* Retry, because buf_pool_t::resize_hash() affected us. */ + latch->release(); + } + } + + /** Exclusively aqcuire all latches */ + inline void write_lock_all(); + + /** Release all latches */ + inline void write_unlock_all(); + }; + +private: + /** Former page_hash that has been deleted during resize(); + singly-linked list via freed_page_hash->array[1] */ + page_hash_table *freed_page_hash; + + /** Lock all page_hash, also freed_page_hash. */ + inline void write_lock_all_page_hash(); + /** Release all page_hash, also freed_page_hash. */ + inline void write_unlock_all_page_hash(); + /** Resize page_hash and zip_hash. */ + inline void resize_hash(); + +public: /** Hash table of file pages (buf_page_t::in_file() holds), - indexed by page_id_t. Protected by both mutex and page_hash_latches[]. */ - hash_table_t page_hash; - /** Latches protecting page_hash */ - mutable rw_lock_t page_hash_latches[MAX_PAGE_HASH_LOCKS]; + indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */ + page_hash_table page_hash; /** map of block->frame to buf_block_t blocks that belong to buf_buddy_alloc(); protected by buf_pool.mutex */ @@ -2103,6 +2135,19 @@ private: /** The InnoDB buffer pool */ extern buf_pool_t buf_pool; +inline void page_hash_latch::read_lock() +{ + ut_ad(!mutex_own(&buf_pool.mutex)); + if (!read_trylock()) + read_lock_wait(); +} + +inline void page_hash_latch::write_lock() +{ + if (!write_trylock()) + write_lock_wait(); +} + inline void buf_page_t::add_buf_fix_count(uint32_t count) { ut_ad(mutex_own(&buf_pool.mutex)); @@ -2129,15 +2174,15 @@ inline void buf_page_t::set_state(buf_page_state state) if (!in_file()) break; /* fall through */ case BUF_BLOCK_FILE_PAGE: - ut_ad(rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X)); + ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked()); break; case BUF_BLOCK_NOT_USED: if (!in_file()) break; /* fall through */ case BUF_BLOCK_ZIP_PAGE: - ut_ad((this >= &buf_pool.watch[0] && - this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]) || - rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X)); + ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() || + (this >= &buf_pool.watch[0] && + this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)])); break; } #endif @@ -2159,7 +2204,7 @@ inline void buf_page_t::set_corrupt_id() break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_FILE_PAGE: - ut_ad(rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X)); + ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked()); break; case BUF_BLOCK_NOT_USED: case BUF_BLOCK_MEMORY: diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index ed3a6cabdb3..937cb427a47 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -153,7 +153,7 @@ buf_LRU_stat_update(); @param id page identifier @param hash_lock buf_pool.page_hash latch (will be released here) */ void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id, - rw_lock_t *hash_lock) + page_hash_latch *hash_lock) MY_ATTRIBUTE((nonnull)); #ifdef UNIV_DEBUG diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index 1802bd57ddd..55bd2ac3a5a 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -192,10 +192,43 @@ extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX]; #include "ut0mutex.h" #include "sync0rw.h" +#include "rw_lock.h" typedef ib_mutex_t BufPoolMutex; typedef ib_mutex_t FlushListMutex; typedef rw_lock_t BPageLock; + +class page_hash_latch : public rw_lock +{ +public: + /** Wait for a shared lock */ + void read_lock_wait(); + /** Wait for an exclusive lock */ + void write_lock_wait(); + + /** Acquire a shared lock */ + inline void read_lock(); + /** Acquire an exclusive lock */ + inline void write_lock(); + + /** Acquire a lock */ + template void acquire() + { + if (exclusive) + write_lock(); + else + read_lock(); + } + /** Release a lock */ + template void release() + { + if (exclusive) + write_unlock(); + else + read_unlock(); + } +}; + #endif /* !UNIV_INNOCHECKSUM */ #endif /* buf0types.h */ diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h index 58da36fee5e..981ff5a0814 100644 --- a/storage/innobase/include/hash0hash.h +++ b/storage/innobase/include/hash0hash.h @@ -33,8 +33,6 @@ struct hash_cell_t{ }; typedef void* hash_node_t; -#define hash_calc_hash(FOLD, TABLE) (TABLE)->calc_hash(FOLD) - /*******************************************************************//** Inserts a struct to a hash table. */ @@ -145,7 +143,7 @@ Gets the next struct in a hash chain, NULL if none. */ Looks for a struct in a hash table. */ #define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\ {\ - (DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \ HASH_ASSERT_VALID(DATA);\ \ while ((DATA) != NULL) {\ diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h new file mode 100644 index 00000000000..613adfef3f5 --- /dev/null +++ b/storage/innobase/include/rw_lock.h @@ -0,0 +1,106 @@ +/***************************************************************************** + +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once +#include +#include "my_dbug.h" + +/** Simple read-write lock based on std::atomic */ +class rw_lock +{ + /** The lock word */ + std::atomic lock; + +protected: + /** Available lock */ + static constexpr uint32_t UNLOCKED= 0; + /** Flag to indicate that write_lock() is being held */ + static constexpr uint32_t WRITER= 1 << 31; + /** Flag to indicate that write_lock_wait() is pending */ + static constexpr uint32_t WRITER_WAITING= 1 << 30; + /** Flag to indicate that write_lock() or write_lock_wait() is pending */ + static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING; + + /** Yield a read lock request due to a conflict with a write lock. + @return the lock value */ + uint32_t read_lock_yield() + { + uint32_t l= lock.fetch_sub(1, std::memory_order_relaxed); + DBUG_ASSERT(l & ~WRITER_PENDING); + return l; + } + /** Start waiting for an exclusive lock. */ + void write_lock_wait_start() + { lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); } + /** Wait for an exclusive lock. + @return whether the exclusive lock was acquired */ + bool write_lock_poll() + { + auto l= WRITER_WAITING; + if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire, + std::memory_order_relaxed)) + return true; + if (!(l & WRITER_WAITING)) + /* write_lock() must have succeeded for another thread */ + write_lock_wait_start(); + return false; + } + +public: + /** Default constructor */ + rw_lock() : lock(UNLOCKED) {} + + /** Release a shared lock */ + void read_unlock() + { + IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release); + DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */ + DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */ + } + /** Release an exclusive lock */ + void write_unlock() + { + IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release); + DBUG_ASSERT(l & WRITER); /* the write lock must have existed */ + } + /** Try to acquire a shared lock. + @return whether the lock was acquired */ + bool read_trylock() + { return !(lock.fetch_add(1, std::memory_order_acquire) & WRITER_PENDING); } + /** Try to acquire an exclusive lock. + @return whether the lock was acquired */ + bool write_trylock() + { + auto l= UNLOCKED; + return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire, + std::memory_order_relaxed); + } + + /** @return whether an exclusive lock is being held by any thread */ + bool is_write_locked() const + { return !!(lock.load(std::memory_order_relaxed) & WRITER); } + /** @return whether a shared lock is being held by any thread */ + bool is_read_locked() const + { + auto l= lock.load(std::memory_order_relaxed); + return (l & ~WRITER_PENDING) && !(l & WRITER); + } + /** @return whether any lock is being held by any thread */ + bool is_locked() const + { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; } +}; diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 2aa874edfad..d4b6425c44b 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -333,8 +333,6 @@ extern const ulint srv_buf_pool_min_size; extern const ulint srv_buf_pool_def_size; /** Requested buffer pool chunk size */ extern ulong srv_buf_pool_chunk_unit; -/** Number of locks to protect buf_pool.page_hash */ -extern ulong srv_n_page_hash_locks; /** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ extern ulong srv_LRU_scan_depth; /** Whether or not to flush neighbors of a block */ diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index 7fcac01e5ba..169cbdd9aa5 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -226,22 +226,8 @@ rw_lock_lock_word_decr( caused by concurrent executions of rw_lock_s_lock(). */ -#if 1 /* FIXME: MDEV-22871 Spurious contention between rw_lock_s_lock() */ - - /* When the number of concurrently executing threads - exceeds the number of available processor cores, - multiple buf_pool.page_hash S-latch requests would - conflict here, mostly in buf_page_get_low(). We should - implement a simpler rw-lock where the S-latch - acquisition would be a simple fetch_add(1) followed by - either an optional load() loop to wait for the X-latch - to be released, or a fetch_sub(1) and a retry. - - For now, we work around the problem with a delay in - this loop. It helped a little on some systems and was - reducing performance on others. */ - (void) LF_BACKOFF(); -#endif + /* Note: unlike this implementation, rw_lock::read_lock() + allows concurrent calls without a spin loop */ } /* A real conflict was detected. */ diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 7eb8250b63d..c63fedb43ee 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -126,7 +126,6 @@ extern mysql_pfs_key_t index_tree_rw_lock_key; extern mysql_pfs_key_t index_online_log_key; extern mysql_pfs_key_t dict_table_stats_key; extern mysql_pfs_key_t trx_sys_rw_lock_key; -extern mysql_pfs_key_t hash_table_locks_key; #endif /* UNIV_PFS_RWLOCK */ /** Prints info of the sync system. diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 84f91048068..8b8765b2748 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -207,9 +207,6 @@ const ulint srv_buf_pool_min_size = 5 * 1024 * 1024; const ulint srv_buf_pool_def_size = 128 * 1024 * 1024; /** Requested buffer pool chunk size */ ulong srv_buf_pool_chunk_unit; -/** innodb_page_hash_locks (a debug-only parameter); -number of locks to protect buf_pool.page_hash */ -ulong srv_n_page_hash_locks = 64; /** innodb_lru_scan_depth; number of blocks scanned in LRU flush batch */ ulong srv_LRU_scan_depth; /** innodb_flush_neighbors; whether or not to flush neighbors of a block */ diff --git a/storage/innobase/sync/sync0debug.cc b/storage/innobase/sync/sync0debug.cc index e370dc86f34..08f8baab101 100644 --- a/storage/innobase/sync/sync0debug.cc +++ b/storage/innobase/sync/sync0debug.cc @@ -777,7 +777,7 @@ LatchDebug::check_order( case SYNC_POOL: case SYNC_POOL_MANAGER: case SYNC_RECV_WRITER: - + case SYNC_BUF_PAGE_HASH: basic_check(latches, level, level); break; @@ -825,14 +825,6 @@ LatchDebug::check_order( basic_check(latches, level, level - 1); break; - case SYNC_BUF_PAGE_HASH: - - /* Multiple page_hash locks are only allowed during - buf_pool.validate() and that is where buf_pool mutex is already - held. */ - - /* Fall through */ - case SYNC_REC_LOCK: if (find(latches, SYNC_LOCK_SYS) != 0) { @@ -1453,9 +1445,6 @@ sync_latch_meta_init() LATCH_ADD_RWLOCK(DICT_TABLE_STATS, SYNC_INDEX_TREE, dict_table_stats_key); - LATCH_ADD_RWLOCK(HASH_TABLE_RW_LOCK, SYNC_BUF_PAGE_HASH, - hash_table_locks_key); - LATCH_ADD_MUTEX(SYNC_DEBUG_MUTEX, SYNC_NO_ORDER_CHECK, PFS_NOT_INSTRUMENTED); diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index 85db39047ec..8b81d8a5ff8 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -102,7 +102,6 @@ mysql_pfs_key_t buf_block_debug_latch_key; # endif /* UNIV_DEBUG */ mysql_pfs_key_t dict_operation_lock_key; mysql_pfs_key_t dict_table_stats_key; -mysql_pfs_key_t hash_table_locks_key; mysql_pfs_key_t index_tree_rw_lock_key; mysql_pfs_key_t index_online_log_key; mysql_pfs_key_t fil_space_latch_key;