From faa4d8f8c6c12c09541c4733773ac700c5e643cf Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Tue, 28 Aug 2018 08:23:44 +0000 Subject: [PATCH] Copy of commit de1e8c7bfe7c875ea284b55040e8f3cd3a56fcc2 Author: Abhinav Sharma Date: Thu Aug 23 14:34:39 2018 -0700 Log updates to semi-sync whitelist in the error log Summary: Plugin variable changes are not logged in the error log even when log_global_var_changes is enabled. Logging updates to whitelist will help in debugging. Reviewed By: guokeno0 Differential Revision: D9483807 fbshipit-source-id: e111cda773d --- storage/rocksdb/CMakeLists.txt | 23 +- storage/rocksdb/ha_rocksdb.cc | 1329 +++++++++++++---- storage/rocksdb/ha_rocksdb.h | 47 +- storage/rocksdb/ha_rocksdb_proto.h | 7 +- .../rocksdb/mysql-test/rocksdb/combinations | 6 + .../rocksdb/include/have_write_committed.inc | 3 + .../rocksdb/r/add_index_inplace.result | 24 + .../r/add_index_inplace_sstfilewriter.result | 2 +- .../rocksdb/r/allow_no_primary_key.result | 25 + .../mysql-test/rocksdb/r/autoinc_debug.result | 40 +- .../mysql-test/rocksdb/r/autoinc_vars.result | 18 + .../mysql-test/rocksdb/r/bloomfilter5.result | 62 + .../rocksdb/r/bloomfilter_bulk_load.result | 15 + .../mysql-test/rocksdb/r/bulk_load_sk.result | 229 +++ .../mysql-test/rocksdb/r/cardinality.result | 17 +- .../mysql-test/rocksdb/r/collation.result | 20 +- .../mysql-test/rocksdb/r/com_rpc_tx.result | 21 + .../r/create_no_primary_key_table.result | 38 + .../rocksdb/r/ddl_high_priority.result | 71 +- .../rocksdb/r/deadlock_tracking.result | 57 + .../mysql-test/rocksdb/r/drop_table.result | 11 + .../rocksdb/r/explicit_snapshot.result | 265 ++++ .../mysql-test/rocksdb/r/i_s_deadlock.result | 55 +- .../rocksdb/r/index_file_map.result | 3 + .../rocksdb/r/index_merge_rocksdb2.result | 78 +- .../rocksdb/r/information_schema.result | 1 + .../mysql-test/rocksdb/r/issue255.result | 2 +- .../rocksdb/mysql-test/rocksdb/r/lock.result | 15 + .../mysql-test/rocksdb/r/mysqldump.result | 5 +- .../percona_nonflushing_analyze_debug.result | 19 + .../r/prefix_extractor_override.result | 7 +- .../mysql-test/rocksdb/r/read_only_tx.result | 2 +- .../mysql-test/rocksdb/r/rocksdb.result | 24 +- .../rocksdb/r/rocksdb_checksums.result | 6 +- .../r/rocksdb_deadlock_detect_rc.result | 16 +- .../r/rocksdb_deadlock_detect_rr.result | 16 +- .../r/secondary_key_update_lock.result | 11 + .../mysql-test/rocksdb/r/show_engine.result | 53 + .../mysql-test/rocksdb/r/transaction.result | 41 + .../mysql-test/rocksdb/r/trx_info_rpl.result | 2 +- .../mysql-test/rocksdb/r/update.result | 8 + .../rocksdb/r/use_direct_reads.result | 18 + .../rocksdb/r/validate_datadic.result | 3 - .../rocksdb/t/add_index_inplace.test | 18 + .../rocksdb/t/add_index_inplace_crash.test | 1 + .../t/add_index_inplace_sstfilewriter.test | 2 +- .../rocksdb/t/allow_no_primary_key.test | 28 + .../mysql-test/rocksdb/t/autoinc_debug.test | 17 +- .../mysql-test/rocksdb/t/autoinc_vars.test | 23 + .../rocksdb/t/bloomfilter5-master.opt | 3 + .../mysql-test/rocksdb/t/bloomfilter5.test | 61 + .../t/bloomfilter_bulk_load-master.opt | 2 + .../rocksdb/t/bloomfilter_bulk_load.test | 35 + .../mysql-test/rocksdb/t/bulk_load_sk.test | 119 ++ .../mysql-test/rocksdb/t/cardinality.test | 21 +- .../t/check_ignore_unknown_options.test | 4 +- .../mysql-test/rocksdb/t/collation.test | 42 +- .../mysql-test/rocksdb/t/com_rpc_tx.cnf | 4 + .../mysql-test/rocksdb/t/com_rpc_tx.test | 87 ++ .../t/create_no_primary_key_table-master.opt | 1 + .../t/create_no_primary_key_table.test | 44 + .../rocksdb/t/deadlock_tracking.test | 76 +- .../mysql-test/rocksdb/t/drop_table.test | 27 + .../rocksdb/t/explicit_snapshot-master.opt | 1 + .../rocksdb/t/explicit_snapshot.test | 263 ++++ .../mysql-test/rocksdb/t/i_s_deadlock.test | 14 +- .../mysql-test/rocksdb/t/index_file_map.test | 3 + .../rocksdb/t/index_merge_rocksdb2-master.opt | 2 +- .../rocksdb/t/information_schema.test | 1 + .../t/insert_optimized_config-master.opt | 8 - .../rocksdb/t/insert_optimized_config.test | 10 + .../mysql-test/rocksdb/t/issue255.test | 2 +- .../rocksdb/mysql-test/rocksdb/t/lock.test | 22 + .../t/percona_nonflushing_analyze_debug.test | 11 + .../rocksdb/t/prefix_extractor_override.test | 12 +- .../rocksdb/mysql-test/rocksdb/t/rocksdb.test | 3 +- .../rocksdb/t/rocksdb_cf_options.test | 1 - .../rocksdb/t/rocksdb_checksums.test | 24 +- .../rocksdb/t/rocksdb_deadlock_detect.inc | 35 +- .../rocksdb/t/secondary_key_update_lock.test | 26 + .../mysql-test/rocksdb/t/set_checkpoint.inc | 2 +- .../mysql-test/rocksdb/t/show_engine.test | 13 + .../mysql-test/rocksdb/t/transaction.test | 53 + .../mysql-test/rocksdb/t/trx_info_rpl.test | 4 +- .../rocksdb/mysql-test/rocksdb/t/update.test | 10 + .../rocksdb/t/use_direct_reads.test | 37 + .../rocksdb/t/validate_datadic.test | 27 +- .../mysql-test/rocksdb_rpl/combinations | 8 +- .../r/rpl_missing_columns_sk_update.result | 62 + ...mts_dependency_unique_key_conflicts.result | 44 + .../rocksdb_rpl/r/rpl_rocksdb_snapshot.result | 4 +- .../t/rpl_gtid_crash_safe_wal_corrupt.inc | 2 +- .../t/rpl_missing_columns_sk_update.cnf | 13 + .../t/rpl_missing_columns_sk_update.test | 69 + ...l_mts_dependency_unique_key_conflicts.test | 64 + .../t/rpl_rocksdb_2pc_crash_recover.test | 1 + .../mysql-test/rocksdb_stress/combinations | 5 + .../r/rocksdb_block_cache_size_basic.result | 84 +- .../r/rocksdb_bulk_load_allow_sk_basic.result | 100 ++ ...ommit_time_batch_for_recovery_basic.result | 121 ++ .../r/rocksdb_create_checkpoint_basic.result | 2 +- ...debug_manual_compaction_delay_basic.result | 46 + ...error_on_suboptimal_collation_basic.result | 7 + ...sdb_manual_compaction_threads_basic.result | 93 ++ ...ocksdb_max_manifest_file_size_basic.result | 2 +- ...ocksdb_max_manual_compactions_basic.result | 57 + .../r/rocksdb_max_row_locks_basic.result | 21 + .../r/rocksdb_stats_recalc_rate_basic.result | 53 + ...rocksdb_write_batch_max_bytes_basic.result | 4 +- .../r/rocksdb_write_policy_basic.result | 15 + .../t/rocksdb_block_cache_size_basic.test | 16 +- .../t/rocksdb_bulk_load_allow_sk_basic.test | 18 + ..._commit_time_batch_for_recovery_basic.test | 20 + .../t/rocksdb_create_checkpoint_basic.test | 2 +- ...b_debug_manual_compaction_delay_basic.test | 16 + ...b_error_on_suboptimal_collation_basic.test | 6 + ...cksdb_manual_compaction_threads_basic.test | 17 + .../rocksdb_max_manual_compactions_basic.test | 17 + .../t/rocksdb_max_row_locks_basic.test | 1 + .../t/rocksdb_stats_recalc_rate_basic.test | 17 + .../rocksdb_write_batch_max_bytes_basic.test | 4 +- .../t/rocksdb_write_policy_basic.test | 17 + storage/rocksdb/properties_collector.cc | 61 +- storage/rocksdb/properties_collector.h | 3 +- storage/rocksdb/rdb_cf_options.cc | 2 - storage/rocksdb/rdb_comparator.h | 46 +- storage/rocksdb/rdb_datadic.cc | 9 +- storage/rocksdb/rdb_datadic.h | 31 +- storage/rocksdb/rdb_i_s.cc | 167 ++- storage/rocksdb/rdb_i_s.h | 1 + storage/rocksdb/rdb_index_merge.cc | 5 + storage/rocksdb/rdb_perf_context.cc | 2 - storage/rocksdb/rdb_psi.cc | 17 +- storage/rocksdb/rdb_psi.h | 10 +- storage/rocksdb/rdb_sst_info.cc | 4 +- storage/rocksdb/rdb_threads.h | 31 +- storage/rocksdb/rdb_utils.cc | 23 - 137 files changed, 4671 insertions(+), 683 deletions(-) create mode 100644 storage/rocksdb/mysql-test/rocksdb/combinations create mode 100644 storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/explicit_snapshot.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/percona_nonflushing_analyze_debug.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/secondary_key_update_lock.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/use_direct_reads.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter_bulk_load-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter_bulk_load.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bulk_load_sk.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/com_rpc_tx.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/com_rpc_tx.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/create_no_primary_key_table-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/create_no_primary_key_table.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/explicit_snapshot-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/explicit_snapshot.test delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/insert_optimized_config-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/percona_nonflushing_analyze_debug.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/secondary_key_update_lock.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/use_direct_reads.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_missing_columns_sk_update.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_mts_dependency_unique_key_conflicts.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_missing_columns_sk_update.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_missing_columns_sk_update.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_mts_dependency_unique_key_conflicts.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_stress/combinations create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_bulk_load_allow_sk_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_commit_time_batch_for_recovery_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_debug_manual_compaction_delay_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_error_on_suboptimal_collation_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_manual_compaction_threads_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_max_manual_compactions_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_recalc_rate_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_write_policy_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_bulk_load_allow_sk_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_commit_time_batch_for_recovery_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_debug_manual_compaction_delay_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_error_on_suboptimal_collation_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_manual_compaction_threads_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_max_manual_compactions_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_stats_recalc_rate_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_write_policy_basic.test diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index a5ef5651e1f..54b0b1fab6a 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -82,32 +82,43 @@ ENDIF() SET(rocksdb_static_libs ) IF (WITH_SNAPPY) + FIND_LIBRARY(SNAPPY_LIBRARY + NAMES libsnappy${PIC_EXT}.a snappy + HINTS ${WITH_SNAPPY}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_SNAPPY}/lib/libsnappy${PIC_EXT}.a) + ${SNAPPY_LIBRARY}) ADD_DEFINITIONS(-DSNAPPY) ENDIF() IF (WITH_LZ4) + FIND_LIBRARY(LZ4_LIBRARY + NAMES liblz4${PIC_EXT}.a lz4 + HINTS ${WITH_LZ4}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_LZ4}/lib/liblz4${PIC_EXT}.a) + ${LZ4_LIBRARY}) ADD_DEFINITIONS(-DLZ4) ENDIF() IF (WITH_BZ2) + FIND_LIBRARY(BZ2_LIBRARY + NAMES libbz2${PIC_EXT}.a bz2 + HINTS ${WITH_BZ2}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_BZ2}/lib/libbz2${PIC_EXT}.a) + ${BZ2_LIBRARY}) ADD_DEFINITIONS(-DBZIP2) ENDIF() IF (WITH_ZSTD) - SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_ZSTD}/lib/libzstd${PIC_EXT}.a) + SET(rocksdb_static_libs ${rocksdb_static_libs} ${ZSTD_LIBRARY}) ADD_DEFINITIONS(-DZSTD) ENDIF() IF (WITH_TBB) + FIND_LIBRARY(TBB_LIBRARY + NAMES libtbb${PIC_EXT}.a tbb + HINTS ${WITH_TBB}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_TBB}/lib/libtbb${PIC_EXT}.a) + ${TBB_LIBRARY}) ADD_DEFINITIONS(-DTBB) ENDIF() diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 961ca5d1d17..c7d7cc0bdc4 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -25,6 +25,7 @@ /* C++ standard header files */ #include +#include #include #include #include @@ -109,6 +110,58 @@ const std::string DEFAULT_CF_NAME("default"); const std::string DEFAULT_SYSTEM_CF_NAME("__system__"); const std::string PER_INDEX_CF_NAME("$per_index_cf"); +class Rdb_explicit_snapshot; + +std::mutex explicit_snapshot_mutex; +ulonglong explicit_snapshot_counter = 0; +std::unordered_map> + explicit_snapshots; +static std::vector rdb_indexes_to_recalc; + +class Rdb_explicit_snapshot : public explicit_snapshot { + std::unique_ptr snapshot; + + public: + static std::shared_ptr + create(snapshot_info_st *ss_info, rocksdb::DB *db, + const rocksdb::Snapshot *snapshot) { + std::lock_guard lock(explicit_snapshot_mutex); + auto s = std::unique_ptr( + new rocksdb::ManagedSnapshot(db, snapshot)); + if (!s) { + return nullptr; + } + ss_info->snapshot_id = ++explicit_snapshot_counter; + auto ret = std::make_shared(*ss_info, std::move(s)); + if (!ret) { + return nullptr; + } + explicit_snapshots[ss_info->snapshot_id] = ret; + return ret; + } + + static std::shared_ptr + get(const ulonglong snapshot_id) { + std::lock_guard lock(explicit_snapshot_mutex); + auto elem = explicit_snapshots.find(snapshot_id); + if (elem == explicit_snapshots.end()) { + return nullptr; + } + return elem->second.lock(); + } + + rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); } + + Rdb_explicit_snapshot(snapshot_info_st ss_info, + std::unique_ptr snapshot) + : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {} + + virtual ~Rdb_explicit_snapshot() { + std::lock_guard lock(explicit_snapshot_mutex); + explicit_snapshots.erase(ss_info.snapshot_id); + } +}; + /** Updates row counters based on the table type and operation type. */ @@ -126,11 +179,15 @@ static handler *rocksdb_create_handler(my_core::handlerton *hton, my_core::TABLE_SHARE *table_arg, my_core::MEM_ROOT *mem_root); -static rocksdb::CompactRangeOptions getCompactRangeOptions() { +static rocksdb::CompactRangeOptions +getCompactRangeOptions(int concurrency = 0) { rocksdb::CompactRangeOptions compact_range_options; compact_range_options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForce; compact_range_options.exclusive_manual_compaction = false; + if (concurrency > 0) { + compact_range_options.max_subcompactions = concurrency; + } return compact_range_options; } @@ -167,6 +224,8 @@ Rdb_io_watchdog *io_watchdog = nullptr; static Rdb_background_thread rdb_bg_thread; +static Rdb_manual_compaction_thread rdb_mc_thread; + // List of table names (using regex) that are exceptions to the strict // collation check requirement. Regex_list_handler *rdb_collation_exceptions; @@ -180,30 +239,6 @@ static void rocksdb_flush_all_memtables() { } } -static void rocksdb_compact_column_family_stub( - THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, - const void *const save) {} - -static int rocksdb_compact_column_family(THD *const thd, - struct st_mysql_sys_var *const var, - void *const var_ptr, - struct st_mysql_value *const value) { - char buff[STRING_BUFFER_USUAL_SIZE]; - int len = sizeof(buff); - - DBUG_ASSERT(value != nullptr); - - if (const char *const cf = value->val_str(value, buff, &len)) { - auto cfh = cf_manager.get_cf(cf); - if (cfh != nullptr && rdb != nullptr) { - sql_print_information("RocksDB: Manual compaction of column family: %s\n", - cf); - rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr); - } - } - return HA_EXIT_SUCCESS; -} - /////////////////////////////////////////////////////////// // Hash map: table name => open table handler /////////////////////////////////////////////////////////// @@ -353,6 +388,7 @@ static void rocksdb_drop_index_wakeup_thread( static my_bool rocksdb_pause_background_work = 0; static mysql_mutex_t rdb_sysvars_mutex; +static mysql_mutex_t rdb_block_cache_resize_mutex; static void rocksdb_set_pause_background_work( my_core::THD *const thd MY_ATTRIBUTE((__unused__)), @@ -435,6 +471,9 @@ static void rocksdb_set_wal_bytes_per_sync(THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr, const void *const save); +static int rocksdb_validate_set_block_cache_size( + THD *thd, struct st_mysql_sys_var *const var, void *var_ptr, + struct st_mysql_value *value); ////////////////////////////////////////////////////////////////////////////// // Options definitions ////////////////////////////////////////////////////////////////////////////// @@ -490,11 +529,19 @@ static my_bool rocksdb_enable_bulk_load_api = 1; static my_bool rocksdb_print_snapshot_conflict_queries = 0; static my_bool rocksdb_large_prefix = 0; static my_bool rocksdb_allow_to_start_after_corruption = 0; +static uint64_t rocksdb_write_policy = + rocksdb::TxnDBWritePolicy::WRITE_COMMITTED; +static my_bool rocksdb_error_on_suboptimal_collation = 1; +static uint32_t rocksdb_stats_recalc_rate = 0; +static uint32_t rocksdb_debug_manual_compaction_delay = 0; +static uint32_t rocksdb_max_manual_compactions = 0; std::atomic rocksdb_row_lock_deadlocks(0); std::atomic rocksdb_row_lock_wait_timeouts(0); std::atomic rocksdb_snapshot_conflict_errors(0); std::atomic rocksdb_wal_group_syncs(0); +std::atomic rocksdb_manual_compactions_processed(0); +std::atomic rocksdb_manual_compactions_running(0); static std::unique_ptr rdb_init_rocksdb_db_options(void) { auto o = std::unique_ptr(new rocksdb::DBOptions()); @@ -519,6 +566,14 @@ static std::unique_ptr rocksdb_db_options = static std::shared_ptr rocksdb_rate_limiter; +/* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */ +static const char *write_policy_names[] = {"write_committed", "write_prepared", + "write_unprepared", NullS}; + +static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1, + "write_policy_typelib", + write_policy_names, nullptr}; + /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */ static const char *info_log_level_names[] = {"debug_level", "info_level", "warn_level", "error_level", @@ -610,6 +665,14 @@ static int rocksdb_validate_flush_log_at_trx_commit( *static_cast(var_ptr) = static_cast(new_value); return HA_EXIT_SUCCESS; } +static void rocksdb_compact_column_family_stub( + THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, + const void *const save) {} + +static int rocksdb_compact_column_family(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + struct st_mysql_value *const value); static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS}; @@ -618,7 +681,8 @@ static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1, nullptr}; const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024; -const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024; +const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024; +const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024; const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000; const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024; const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024; @@ -649,6 +713,11 @@ static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG, /*min*/ 2, /*max*/ ULONG_MAX, 0); +static MYSQL_THDVAR_BOOL( + commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG, + "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr, + nullptr, FALSE); + static MYSQL_THDVAR_BOOL( trace_sst_api, PLUGIN_VAR_RQCMDARG, "Generate trace output in the log for each call to the SstFileWriter", @@ -660,6 +729,13 @@ static MYSQL_THDVAR_BOOL( "unique_checks and enables rocksdb_commit_in_the_middle.", rocksdb_check_bulk_load, nullptr, FALSE); +static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG, + "Allow bulk loading of sk keys during bulk-load. " + "Can be changed only when bulk load is disabled.", + /* Intentionally reuse unsorted's check function */ + rocksdb_check_bulk_load_allow_unsorted, nullptr, + FALSE); + static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG, "Allow unsorted input during bulk-load. " "Can be changed only when bulk load is disabled.", @@ -705,7 +781,7 @@ static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG, static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG, "Maximum number of locks a transaction can have", nullptr, nullptr, - /*default*/ RDB_MAX_ROW_LOCKS, + /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS, /*min*/ 1, /*max*/ RDB_MAX_ROW_LOCKS, 0); @@ -757,6 +833,12 @@ static MYSQL_THDVAR_ULONGLONG( /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY, /* max */ SIZE_T_MAX, 1); +static MYSQL_THDVAR_INT( + manual_compaction_threads, PLUGIN_VAR_RQCMDARG, + "How many rocksdb threads to run for manual compactions", nullptr, nullptr, + /* default rocksdb.dboption max_subcompactions */ 0, + /* min */ 0, /* max */ 128, 0); + static MYSQL_SYSVAR_BOOL( create_if_missing, *reinterpret_cast(&rocksdb_db_options->create_if_missing), @@ -778,6 +860,12 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr, rocksdb_db_options->manual_wal_flush); +static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "DBOptions::write_policy for RocksDB", nullptr, + nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED, + &write_policy_typelib); + static MYSQL_SYSVAR_BOOL( create_missing_column_families, *reinterpret_cast( @@ -988,7 +1076,9 @@ static MYSQL_SYSVAR_INT(table_cache_numshardbits, "DBOptions::table_cache_numshardbits for RocksDB", nullptr, nullptr, rocksdb_db_options->table_cache_numshardbits, - /* min */ 0, /* max */ INT_MAX, 0); + // LRUCache limits this to 19 bits, anything greater + // fails to create a cache and returns a nullptr + /* min */ 0, /* max */ 19, 0); static MYSQL_SYSVAR_ULONG(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -1099,8 +1189,9 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true); static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "block_cache size for RocksDB", nullptr, nullptr, + PLUGIN_VAR_RQCMDARG, + "block_cache size for RocksDB", + rocksdb_validate_set_block_cache_size, nullptr, /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE, /* min */ RDB_MIN_BLOCK_CACHE_SIZE, /* max */ LONGLONG_MAX, @@ -1342,6 +1433,18 @@ static MYSQL_SYSVAR_BOOL( "on PK TTL data. This variable is a no-op in non-debug builds.", nullptr, nullptr, FALSE); +static MYSQL_SYSVAR_UINT( + max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG, + "Maximum number of pending + ongoing number of manual compactions.", + nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0); + +static MYSQL_SYSVAR_UINT( + debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay, + PLUGIN_VAR_RQCMDARG, + "For debugging purposes only. Sleeping specified seconds " + "for simulating long running compactions.", + nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0); + static MYSQL_SYSVAR_BOOL( reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG, "Reset the RocksDB internal statistics without restarting the DB.", nullptr, @@ -1497,6 +1600,13 @@ static MYSQL_SYSVAR_UINT( RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0, /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0); +static MYSQL_SYSVAR_UINT( + stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG, + "The number of indexes per second to recalculate statistics for. 0 to " + "disable background recalculation.", + nullptr, nullptr, 0 /* default value */, 0 /* min value */, + UINT_MAX /* max value */, 0); + static MYSQL_SYSVAR_BOOL( large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, "Support large index prefix length of 3072 bytes. If off, the maximum " @@ -1510,16 +1620,25 @@ static MYSQL_SYSVAR_BOOL( "detected.", nullptr, nullptr, FALSE); +static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation, + rocksdb_error_on_suboptimal_collation, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Raise an error instead of warning if a sub-optimal " + "collation is used", + nullptr, nullptr, TRUE); + static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100; static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(deadlock_detect), MYSQL_SYSVAR(deadlock_detect_depth), + MYSQL_SYSVAR(commit_time_batch_for_recovery), MYSQL_SYSVAR(max_row_locks), MYSQL_SYSVAR(write_batch_max_bytes), MYSQL_SYSVAR(lock_scanned_rows), MYSQL_SYSVAR(bulk_load), + MYSQL_SYSVAR(bulk_load_allow_sk), MYSQL_SYSVAR(bulk_load_allow_unsorted), MYSQL_SYSVAR(skip_unique_check_tables), MYSQL_SYSVAR(trace_sst_api), @@ -1537,6 +1656,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(create_if_missing), MYSQL_SYSVAR(two_write_queues), MYSQL_SYSVAR(manual_wal_flush), + MYSQL_SYSVAR(write_policy), MYSQL_SYSVAR(create_missing_column_families), MYSQL_SYSVAR(error_if_exists), MYSQL_SYSVAR(paranoid_checks), @@ -1652,6 +1772,11 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(allow_to_start_after_corruption), + MYSQL_SYSVAR(error_on_suboptimal_collation), + MYSQL_SYSVAR(stats_recalc_rate), + MYSQL_SYSVAR(debug_manual_compaction_delay), + MYSQL_SYSVAR(max_manual_compactions), + MYSQL_SYSVAR(manual_compaction_threads), nullptr}; static rocksdb::WriteOptions @@ -1666,6 +1791,50 @@ rdb_get_rocksdb_write_options(my_core::THD *const thd) { return opt; } +static int rocksdb_compact_column_family(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + struct st_mysql_value *const value) { + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + DBUG_ASSERT(value != nullptr); + + if (const char *const cf = value->val_str(value, buff, &len)) { + auto cfh = cf_manager.get_cf(cf); + if (cfh != nullptr && rdb != nullptr) { + int mc_id = rdb_mc_thread.request_manual_compaction( + cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads)); + if (mc_id == -1) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "Can't schedule more manual compactions. " + "Increase rocksdb_max_manual_compactions or stop issuing " + "more manual compactions."); + return HA_EXIT_FAILURE; + } else if (mc_id < 0) { + return HA_EXIT_FAILURE; + } + // NO_LINT_DEBUG + sql_print_information("RocksDB: Manual compaction of column family: %s\n", + cf); + // Checking thd state every short cycle (100ms). This is for allowing to + // exiting this function without waiting for CompactRange to finish. + do { + my_sleep(100000); + } while (!thd->killed && + !rdb_mc_thread.is_manual_compaction_finished(mc_id)); + + if (thd->killed) { + // This cancels if requested compaction state is INITED. + // TODO(yoshinorim): Cancel running compaction as well once + // it is supported in RocksDB. + rdb_mc_thread.clear_manual_compaction_request(mc_id, true); + } + } + } + return HA_EXIT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////////////////// /** @@ -1763,9 +1932,16 @@ protected: bool m_is_delayed_snapshot = false; bool m_is_two_phase = false; - THD *m_thd = nullptr; +private: + /* + Number of write operations this transaction had when we took the last + savepoint (the idea is not to take another savepoint if we haven't made + any changes) + */ + ulonglong m_writes_at_last_savepoint; - rocksdb::ReadOptions m_read_opts; +protected: + THD *m_thd = nullptr; static std::multiset s_tx_list; static mysql_mutex_t s_tx_list_mutex; @@ -1814,7 +1990,16 @@ protected: return s; } +protected: + /* + The following two are helper functions to be overloaded by child classes. + They should provide RocksDB's savepoint semantics. + */ + virtual void do_set_savepoint() = 0; + virtual void do_rollback_to_savepoint() = 0; + public: + rocksdb::ReadOptions m_read_opts; const char *m_mysql_log_file_name; my_off_t m_mysql_log_offset; const char *m_mysql_gtid; @@ -1822,6 +2007,7 @@ protected: String m_detailed_error; int64_t m_snapshot_timestamp = 0; bool m_ddl_transaction; + std::shared_ptr m_explicit_snapshot; /* Tracks the number of tables in use through external_lock. @@ -1881,7 +2067,7 @@ protected: if (s.IsDeadlock()) { my_core::thd_mark_transaction_to_rollback(thd, - false /* just statement */); + true /* whole transaction */); m_detailed_error = String(); table_handler->m_deadlock_counter.inc(); rocksdb_row_lock_deadlocks++; @@ -1898,7 +2084,7 @@ protected: } m_detailed_error = String(" (snapshot conflict)", system_charset_info); table_handler->m_deadlock_counter.inc(); - return HA_ERR_LOCK_DEADLOCK; + return HA_ERR_ROCKSDB_STATUS_BUSY; } if (s.IsIOError() || s.IsCorruption()) { @@ -2298,6 +2484,52 @@ public: virtual bool is_tx_started() const = 0; virtual void start_tx() = 0; virtual void start_stmt() = 0; + + void set_initial_savepoint() { + /* + Set the initial savepoint. If the first statement in the transaction + fails, we need something to roll back to, without rolling back the + entire transaction. + */ + do_set_savepoint(); + m_writes_at_last_savepoint= m_write_count; + } + + /* + Called when a "top-level" statement inside a transaction completes + successfully and its changes become part of the transaction's changes. + */ + void make_stmt_savepoint_permanent() { + + // Take another RocksDB savepoint only if we had changes since the last + // one. This is very important for long transactions doing lots of + // SELECTs. + if (m_writes_at_last_savepoint != m_write_count) + { + do_set_savepoint(); + m_writes_at_last_savepoint= m_write_count; + } + } + + + /* + Rollback to the savepoint we've set before the last statement + */ + void rollback_to_stmt_savepoint() { + if (m_writes_at_last_savepoint != m_write_count) { + do_rollback_to_savepoint(); + /* + RollbackToSavePoint "removes the most recent SetSavePoint()", so + we need to set it again so that next statement can roll back to this + stage. + It's ok to do it here at statement end (instead of doing it at next + statement start) because setting a savepoint is cheap. + */ + do_set_savepoint(); + m_writes_at_last_savepoint= m_write_count; + } + } + virtual void rollback_stmt() = 0; void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; } @@ -2465,7 +2697,15 @@ public: void acquire_snapshot(bool acquire_now) override { if (m_read_opts.snapshot == nullptr) { - if (is_tx_read_only()) { + const auto thd_ss = std::static_pointer_cast( + m_thd->get_explicit_snapshot()); + if (thd_ss) { + m_explicit_snapshot = thd_ss; + } + if (m_explicit_snapshot) { + auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot(); + snapshot_created(snapshot); + } else if (is_tx_read_only()) { snapshot_created(rdb->GetSnapshot()); } else if (acquire_now) { m_rocksdb_tx->SetSnapshot(); @@ -2482,7 +2722,10 @@ public: if (m_read_opts.snapshot != nullptr) { m_snapshot_timestamp = 0; - if (is_tx_read_only()) { + if (m_explicit_snapshot) { + m_explicit_snapshot.reset(); + need_clear = false; + } else if (is_tx_read_only()) { rdb->ReleaseSnapshot(m_read_opts.snapshot); need_clear = false; } else { @@ -2591,6 +2834,10 @@ public: tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec); tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect); tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth); + // If this variable is set, this will write commit time write batch + // information on recovery or memtable flush. + tx_opts.use_only_the_last_commit_time_batch_for_recovery = + THDVAR(m_thd, commit_time_batch_for_recovery); tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes); write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); @@ -2609,9 +2856,20 @@ public: m_read_opts = rocksdb::ReadOptions(); + set_initial_savepoint(); + m_ddl_transaction = false; } + /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints */ + void do_set_savepoint() override { + m_rocksdb_tx->SetSavePoint(); + } + + void do_rollback_to_savepoint() override { + m_rocksdb_tx->RollbackToSavePoint(); + } + /* Start a statement inside a multi-statement transaction. @@ -2624,7 +2882,6 @@ public: void start_stmt() override { // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation) acquire_snapshot(false); - m_rocksdb_tx->SetSavePoint(); } /* @@ -2635,7 +2892,7 @@ public: /* TODO: here we must release the locks taken since the start_stmt() call */ if (m_rocksdb_tx) { const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot(); - m_rocksdb_tx->RollbackToSavePoint(); + rollback_to_stmt_savepoint(); const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot(); if (org_snapshot != cur_snapshot) { @@ -2673,7 +2930,7 @@ public: /* This is a rocksdb write batch. This class doesn't hold or wait on any transaction locks (skips rocksdb transaction API) thus giving better - performance. The commit is done through rdb->GetBaseDB()->Commit(). + performance. Currently this is only used for replication threads which are guaranteed to be non-conflicting. Any further usage of this class should completely @@ -2695,6 +2952,8 @@ private: bool commit_no_binlog() override { bool res = false; rocksdb::Status s; + rocksdb::TransactionDBWriteOptimizations optimize; + optimize.skip_concurrency_control = true; s = merge_auto_incr_map(m_batch->GetWriteBatch()); if (!s.ok()) { @@ -2705,7 +2964,7 @@ private: release_snapshot(); - s = rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch()); + s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch()); if (!s.ok()) { rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); res = true; @@ -2723,6 +2982,15 @@ error: return res; } + /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */ + void do_set_savepoint() override { + m_batch->SetSavePoint(); + } + + void do_rollback_to_savepoint() override { + m_batch->RollbackToSavePoint(); + } + public: bool is_writebatch_trx() const override { return true; } @@ -2810,6 +3078,13 @@ public: get_for_update(rocksdb::ColumnFamilyHandle *const column_family, const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, bool exclusive) override { + if (value == nullptr) { + rocksdb::PinnableSlice pin_val; + rocksdb::Status s = get(column_family, key, &pin_val); + pin_val.Reset(); + return s; + } + return get(column_family, key, value); } @@ -2828,13 +3103,15 @@ public: write_opts.disableWAL = THDVAR(m_thd, write_disable_wal); write_opts.ignore_missing_column_families = THDVAR(m_thd, write_ignore_missing_column_families); + + set_initial_savepoint(); } - void start_stmt() override { m_batch->SetSavePoint(); } + void start_stmt() override {} void rollback_stmt() override { if (m_batch) - m_batch->RollbackToSavePoint(); + rollback_to_stmt_savepoint(); } explicit Rdb_writebatch_impl(THD *const thd) @@ -3043,6 +3320,8 @@ static int rocksdb_prepare(handlerton *const hton, THD *const thd, DEBUG_SYNC(thd, "rocksdb.prepared"); } + else + tx->make_stmt_savepoint_permanent(); return HA_EXIT_SUCCESS; } @@ -3214,11 +3493,9 @@ static int rocksdb_commit(handlerton *const hton, THD *const thd, } else { /* We get here when committing a statement within a transaction. - - We don't need to do anything here. tx->start_stmt() will notify - Rdb_transaction_impl that another statement has started. */ tx->set_tx_failed(false); + tx->make_stmt_savepoint_permanent(); } if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { @@ -3406,6 +3683,7 @@ private: if (!path_entry.path.empty() && !path_entry.limit_exceeded) { auto deadlocking_txn = *(path_entry.path.end() - 1); deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id; + deadlock_info.deadlock_time = path_entry.deadlock_time; } return deadlock_info; } @@ -3452,16 +3730,18 @@ private: path_data += "\n*** DEADLOCK PATH\n" "=========================================\n"; const auto dl_info = get_dl_path_trx_info(path_entry); + const auto deadlock_time = dl_info.deadlock_time; for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) { const auto trx_info = *it; path_data += format_string( + "TIMESTAMP: %" PRId64 "\n" "TRANSACTION ID: %u\n" "COLUMN FAMILY NAME: %s\n" "WAITING KEY: %s\n" "LOCK TYPE: %s\n" "INDEX NAME: %s\n" "TABLE NAME: %s\n", - trx_info.trx_id, trx_info.cf_name.c_str(), + deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(), trx_info.waiting_key.c_str(), trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED", trx_info.index_name.c_str(), trx_info.table_name.c_str()); @@ -3772,6 +4052,24 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, str, stat_print); } } + + /* Explicit snapshot information */ + str.clear(); + { + std::lock_guard lock(explicit_snapshot_mutex); + for (const auto &elem : explicit_snapshots) { + const auto &ss = elem.second.lock(); + DBUG_ASSERT(ss != nullptr); + const auto &info = ss->ss_info; + str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) + + "\nBinlog File: " + info.binlog_file + + "\nBinlog Pos: " + std::to_string(info.binlog_pos) + + "\nGtid Executed: " + info.gtid_executed + "\n"; + } + } + if (!str.empty()) { + res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print); + } } else if (stat_type == HA_ENGINE_TRX) { /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */ res |= rocksdb_show_snapshot_status(hton, thd, stat_print); @@ -3791,6 +4089,48 @@ static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd, } } +static bool rocksdb_explicit_snapshot( + handlerton *const /* hton */, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle */ + snapshot_info_st *ss_info) /*!< out: Snapshot information */ +{ + switch (ss_info->op) { + case snapshot_operation::SNAPSHOT_CREATE: { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot()); + if (mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + thd->set_explicit_snapshot(s); + return s == nullptr; + } + case snapshot_operation::SNAPSHOT_ATTACH: { + auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!s) { + return true; + } + *ss_info = s->ss_info; + thd->set_explicit_snapshot(s); + return false; + } + case snapshot_operation::SNAPSHOT_RELEASE: { + if (!thd->get_explicit_snapshot()) { + return true; + } + *ss_info = thd->get_explicit_snapshot()->ss_info; + thd->set_explicit_snapshot(nullptr); + return false; + } + default: + DBUG_ASSERT(false); + return true; + } + return true; +} + /* Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT @@ -3813,14 +4153,12 @@ static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd, InnoDB and RocksDB transactions. */ static int rocksdb_start_tx_and_assign_read_view( - handlerton *const hton, /*!< in: RocksDB handlerton */ - THD *const thd, /*!< in: MySQL thread handle of the - user for whom the transaction should - be committed */ - char *const binlog_file, /* out: binlog file for last commit */ - ulonglong *const binlog_pos, /* out: binlog pos for last commit */ - char **gtid_executed, /* out: Gtids logged until last commit */ - int *const gtid_executed_length) /*out: Length of gtid_executed string */ + handlerton *const hton, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos, + gtid executed and snapshot ID */ { ulong const tx_isolation = my_core::thd_tx_isolation(thd); @@ -3829,11 +4167,12 @@ static int rocksdb_start_tx_and_assign_read_view( return HA_EXIT_FAILURE; } - if (binlog_file) { - if (binlog_pos && mysql_bin_log_is_open()) - mysql_bin_log_lock_commits(); - else + if (ss_info) { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } else { return HA_EXIT_FAILURE; + } } Rdb_transaction *const tx = get_or_create_tx(thd); @@ -3844,13 +4183,94 @@ static int rocksdb_start_tx_and_assign_read_view( rocksdb_register_tx(hton, thd, tx); tx->acquire_snapshot(true); - if (binlog_file) - mysql_bin_log_unlock_commits(binlog_file, binlog_pos, gtid_executed, - gtid_executed_length); + if (ss_info) { + mysql_bin_log_unlock_commits(ss_info); + } return HA_EXIT_SUCCESS; } +static int rocksdb_start_tx_with_shared_read_view( + handlerton *const hton, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos, + gtid executed and snapshot ID */ +{ + DBUG_ASSERT(thd != nullptr); + DBUG_ASSERT(ss_info != nullptr); + + int error = HA_EXIT_SUCCESS; + + ulong const tx_isolation = my_core::thd_tx_isolation(thd); + if (tx_isolation != ISO_REPEATABLE_READ) { + my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0)); + return HA_EXIT_FAILURE; + } + + std::shared_ptr explicit_snapshot; + const auto op = ss_info->op; + Rdb_transaction *tx = nullptr; + + DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE || + op == snapshot_operation::SNAPSHOT_ATTACH); + + // case: if binlogs are available get binlog file/pos and gtid info + if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + + if (op == snapshot_operation::SNAPSHOT_ATTACH) { + explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!explicit_snapshot) { + my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0), + ss_info->snapshot_id); + error = HA_EXIT_FAILURE; + } + } + + // case: all good till now + if (error == HA_EXIT_SUCCESS) { + tx = get_or_create_tx(thd); + Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); + + if (explicit_snapshot) { + tx->m_explicit_snapshot = explicit_snapshot; + } + + DBUG_ASSERT(!tx->has_snapshot()); + tx->set_tx_read_only(true); + rocksdb_register_tx(hton, thd, tx); + tx->acquire_snapshot(true); + + // case: an explicit snapshot was not assigned to this transaction + if (!tx->m_explicit_snapshot) { + tx->m_explicit_snapshot = + Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot); + if (!tx->m_explicit_snapshot) { + my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0)); + error = HA_EXIT_FAILURE; + } + } + } + + // case: unlock the binlog + if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot); + + // copy over the snapshot details to pass to the upper layers + if (tx->m_explicit_snapshot) { + *ss_info = tx->m_explicit_snapshot->ss_info; + ss_info->op = op; + } + + return error; +} + /* Dummy SAVEPOINT support. This is needed for long running transactions * like mysqldump (https://bugs.mysql.com/bug.php?id=71017). * Current SAVEPOINT does not correctly handle ROLLBACK and does not return @@ -4062,9 +4482,11 @@ static int rocksdb_init_func(void *const p) { rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key); rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key, rdb_signal_drop_idx_psi_cond_key); + rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key); #else rdb_bg_thread.init(); rdb_drop_idx_thread.init(); + rdb_mc_thread.init(); #endif mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex, MY_MUTEX_INIT_FAST); @@ -4080,6 +4502,8 @@ static int rocksdb_init_func(void *const p) { mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex, MY_MUTEX_INIT_FAST); + mysql_mutex_init(rdb_block_cache_resize_mutex_key, + &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST); rdb_open_tables.init_hash(); Rdb_transaction::init_mutex(); @@ -4094,8 +4518,10 @@ static int rocksdb_init_func(void *const p) { rocksdb_hton->rollback = rocksdb_rollback; rocksdb_hton->db_type = DB_TYPE_ROCKSDB; rocksdb_hton->show_status = rocksdb_show_status; + rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot; rocksdb_hton->start_consistent_snapshot = rocksdb_start_tx_and_assign_read_view; + rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view; rocksdb_hton->savepoint_set = rocksdb_savepoint; rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint; rocksdb_hton->savepoint_rollback_can_release_mdl = @@ -4158,6 +4584,35 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT + if (rocksdb_db_options->use_direct_reads) { + rocksdb::EnvOptions soptions; + rocksdb::Status check_status; + rocksdb::Env *const env = rocksdb_db_options->env; + + std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir); + if (env->FileExists(fname).ok()) { + std::unique_ptr file; + soptions.use_direct_reads = true; + check_status = env->NewSequentialFile(fname, &file, soptions); + } else { + std::unique_ptr file; + soptions.use_direct_writes = true; + check_status = env->ReopenWritableFile(fname, &file, soptions); + if (file != nullptr) { + file->Close(); + } + env->DeleteFile(fname); + } + + if (!check_status.ok()) { + sql_print_error("RocksDB: Unable to use direct io in rocksdb-datadir:" + "(%s)", check_status.getState()); + rdb_open_tables.free_hash(); + DBUG_RETURN(HA_EXIT_FAILURE); + } + } + if (rocksdb_db_options->allow_mmap_writes && rocksdb_db_options->use_direct_io_for_flush_and_compaction) { // See above comment for allow_mmap_reads. (NO_LINT_DEBUG) @@ -4309,8 +4764,10 @@ static int rocksdb_init_func(void *const p) { cf_options_map->get_defaults()); rocksdb::TransactionDBOptions tx_db_options; - tx_db_options.transaction_lock_timeout = 2; // 2 seconds + tx_db_options.transaction_lock_timeout = 2000; // 2 seconds tx_db_options.custom_mutex_factory = std::make_shared(); + tx_db_options.write_policy = + static_cast(rocksdb_write_policy); status = check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); @@ -4334,7 +4791,7 @@ static int rocksdb_init_func(void *const p) { } cf_manager.init(std::move(cf_options_map), &cf_handles); - if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) { + if (dict_manager.init(rdb, &cf_manager)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize data dictionary."); rdb_open_tables.free_hash(); @@ -4401,6 +4858,21 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME +#ifdef HAVE_PSI_INTERFACE + , + rdb_mc_psi_thread_key +#endif + ); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Couldn't start the manual compaction thread: (errno=%d)", + err); + rdb_open_tables.free_hash(); + DBUG_RETURN(HA_EXIT_FAILURE); + } + rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions); if (rocksdb_pause_background_work) { @@ -4496,6 +4968,16 @@ static int rocksdb_done_func(void *const p) { sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err); } + // signal the manual compaction thread to stop + rdb_mc_thread.signal(true); + // Wait for the manual compaction thread to finish. + err = rdb_mc_thread.join(); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err); + } + if (rdb_open_tables.m_hash.records) { // Looks like we are getting unloaded and yet we have some open tables // left behind. @@ -4505,6 +4987,7 @@ static int rocksdb_done_func(void *const p) { rdb_open_tables.free_hash(); mysql_mutex_destroy(&rdb_open_tables.m_mutex); mysql_mutex_destroy(&rdb_sysvars_mutex); + mysql_mutex_destroy(&rdb_block_cache_resize_mutex); delete rdb_collation_exceptions; mysql_mutex_destroy(&rdb_collation_data_mutex); @@ -4709,6 +5192,12 @@ static ulonglong rdb_get_int_col_max_value(const Field *field) { case HA_KEYTYPE_LONGLONG: max_value = 0x7FFFFFFFFFFFFFFFULL; break; + case HA_KEYTYPE_FLOAT: + max_value = 0x1000000ULL; + break; + case HA_KEYTYPE_DOUBLE: + max_value = 0x20000000000000ULL; + break; default: abort(); } @@ -4896,7 +5385,6 @@ longlong ha_rocksdb::update_hidden_pk_val() { /* Get the id of the hidden pk id from m_last_rowkey */ int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) { - DBUG_ASSERT(hidden_pk_id != nullptr); DBUG_ASSERT(table != nullptr); DBUG_ASSERT(has_hidden_pk(table)); @@ -4953,14 +5441,14 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, my_core::TABLE_SHARE *const table_arg) : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr), m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), + m_scan_it_lower_bound(nullptr), m_scan_it_upper_bound(nullptr), m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr), m_pk_can_be_decoded(false), m_maybe_unpack_info(false), m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr), m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr), m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr), m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr), - m_dup_sk_packed_tuple_old(nullptr), m_eq_cond_lower_bound(nullptr), - m_eq_cond_upper_bound(nullptr), m_pack_buffer(nullptr), + m_dup_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr), m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr), m_row_checksums_checked(0), m_in_rpl_delete_rows(false), m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {} @@ -5507,9 +5995,6 @@ int ha_rocksdb::convert_field_from_storage_format( int ha_rocksdb::convert_record_from_storage_format( const rocksdb::Slice *const key, const rocksdb::Slice *const value, uchar *const buf) { - DBUG_ASSERT(key != nullptr); - DBUG_ASSERT(buf != nullptr); - Rdb_string_reader reader(value); /* @@ -5753,7 +6238,6 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, DBUG_ENTER_FUNC(); DBUG_ASSERT(m_pk_tuple == nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); std::shared_ptr *const kd_arr = tbl_def_arg->m_key_descr_arr; @@ -5805,9 +6289,9 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, m_pack_buffer = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); - m_eq_cond_upper_bound = + m_scan_it_lower_bound = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); - m_eq_cond_lower_bound = + m_scan_it_upper_bound = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); /* @@ -5824,7 +6308,7 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr || m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr || m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr || - m_eq_cond_upper_bound == nullptr || m_eq_cond_lower_bound == nullptr || + m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr || (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr || m_dup_sk_packed_tuple_old == nullptr))) { // One or more of the above allocations failed. Clean up and exit @@ -5864,11 +6348,11 @@ void ha_rocksdb::free_key_buffers() { my_free(m_dup_sk_packed_tuple_old); m_dup_sk_packed_tuple_old = nullptr; - my_free(m_eq_cond_upper_bound); - m_eq_cond_upper_bound = nullptr; + my_free(m_scan_it_lower_bound); + m_scan_it_lower_bound = nullptr; - my_free(m_eq_cond_lower_bound); - m_eq_cond_lower_bound = nullptr; + my_free(m_scan_it_upper_bound); + m_scan_it_upper_bound = nullptr; } void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) { @@ -6081,9 +6565,8 @@ bool ha_rocksdb::get_error_message(const int error, String *const buf) { static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST, "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST"); - DBUG_ASSERT(buf != nullptr); - - if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK) { + if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK || + error == HA_ERR_ROCKSDB_STATUS_BUSY) { Rdb_transaction *const tx = get_tx_from_thd(ha_thd()); DBUG_ASSERT(tx != nullptr); buf->append(tx->m_detailed_error); @@ -6165,10 +6648,10 @@ int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s, } if (opt_msg) { - my_error(ER_RDB_STATUS_MSG, MYF(0), opt_msg, s.code(), - s.ToString().c_str()); + std::string concatenated_error = s.ToString() + " (" + std::string(opt_msg) + ")"; + my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(), rocksdb_hton_name); } else { - my_error(ER_RDB_STATUS_GENERAL, MYF(0), s.code(), s.ToString().c_str()); + my_error(ER_GET_ERRMSG, MYF(0), s.code(), s.ToString().c_str(), rocksdb_hton_name); } return err; @@ -6213,7 +6696,6 @@ int ha_rocksdb::create_key_defs( /* = nullptr */) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); uint i; @@ -6284,9 +6766,7 @@ int ha_rocksdb::create_cfs( std::array *const cfs) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); char tablename_sys[NAME_LEN + 1]; @@ -6315,11 +6795,22 @@ int ha_rocksdb::create_cfs( } collation_err += coll->name; } - my_error(ER_UNSUPPORTED_COLLATION, MYF(0), - tbl_def_arg->full_tablename().c_str(), - table_arg->key_info[i].key_part[part].field->field_name, - collation_err.c_str()); - DBUG_RETURN(HA_EXIT_FAILURE); + + if (rocksdb_error_on_suboptimal_collation) { + my_error(ER_UNSUPPORTED_COLLATION, MYF(0), + tbl_def_arg->full_tablename().c_str(), + table_arg->key_info[i].key_part[part].field->field_name, + collation_err.c_str()); + DBUG_RETURN(HA_EXIT_FAILURE); + } else { + push_warning_printf( + ha_thd(), Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS, + "Unsupported collation on string indexed column %s.%s Use " + "binary collation (%s).", + tbl_def_arg->full_tablename().c_str(), + table_arg->key_info[i].key_part[part].field->field_name, + collation_err.c_str()); + } } } } @@ -6380,10 +6871,6 @@ int ha_rocksdb::create_inplace_key_defs( const std::array &cfs) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - DBUG_ASSERT(old_tbl_def_arg != nullptr); - std::shared_ptr *const old_key_descr = old_tbl_def_arg->m_key_descr_arr; std::shared_ptr *const new_key_descr = @@ -6451,11 +6938,6 @@ std::unordered_map ha_rocksdb::get_old_key_positions( const Rdb_tbl_def *const old_tbl_def_arg) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(old_table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - DBUG_ASSERT(old_tbl_def_arg != nullptr); - std::shared_ptr *const old_key_descr = old_tbl_def_arg->m_key_descr_arr; std::unordered_map old_key_pos; @@ -6521,9 +7003,6 @@ int ha_rocksdb::compare_keys(const KEY *const old_key, const KEY *const new_key) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(old_key != nullptr); - DBUG_ASSERT(new_key != nullptr); - /* Check index name. */ if (strcmp(old_key->name, new_key->name) != 0) { DBUG_RETURN(HA_EXIT_FAILURE); @@ -6554,9 +7033,6 @@ int ha_rocksdb::compare_key_parts(const KEY *const old_key, const KEY *const new_key) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(old_key != nullptr); - DBUG_ASSERT(new_key != nullptr); - /* Skip if key parts do not match, as it is a different key */ if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) { DBUG_RETURN(HA_EXIT_FAILURE); @@ -6601,7 +7077,6 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, const struct key_def_cf_info &cf_info) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(new_key_def != nullptr); DBUG_ASSERT(*new_key_def == nullptr); uint64 ttl_duration = 0; @@ -6688,8 +7163,6 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, int rdb_normalize_tablename(const std::string &tablename, std::string *const strbuf) { - DBUG_ASSERT(strbuf != nullptr); - if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') { DBUG_ASSERT(0); // We were not passed table name? return HA_ERR_ROCKSDB_INVALID_TABLE; @@ -7004,8 +7477,6 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, const bool &full_key_match, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) { - DBUG_ASSERT(iter != nullptr); - /* We are looking for the first record such that index_tuple= lookup_tuple. @@ -7213,8 +7684,6 @@ int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd, } int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { - DBUG_ASSERT(buf != nullptr); - int rc; const rocksdb::Slice &rkey = m_scan_it->key(); const uint pk_size = rkey.size(); @@ -7238,8 +7707,6 @@ int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd, bool move_forward) { - DBUG_ASSERT(buf != nullptr); - int rc = 0; uint pk_size; @@ -7342,7 +7809,6 @@ ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const { pair for. */ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { - DBUG_ASSERT(buf != nullptr); DBUG_ASSERT(table != nullptr); stats.rows_requested++; @@ -7612,7 +8078,7 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, else rc = read_row_from_secondary_key(buf, kd, move_forward); - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; /* Exit the loop */ // release the snapshot and iterator so they will be regenerated @@ -7655,8 +8121,6 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, uchar *const buf) { - DBUG_ASSERT(buf != nullptr); - if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) { const Rdb_key_def &kd = *m_key_descr_arr[active_index]; @@ -7974,8 +8438,6 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, const bool skip_ttl_check) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - DBUG_ASSERT(rowid != nullptr); DBUG_ASSERT(table != nullptr); int rc; @@ -8190,8 +8652,6 @@ int ha_rocksdb::index_last(uchar *const buf) { int ha_rocksdb::index_first_intern(uchar *const buf) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - uchar *key; uint key_size; int rc; @@ -8205,7 +8665,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { DBUG_ASSERT(key != nullptr); const Rdb_key_def &kd = *m_key_descr_arr[active_index]; - kd.get_first_key(key, &key_size); + int key_start_matching_bytes = kd.get_first_key(key, &key_size); rocksdb::Slice index_key((const char *)key, key_size); @@ -8216,12 +8676,12 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { // Loop as long as we get a deadlock error AND we end up creating the // snapshot here (i.e. it did not exist prior to this) for (;;) { - setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE); + setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes); m_scan_it->Seek(index_key); m_skip_scan_it_next_call = true; rc = index_next_with_direction(buf, true); - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; // exit the loop // release the snapshot and iterator so they will be regenerated @@ -8281,8 +8741,6 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { int ha_rocksdb::index_last_intern(uchar *const buf) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - uchar *key; uint key_size; int rc; @@ -8324,7 +8782,7 @@ int ha_rocksdb::index_last_intern(uchar *const buf) { rc = secondary_index_read(active_index, buf); } - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; /* exit the loop */ // release the snapshot and iterator so they will be regenerated @@ -8405,7 +8863,6 @@ bool ha_rocksdb::commit_in_the_middle() { @retval false if bulk commit was skipped or succeeded */ bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { - DBUG_ASSERT(tx != nullptr); return commit_in_the_middle() && tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) && tx->flush_batch(); @@ -8418,7 +8875,6 @@ bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { 'auto-incremented' pk.) */ bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { - DBUG_ASSERT(table != nullptr); return Rdb_key_def::table_has_hidden_pk(table); } @@ -8428,9 +8884,7 @@ bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { */ bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return (table_arg->s->primary_key == MAX_INDEXES && index == tbl_def_arg->m_key_count - 1); @@ -8439,9 +8893,7 @@ bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, /* Returns index of primary key */ uint ha_rocksdb::pk_index(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1 : table_arg->s->primary_key; @@ -8450,9 +8902,7 @@ uint ha_rocksdb::pk_index(const TABLE *const table_arg, /* Returns true if given index number is a primary key */ bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return index == table_arg->s->primary_key || is_hidden_pk(index, table_arg, tbl_def_arg); @@ -8467,9 +8917,6 @@ uint ha_rocksdb::max_supported_key_part_length() const { const char *ha_rocksdb::get_key_name(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - if (is_hidden_pk(index, table_arg, tbl_def_arg)) { return HIDDEN_PK_NAME; } @@ -8483,9 +8930,6 @@ const char *ha_rocksdb::get_key_name(const uint index, const char *ha_rocksdb::get_key_comment(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - if (is_hidden_pk(index, table_arg, tbl_def_arg)) { return nullptr; } @@ -8537,7 +8981,6 @@ const std::string ha_rocksdb::generate_cf_name(const uint index, } const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); return table_arg->s->comment.str; @@ -8651,8 +9094,7 @@ int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id, /* If the keys are the same, then no lock is needed */ - if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice, - row_info.old_pk_slice)) { + if (!row_info.new_pk_slice.compare(row_info.old_pk_slice)) { *found = false; return HA_EXIT_SUCCESS; } @@ -8741,8 +9183,7 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, rocksdb::Slice((const char *)m_sk_packed_tuple, size); /* - For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs - always require locking. + Acquire lock on the old key in case of UPDATE */ if (row_info.old_data != nullptr) { size = kd.pack_record(table, m_pack_buffer, row_info.old_data, @@ -8751,14 +9192,18 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, const rocksdb::Slice old_slice = rocksdb::Slice((const char *)m_sk_packed_tuple_old, size); - /* - For updates, if the keys are the same, then no lock is needed + const rocksdb::Status s = + get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr); + if (!s.ok()) { + return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler); + } - Also check to see if the key has any fields set to NULL. If it does, then - this key is unique since NULL is not equal to each other, so no lock is - needed. + /* + If the old and new keys are the same we're done since we've already taken + the lock on the old key */ - if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) { + if (!new_slice.compare(old_slice)) { return HA_EXIT_SUCCESS; } } @@ -8784,8 +9229,14 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, The bloom filter may need to be disabled for this lookup. */ + uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + rocksdb::Slice lower_bound_slice; + rocksdb::Slice upper_bound_slice; + const bool total_order_seek = !check_bloom_and_set_bounds( - ha_thd(), kd, new_slice, all_parts_used); + ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE, + lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice); const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache); const rocksdb::Status s = @@ -8796,9 +9247,8 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, } rocksdb::Iterator *const iter = row_info.tx->get_iterator( - kd.get_cf(), total_order_seek, fill_cache, - m_eq_cond_lower_bound_slice, m_eq_cond_upper_bound_slice, - true /* read current data */, + kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice, + upper_bound_slice, true /* read current data */, false /* acquire snapshot */); /* Need to scan the transaction to see if there is a duplicate key. @@ -8960,7 +9410,7 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, } } - if (table->next_number_field) { + if (table->found_next_number_field) { update_auto_incr_val_from_field(); } @@ -9008,9 +9458,11 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, } int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info) { + const struct update_row_info &row_info, + const bool bulk_load_sk) { int new_packed_size; int old_packed_size; + int rc = HA_EXIT_SUCCESS; rocksdb::Slice new_key_slice; rocksdb::Slice new_value_slice; @@ -9088,18 +9540,23 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, rocksdb::Slice(reinterpret_cast(m_sk_tails.ptr()), m_sk_tails.get_current_pos()); - row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, - new_value_slice); + if (bulk_load_sk && row_info.old_data == nullptr) { + rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true); + } else { + row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, + new_value_slice); + } row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() + new_value_slice.size()); - return HA_EXIT_SUCCESS; + return rc; } int ha_rocksdb::update_indexes(const struct update_row_info &row_info, const bool &pk_changed) { int rc; + bool bulk_load_sk; // The PK must be updated first to pull out the TTL value. rc = update_pk(*m_pk_descr, row_info, pk_changed); @@ -9107,13 +9564,17 @@ int ha_rocksdb::update_indexes(const struct update_row_info &row_info, return rc; } - // Update the remaining indexes. + // Update the remaining indexes. Allow bulk loading only if + // allow_sk is enabled + bulk_load_sk = rocksdb_enable_bulk_load_api && + THDVAR(table->in_use, bulk_load) && + THDVAR(table->in_use, bulk_load_allow_sk); for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) { if (is_pk(key_id, table, m_tbl_def)) { continue; } - rc = update_sk(table, *m_key_descr_arr[key_id], row_info); + rc = update_sk(table, *m_key_descr_arr[key_id], row_info, bulk_load_sk); if (rc != HA_EXIT_SUCCESS) { return rc; } @@ -9206,25 +9667,22 @@ int ha_rocksdb::update_write_row(const uchar *const old_data, 0x0000b3eb003f65c5e78857, and lower bound would be 0x0000b3eb003f65c5e78859. These cover given eq condition range. */ -void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond) { - uint eq_cond_len = eq_cond.size(); - DBUG_ASSERT(eq_cond_len >= Rdb_key_def::INDEX_NUMBER_SIZE); - memcpy(m_eq_cond_upper_bound, eq_cond.data(), eq_cond_len); - kd.successor(m_eq_cond_upper_bound, eq_cond_len); - memcpy(m_eq_cond_lower_bound, eq_cond.data(), eq_cond_len); - kd.predecessor(m_eq_cond_lower_bound, eq_cond_len); +void ha_rocksdb::setup_iterator_bounds( + const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) { + uint min_len = std::min(eq_cond.size(), bound_len); + memcpy(upper_bound, eq_cond.data(), min_len); + kd.successor(upper_bound, min_len); + memcpy(lower_bound, eq_cond.data(), min_len); + kd.predecessor(lower_bound, min_len); if (kd.m_is_reverse_cf) { - m_eq_cond_upper_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_lower_bound, eq_cond_len); - m_eq_cond_lower_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_upper_bound, eq_cond_len); + *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); + *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); } else { - m_eq_cond_upper_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_upper_bound, eq_cond_len); - m_eq_cond_lower_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_lower_bound, eq_cond_len); + *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); + *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); } } @@ -9236,7 +9694,6 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *const slice, const bool use_all_keys, const uint eq_cond_len) { - DBUG_ASSERT(slice != nullptr); DBUG_ASSERT(slice->size() >= eq_cond_len); Rdb_transaction *const tx = get_or_create_tx(table->in_use); @@ -9244,7 +9701,10 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, bool skip_bloom = true; const rocksdb::Slice eq_cond(slice->data(), eq_cond_len); - if (check_bloom_and_set_bounds(ha_thd(), kd, eq_cond, use_all_keys)) { + if (check_bloom_and_set_bounds( + ha_thd(), kd, eq_cond, use_all_keys, eq_cond_len, + m_scan_it_lower_bound, m_scan_it_upper_bound, + &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) { skip_bloom = false; } @@ -9285,8 +9745,8 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, m_scan_it = rdb->NewIterator(read_opts, kd.get_cf()); } else { m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache, - m_eq_cond_lower_bound_slice, - m_eq_cond_upper_bound_slice); + m_scan_it_lower_bound_slice, + m_scan_it_upper_bound_slice); } m_scan_it_skips_bloom = skip_bloom; } @@ -9305,12 +9765,12 @@ void ha_rocksdb::release_scan_iterator() { void ha_rocksdb::setup_iterator_for_rnd_scan() { uint key_size; - m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); + int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size); setup_scan_iterator(*m_pk_descr, &table_key, false, - Rdb_key_def::INDEX_NUMBER_SIZE); + key_start_matching_bytes); m_scan_it->Seek(table_key); m_skip_scan_it_next_call = true; } @@ -9353,7 +9813,7 @@ int ha_rocksdb::rnd_next(uchar *const buf) { ha_statistic_increment(&SSV::ha_read_rnd_next_count); for (;;) { rc = rnd_next_with_direction(buf, true); - if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !m_rnd_scan_is_new_snapshot) break; /* exit the loop */ // release the snapshot and iterator and then regenerate them @@ -9676,7 +10136,7 @@ int ha_rocksdb::info(uint flag) { if (static_cast(stats.data_file_length) < 0 || static_cast(stats.index_file_length) < 0 || static_cast(stats.records) < 0) { - if (analyze(nullptr, nullptr)) { + if (calculate_stats_for_table()) { DBUG_RETURN(HA_EXIT_FAILURE); } @@ -9756,10 +10216,21 @@ int ha_rocksdb::info(uint flag) { KEY *const k = &table->key_info[i]; for (uint j = 0; j < k->actual_key_parts; j++) { const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats; - uint x = k_stats.m_distinct_keys_per_prefix.size() > j && - k_stats.m_distinct_keys_per_prefix[j] > 0 - ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j] - : 0; + uint x; + + if (k_stats.m_distinct_keys_per_prefix.size() > j && + k_stats.m_distinct_keys_per_prefix[j] > 0) { + x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j]; + /* + If the number of rows is less than the number of prefixes (due to + sampling), the average number of rows with the same prefix is 1. + */ + if (x == 0) { + x = 1; + } + } else { + x = 0; + } if (x > stats.records) x = stats.records; if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) || @@ -9932,7 +10403,7 @@ THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to, m_lock_rows = RDB_LOCK_WRITE; } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) { m_lock_rows = RDB_LOCK_READ; - } else { + } else if (lock_type != TL_IGNORE) { m_lock_rows = RDB_LOCK_NONE; if (THDVAR(thd, lock_scanned_rows)) { /* @@ -10023,22 +10494,24 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { } if (lock_type == F_UNLCK) { - Rdb_transaction *const tx = get_or_create_tx(thd); + Rdb_transaction *const tx = get_tx_from_thd(thd); - tx->io_perf_end_and_record(&m_io_perf); - tx->m_n_mysql_tables_in_use--; - if (tx->m_n_mysql_tables_in_use == 0 && - !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { - /* - Do like InnoDB: when we get here, it's time to commit a - single-statement transaction. + if (tx) { + tx->io_perf_end_and_record(&m_io_perf); + tx->m_n_mysql_tables_in_use--; + if (tx->m_n_mysql_tables_in_use == 0 && + !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + /* + Do like InnoDB: when we get here, it's time to commit a + single-statement transaction. - If the statement involved multiple tables, this code will be executed - for each of them, but that's ok because non-first tx->commit() calls - will be no-ops. - */ - if (tx->commit_or_rollback()) { - res = HA_ERR_INTERNAL_ERROR; + If the statement involved multiple tables, this code will be executed + for each of them, but that's ok because non-first tx->commit() calls + will be no-ops. + */ + if (tx->commit_or_rollback()) { + res = HA_ERR_INTERNAL_ERROR; + } } } } else { @@ -10075,6 +10548,11 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { DBUG_RETURN(HA_ERR_UNSUPPORTED); } + if (thd->get_explicit_snapshot()) { + my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0)); + DBUG_RETURN(HA_ERR_UNSUPPORTED); + } + /* SQL layer signals us to take a write lock. It does so when starting DML statement. We should put locks on the rows we're reading. @@ -10282,8 +10760,6 @@ void Rdb_drop_index_thread::run() { } Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) { - DBUG_ASSERT(tablename != nullptr); - std::string str; if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) { // We were not passed table name? @@ -10346,6 +10822,12 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { char key_buf[MAX_KEY_LENGTH]; uint key_len; ulonglong bytes_written = 0; + + uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + rocksdb::Slice lower_bound_slice; + rocksdb::Slice upper_bound_slice; + /* Remove all records in each index. (This is is not crash-safe, but it doesn't matter, because bulk row @@ -10356,9 +10838,12 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { kd.get_infimum_key(reinterpret_cast(key_buf), &key_len); rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); const rocksdb::Slice table_key(key_buf, key_len); - setup_iterator_bounds(kd, table_key); - opts.iterate_lower_bound = &m_eq_cond_lower_bound_slice; - opts.iterate_upper_bound = &m_eq_cond_upper_bound_slice; + setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE, + lower_bound_buf, upper_bound_buf, &lower_bound_slice, + &upper_bound_slice); + DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE); + opts.iterate_lower_bound = &lower_bound_slice; + opts.iterate_upper_bound = &upper_bound_slice; std::unique_ptr it(rdb->NewIterator(opts, cf)); it->Seek(table_key); @@ -10397,9 +10882,6 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { int ha_rocksdb::rename_table(const char *const from, const char *const to) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(from != nullptr); - DBUG_ASSERT(to != nullptr); - std::string from_str; std::string to_str; std::string from_db; @@ -10652,29 +11134,28 @@ int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) { DBUG_RETURN(HA_EXIT_SUCCESS); } -int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, - HA_CHECK_OPT *const check_opt) { +static int calculate_stats( + const std::unordered_map> + &to_recalc, + bool include_memtables) { DBUG_ENTER_FUNC(); // find per column family key ranges which need to be queried std::unordered_map> ranges; - std::unordered_set ids_to_check; - std::vector buf(table_arg->s->keys * 2 * - Rdb_key_def::INDEX_NUMBER_SIZE); std::unordered_map stats; - for (uint i = 0; i < table_arg->s->keys; i++) { - const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE]; - const Rdb_key_def &kd = *m_key_descr_arr[i]; - const GL_INDEX_ID index_id = kd.get_gl_index_id(); - ranges[kd.get_cf()].push_back(get_range(i, bufp)); + std::vector buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE); + + uchar *bufp = buf.data(); + for (const auto &it : to_recalc) { + const GL_INDEX_ID index_id = it.first; + auto &kd = it.second; + ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp)); + bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE; - ids_to_check.insert(index_id); - // Initialize the stats to 0. If there are no files that contain - // this gl_index_id, then 0 should be stored for the cached stats. stats[index_id] = Rdb_index_stats(index_id); - DBUG_ASSERT(kd.get_key_parts() > 0); - stats[index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts()); + DBUG_ASSERT(kd->get_key_parts() > 0); + stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts()); } // get RocksDB table properties for these ranges @@ -10685,8 +11166,8 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, it.first, &it.second[0], it.second.size(), &props); DBUG_ASSERT(props.size() >= old_size); if (!status.ok()) { - DBUG_RETURN( - rdb_error_to_mysql(status, "Could not access RocksDB properties")); + DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql( + status, "Could not access RocksDB properties")); } } @@ -10707,61 +11188,62 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, other SQL tables, it can be that we're only seeing a small fraction of table's entries (and so we can't update statistics based on that). */ - if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end()) + if (stats.find(it1.m_gl_index_id) == stats.end()) { continue; + } - auto kd = ddl_manager.safe_find(it1.m_gl_index_id); - DBUG_ASSERT(kd != nullptr); - stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length()); + auto it_index = to_recalc.find(it1.m_gl_index_id); + DBUG_ASSERT(it_index != to_recalc.end()); + if (it_index == to_recalc.end()) { + continue; + } + stats[it1.m_gl_index_id].merge( + it1, true, it_index->second->max_storage_fmt_length()); } num_sst++; } - // calculate memtable cardinality - Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); - auto read_opts = rocksdb::ReadOptions(); - read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; - for (uint i = 0; i < table_arg->s->keys; i++) { - const Rdb_key_def &kd = *m_key_descr_arr[i]; - Rdb_index_stats &stat = stats[kd.get_gl_index_id()]; + if (include_memtables) { + // calculate memtable cardinality + Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); + auto read_opts = rocksdb::ReadOptions(); + read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; + for (const auto &it_kd : to_recalc) { + const std::shared_ptr &kd = it_kd.second; + Rdb_index_stats &stat = stats[kd->get_gl_index_id()]; - uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; - auto r = get_range(i, r_buf); - uint64_t memtableCount; - uint64_t memtableSize; - rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount, - &memtableSize); - if (memtableCount < (uint64_t)stat.m_rows / 10) { - // skip tables that already have enough stats from SST files to reduce - // overhead and avoid degradation of big tables stats by sampling from - // relatively tiny (less than 10% of full data set) memtable dataset - continue; - } - - std::unique_ptr it = std::unique_ptr( - rdb->NewIterator(read_opts, kd.get_cf())); - - uchar *first_key; - uint key_size; - if (is_pk(i, table, m_tbl_def)) { - first_key = m_pk_packed_tuple; - } else { - first_key = m_sk_packed_tuple; - } - kd.get_first_key(first_key, &key_size); - rocksdb::Slice first_index_key((const char *)first_key, key_size); - - cardinality_collector.Reset(); - for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { - const rocksdb::Slice key = it->key(); - if (!kd.covers_key(key)) { - break; // end of this index + uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; + auto r = myrocks::get_range(*kd, r_buf); + uint64_t memtableCount; + uint64_t memtableSize; + rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount, + &memtableSize); + if (memtableCount < (uint64_t)stat.m_rows / 10) { + // skip tables that already have enough stats from SST files to reduce + // overhead and avoid degradation of big tables stats by sampling from + // relatively tiny (less than 10% of full data set) memtable dataset + continue; } - stat.m_rows++; - cardinality_collector.ProcessKey(key, &kd, &stat); + std::unique_ptr it = + std::unique_ptr( + rdb->NewIterator(read_opts, kd->get_cf())); + + rocksdb::Slice first_index_key((const char *)r_buf, + Rdb_key_def::INDEX_NUMBER_SIZE); + + cardinality_collector.Reset(); + for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { + const rocksdb::Slice key = it->key(); + if (!kd->covers_key(key)) { + break; // end of this index + } + stat.m_rows++; + + cardinality_collector.ProcessKey(key, kd.get(), &stat); + } + cardinality_collector.AdjustStats(&stat); } - cardinality_collector.AdjustStats(&stat); } // set and persist new stats @@ -10771,6 +11253,19 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, DBUG_RETURN(HA_EXIT_SUCCESS); } +int ha_rocksdb::calculate_stats_for_table() { + DBUG_ENTER_FUNC(); + + std::unordered_map> + ids_to_check; + for (uint i = 0; i < table->s->keys; i++) { + ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(), + m_key_descr_arr[i])); + } + + DBUG_RETURN(calculate_stats(ids_to_check, true)); +} + /* @return HA_ADMIN_OK OK @@ -10779,7 +11274,16 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) { DBUG_ENTER_FUNC(); - if (table && calculate_stats(table, thd, check_opt) != HA_EXIT_SUCCESS) { + if (table) { + if (calculate_stats_for_table() != HA_EXIT_SUCCESS) { + DBUG_RETURN(HA_ADMIN_FAILED); + } + } + + // A call to ::info is needed to repopulate some SQL level structs. This is + // necessary for online analyze because we cannot rely on another ::open + // call to call info for us. + if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) { DBUG_RETURN(HA_ADMIN_FAILED); } @@ -11184,7 +11688,9 @@ bool ha_rocksdb::prepare_inplace_alter_table( if (!new_tdef) { new_tdef = m_tbl_def; } - max_auto_incr = load_auto_incr_value_from_index(); + if (table->found_next_number_field) { + max_auto_incr = load_auto_incr_value_from_index(); + } } ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx( @@ -11636,18 +12142,6 @@ bool ha_rocksdb::commit_inplace_alter_table( dict_manager.finish_indexes_operation( create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING); - /* - We need to recalculate the index stats here manually. The reason is that - the secondary index does not exist inside - m_index_num_to_keydef until it is committed to the data dictionary, which - prevents us from updating the stats normally as the ddl_manager cannot - find the proper gl_index_ids yet during adjust_stats calls. - */ - if (calculate_stats(altered_table, nullptr, nullptr)) { - /* Failed to update index statistics, should never happen */ - DBUG_ASSERT(0); - } - rdb_drop_idx_thread.signal(); } @@ -11719,6 +12213,8 @@ struct rocksdb_status_counters_t { uint64_t block_cache_data_hit; uint64_t block_cache_data_add; uint64_t bloom_filter_useful; + uint64_t bloom_filter_full_positive; + uint64_t bloom_filter_full_true_positive; uint64_t memtable_hit; uint64_t memtable_miss; uint64_t get_hit_l0; @@ -11793,6 +12289,8 @@ DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS) DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT) DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD) DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL) +DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE) +DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE) DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT) DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS) DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0) @@ -12027,6 +12525,8 @@ static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_data_hit), DEF_STATUS_VAR(block_cache_data_add), DEF_STATUS_VAR(bloom_filter_useful), + DEF_STATUS_VAR(bloom_filter_full_positive), + DEF_STATUS_VAR(bloom_filter_full_true_positive), DEF_STATUS_VAR(memtable_hit), DEF_STATUS_VAR(memtable_miss), DEF_STATUS_VAR(get_hit_l0), @@ -12084,6 +12584,10 @@ static SHOW_VAR rocksdb_status_vars[] = { &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("manual_compactions_processed", + &rocksdb_manual_compactions_processed, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("manual_compactions_running", + &rocksdb_manual_compactions_running, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete, @@ -12158,6 +12662,42 @@ void Rdb_background_thread::run() { } } + // Recalculate statistics for indexes. + if (rocksdb_stats_recalc_rate) { + std::unordered_map> + to_recalc; + + if (rdb_indexes_to_recalc.empty()) { + struct Rdb_index_collector : public Rdb_tables_scanner { + int add_table(Rdb_tbl_def *tdef) override { + for (uint i = 0; i < tdef->m_key_count; i++) { + rdb_indexes_to_recalc.push_back( + tdef->m_key_descr_arr[i]->get_gl_index_id()); + } + return HA_EXIT_SUCCESS; + } + } collector; + ddl_manager.scan_for_tables(&collector); + } + + while (to_recalc.size() < rocksdb_stats_recalc_rate && + !rdb_indexes_to_recalc.empty()) { + const auto index_id = rdb_indexes_to_recalc.back(); + rdb_indexes_to_recalc.pop_back(); + + std::shared_ptr keydef = + ddl_manager.safe_find(index_id); + + if (keydef) { + to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef)); + } + } + + if (!to_recalc.empty()) { + calculate_stats(to_recalc, false); + } + } + // Set the next timestamp for mysql_cond_timedwait() (which ends up calling // pthread_cond_timedwait()) to wait on. ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL; @@ -12167,12 +12707,152 @@ void Rdb_background_thread::run() { ddl_manager.persist_stats(); } -bool ha_rocksdb::check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond, - const bool use_all_keys) { +/* + A background thread to handle manual compactions, + except for dropping indexes/tables. Every second, it checks + pending manual compactions, and it calls CompactRange if there is. +*/ +void Rdb_manual_compaction_thread::run() { + mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + for (;;) { + if (m_stop) { + break; + } + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += 1; + + const auto ret MY_ATTRIBUTE((__unused__)) = + mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts); + if (m_stop) { + break; + } + // make sure, no program error is returned + DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT); + RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); + + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + // Grab the first item and proceed, if not empty. + if (m_requests.empty()) { + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + continue; + } + Manual_compaction_request &mcr = m_requests.begin()->second; + DBUG_ASSERT(mcr.cf != nullptr); + DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED); + mcr.state = Manual_compaction_request::RUNNING; + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + + DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING); + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id, + mcr.cf->GetName().c_str()); + rocksdb_manual_compactions_running++; + if (rocksdb_debug_manual_compaction_delay > 0) { + my_sleep(rocksdb_debug_manual_compaction_delay * 1000000); + } + // CompactRange may take a very long time. On clean shutdown, + // it is cancelled by CancelAllBackgroundWork, then status is + // set to shutdownInProgress. + const rocksdb::Status s = rdb->CompactRange( + getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit); + rocksdb_manual_compactions_running--; + if (s.ok()) { + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id, + mcr.cf->GetName().c_str()); + } else { + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s aborted. %s", + mcr.mc_id, mcr.cf->GetName().c_str(), s.getState()); + if (!s.IsShutdownInProgress()) { + rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD); + } else { + DBUG_ASSERT(m_requests.size() == 1); + } + } + rocksdb_manual_compactions_processed++; + clear_manual_compaction_request(mcr.mc_id, false); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + } + clear_all_manual_compaction_requests(); + DBUG_ASSERT(m_requests.empty()); + RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); + mysql_mutex_destroy(&m_mc_mutex); +} + +void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() { + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + m_requests.clear(); + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); +} + +void Rdb_manual_compaction_thread::clear_manual_compaction_request( + int mc_id, bool init_only) { + bool erase = true; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + auto it = m_requests.find(mc_id); + if (it != m_requests.end()) { + if (init_only) { + Manual_compaction_request mcr = it->second; + if (mcr.state != Manual_compaction_request::INITED) { + erase = false; + } + } + if (erase) { + m_requests.erase(it); + } + } else { + // Current code path guarantees that erasing by the same mc_id happens + // at most once. INITED state may be erased by a thread that requested + // the compaction. RUNNING state is erased by mc thread only. + DBUG_ASSERT(0); + } + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); +} + +int Rdb_manual_compaction_thread::request_manual_compaction( + rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start, + rocksdb::Slice *limit, int concurrency) { + int mc_id = -1; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + if (m_requests.size() >= rocksdb_max_manual_compactions) { + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return mc_id; + } + Manual_compaction_request mcr; + mc_id = mcr.mc_id = ++m_latest_mc_id; + mcr.state = Manual_compaction_request::INITED; + mcr.cf = cf; + mcr.start = start; + mcr.limit = limit; + mcr.concurrency = concurrency; + m_requests.insert(std::make_pair(mcr.mc_id, mcr)); + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return mc_id; +} + +bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) { + bool finished = false; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + if (m_requests.count(mc_id) == 0) { + finished = true; + } + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return finished; +} + +bool ha_rocksdb::check_bloom_and_set_bounds( + THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, + const bool use_all_keys, size_t bound_len, uchar *const lower_bound, + uchar *const upper_bound, rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice) { bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys); if (!can_use_bloom) { - setup_iterator_bounds(kd, eq_cond); + setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound, + lower_bound_slice, upper_bound_slice); } return can_use_bloom; } @@ -12282,7 +12962,6 @@ void rdb_update_global_stats(const operation_type &type, uint count, int rdb_get_table_perf_counters(const char *const tablename, Rdb_perf_counters *const counters) { - DBUG_ASSERT(counters != nullptr); DBUG_ASSERT(tablename != nullptr); Rdb_table_handler *table_handler; @@ -12321,9 +13000,6 @@ const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) { // In case of core dump generation we want this function NOT to be optimized // so that we can capture as much data as possible to debug the root cause // more efficiently. -#pragma GCC push_options -#pragma GCC optimize("O0") - void rdb_handle_io_error(const rocksdb::Status status, const RDB_IO_ERROR_TYPE err_type) { if (status.IsIOError()) { @@ -12338,6 +13014,9 @@ void rdb_handle_io_error(const rocksdb::Status status, } case RDB_IO_ERROR_BG_THREAD: { rdb_log_status_error(status, "BG thread failed to write to RocksDB"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on BG write error."); + abort(); break; } case RDB_IO_ERROR_GENERAL: { @@ -12373,8 +13052,6 @@ void rdb_handle_io_error(const rocksdb::Status status, } } -#pragma GCC pop_options - Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; } Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; } @@ -12672,6 +13349,42 @@ static void rocksdb_set_wal_bytes_per_sync( RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +/* + Validating and updating block cache size via sys_var::check path. + SetCapacity may take seconds when reducing block cache, and + sys_var::update holds LOCK_global_system_variables mutex, so + updating block cache size is done at check path instead. +*/ +static int rocksdb_validate_set_block_cache_size( + THD *thd MY_ATTRIBUTE((__unused__)), + struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), + void *var_ptr, struct st_mysql_value *value) { + DBUG_ASSERT(value != nullptr); + + long long new_value; + + /* value is NULL */ + if (value->val_int(value, &new_value)) { + return HA_EXIT_FAILURE; + } + + if (new_value < RDB_MIN_BLOCK_CACHE_SIZE || + (uint64_t)new_value > (uint64_t)LONGLONG_MAX) { + return HA_EXIT_FAILURE; + } + + RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex); + const rocksdb::BlockBasedTableOptions &table_options = + rdb_get_table_options(); + + if (rocksdb_block_cache_size != new_value && table_options.block_cache) { + table_options.block_cache->SetCapacity(new_value); + } + *static_cast(var_ptr) = static_cast(new_value); + RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex); + return HA_EXIT_SUCCESS; +} + static int rocksdb_validate_update_cf_options(THD * /* unused */, struct st_mysql_sys_var * /*unused*/, @@ -12849,6 +13562,13 @@ double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) { DBUG_RETURN((rows / 20.0) + 1); } +void ha_rocksdb::print_error(int error, myf errflag) { + if (error == HA_ERR_ROCKSDB_STATUS_BUSY) { + error = HA_ERR_LOCK_DEADLOCK; + } + handler::print_error(error, errflag); +} + std::string rdb_corruption_marker_file_name() { std::string ret(rocksdb_datadir); ret.append("/ROCKSDB_CORRUPTED"); @@ -12885,5 +13605,6 @@ mysql_declare_plugin(rocksdb_se){ myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global, myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats, myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl, - myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info, - myrocks::rdb_i_s_trx_info, myrocks::rdb_i_s_deadlock_info mysql_declare_plugin_end; + myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map, + myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info, + myrocks::rdb_i_s_deadlock_info mysql_declare_plugin_end; diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 0b3b473d841..ca141657413 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -104,6 +104,7 @@ struct Rdb_deadlock_info { std::string table_name; }; std::vector path; + int64_t deadlock_time; ulonglong victim_trx_id; }; @@ -144,6 +145,11 @@ const char *const BG_THREAD_NAME = "myrocks-bg"; */ const char *const INDEX_THREAD_NAME = "myrocks-index"; +/* + Name for the manual compaction thread. +*/ +const char *const MANUAL_COMPACTION_THREAD_NAME = "myrocks-mc"; + /* Separator between partition name and the qualifier. Sample usage: @@ -485,6 +491,12 @@ class ha_rocksdb : public my_core::handler { const rocksdb::Snapshot *m_scan_it_snapshot; + /* Buffers used for upper/lower bounds for m_scan_it. */ + uchar *m_scan_it_lower_bound; + uchar *m_scan_it_upper_bound; + rocksdb::Slice m_scan_it_lower_bound_slice; + rocksdb::Slice m_scan_it_upper_bound_slice; + Rdb_tbl_def *m_tbl_def; /* Primary Key encoder from KeyTupleFormat to StorageFormat */ @@ -548,12 +560,6 @@ class ha_rocksdb : public my_core::handler { uchar *m_dup_sk_packed_tuple; uchar *m_dup_sk_packed_tuple_old; - /* Buffers used for passing upper/bound eq conditions. */ - uchar *m_eq_cond_lower_bound; - uchar *m_eq_cond_upper_bound; - rocksdb::Slice m_eq_cond_lower_bound_slice; - rocksdb::Slice m_eq_cond_upper_bound_slice; - /* Temporary space for packing VARCHARs (we provide it to pack_record()/pack_index_tuple() calls). @@ -635,13 +641,20 @@ class ha_rocksdb : public my_core::handler { enum ha_rkey_function find_flag) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); void setup_iterator_bounds(const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond); + const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, const bool use_all_keys); bool check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, - const bool use_all_keys); + const bool use_all_keys, size_t bound_len, + uchar *const lower_bound, + uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice, const bool use_all_keys, const uint eq_cond_len) MY_ATTRIBUTE((__nonnull__)); @@ -834,7 +847,7 @@ public: HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS | (m_pk_can_be_decoded ? HA_PRIMARY_KEY_IN_READ_INDEX : 0) | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY | - HA_PARTIAL_COLUMN_READ); + HA_PARTIAL_COLUMN_READ | HA_ONLINE_ANALYZE); } bool init_with_fields() override; @@ -1009,6 +1022,7 @@ public: } virtual double read_time(uint, uint, ha_rows rows) override; + virtual void print_error(int error, myf errflag) override; int open(const char *const name, int mode, uint test_if_locked) override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1123,8 +1137,8 @@ private: MY_ATTRIBUTE((__nonnull__)); int compare_key_parts(const KEY *const old_key, - const KEY *const new_key) const; - MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + const KEY *const new_key) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); int compare_keys(const KEY *const old_key, const KEY *const new_key) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); @@ -1180,7 +1194,7 @@ private: int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info, const bool &pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info) + const struct update_row_info &row_info, const bool bulk_load_sk) MY_ATTRIBUTE((__warn_unused_result__)); int update_indexes(const struct update_row_info &row_info, const bool &pk_changed) @@ -1234,7 +1248,9 @@ private: int finalize_bulk_load(bool print_client_error = true) MY_ATTRIBUTE((__warn_unused_result__)); -public: + int calculate_stats_for_table() MY_ATTRIBUTE((__warn_unused_result__)); + + public: int index_init(uint idx, bool sorted) override MY_ATTRIBUTE((__warn_unused_result__)); int index_end() override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1327,9 +1343,6 @@ public: MY_ATTRIBUTE((__warn_unused_result__)); int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override MY_ATTRIBUTE((__warn_unused_result__)); - int calculate_stats(const TABLE *const table_arg, THD *const thd, - HA_CHECK_OPT *const check_opt) - MY_ATTRIBUTE((__warn_unused_result__)); enum_alter_inplace_result check_if_supported_inplace_alter( TABLE *altered_table, @@ -1356,7 +1369,7 @@ public: virtual void rpl_after_delete_rows() override; virtual void rpl_before_update_rows() override; virtual void rpl_after_update_rows() override; - virtual bool use_read_free_rpl(); + virtual bool use_read_free_rpl() override; private: /* Flags tracking if we are inside different replication operation */ diff --git a/storage/rocksdb/ha_rocksdb_proto.h b/storage/rocksdb/ha_rocksdb_proto.h index 71ec6957456..a2d014f2f01 100644 --- a/storage/rocksdb/ha_rocksdb_proto.h +++ b/storage/rocksdb/ha_rocksdb_proto.h @@ -39,7 +39,12 @@ enum RDB_IO_ERROR_TYPE { const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type); void rdb_handle_io_error(const rocksdb::Status status, - const RDB_IO_ERROR_TYPE err_type); + const RDB_IO_ERROR_TYPE err_type) +#if defined(__clang__) + MY_ATTRIBUTE((optnone)); +#else + MY_ATTRIBUTE((optimize("O0"))); +#endif int rdb_normalize_tablename(const std::string &tablename, std::string *str) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); diff --git a/storage/rocksdb/mysql-test/rocksdb/combinations b/storage/rocksdb/mysql-test/rocksdb/combinations new file mode 100644 index 00000000000..c3f6b9d0396 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/combinations @@ -0,0 +1,6 @@ +[write_committed] +rocksdb_write_policy=write_committed + +[write_prepared] +rocksdb_write_policy=write_prepared +rocksdb_commit_time_batch_for_recovery=on diff --git a/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc b/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc new file mode 100644 index 00000000000..681b966f680 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc @@ -0,0 +1,3 @@ +if (`select count(*) = 0 from information_schema.session_variables where variable_name = 'rocksdb_write_policy' and variable_value = 'write_committed';`) { + --skip Test requires write_committed policy +} diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result index b7b60f26816..b6b942c3d70 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result @@ -287,11 +287,13 @@ set global rocksdb_bulk_load=1; show global variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 show session variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (i INT, j INT, PRIMARY KEY (i)) ENGINE = ROCKSDB; @@ -335,6 +337,7 @@ SET session rocksdb_merge_buf_size = 340; show variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load OFF +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (a VARCHAR(80)) ENGINE=RocksDB; @@ -442,3 +445,24 @@ t1 CREATE TABLE `t1` ( KEY `kb` (`b`(8)) ) ENGINE=ROCKSDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin DROP TABLE t1; +SET @prior_rocksdb_table_stats_sampling_pct = @@rocksdb_table_stats_sampling_pct; +set global rocksdb_table_stats_sampling_pct = 100; +CREATE TABLE t1 (a INT, b INT, PRIMARY KEY ka(a)) ENGINE=RocksDB; +INSERT INTO t1 (a, b) VALUES (1, 10); +INSERT INTO t1 (a, b) VALUES (2, 10); +INSERT INTO t1 (a, b) VALUES (3, 20); +INSERT INTO t1 (a, b) VALUES (4, 20); +set global rocksdb_force_flush_memtable_now=1; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +SHOW INDEX in t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment +t1 0 PRIMARY 1 a A 4 NULL NULL LSMTREE +ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; +SHOW INDEX in t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment +t1 0 PRIMARY 1 a A 4 NULL NULL LSMTREE +t1 1 kb 1 b A 2 NULL NULL YES LSMTREE +DROP TABLE t1; +SET global rocksdb_table_stats_sampling_pct = @prior_rocksdb_table_stats_sampling_pct; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result index ec9c940f250..97310071ef2 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result @@ -15,7 +15,7 @@ count(b) 3000000 ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; -ERROR HY000: Status error 10 received from RocksDB: Operation aborted: Failed to acquire lock due to max_num_locks limit +ERROR HY000: Got error 10 'Operation aborted: Failed to acquire lock due to max_num_locks limit' from ROCKSDB set session rocksdb_bulk_load=1; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; set session rocksdb_bulk_load=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result b/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result index d86792a6469..6f811e177ba 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result @@ -262,3 +262,28 @@ SELECT * FROM t1; a b 36 foo DROP TABLE t1; +# +# Issue #834/MDEV-15304 ALTER TABLE table_with_hidden_pk causes Can't +# write; duplicate key in table error and/or crash +# +CREATE TABLE t1 (a INT, KEY(a)) ENGINE=RocksDB; +INSERT INTO t1 VALUES (1),(1+1); +create table t2 (a int); +insert into t2 values (10),(20),(30); +BEGIN; +select * from t2; +a +10 +20 +30 +alter table t1 force; +select * from t1; +a +insert into t1 values (100); +select * from t1; +a +1 +2 +100 +rollback; +drop table t1,t2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result index 79030e35225..8b51c126e2c 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result @@ -59,12 +59,10 @@ insert into t values (); set debug="+d,crash_commit_before"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 4 -select max(i) from t; -max(i) -3 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After engine prepare begin; insert into t values (); @@ -72,12 +70,10 @@ insert into t values (); set debug="+d,crash_commit_after_prepare"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 4 -select max(i) from t; -max(i) -3 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After binlog begin; insert into t values (); @@ -85,12 +81,10 @@ insert into t values (); set debug="+d,crash_commit_after_log"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 6 -select max(i) from t; -max(i) -5 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After everything begin; insert into t values (); @@ -98,10 +92,8 @@ insert into t values (); set debug="+d,crash_commit_after"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 8 -select max(i) from t; -max(i) -7 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 drop table t; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result index 848d362f9e1..17e6bedb882 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result @@ -141,3 +141,21 @@ SELECT * FROM t1; a b 18446744073709551613 a DROP TABLE t1; +#---------------------------------- +# Issue #792 Crash in autoincrement +#---------------------------------- +CREATE TABLE t1(C1 DOUBLE AUTO_INCREMENT KEY,C2 CHAR) ENGINE=ROCKSDB; +INSERT INTO t1 VALUES(2177,0); +DROP TABLE t1; +CREATE TABLE t0(c0 BLOB) ENGINE=ROCKSDB; +INSERT INTO t0 VALUES(0); +ALTER TABLE t0 AUTO_INCREMENT=0; +DROP TABLE t0; +#---------------------------------- +# Issue #869 Crash in autoincrement +#---------------------------------- +CREATE TABLE t1 (pk INT AUTO_INCREMENT, a INT, PRIMARY KEY(pk)) ENGINE=RocksDB; +INSERT INTO t1 (a) VALUES (1); +UPDATE t1 SET pk = 3; +ALTER TABLE t1 AUTO_INCREMENT 2; +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result new file mode 100644 index 00000000000..058d3608c75 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result @@ -0,0 +1,62 @@ +# +# Issue #809: Wrong query result with bloom filters +# +create table t1 ( +id1 bigint not null, +id2 bigint not null, +id3 varchar(100) not null, +id4 int not null, +id5 int not null, +value bigint, +value2 varchar(100), +primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1' +) engine=ROCKSDB; +create table t2(a int); +insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t3(seq int); +insert into t3 +select +1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000 +from t2 A, t2 B, t2 C, t2 D; +insert t1 +select +(seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc" +from t3; +set global rocksdb_force_flush_memtable_now=1; +# Full table scan +explain +select * from t1 limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 NULL +select * from t1 limit 10; +id1 id2 id3 id4 id5 value value2 +1000 2000 2000 10000 10000 1000 aaabbbccc +1000 2000 2000 9999 9999 1000 aaabbbccc +1000 2000 2000 9998 9998 1000 aaabbbccc +1000 2000 2000 9997 9997 1000 aaabbbccc +1000 2000 2000 9996 9996 1000 aaabbbccc +1000 1999 1999 9995 9995 1000 aaabbbccc +1000 1999 1999 9994 9994 1000 aaabbbccc +1000 1999 1999 9993 9993 1000 aaabbbccc +1000 1999 1999 9992 9992 1000 aaabbbccc +1000 1999 1999 9991 9991 1000 aaabbbccc +# An index scan starting from the end of the table: +explain +select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 122 NULL 1 NULL +select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1; +id1 id2 id3 id4 id5 value value2 +1000 2000 2000 10000 10000 1000 aaabbbccc +create table t4 ( +pk int unsigned not null primary key, +kp1 int unsigned not null, +kp2 int unsigned not null, +col1 int unsigned, +key(kp1, kp2) comment 'rev:bf5_2' +) engine=rocksdb; +insert into t4 values (1, 0xFFFF, 0xFFF, 12345); +# This must not fail an assert: +select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc; +pk kp1 kp2 col1 +drop table t1,t2,t3,t4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result new file mode 100644 index 00000000000..4b02d1103cf --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result @@ -0,0 +1,15 @@ +create table r1 (id bigint primary key, value bigint) engine=rocksdb; +create table r2 (id bigint, value bigint, primary key (id) comment 'cf2') engine=rocksdb; +set session rocksdb_bulk_load=1; +set session rocksdb_bulk_load=0; +select variable_value into @h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +insert into r1 values (100, 100); +select variable_value-@h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +variable_value-@h +1 +select variable_value into @h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +insert into r2 values (100, 100); +select variable_value-@h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +variable_value-@h +0 +DROP TABLE r1, r2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result new file mode 100644 index 00000000000..42f820a2a42 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result @@ -0,0 +1,229 @@ +SET rocksdb_bulk_load_size=15; +CREATE TABLE t4 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t3 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t2 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t1 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +SET rocksdb_bulk_load=1; +INSERT INTO t1 SELECT * FROM t3 FORCE INDEX (PRIMARY) ORDER BY a; +SELECT count(*) FROM t1 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t1 FORCE INDEX (b); +count(*) +10 +SELECT count(*) FROM t1 FORCE INDEX (c); +count(*) +10 +SET rocksdb_bulk_load=0; +SELECT * FROM t1 FORCE INDEX (PRIMARY); +a b c +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +SELECT b FROM t1 FORCE INDEX (b); +b +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +SELECT c FROM t1 FORCE INDEX (c); +c +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 3862424802 +CHECKSUM TABLE t1; +Table Checksum +test.t1 3862424802 +SET rocksdb_bulk_load_allow_sk=1; +SET rocksdb_bulk_load=1; +INSERT INTO t4 SELECT * FROM t3 FORCE INDEX (PRIMARY) ORDER BY a; +SELECT count(*) FROM t4 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t4 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t4 FORCE INDEX (c); +count(*) +0 +SET rocksdb_bulk_load=0; +SELECT * FROM t4 FORCE INDEX (PRIMARY); +a b c +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +SELECT b FROM t4 FORCE INDEX (b); +b +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +SELECT c FROM t4 FORCE INDEX (c); +c +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 3862424802 +CHECKSUM TABLE t4; +Table Checksum +test.t4 3862424802 +SET rocksdb_bulk_load_allow_unsorted=1; +SET rocksdb_bulk_load_allow_sk=1; +SET rocksdb_bulk_load=1; +INSERT INTO t2 SELECT * FROM t3 WHERE b >= 0 ORDER BY b; +INSERT INTO t2 SELECT * FROM t3 WHERE b < 0 ORDER BY b; +SELECT count(*) FROM t2 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (c); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (c); +count(*) +0 +SET rocksdb_bulk_load=0; +SELECT * FROM t2 FORCE INDEX (PRIMARY); +a b c +-19 21 21 +-17 19 19 +-15 17 17 +-13 15 15 +-11 13 13 +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +12 -10 -10 +14 -12 -12 +16 -14 -14 +18 -16 -16 +20 -18 -18 +SELECT b FROM t2 FORCE INDEX (b); +b +-18 +-16 +-14 +-12 +-10 +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 +SELECT c FROM t2 FORCE INDEX (c); +c +-18 +-16 +-14 +-12 +-10 +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 1495594118 +CHECKSUM TABLE t2; +Table Checksum +test.t2 1495594118 +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; +DROP TABLE t4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result b/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result index 94585af6cfc..029c68ae6b1 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result @@ -82,4 +82,19 @@ t1 1 t1_5 2 c1 A 100000 NULL NULL YES LSMTREE SELECT table_name, table_rows FROM information_schema.tables WHERE table_schema = DATABASE(); table_name table_rows t1 100000 -drop table t1; +CREATE TABLE t2 (a INT, b INT, c INT, d INT, e INT, f INT, g INT, +PRIMARY KEY (a), KEY (c, b, a, d, e, f, g)) +ENGINE=ROCKSDB; +SET GLOBAL rocksdb_force_flush_memtable_now = 1; +ANALYZE TABLE t2; +Table Op Msg_type Msg_text +test.t2 analyze status OK +cardinality of the columns after 'a' must be equal to the cardinality of column 'a' +SELECT CARDINALITY INTO @c FROM information_schema.statistics WHERE TABLE_NAME='t2' AND INDEX_NAME='c' AND COLUMN_NAME='a'; +SELECT COLUMN_NAME, CARDINALITY = @c FROM information_schema.statistics WHERE TABLE_NAME='t2' AND INDEX_NAME='c' AND SEQ_IN_INDEX > 3; +COLUMN_NAME CARDINALITY = @c +d 1 +e 1 +f 1 +g 1 +drop table t1, t2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/collation.result b/storage/rocksdb/mysql-test/rocksdb/r/collation.result index edd3fd1f187..b4354ad353f 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/collation.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/collation.result @@ -1,6 +1,7 @@ -SET @start_global_value = @@global.ROCKSDB_STRICT_COLLATION_EXCEPTIONS; -DROP TABLE IF EXISTS t1; +call mtr.add_suppression("Invalid pattern"); CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text) engine=rocksdb charset utf8; +ALTER TABLE t1 ADD INDEX (value); +ERROR HY000: Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). DROP TABLE t1; CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value)) engine=rocksdb charset utf8; ERROR HY000: Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). @@ -13,6 +14,7 @@ SET GLOBAL rocksdb_strict_collation_check=1; CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value2)) engine=rocksdb charset utf8; DROP TABLE t1; CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset latin1 collate latin1_bin; +ALTER TABLE t1 collate=latin1_general_ci; DROP TABLE t1; CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset utf8 collate utf8_bin; DROP TABLE t1; @@ -126,4 +128,16 @@ CREATE TABLE abcd (id INT PRIMARY KEY, value varchar(50), index(value)) engine=r ERROR HY000: Unsupported collation on string indexed column test.abcd.value Use binary collation (binary, latin1_bin, utf8_bin). DROP TABLE abc; SET GLOBAL rocksdb_strict_collation_exceptions=null; -SET GLOBAL rocksdb_strict_collation_exceptions=@start_global_value; +SET GLOBAL rocksdb_strict_collation_check=1; +CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value)) engine=rocksdb charset utf8; +Warnings: +Warning 1210 Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). +DROP TABLE t1; +CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text) engine=rocksdb charset utf8; +ALTER TABLE t1 ADD INDEX (value); +Warnings: +Warning 1210 Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). +DROP TABLE t1; +CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset latin1 collate latin1_bin; +ALTER TABLE t1 collate=latin1_general_ci; +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result b/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result new file mode 100644 index 00000000000..789ce12e900 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result @@ -0,0 +1,21 @@ +CREATE DATABASE db_rpc; +USE db_rpc; +CREATE TABLE t1(pk INT PRIMARY KEY) ENGINE=rocksdb; +SET GLOBAL rocksdb_enable_2pc=1; +SET autocommit = 0; +SET autocommit = 0; +BEGIN; +BEGIN; +SELECT * from t1; +pk +SELECT * from t1; +pk +INSERT INTO t1 VALUES(1); +INSERT INTO t1 VALUES(2); +COMMIT; +COMMIT; +SELECT * from db_rpc.t1; +pk +1 +2 +DROP DATABASE db_rpc; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result new file mode 100644 index 00000000000..e5aeb57ebdf --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result @@ -0,0 +1,38 @@ +USE mysql; +CREATE TABLE mysql_table (a INT) ENGINE=ROCKSDB; +CREATE TABLE test.mysql_table (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +USE test; +CREATE TABLE mysql_table (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE IF NOT EXISTS mysql_table_2 (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_table_no_cols ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql.mysql_table_2 (a INT) ENGINE=ROCKSDB; +CREATE TABLE mysql_primkey (a INT PRIMARY KEY, b INT, c INT, d INT, INDEX (c)) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey DROP b, DROP a, ADD (f INT PRIMARY KEY); +ALTER TABLE mysql_primkey DROP PRIMARY KEY; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey2 (a INT PRIMARY KEY, b INT, c INT) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey2 DROP b; +ALTER TABLE mysql_primkey2 ADD (b INT); +ALTER TABLE mysql_primkey2 DROP c, DROP A; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey3 (a INT PRIMARY KEY, b INT, c INT, INDEX indexonb (b), INDEX indexonc (c)) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey3 DROP INDEX indexonb; +ALTER TABLE mysql_primkey3 DROP c; +ALTER TABLE mysql_primkey3 DROP PRIMARY KEY, ADD PRIMARY KEY(b); +CREATE TABLE mysql_primkey4(a INT, b INT, PRIMARY KEY(a), INDEX si (a, b)) ENGINE=ROCKSDB; +DROP INDEX si ON mysql_primkey4; +DROP INDEX `PRIMARY` ON mysql_primkey4; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +ALTER TABLE mysql.mysql_table ADD PRIMARY KEY (a); +ALTER TABLE mysql.mysql_table DROP PRIMARY KEY; +DROP TABLE mysql_primkey; +DROP TABLE mysql_primkey2; +DROP TABLE mysql_primkey3; +DROP TABLE mysql_primkey4; +USE mysql; +DROP TABLE mysql_table; +DROP TABLE mysql_table_2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result index 39130475349..50733f81598 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result @@ -45,7 +45,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 modify i bigint;; set high_priority_ddl = 0; @@ -98,7 +98,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 rename t1_new;; set high_priority_ddl = 0; @@ -152,7 +152,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop table t1;; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 @@ -202,7 +202,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop table t1;; set high_priority_ddl = 0; @@ -251,7 +251,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 modify i bigint;; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 @@ -302,7 +302,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 create index idx1 on t1 (i);; set high_priority_ddl = 0; @@ -342,7 +342,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop index idx1 on t1;; set high_priority_ddl = 0; @@ -390,7 +390,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 truncate t1;; set high_priority_ddl = 0; @@ -438,7 +438,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 create trigger ins_sum before insert on t1 for each row set @sum = @sum + new.i;; set high_priority_ddl = 0; @@ -478,7 +478,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop trigger ins_sum;; set high_priority_ddl = 0; @@ -528,7 +528,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 optimize table t1;; Table Op Msg_type Msg_text @@ -538,6 +538,55 @@ connection: default (for show processlist) show processlist; Id User Host db Command Time State Info Rows examined Rows sent Tid Srv_Id root test