diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 8f46bcf1454..7dee801c921 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -2711,7 +2711,7 @@ static os_thread_ret_t DECLARE_THREAD(log_copying_thread)(void*) log_mutex_enter(); bool completed = metadata_to_lsn - && metadata_to_lsn < log_copy_scanned_lsn; + && metadata_to_lsn <= log_copy_scanned_lsn; log_mutex_exit(); if (completed) { break; diff --git a/man/CMakeLists.txt b/man/CMakeLists.txt index f5899577a47..9c34ede59c7 100644 --- a/man/CMakeLists.txt +++ b/man/CMakeLists.txt @@ -26,12 +26,15 @@ SET(MAN1_SERVER innochecksum.1 my_print_defaults.1 myisam_ftdump.1 myisamchk.1 mysqld_safe_helper.1 tokuftdump.1 wsrep_sst_common.1 wsrep_sst_mysqldump.1 wsrep_sst_rsync.1 wsrep_sst_xtrabackup-v2.1 wsrep_sst_xtrabackup.1 - galera_recovery.1 galera_new_cluster.1 tokuft_logdump.1) + galera_recovery.1 galera_new_cluster.1 tokuft_logprint.1 + mysql_ldb.1 + wsrep_sst_mariabackup.1 mbstream.1 mariabackup.1 + wsrep_sst_rsync_wan.1) SET(MAN8_SERVER mysqld.8) SET(MAN1_CLIENT msql2mysql.1 mysql.1 mysql_find_rows.1 mysql_waitpid.1 mysqlaccess.1 mysqladmin.1 mysqlbinlog.1 mysqlcheck.1 mysqldump.1 mysqlimport.1 mysqlshow.1 mysqlslap.1 - mysql_plugin.1) + mysql_plugin.1 mysql_embedded.1) SET(MAN1_DEVEL mysql_config.1) SET(MAN1_TEST mysql-stress-test.pl.1 mysql-test-run.pl.1 mysql_client_test.1 mysqltest_embedded.1 mysql_client_test_embedded.1 my_safe_process.1) diff --git a/man/mariabackup.1 b/man/mariabackup.1 new file mode 100644 index 00000000000..628c9cf74fa --- /dev/null +++ b/man/mariabackup.1 @@ -0,0 +1,16 @@ +'\" t +.\" +.TH "\FBMARIABACKUP\FR" "1" "9 August 2018" "MariaDB 10\&.1" "MariaDB Database System" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.SH NAME +mariabackup \- Backup tool +.SH DESCRIPTION +Use \fBmariabackup \-\-help\fR for details on usage\. +.PP +For more information, please refer to the MariaDB Knowledge Base, available online at https://mariadb.com/kb/ diff --git a/man/mbstream.1 b/man/mbstream.1 new file mode 100644 index 00000000000..2aba73fe400 --- /dev/null +++ b/man/mbstream.1 @@ -0,0 +1,16 @@ +'\" t +.\" +.TH "\FBMBSTREAM\FR" "1" "9 August 2018" "MariaDB 10\&.1" "MariaDB Database System" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.SH NAME +mbstream \- Serialize/deserialize files in the XBSTREAM format +.SH DESCRIPTION +Use \fBmbstream \-\-help\fR for details on usage\. +.PP +For more information, please refer to the MariaDB Knowledge Base, available online at https://mariadb.com/kb/ diff --git a/man/mysql_embedded.1 b/man/mysql_embedded.1 new file mode 100644 index 00000000000..735c4e05ae0 --- /dev/null +++ b/man/mysql_embedded.1 @@ -0,0 +1 @@ +.so man1/mysql.1 diff --git a/man/mysql_ldb.1 b/man/mysql_ldb.1 new file mode 100644 index 00000000000..5c08a1a14df --- /dev/null +++ b/man/mysql_ldb.1 @@ -0,0 +1,16 @@ +'\" t +.\" +.TH "\FBMYSQL_LDB\FR" "1" "9 August 2018" "MariaDB 10\&.2" "MariaDB Database System" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.SH NAME +mysql_ldb \- RocksDB tool +.SH DESCRIPTION +Use \fBmysql_ldb \-\-help\fR for details on usage\. +.PP +For more information, please refer to the MariaDB Knowledge Base, available online at https://mariadb.com/kb/ diff --git a/man/tokuft_logdump.1 b/man/tokuft_logprint.1 similarity index 84% rename from man/tokuft_logdump.1 rename to man/tokuft_logprint.1 index f6cf08080f7..bc3b85a55d6 100644 --- a/man/tokuft_logdump.1 +++ b/man/tokuft_logprint.1 @@ -11,6 +11,6 @@ .SH NAME tokuft_logprint \- Dump the log from stdin to stdout .SH DESCRIPTION -Use: Dump the log from stdin to stdout\. +Use: Dump the log from stdin to stdout\. Use \fBtokuft_logprint \-\-help\fR for details on usage\. .PP For more information, please refer to the MariaDB Knowledge Base, available online at https://mariadb.com/kb/ diff --git a/man/wsrep_sst_mariabackup.1 b/man/wsrep_sst_mariabackup.1 new file mode 100644 index 00000000000..34ae4b6f82e --- /dev/null +++ b/man/wsrep_sst_mariabackup.1 @@ -0,0 +1,16 @@ +'\" t +.\" +.TH "\FBWSREP_SST_MARIABACKUP\FR" "1" "8 August 2018" "MariaDB 10\&.1" "MariaDB Database System" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.SH NAME +wsrep_sst_mariabackup \- mariabackup\-based state snapshot transfer +.SH DESCRIPTION +Use: mariabackup-based state snapshot transfer\. +.PP +For more information, please refer to the MariaDB Knowledge Base, available online at https://mariadb.com/kb/ diff --git a/man/wsrep_sst_rsync.1 b/man/wsrep_sst_rsync.1 index 30fcfbfba19..bbaeb64016e 100644 --- a/man/wsrep_sst_rsync.1 +++ b/man/wsrep_sst_rsync.1 @@ -1,6 +1,6 @@ '\" t .\" -.TH "\FBWSREP_SST_RSYNC\FR" "1" "9 May 2017" "MariaDB 10\&.3" "MariaDB Database System" +.TH "\FBWSREP_SST_RSYNC\FR" "1" "9 August 2018" "MariaDB 10\&.3" "MariaDB Database System" .\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- @@ -9,7 +9,7 @@ .\" disable justification (adjust text to left margin only) .ad l .SH NAME -wsrep_sst_mysqldump \- rsync-based state snapshot transfer +wsrep_sst_rsync \- rsync-based state snapshot transfer .SH DESCRIPTION Use: rsync-based state snapshot transfer\. .PP diff --git a/man/wsrep_sst_rsync_wan.1 b/man/wsrep_sst_rsync_wan.1 new file mode 100644 index 00000000000..6d465d554a8 --- /dev/null +++ b/man/wsrep_sst_rsync_wan.1 @@ -0,0 +1,16 @@ +'\" t +.\" +.TH "\FBWSREP_SST_RSYNC_WAN\FR" "1" "9 August 2018" "MariaDB 10\&.1" "MariaDB Database System" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.SH NAME +wsrep_sst_rsync_wan \- rsync_wan (rsync with delta transfers)\-based state snapshot transfer +.SH DESCRIPTION +Use: rsync_wan\-based state snapshot transfer\. +.PP +For more information, please refer to the MariaDB Knowledge Base, available online at https://mariadb.com/kb/ diff --git a/mysql-test/include/search_pattern_in_file.inc b/mysql-test/include/search_pattern_in_file.inc index 21192b55efb..6bead628fb0 100644 --- a/mysql-test/include/search_pattern_in_file.inc +++ b/mysql-test/include/search_pattern_in_file.inc @@ -18,6 +18,11 @@ # Optionally, SEARCH_ABORT can be set to "FOUND" or "NOT FOUND" and this # will abort if the search result doesn't match the requested one. # +# Optionally, SEARCH_OUTPUT can be set to control the format of output. +# Supported formats: +# - (default) : "FOUND n /pattern/ in FILE " or "NOT FOUND ..." +# - "matches" : Each match is printed, on a separate line +# # In case of # - SEARCH_FILE and/or SEARCH_PATTERN is not set # - SEARCH_FILE cannot be opened @@ -75,7 +80,14 @@ perl; my @matches=($content =~ m/$search_pattern/gs); my $res=@matches ? "FOUND " . scalar(@matches) : "NOT FOUND"; $ENV{SEARCH_FILE} =~ s{^.*?([^/\\]+)$}{$1}; - print "$res /$search_pattern/ in $ENV{SEARCH_FILE}\n"; + + if ($ENV{SEARCH_OUTPUT} eq "matches") { + foreach (@matches) { + print $_ . "\n"; + } + } else { + print "$res /$search_pattern/ in $ENV{SEARCH_FILE}\n"; + } die "$ENV{SEARCH_ABORT}\n" - if $ENV{SEARCH_ABORT} && $res =~ /^$ENV{SEARCH_ABORT}/; + if $ENV{SEARCH_ABORT} && $res =~ /^$ENV{SEARCH_ABORT}/; EOF diff --git a/mysql-test/suite/galera/disabled.def b/mysql-test/suite/galera/disabled.def index 697fba5f9b3..75654866bfc 100644 --- a/mysql-test/suite/galera/disabled.def +++ b/mysql-test/suite/galera/disabled.def @@ -40,7 +40,6 @@ galera_sst_mysqldump_with_key : MDEV-16890 Galera test failure galera_sst_xtrabackup-v2-options : Failed to read uuid:seqno and wsrep_gtid_domain_id from joiner script MW-328C : Timeouts galera_gcs_fc_limit : Timeouts -galera_binlog_stmt_autoinc : Unstable results pool_of_threads: WSREP has not yet prepared node for application use -partition: Not stable test: Different auto_increment numbers galera_var_innodb_disallow_writes : Timeout +galera.galera_kill_ddl : MDEV-17108 Test failure on galera.galera_kill_ddl diff --git a/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result b/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result deleted file mode 100644 index 78b40228eb0..00000000000 --- a/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result +++ /dev/null @@ -1,161 +0,0 @@ -connection node_1; -connection node_2; -connection node_2; -SET GLOBAL wsrep_forced_binlog_format='STATEMENT'; -connection node_1; -SET GLOBAL wsrep_forced_binlog_format='STATEMENT'; -CREATE TABLE t1 ( -i int(11) NOT NULL AUTO_INCREMENT, -c char(32) DEFAULT 'dummy_text', -PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -insert into t1(i) values(null); -select * from t1; -i c -1 dummy_text -insert into t1(i) values(null), (null), (null); -select * from t1; -i c -1 dummy_text -3 dummy_text -5 dummy_text -7 dummy_text -connection node_2; -select * from t1; -i c -1 dummy_text -3 dummy_text -5 dummy_text -7 dummy_text -SET GLOBAL wsrep_forced_binlog_format='none'; -connection node_1; -SET GLOBAL wsrep_forced_binlog_format='none'; -drop table t1; -SET SESSION binlog_format='STATEMENT'; -show variables like 'binlog_format'; -Variable_name Value -binlog_format STATEMENT -SET GLOBAL wsrep_auto_increment_control='OFF'; -SET SESSION auto_increment_increment = 3; -SET SESSION auto_increment_offset = 1; -CREATE TABLE t1 ( -i int(11) NOT NULL AUTO_INCREMENT, -c char(32) DEFAULT 'dummy_text', -PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -insert into t1(i) values(null); -select * from t1; -i c -1 dummy_text -insert into t1(i) values(null), (null), (null); -select * from t1; -i c -1 dummy_text -4 dummy_text -7 dummy_text -10 dummy_text -connection node_2; -select * from t1; -i c -1 dummy_text -4 dummy_text -7 dummy_text -10 dummy_text -connection node_1; -SET GLOBAL wsrep_auto_increment_control='ON'; -SET SESSION binlog_format='ROW'; -show variables like 'binlog_format'; -Variable_name Value -binlog_format ROW -show variables like '%auto_increment%'; -Variable_name Value -auto_increment_increment 2 -auto_increment_offset 1 -wsrep_auto_increment_control ON -SET GLOBAL wsrep_auto_increment_control='OFF'; -show variables like '%auto_increment%'; -Variable_name Value -auto_increment_increment 3 -auto_increment_offset 1 -wsrep_auto_increment_control OFF -SET GLOBAL wsrep_auto_increment_control='ON'; -drop table t1; -connection node_2; -SET GLOBAL wsrep_forced_binlog_format='ROW'; -connection node_1; -SET GLOBAL wsrep_forced_binlog_format='ROW'; -CREATE TABLE t1 ( -i int(11) NOT NULL AUTO_INCREMENT, -c char(32) DEFAULT 'dummy_text', -PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -insert into t1(i) values(null); -select * from t1; -i c -1 dummy_text -insert into t1(i) values(null), (null), (null); -select * from t1; -i c -1 dummy_text -3 dummy_text -5 dummy_text -7 dummy_text -connection node_2; -select * from t1; -i c -1 dummy_text -3 dummy_text -5 dummy_text -7 dummy_text -SET GLOBAL wsrep_forced_binlog_format='none'; -connection node_1; -SET GLOBAL wsrep_forced_binlog_format='none'; -drop table t1; -SET SESSION binlog_format='ROW'; -show variables like 'binlog_format'; -Variable_name Value -binlog_format ROW -SET GLOBAL wsrep_auto_increment_control='OFF'; -SET SESSION auto_increment_increment = 3; -SET SESSION auto_increment_offset = 1; -CREATE TABLE t1 ( -i int(11) NOT NULL AUTO_INCREMENT, -c char(32) DEFAULT 'dummy_text', -PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -insert into t1(i) values(null); -select * from t1; -i c -1 dummy_text -insert into t1(i) values(null), (null), (null); -select * from t1; -i c -1 dummy_text -4 dummy_text -7 dummy_text -10 dummy_text -connection node_2; -select * from t1; -i c -1 dummy_text -4 dummy_text -7 dummy_text -10 dummy_text -connection node_1; -SET GLOBAL wsrep_auto_increment_control='ON'; -show variables like 'binlog_format'; -Variable_name Value -binlog_format ROW -show variables like '%auto_increment%'; -Variable_name Value -auto_increment_increment 2 -auto_increment_offset 1 -wsrep_auto_increment_control ON -SET GLOBAL wsrep_auto_increment_control='OFF'; -show variables like '%auto_increment%'; -Variable_name Value -auto_increment_increment 3 -auto_increment_offset 1 -wsrep_auto_increment_control OFF -SET GLOBAL wsrep_auto_increment_control='ON'; -drop table t1; diff --git a/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result b/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result new file mode 100644 index 00000000000..d5c6a11f61f --- /dev/null +++ b/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result @@ -0,0 +1,396 @@ +connection node_1; +connection node_2; +Performing State Transfer on a server that has been shut down cleanly and restarted +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_shutdown_slave; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; +Performing State Transfer on a server that starts from a clean var directory +This is accomplished by shutting down node #2 and removing its var directory before restarting it +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +Cleaning var directory ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_clean_slave; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; +Performing State Transfer on a server that has been killed and restarted +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +Killing server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Performing --wsrep-recover ... +Starting server ... +Using --wsrep-start-position when starting mysqld ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_kill_slave; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; +Performing State Transfer on a server that has been killed and restarted +while a DDL was in progress on it +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +connection node_2; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +SET GLOBAL debug_dbug = 'd,sync.alter_opened_table'; +connection node_1; +ALTER TABLE t1 ADD COLUMN f2 INTEGER; +connection node_2; +SET wsrep_sync_wait = 0; +Killing server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 (f1) VALUES ('node1_committed_during'); +INSERT INTO t1 (f1) VALUES ('node1_committed_during'); +INSERT INTO t1 (f1) VALUES ('node1_committed_during'); +INSERT INTO t1 (f1) VALUES ('node1_committed_during'); +INSERT INTO t1 (f1) VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_kill_slave_ddl, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Performing --wsrep-recover ... +connection node_2; +Starting server ... +Using --wsrep-start-position when starting mysqld ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 (f1) VALUES ('node2_committed_after'); +INSERT INTO t1 (f1) VALUES ('node2_committed_after'); +INSERT INTO t1 (f1) VALUES ('node2_committed_after'); +INSERT INTO t1 (f1) VALUES ('node2_committed_after'); +INSERT INTO t1 (f1) VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 (f1) VALUES ('node1_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_committed_after'); +INSERT INTO t1 (f1) VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_kill_slave_ddl; +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 (f1) VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; +COUNT(*) = 2 +1 +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; +COUNT(*) = 2 +1 +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; +SET GLOBAL debug_dbug = $debug_orig; diff --git a/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test b/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test deleted file mode 100644 index aab4fea9f2e..00000000000 --- a/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test +++ /dev/null @@ -1,223 +0,0 @@ -## -## Tests the auto-increment with binlog in STATEMENT mode. -## - ---source include/galera_cluster.inc ---source include/have_innodb.inc - ---let $node_1=node_1 ---let $node_2=node_2 ---source include/auto_increment_offset_save.inc - -## -## Verify the correct operation of the auto-increment when the binlog -## format artificially set to the 'STATEMENT' (although this mode is -## not recommended in the current version): -## - ---connection node_2 -SET GLOBAL wsrep_forced_binlog_format='STATEMENT'; - ---connection node_1 -SET GLOBAL wsrep_forced_binlog_format='STATEMENT'; - -CREATE TABLE t1 ( - i int(11) NOT NULL AUTO_INCREMENT, - c char(32) DEFAULT 'dummy_text', - PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -insert into t1(i) values(null); - -select * from t1; - -insert into t1(i) values(null), (null), (null); - -select * from t1; - ---connection node_2 - -select * from t1; - -SET GLOBAL wsrep_forced_binlog_format='none'; - ---connection node_1 - -SET GLOBAL wsrep_forced_binlog_format='none'; - -drop table t1; - -## -## Check the operation when the automatic control over the auto-increment -## settings is switched off, that is, when we use the increment step and -## the offset specified by the user. In the current session, the binlog -## format is set to 'STATEMENT'. It is important that the values of the -## auto-increment options does not changed on other node - it allows us -## to check the correct transmission of the auto-increment options to -## other nodes: -## - ---disable_warnings -SET SESSION binlog_format='STATEMENT'; ---enable_warnings - -show variables like 'binlog_format'; - -SET GLOBAL wsrep_auto_increment_control='OFF'; - -SET SESSION auto_increment_increment = 3; -SET SESSION auto_increment_offset = 1; - -CREATE TABLE t1 ( - i int(11) NOT NULL AUTO_INCREMENT, - c char(32) DEFAULT 'dummy_text', - PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -insert into t1(i) values(null); - -select * from t1; - -insert into t1(i) values(null), (null), (null); - -select * from t1; - ---connection node_2 - -select * from t1; - ---connection node_1 - -## -## Verify the return to automatic calculation of the step -## and offset of the auto-increment: -## - -SET GLOBAL wsrep_auto_increment_control='ON'; - -SET SESSION binlog_format='ROW'; - -show variables like 'binlog_format'; -show variables like '%auto_increment%'; - -## -## Verify the recovery of original user-defined values after -## stopping the automatic control over auto-increment: -## - -SET GLOBAL wsrep_auto_increment_control='OFF'; - -show variables like '%auto_increment%'; - -## -## Restore original options and drop test table: -## - -SET GLOBAL wsrep_auto_increment_control='ON'; - -drop table t1; - -## -## Verify the correct operation of the auto-increment when the binlog -## format set to the 'ROW': -## - ---connection node_2 -SET GLOBAL wsrep_forced_binlog_format='ROW'; - ---connection node_1 -SET GLOBAL wsrep_forced_binlog_format='ROW'; - -CREATE TABLE t1 ( - i int(11) NOT NULL AUTO_INCREMENT, - c char(32) DEFAULT 'dummy_text', - PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -insert into t1(i) values(null); - -select * from t1; - -insert into t1(i) values(null), (null), (null); - -select * from t1; - ---connection node_2 - -select * from t1; - -SET GLOBAL wsrep_forced_binlog_format='none'; - ---connection node_1 - -SET GLOBAL wsrep_forced_binlog_format='none'; - -drop table t1; - -## -## Check the operation when the automatic control over the auto-increment -## settings is switched off, that is, when we use the increment step and -## the offset specified by the user. In the current session, the binlog -## format is set to 'ROW'. It is important that the values of the -## auto-increment options does not changed on other node - it allows us -## to check the correct transmission of the auto-increment options to -## other nodes: -## - -SET SESSION binlog_format='ROW'; - -show variables like 'binlog_format'; - -SET GLOBAL wsrep_auto_increment_control='OFF'; - -SET SESSION auto_increment_increment = 3; -SET SESSION auto_increment_offset = 1; - -CREATE TABLE t1 ( - i int(11) NOT NULL AUTO_INCREMENT, - c char(32) DEFAULT 'dummy_text', - PRIMARY KEY (i) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -insert into t1(i) values(null); - -select * from t1; - -insert into t1(i) values(null), (null), (null); - -select * from t1; - ---connection node_2 - -select * from t1; - ---connection node_1 - -## -## Verify the return to automatic calculation of the step -## and offset of the auto-increment: -## - -SET GLOBAL wsrep_auto_increment_control='ON'; - -show variables like 'binlog_format'; -show variables like '%auto_increment%'; - -## -## Verify the recovery of original user-defined values after -## stopping the automatic control over auto-increment: -## - -SET GLOBAL wsrep_auto_increment_control='OFF'; - -show variables like '%auto_increment%'; - -## -## Restore original options and drop test table: -## - -SET GLOBAL wsrep_auto_increment_control='ON'; - -drop table t1; - ---source include/auto_increment_offset_restore.inc diff --git a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf new file mode 100644 index 00000000000..afe9796a11a --- /dev/null +++ b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf @@ -0,0 +1,11 @@ +!include ../galera_2nodes.cnf + +[mysqld] +wsrep_sst_method=rsync + +[mysqld.1] +wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' + +[mysqld.2] +innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/rsync_test_2 +wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' diff --git a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test new file mode 100644 index 00000000000..68aa1068f75 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test @@ -0,0 +1,16 @@ +--source include/big_test.inc +--source include/galera_cluster.inc + +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +--source suite/galera/include/galera_st_shutdown_slave.inc +--source suite/galera/include/galera_st_clean_slave.inc + +--source suite/galera/include/galera_st_kill_slave.inc +--source suite/galera/include/galera_st_kill_slave_ddl.inc +--source include/auto_increment_offset_restore.inc + +# cleanup temporary database files: +--remove_files_wildcard $MYSQL_TMP_DIR/rsync_test_2 * diff --git a/mysql-test/suite/maria/concurrent.result b/mysql-test/suite/maria/concurrent.result new file mode 100644 index 00000000000..caea6fd1fb9 --- /dev/null +++ b/mysql-test/suite/maria/concurrent.result @@ -0,0 +1,33 @@ +CREATE TABLE t1 (a INT, b CHAR(12), c INT, FULLTEXT KEY(b), KEY (c)) ENGINE=Aria; +CREATE TABLE t2 (a INT, b CHAR(12), c INT) ENGINE=Aria; +INSERT INTO t2 VALUES (1,'foo',8), (2,'bar',9); +connect con1,localhost,root,,test; +INSERT INTO t1 SELECT * FROM t2; +connection default; +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +SELECT * FROM t1 WHERE a = ( SELECT 1 FROM non_existing_table2 ); +ERROR 42S02: Table 'test.non_existing_table2' doesn't exist +connection con1; +disconnect con1; +connection default; +DROP TABLE t1, t2; diff --git a/mysql-test/suite/maria/concurrent.test b/mysql-test/suite/maria/concurrent.test new file mode 100644 index 00000000000..42adb082d40 --- /dev/null +++ b/mysql-test/suite/maria/concurrent.test @@ -0,0 +1,28 @@ +# +# MDEV-15797 Assertion `thd->killed != 0' failed in ha_maria::enable_indexes +# + +CREATE TABLE t1 (a INT, b CHAR(12), c INT, FULLTEXT KEY(b), KEY (c)) ENGINE=Aria; +CREATE TABLE t2 (a INT, b CHAR(12), c INT) ENGINE=Aria; +INSERT INTO t2 VALUES (1,'foo',8), (2,'bar',9); + +--connect (con1,localhost,root,,test) +--send + INSERT INTO t1 SELECT * FROM t2; +--connection default +select 1; +select 1; +select 1; +select 1; +select 1; +select 1; +select 1; +--error ER_NO_SUCH_TABLE +SELECT * FROM t1 WHERE a = ( SELECT 1 FROM non_existing_table2 ); +--connection con1 +--reap + +# Cleanup +--disconnect con1 +--connection default +DROP TABLE t1, t2; diff --git a/mysql-test/suite/mariabackup/skip_innodb.opt b/mysql-test/suite/mariabackup/skip_innodb.opt new file mode 100644 index 00000000000..213331f5575 --- /dev/null +++ b/mysql-test/suite/mariabackup/skip_innodb.opt @@ -0,0 +1 @@ +--loose-skip-innodb \ No newline at end of file diff --git a/mysql-test/suite/mariabackup/skip_innodb.result b/mysql-test/suite/mariabackup/skip_innodb.result new file mode 100644 index 00000000000..0d56b55bf4b --- /dev/null +++ b/mysql-test/suite/mariabackup/skip_innodb.result @@ -0,0 +1,10 @@ +CREATE TABLE t(i int); +INSERT INTO t VALUES(1); +# shutdown server +# remove datadir +# xtrabackup move back +# restart server +SELECT * from t; +i +1 +DROP TABLE t; diff --git a/mysql-test/suite/mariabackup/skip_innodb.test b/mysql-test/suite/mariabackup/skip_innodb.test new file mode 100644 index 00000000000..14e4bc007f6 --- /dev/null +++ b/mysql-test/suite/mariabackup/skip_innodb.test @@ -0,0 +1,12 @@ +let $targetdir=$MYSQLTEST_VARDIR/tmp/backup; +CREATE TABLE t(i int); +INSERT INTO t VALUES(1); +--disable_result_log +exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir; +exec $XTRABACKUP --prepare --target-dir=$targetdir; +-- source include/restart_and_restore.inc +--enable_result_log +SELECT * from t; +DROP TABLE t; + +rmdir $targetdir; \ No newline at end of file diff --git a/mysql-test/suite/plugins/r/auth_ed25519.result b/mysql-test/suite/plugins/r/auth_ed25519.result index ee9320bbc6c..4785bef3ef7 100644 --- a/mysql-test/suite/plugins/r/auth_ed25519.result +++ b/mysql-test/suite/plugins/r/auth_ed25519.result @@ -33,7 +33,7 @@ PLUGIN_DESCRIPTION Elliptic curve ED25519 based authentication PLUGIN_LICENSE GPL LOAD_OPTION ON PLUGIN_MATURITY Stable -PLUGIN_AUTH_VERSION 1.0-alpha +PLUGIN_AUTH_VERSION 1.0 create user test1@localhost identified via ed25519 using 'ZIgUREUg5PVgQ6LskhXmO+eZLS0nC8be6HPjYWR4YJY'; show grants for test1@localhost; Grants for test1@localhost diff --git a/plugin/auth_ed25519/server_ed25519.c b/plugin/auth_ed25519/server_ed25519.c index 8870c271b18..23b4e7389c7 100644 --- a/plugin/auth_ed25519/server_ed25519.c +++ b/plugin/auth_ed25519/server_ed25519.c @@ -100,7 +100,7 @@ maria_declare_plugin(ed25519) 0x0100, NULL, NULL, - "1.0-alpha", + "1.0", MariaDB_PLUGIN_MATURITY_STABLE } maria_declare_plugin_end; diff --git a/plugin/auth_pam/mapper/pam_user_map.c b/plugin/auth_pam/mapper/pam_user_map.c index c03ea12be74..e1d11acabb9 100644 --- a/plugin/auth_pam/mapper/pam_user_map.c +++ b/plugin/auth_pam/mapper/pam_user_map.c @@ -189,13 +189,15 @@ int pam_sm_authenticate(pam_handle_t *pamh, int flags, s++; } from= s; - skip(isalnum(*s) || (*s == '_') || (*s == '.') || (*s == '-') || (*s == '$')); + skip(isalnum(*s) || (*s == '_') || (*s == '.') || (*s == '-') || + (*s == '$') || (*s == '\\') || (*s == '/')); end_from= s; skip(isspace(*s)); if (end_from == from || *s++ != ':') goto syntax_error; skip(isspace(*s)); to= s; - skip(isalnum(*s) || (*s == '_') || (*s == '.') || (*s == '-') || (*s == '$')); + skip(isalnum(*s) || (*s == '_') || (*s == '.') || (*s == '-') || + (*s == '$')); end_to= s; if (end_to == to) goto syntax_error; diff --git a/scripts/galera_recovery.sh b/scripts/galera_recovery.sh index 09de6721762..c58f3d8f6b9 100644 --- a/scripts/galera_recovery.sh +++ b/scripts/galera_recovery.sh @@ -107,8 +107,7 @@ else log "WSREP: mktemp failed" fi -parse_arguments `$print_defaults $cmdline_args --loose-verbose \ - mariadb mariadb_safe mysqld mysqld_safe safe_mysqld galera` +parse_arguments `$print_defaults $cmdline_args --loose-verbose --mysqld` # Perform wsrep position recovery if wsrep_on=1, skip otherwise. if [ "$wsrep_on" -eq 1 ]; then diff --git a/scripts/wsrep_sst_common.sh b/scripts/wsrep_sst_common.sh index 21da52b015e..313821f522d 100755 --- a/scripts/wsrep_sst_common.sh +++ b/scripts/wsrep_sst_common.sh @@ -27,6 +27,7 @@ WSREP_SST_OPT_PSWD=${WSREP_SST_OPT_PSWD:-} WSREP_SST_OPT_DEFAULT="" WSREP_SST_OPT_EXTRA_DEFAULT="" WSREP_SST_OPT_SUFFIX_DEFAULT="" +WSREP_SST_OPT_SUFFIX_VALUE="" while [ $# -gt 0 ]; do case "$1" in @@ -76,6 +77,7 @@ case "$1" in ;; '--defaults-group-suffix') readonly WSREP_SST_OPT_SUFFIX_DEFAULT="$1=$2" + readonly WSREP_SST_OPT_SUFFIX_VALUE="$2" shift ;; '--host') @@ -272,8 +274,8 @@ parse_cnf() reval=$($MY_PRINT_DEFAULTS "${group}" | awk -v var="${var}" 'BEGIN { OFS=FS="=" } { gsub(/_/,"-",$1); if ( $1=="--"var) lastval=substr($0,length($1)+2) } END { print lastval}') # use default if we haven't found a value - if [ -z $reval ]; then - [ -n $3 ] && reval=$3 + if [ -z "$reval" ]; then + [ -n "$3" ] && reval=$3 fi echo $reval } diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index b5cb3b073b2..45b9a753938 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -155,6 +155,20 @@ else WSREP_LOG_DIR=$(cd $WSREP_SST_OPT_DATA; pwd -P) fi +INNODB_DATA_HOME_DIR=${INNODB_DATA_HOME_DIR:-""} +# if INNODB_DATA_HOME_DIR env. variable is not set, try to get it from my.cnf +if [ -z "$INNODB_DATA_HOME_DIR" ]; then + INNODB_DATA_HOME_DIR=$(parse_cnf mysqld$WSREP_SST_OPT_SUFFIX_VALUE innodb-data-home-dir '') +fi + +if [ -n "$INNODB_DATA_HOME_DIR" ]; then + # handle both relative and absolute paths + INNODB_DATA_HOME_DIR=$(cd $WSREP_SST_OPT_DATA; mkdir -p "$INNODB_DATA_HOME_DIR"; cd $INNODB_DATA_HOME_DIR; pwd -P) +else + # default to datadir + INNODB_DATA_HOME_DIR=$(cd $WSREP_SST_OPT_DATA; pwd -P) +fi + # Old filter - include everything except selected # FILTER=(--exclude '*.err' --exclude '*.pid' --exclude '*.sock' \ # --exclude '*.conf' --exclude core --exclude 'galera.*' \ @@ -163,7 +177,7 @@ fi # New filter - exclude everything except dirs (schemas) and innodb files FILTER="-f '- /lost+found' -f '- /.fseventsd' -f '- /.Trashes' - -f '+ /wsrep_sst_binlog.tar' -f '+ /ib_lru_dump' -f '+ /ibdata*' -f '+ /*/' -f '- /*'" + -f '+ /wsrep_sst_binlog.tar' -f '- $INNODB_DATA_HOME_DIR/ib_lru_dump' -f '- $INNODB_DATA_HOME_DIR/ibdata*' -f '+ /*/' -f '- /*'" SSTKEY=$(parse_cnf sst tkey "") SSTCERT=$(parse_cnf sst tcert "") @@ -271,6 +285,19 @@ EOF exit $RC fi + # Transfer InnoDB data files + rsync ${STUNNEL:+--rsh="$STUNNEL"} \ + --owner --group --perms --links --specials \ + --ignore-times --inplace --dirs --delete --quiet \ + $WHOLE_FILE_OPT -f '+ /ibdata*' -f '+ /ib_lru_dump' \ + -f '- **' "$INNODB_DATA_HOME_DIR/" \ + rsync://$WSREP_SST_OPT_ADDR-data_dir >&2 || RC=$? + + if [ $RC -ne 0 ]; then + wsrep_log_error "rsync innodb_data_home_dir returned code $RC:" + exit 255 # unknown error + fi + # second, we transfer InnoDB log files rsync ${STUNNEL:+--rsh="$STUNNEL"} \ --owner --group --perms --links --specials \ @@ -371,6 +398,8 @@ $SILENT path = $WSREP_SST_OPT_DATA [$MODULE-log_dir] path = $WSREP_LOG_DIR +[$MODULE-data_dir] + path = $INNODB_DATA_HOME_DIR EOF # rm -rf "$DATA"/ib_logfile* # we don't want old logs around diff --git a/sql/field.h b/sql/field.h index 0046bc11be2..2037802df9a 100644 --- a/sql/field.h +++ b/sql/field.h @@ -1494,17 +1494,6 @@ public: /* Hash value */ virtual void hash(ulong *nr, ulong *nr2); - /** - Get the upper limit of the MySQL integral and floating-point type. - - @return maximum allowed value for the field - */ - virtual ulonglong get_max_int_value() const - { - DBUG_ASSERT(false); - return 0ULL; - } - /** Checks whether a string field is part of write_set. @@ -2126,11 +2115,6 @@ public: *to= *from; return from + 1; } - - virtual ulonglong get_max_int_value() const - { - return unsigned_flag ? 0xFFULL : 0x7FULL; - } }; @@ -2175,10 +2159,6 @@ public: virtual const uchar *unpack(uchar* to, const uchar *from, const uchar *from_end, uint param_data) { return unpack_int16(to, from, from_end); } - virtual ulonglong get_max_int_value() const - { - return unsigned_flag ? 0xFFFFULL : 0x7FFFULL; - } }; class Field_medium :public Field_int @@ -2214,10 +2194,6 @@ public: { return Field::pack(to, from, max_length); } - virtual ulonglong get_max_int_value() const - { - return unsigned_flag ? 0xFFFFFFULL : 0x7FFFFFULL; - } }; @@ -2267,10 +2243,6 @@ public: { return unpack_int32(to, from, from_end); } - virtual ulonglong get_max_int_value() const - { - return unsigned_flag ? 0xFFFFFFFFULL : 0x7FFFFFFFULL; - } }; @@ -2323,11 +2295,6 @@ public: { return unpack_int64(to, from, from_end); } - virtual ulonglong get_max_int_value() const - { - return unsigned_flag ? 0xFFFFFFFFFFFFFFFFULL : 0x7FFFFFFFFFFFFFFFULL; - } - void set_max(); bool is_max(); }; @@ -2412,13 +2379,6 @@ public: uint32 pack_length() const { return sizeof(float); } uint row_pack_length() const { return pack_length(); } void sql_type(String &str) const; - virtual ulonglong get_max_int_value() const - { - /* - We use the maximum as per IEEE754-2008 standard, 2^24 - */ - return 0x1000000ULL; - } private: int save_field_metadata(uchar *first_byte); }; @@ -2472,13 +2432,6 @@ public: uint32 pack_length() const { return sizeof(double); } uint row_pack_length() const { return pack_length(); } void sql_type(String &str) const; - virtual ulonglong get_max_int_value() const - { - /* - We use the maximum as per IEEE754-2008 standard, 2^53 - */ - return 0x20000000000000ULL; - } private: int save_field_metadata(uchar *first_byte); }; diff --git a/sql/handler.cc b/sql/handler.cc index ad9adc34a7e..306b0868d15 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -3047,15 +3047,9 @@ compute_next_insert_id(ulonglong nr,struct system_variables *variables) nr= nr + 1; // optimization of the formula below else { - /* - Calculating the number of complete auto_increment_increment extents: - */ nr= (((nr+ variables->auto_increment_increment - variables->auto_increment_offset)) / (ulonglong) variables->auto_increment_increment); - /* - Adding an offset to the auto_increment_increment extent boundary: - */ nr= (nr* (ulonglong) variables->auto_increment_increment + variables->auto_increment_offset); } @@ -3111,14 +3105,8 @@ prev_insert_id(ulonglong nr, struct system_variables *variables) } if (variables->auto_increment_increment == 1) return nr; // optimization of the formula below - /* - Calculating the number of complete auto_increment_increment extents: - */ nr= (((nr - variables->auto_increment_offset)) / (ulonglong) variables->auto_increment_increment); - /* - Adding an offset to the auto_increment_increment extent boundary: - */ return (nr * (ulonglong) variables->auto_increment_increment + variables->auto_increment_offset); } @@ -3360,23 +3348,10 @@ int handler::update_auto_increment() if (unlikely(tmp)) // Out of range value in store { /* - first test if the query was aborted due to strict mode constraints + It's better to return an error here than getting a confusing + 'duplicate key error' later. */ - if (thd->killed == KILL_BAD_DATA || - nr > table->next_number_field->get_max_int_value()) - DBUG_RETURN(HA_ERR_AUTOINC_ERANGE); - - /* - field refused this value (overflow) and truncated it, use the result of - the truncation (which is going to be inserted); however we try to - decrease it to honour auto_increment_* variables. - That will shift the left bound of the reserved interval, we don't - bother shifting the right bound (anyway any other value from this - interval will cause a duplicate key). - */ - nr= prev_insert_id(table->next_number_field->val_int(), variables); - if (unlikely(table->next_number_field->store((longlong) nr, TRUE))) - nr= table->next_number_field->val_int(); + result= HA_ERR_AUTOINC_ERANGE; } if (append) { diff --git a/sql/mysqld.cc b/sql/mysqld.cc index d2084a8d073..f2b00831d6f 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -4389,20 +4389,6 @@ static int init_common_variables() DBUG_PRINT("info",("%s Ver %s for %s on %s\n",my_progname, server_version, SYSTEM_TYPE,MACHINE_TYPE)); -#ifdef WITH_WSREP - /* - We need to initialize auxiliary variables, that will be - further keep the original values of auto-increment options - as they set by the user. These variables used to restore - user-defined values of the auto-increment options after - setting of the wsrep_auto_increment_control to 'OFF'. - */ - global_system_variables.saved_auto_increment_increment= - global_system_variables.auto_increment_increment; - global_system_variables.saved_auto_increment_offset= - global_system_variables.auto_increment_offset; -#endif /* WITH_WSREP */ - #ifdef HAVE_LINUX_LARGE_PAGES /* Initialize large page size */ if (opt_large_pages) diff --git a/sql/sql_class.h b/sql/sql_class.h index e2aed01ef3b..00a5c41f708 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -581,17 +581,6 @@ typedef struct system_variables ha_rows max_join_size; ha_rows expensive_subquery_limit; ulong auto_increment_increment, auto_increment_offset; -#ifdef WITH_WSREP - /* - Variables with stored values of the auto_increment_increment - and auto_increment_offset options that are will be needed when - wsrep_auto_increment_control will be set to 'OFF', because the - setting it to 'ON' leads to overwriting of the original values - (which are set by the user) by calculated values (which are - based on the cluster's size): - */ - ulong saved_auto_increment_increment, saved_auto_increment_offset; -#endif /* WITH_WSREP */ uint eq_range_index_dive_limit; ulong column_compression_zlib_strategy; ulong lock_wait_timeout; diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 4cf62d457df..6d4c135683a 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -349,56 +349,13 @@ static Sys_var_long Sys_pfs_connect_attrs_size( #endif /* WITH_PERFSCHEMA_STORAGE_ENGINE */ -#ifdef WITH_WSREP - -/* - We need to keep the original values set by the user, as they will - be lost if wsrep_auto_increment_control set to 'ON': -*/ -static bool update_auto_increment_increment (sys_var *self, THD *thd, enum_var_type type) -{ - if (type == OPT_GLOBAL) - global_system_variables.saved_auto_increment_increment= - global_system_variables.auto_increment_increment; - else - thd->variables.saved_auto_increment_increment= - thd->variables.auto_increment_increment; - return false; -} - -#endif /* WITH_WSREP */ - static Sys_var_ulong Sys_auto_increment_increment( "auto_increment_increment", "Auto-increment columns are incremented by this", SESSION_VAR(auto_increment_increment), CMD_LINE(OPT_ARG), VALID_RANGE(1, 65535), DEFAULT(1), BLOCK_SIZE(1), -#ifdef WITH_WSREP - NO_MUTEX_GUARD, IN_BINLOG, ON_CHECK(0), - ON_UPDATE(update_auto_increment_increment)); -#else NO_MUTEX_GUARD, IN_BINLOG); -#endif /* WITH_WSREP */ - -#ifdef WITH_WSREP - -/* - We need to keep the original values set by the user, as they will - be lost if wsrep_auto_increment_control set to 'ON': -*/ -static bool update_auto_increment_offset (sys_var *self, THD *thd, enum_var_type type) -{ - if (type == OPT_GLOBAL) - global_system_variables.saved_auto_increment_offset= - global_system_variables.auto_increment_offset; - else - thd->variables.saved_auto_increment_offset= - thd->variables.auto_increment_offset; - return false; -} - -#endif /* WITH_WSREP */ static Sys_var_ulong Sys_auto_increment_offset( "auto_increment_offset", @@ -407,12 +364,7 @@ static Sys_var_ulong Sys_auto_increment_offset( SESSION_VAR(auto_increment_offset), CMD_LINE(OPT_ARG), VALID_RANGE(1, 65535), DEFAULT(1), BLOCK_SIZE(1), -#ifdef WITH_WSREP - NO_MUTEX_GUARD, IN_BINLOG, ON_CHECK(0), - ON_UPDATE(update_auto_increment_offset)); -#else NO_MUTEX_GUARD, IN_BINLOG); -#endif /* WITH_WSREP */ static Sys_var_mybool Sys_automatic_sp_privileges( "automatic_sp_privileges", @@ -5383,54 +5335,11 @@ static Sys_var_ulong Sys_wsrep_retry_autocommit( SESSION_VAR(wsrep_retry_autocommit), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 10000), DEFAULT(1), BLOCK_SIZE(1)); -static bool update_wsrep_auto_increment_control (sys_var *self, THD *thd, enum_var_type type) -{ - if (wsrep_auto_increment_control) - { - /* - The variables that control auto increment shall be calculated - automaticaly based on the size of the cluster. This usually done - within the wsrep_view_handler_cb callback. However, if the user - manually sets the value of wsrep_auto_increment_control to 'ON', - then we should to re-calculate these variables again (because - these values may be required before wsrep_view_handler_cb will - be re-invoked, which is rarely invoked if the cluster stays in - the stable state): - */ - global_system_variables.auto_increment_increment= - wsrep_cluster_size ? wsrep_cluster_size : 1; - global_system_variables.auto_increment_offset= - wsrep_local_index >= 0 ? wsrep_local_index + 1 : 1; - thd->variables.auto_increment_increment= - global_system_variables.auto_increment_increment; - thd->variables.auto_increment_offset= - global_system_variables.auto_increment_offset; - } - else - { - /* - We must restore the last values of the variables that - are explicitly specified by the user: - */ - global_system_variables.auto_increment_increment= - global_system_variables.saved_auto_increment_increment; - global_system_variables.auto_increment_offset= - global_system_variables.saved_auto_increment_offset; - thd->variables.auto_increment_increment= - thd->variables.saved_auto_increment_increment; - thd->variables.auto_increment_offset= - thd->variables.saved_auto_increment_offset; - } - return false; -} - static Sys_var_mybool Sys_wsrep_auto_increment_control( "wsrep_auto_increment_control", "To automatically control the " "assignment of autoincrement variables", GLOBAL_VAR(wsrep_auto_increment_control), - CMD_LINE(OPT_ARG), DEFAULT(TRUE), - NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(0), - ON_UPDATE(update_wsrep_auto_increment_control)); + CMD_LINE(OPT_ARG), DEFAULT(TRUE)); static Sys_var_mybool Sys_wsrep_drupal_282555_workaround( "wsrep_drupal_282555_workaround", "Enable a workaround to handle the " diff --git a/sql/wsrep_mysqld.h b/sql/wsrep_mysqld.h index d7b490d879e..6aa8a68f222 100644 --- a/sql/wsrep_mysqld.h +++ b/sql/wsrep_mysqld.h @@ -159,10 +159,7 @@ extern "C" time_t wsrep_thd_query_start(THD *thd); extern "C" query_id_t wsrep_thd_query_id(THD *thd); extern "C" query_id_t wsrep_thd_wsrep_last_query_id(THD *thd); extern "C" void wsrep_thd_set_wsrep_last_query_id(THD *thd, query_id_t id); - -extern "C" void wsrep_thd_auto_increment_variables(THD*, - unsigned long long *offset, - unsigned long long *increment); +extern "C" void wsrep_set_data_home_dir(const char *data_dir); extern void wsrep_close_client_connections(my_bool wait_to_end); extern int wsrep_wait_committing_connections_close(int wait_time); diff --git a/sql/wsrep_sst.cc b/sql/wsrep_sst.cc index 8816a31330e..0a1d95f30b8 100644 --- a/sql/wsrep_sst.cc +++ b/sql/wsrep_sst.cc @@ -58,6 +58,13 @@ bool wsrep_sst_method_update (sys_var *self, THD* thd, enum_var_type type) return 0; } +static const char* data_home_dir = NULL; + +extern "C" +void wsrep_set_data_home_dir(const char *data_dir) +{ + data_home_dir= (data_dir && *data_dir) ? data_dir : NULL; +} static void make_wsrep_defaults_file() { @@ -595,6 +602,29 @@ static int sst_append_auth_env(wsp::env& env, const char* sst_auth) return -env.error(); } +#define DATA_HOME_DIR_ENV "INNODB_DATA_HOME_DIR" + +static int sst_append_data_dir(wsp::env& env, const char* data_dir) +{ + int const data_dir_size= strlen(DATA_HOME_DIR_ENV) + 1 /* = */ + + (data_dir ? strlen(data_dir) : 0) + 1 /* \0 */; + + wsp::string data_dir_str(data_dir_size); // for automatic cleanup on return + if (!data_dir_str()) return -ENOMEM; + + int ret= snprintf(data_dir_str(), data_dir_size, "%s=%s", + DATA_HOME_DIR_ENV, data_dir ? data_dir : ""); + + if (ret < 0 || ret >= data_dir_size) + { + WSREP_ERROR("sst_append_data_dir(): snprintf() failed: %d", ret); + return (ret < 0 ? ret : -EMSGSIZE); + } + + env.append(data_dir_str()); + return -env.error(); +} + static ssize_t sst_prepare_other (const char* method, const char* sst_auth, const char* addr_in, @@ -656,6 +686,16 @@ static ssize_t sst_prepare_other (const char* method, return ret; } + if (data_home_dir) + { + if ((ret= sst_append_data_dir(env, data_home_dir))) + { + WSREP_ERROR("sst_prepare_other(): appending data " + "directory failed: %d", ret); + return ret; + } + } + pthread_t tmp; sst_thread_arg arg(cmd_str(), env()); mysql_mutex_lock (&arg.lock); @@ -1347,6 +1387,16 @@ wsrep_cb_status_t wsrep_sst_donate_cb (void* app_ctx, void* recv_ctx, return WSREP_CB_FAILURE; } + if (data_home_dir) + { + if ((ret= sst_append_data_dir(env, data_home_dir))) + { + WSREP_ERROR("wsrep_sst_donate_cb(): appending data " + "directory failed: %d", ret); + return WSREP_CB_FAILURE; + } + } + if (!strcmp (WSREP_SST_MYSQLDUMP, method)) { ret = sst_donate_mysqldump(data, ¤t_gtid->uuid, uuid_str, diff --git a/sql/wsrep_thd.cc b/sql/wsrep_thd.cc index a83ea4ce1c6..ce6d9688cb3 100644 --- a/sql/wsrep_thd.cc +++ b/sql/wsrep_thd.cc @@ -676,25 +676,3 @@ bool wsrep_thd_has_explicit_locks(THD *thd) assert(thd); return thd->mdl_context.has_explicit_locks(); } - -/* - Get auto increment variables for THD. Use global settings for - applier threads. - */ -extern "C" -void wsrep_thd_auto_increment_variables(THD* thd, - unsigned long long* offset, - unsigned long long* increment) -{ - if (thd->wsrep_exec_mode == REPL_RECV && - thd->wsrep_conflict_state != REPLAYING) - { - *offset= global_system_variables.auto_increment_offset; - *increment= global_system_variables.auto_increment_increment; - } - else - { - *offset= thd->variables.auto_increment_offset; - *increment= thd->variables.auto_increment_increment; - } -} diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index bfb7abd7ab3..e03fa1b58d6 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -3827,6 +3827,12 @@ static int innodb_init_params() srv_data_home = innobase_data_home_dir ? innobase_data_home_dir : default_path; +#ifdef WITH_WSREP + /* If we use the wsrep API, then we need to tell the server + the path to the data files (for passing it to the SST scripts): */ + wsrep_set_data_home_dir(srv_data_home); +#endif /* WITH_WSREP */ + /*--------------- Shared tablespaces -------------------------*/ @@ -8047,7 +8053,8 @@ ha_innobase::write_row( /* We need the upper limit of the col type to check for whether we update the table autoinc counter or not. */ - col_max_value = table->next_number_field->get_max_int_value(); + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); /* Get the value that MySQL attempted to store in the table.*/ auto_inc = table->next_number_field->val_uint(); @@ -8122,30 +8129,14 @@ set_max_autoinc: /* This should filter out the negative values set explicitly by the user. */ if (auto_inc <= col_max_value) { + ut_a(m_prebuilt->autoinc_increment > 0); + ulonglong offset; ulonglong increment; dberr_t err; -#ifdef WITH_WSREP - /* Applier threads which are - processing ROW events and don't go - through server level autoinc - processing, therefore m_prebuilt - autoinc values don't get - properly assigned. Fetch values from - server side. */ - if (wsrep_on(m_user_thd) && - wsrep_thd_exec_mode(m_user_thd) == REPL_RECV) { - wsrep_thd_auto_increment_variables(m_user_thd, &offset, &increment); - } else { -#endif /* WITH_WSREP */ - ut_a(m_prebuilt->autoinc_increment > 0); - - offset = m_prebuilt->autoinc_offset; - increment = m_prebuilt->autoinc_increment; -#ifdef WITH_WSREP - } -#endif /* WITH_WSREP */ + offset = m_prebuilt->autoinc_offset; + increment = m_prebuilt->autoinc_increment; auto_inc = innobase_next_autoinc( auto_inc, @@ -8843,27 +8834,12 @@ ha_innobase::update_row( /* A value for an AUTO_INCREMENT column was specified in the UPDATE statement. */ - ulonglong offset, increment; -#ifdef WITH_WSREP - /* Applier threads which are processing ROW events and - don't go through server level autoinc processing, - therefore m_prebuilt autoinc values don't get properly - assigned. Fetch values from server side. */ - if (wsrep_on(m_user_thd) - && wsrep_thd_exec_mode(m_user_thd) == REPL_RECV) { - wsrep_thd_auto_increment_variables( - m_user_thd, &offset, &increment); - } else { -#endif /* WITH_WSREP */ - offset = m_prebuilt->autoinc_offset; - increment = m_prebuilt->autoinc_increment; -#ifdef WITH_WSREP - } -#endif /* WITH_WSREP */ - autoinc = innobase_next_autoinc( - autoinc, 1, increment, offset, - table->found_next_number_field->get_max_int_value()); + autoinc, 1, + m_prebuilt->autoinc_increment, + m_prebuilt->autoinc_offset, + innobase_get_int_col_max_value( + table->found_next_number_field)); error = innobase_set_max_autoinc(autoinc); @@ -16511,14 +16487,14 @@ ha_innobase::get_auto_increment( increment, thd_get_thread_id(m_user_thd), current, autoinc); - if (!wsrep_on(m_user_thd)) { - current = innobase_next_autoinc( - autoinc - - m_prebuilt->autoinc_increment, - 1, increment, offset, col_max_value); + current = autoinc + - m_prebuilt->autoinc_increment; } + current = innobase_next_autoinc( + current, 1, increment, offset, col_max_value); + dict_table_autoinc_initialize( m_prebuilt->table, current); diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index 94f9ade9d65..cd13384f144 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -1472,6 +1472,7 @@ int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt) while ((error= repair(thd, param, 0)) && param->retry_repair) { param->retry_repair= 0; + file->state->records= start_records; if (test_all_bits(param->testflag, (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK))) { @@ -1976,6 +1977,7 @@ int ha_maria::disable_indexes(uint mode) int ha_maria::enable_indexes(uint mode) { int error; + ha_rows start_rows= file->state->records; DBUG_PRINT("info", ("ha_maria::enable_indexes mode: %d", mode)); if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys)) { @@ -2038,6 +2040,7 @@ int ha_maria::enable_indexes(uint mode) DBUG_ASSERT(thd->killed != 0); /* Repairing by sort failed. Now try standard repair method. */ param->testflag &= ~T_REP_BY_SORT; + file->state->records= start_rows; error= (repair(thd, param, 0) != HA_ADMIN_OK); /* If the standard repair succeeded, clear all error messages which diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c index 0ae3868dbf6..3850af8246c 100644 --- a/storage/maria/ma_commit.c +++ b/storage/maria/ma_commit.c @@ -98,7 +98,7 @@ int maria_commit(MARIA_HA *info) if (!info->s->now_transactional) return 0; trn= info->trn; - info->trn= 0; /* checked in maria_close() */ + _ma_reset_trn_for_table(info); return ma_commit(trn); } diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c index 1189594fd2b..da44da123d2 100644 --- a/storage/maria/ma_info.c +++ b/storage/maria/ma_info.c @@ -56,7 +56,11 @@ int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag) } if (flag & HA_STATUS_VARIABLE) { - x->records = info->state->records; + /* If table is locked, give versioned number otherwise last commited */ + if (info->lock_type == F_UNLCK) + x->records = share->state.state.records; + else + x->records = info->state->records; x->deleted = share->state.state.del; x->delete_length = share->state.state.empty; x->data_file_length = share->state.state.data_file_length; diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c index c197aac0da7..e015007c9fd 100644 --- a/storage/maria/ma_recovery.c +++ b/storage/maria/ma_recovery.c @@ -3552,8 +3552,8 @@ void _ma_tmp_disable_logging_for_table(MARIA_HA *info, info->state may point to a state that was deleted by _ma_trnman_end_trans_hook() */ - share->state.common= *info->state; - info->state= &share->state.common; + share->state.no_logging= *info->state; + info->state= &share->state.no_logging; info->switched_transactional= TRUE; /* @@ -3609,6 +3609,10 @@ my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages) _ma_copy_nontrans_state_information(info); _ma_reset_history(info->s); + /* Reset state to point to state.common, as on open() */ + info->state= &share->state.common; + *info->state= share->state.state; + if (flush_pages) { /* Ensure that recover is not executing any redo before this */ diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c index 50930dae2ea..ae3b95db6ec 100644 --- a/storage/maria/maria_chk.c +++ b/storage/maria/maria_chk.c @@ -1160,7 +1160,7 @@ static int maria_chk(HA_CHECK *param, char *filename) { fprintf(stderr, "Aria table '%s' is not fixed because of errors\n", filename); - return(-1); + DBUG_RETURN(-1); } recreate=1; if (!(param->testflag & T_REP_ANY)) @@ -1182,7 +1182,7 @@ static int maria_chk(HA_CHECK *param, char *filename) param->total_deleted+=info->state->del; descript(param, info, filename); maria_close(info); /* Should always succeed */ - return(0); + DBUG_RETURN(0); } if (!stopwords_inited++) diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index 76233ef4a94..dd2e2949856 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -150,6 +150,8 @@ typedef struct st_maria_state_info MARIA_STATUS_INFO state; /* maria_ha->state points here for crash-safe but not versioned tables */ MARIA_STATUS_INFO common; + /* State for a versioned table that is temporary non versioned */ + MARIA_STATUS_INFO no_logging; ha_rows split; /* number of split blocks */ my_off_t dellink; /* Link to next removed block */ pgcache_page_no_t first_bitmap_with_space; diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index e56c5b1033e..090763a2ece 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -159,6 +159,9 @@ if (UNIX AND NOT APPLE) endif() TARGET_LINK_LIBRARIES(rocksdb rocksdb_aux_lib) + FIND_LIBRARY(LZ4_LIBRARY + NAMES liblz4${PIC_EXT}.a lz4 + HINTS ${WITH_LZ4}/lib) IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") @@ -186,8 +189,11 @@ IF(HAVE_SCHED_GETCPU) ENDIF() IF (WITH_TBB) + FIND_LIBRARY(TBB_LIBRARY + NAMES libtbb${PIC_EXT}.a tbb + HINTS ${WITH_TBB}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_TBB}/lib/libtbb${PIC_EXT}.a) + ${TBB_LIBRARY}) ADD_DEFINITIONS(-DTBB) ENDIF() diff --git a/storage/rocksdb/build_rocksdb.cmake b/storage/rocksdb/build_rocksdb.cmake index d854bdaa03a..c36c761f92f 100644 --- a/storage/rocksdb/build_rocksdb.cmake +++ b/storage/rocksdb/build_rocksdb.cmake @@ -178,6 +178,7 @@ set(ROCKSDB_SOURCES db/db_info_dumper.cc db/db_iter.cc db/dbformat.cc + db/error_handler.cc db/event_helpers.cc db/experimental.cc db/external_sst_file_ingestion_job.cc @@ -188,6 +189,7 @@ set(ROCKSDB_SOURCES db/internal_stats.cc db/log_reader.cc db/log_writer.cc + db/logs_with_prep_tracker.cc db/malloc_stats.cc db/managed_iterator.cc db/memtable.cc @@ -284,6 +286,7 @@ set(ROCKSDB_SOURCES util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc + util/compression_context_cache.cc util/concurrent_arena.cc util/crc32c.cc util/delete_scheduler.cc @@ -304,6 +307,7 @@ set(ROCKSDB_SOURCES util/status_message.cc util/string_util.cc util/sync_point.cc + util/sync_point_impl.cc util/testutil.cc util/thread_local.cc util/threadpool_imp.cc @@ -352,6 +356,8 @@ set(ROCKSDB_SOURCES utilities/transactions/transaction_util.cc utilities/transactions/write_prepared_txn.cc utilities/transactions/write_prepared_txn_db.cc + utilities/transactions/write_unprepared_txn.cc + utilities/transactions/write_unprepared_txn_db.cc utilities/ttl/db_ttl_impl.cc utilities/write_batch_with_index/write_batch_with_index.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -379,6 +385,13 @@ ENDFOREACH() if(MSVC) add_definitions(-DHAVE_SSE42 -DHAVE_PCLMUL) + # Workaround broken compilation with -DWIN32_LEAN_AND_MEAN + # (https://github.com/facebook/rocksdb/issues/4344) + set_source_files_properties(${ROCKSDB_SOURCE_DIR}/port/win/env_win.cc + PROPERTIES COMPILE_FLAGS "/FI\"windows.h\" /FI\"winioctl.h\"") + + # Workaround Win8.1 SDK bug, that breaks /permissive- + string(REPLACE "/permissive-" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") else() set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul ${CXX11_FLAGS}") diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 2b6ba2616d8..26c0aefd56b 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -32,6 +32,7 @@ /* C++ standard header files */ #include +#include #include #include #include @@ -127,6 +128,60 @@ const std::string DEFAULT_CF_NAME("default"); const std::string DEFAULT_SYSTEM_CF_NAME("__system__"); const std::string PER_INDEX_CF_NAME("$per_index_cf"); +class Rdb_explicit_snapshot; + +std::mutex explicit_snapshot_mutex; +ulonglong explicit_snapshot_counter = 0; +std::unordered_map> + explicit_snapshots; +static std::vector rdb_indexes_to_recalc; + +#ifdef MARIADB_NOT_YET +class Rdb_explicit_snapshot : public explicit_snapshot { + std::unique_ptr snapshot; + + public: + static std::shared_ptr + create(snapshot_info_st *ss_info, rocksdb::DB *db, + const rocksdb::Snapshot *snapshot) { + std::lock_guard lock(explicit_snapshot_mutex); + auto s = std::unique_ptr( + new rocksdb::ManagedSnapshot(db, snapshot)); + if (!s) { + return nullptr; + } + ss_info->snapshot_id = ++explicit_snapshot_counter; + auto ret = std::make_shared(*ss_info, std::move(s)); + if (!ret) { + return nullptr; + } + explicit_snapshots[ss_info->snapshot_id] = ret; + return ret; + } + + static std::shared_ptr + get(const ulonglong snapshot_id) { + std::lock_guard lock(explicit_snapshot_mutex); + auto elem = explicit_snapshots.find(snapshot_id); + if (elem == explicit_snapshots.end()) { + return nullptr; + } + return elem->second.lock(); + } + + rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); } + + Rdb_explicit_snapshot(snapshot_info_st ss_info, + std::unique_ptr snapshot) + : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {} + + virtual ~Rdb_explicit_snapshot() { + std::lock_guard lock(explicit_snapshot_mutex); + explicit_snapshots.erase(ss_info.snapshot_id); + } +}; +#endif + /** Updates row counters based on the table type and operation type. */ @@ -144,11 +199,15 @@ static handler *rocksdb_create_handler(my_core::handlerton *hton, my_core::TABLE_SHARE *table_arg, my_core::MEM_ROOT *mem_root); -static rocksdb::CompactRangeOptions getCompactRangeOptions() { +static rocksdb::CompactRangeOptions +getCompactRangeOptions(int concurrency = 0) { rocksdb::CompactRangeOptions compact_range_options; compact_range_options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForce; compact_range_options.exclusive_manual_compaction = false; + if (concurrency > 0) { + compact_range_options.max_subcompactions = concurrency; + } return compact_range_options; } @@ -187,6 +246,8 @@ Rdb_io_watchdog *io_watchdog = nullptr; static Rdb_background_thread rdb_bg_thread; +static Rdb_manual_compaction_thread rdb_mc_thread; + // List of table names (using regex) that are exceptions to the strict // collation check requirement. Regex_list_handler *rdb_collation_exceptions; @@ -200,30 +261,6 @@ static void rocksdb_flush_all_memtables() { } } -static void rocksdb_compact_column_family_stub( - THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, - const void *const save) {} - -static int rocksdb_compact_column_family(THD *const thd, - struct st_mysql_sys_var *const var, - void *const var_ptr, - struct st_mysql_value *const value) { - char buff[STRING_BUFFER_USUAL_SIZE]; - int len = sizeof(buff); - - DBUG_ASSERT(value != nullptr); - - if (const char *const cf = value->val_str(value, buff, &len)) { - auto cfh = cf_manager.get_cf(cf); - if (cfh != nullptr && rdb != nullptr) { - sql_print_verbose_info("RocksDB: Manual compaction of column family: %s\n", - cf); - rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr); - } - } - return HA_EXIT_SUCCESS; -} - /////////////////////////////////////////////////////////// // Hash map: table name => open table handler /////////////////////////////////////////////////////////// @@ -249,6 +286,8 @@ struct Rdb_open_tables_map { Rdb_open_tables_map() : m_hash(get_hash_key, system_charset_info) { } + void free_hash(void) { m_hash.~Rdb_table_set(); } + std::vector get_table_names(void) const; }; @@ -368,6 +407,7 @@ static void rocksdb_drop_index_wakeup_thread( static my_bool rocksdb_pause_background_work = 0; static mysql_mutex_t rdb_sysvars_mutex; +static mysql_mutex_t rdb_block_cache_resize_mutex; static void rocksdb_set_pause_background_work( my_core::THD *const, @@ -450,6 +490,9 @@ static void rocksdb_set_wal_bytes_per_sync(THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr, const void *const save); +static int rocksdb_validate_set_block_cache_size( + THD *thd, struct st_mysql_sys_var *const var, void *var_ptr, + struct st_mysql_value *value); ////////////////////////////////////////////////////////////////////////////// // Options definitions ////////////////////////////////////////////////////////////////////////////// @@ -510,11 +553,19 @@ static char* rocksdb_git_hash; char *compression_types_val= const_cast(get_rocksdb_supported_compression_types()); +static unsigned long rocksdb_write_policy = + rocksdb::TxnDBWritePolicy::WRITE_COMMITTED; +static my_bool rocksdb_error_on_suboptimal_collation = 1; +static uint32_t rocksdb_stats_recalc_rate = 0; +static uint32_t rocksdb_debug_manual_compaction_delay = 0; +static uint32_t rocksdb_max_manual_compactions = 0; std::atomic rocksdb_row_lock_deadlocks(0); std::atomic rocksdb_row_lock_wait_timeouts(0); std::atomic rocksdb_snapshot_conflict_errors(0); std::atomic rocksdb_wal_group_syncs(0); +std::atomic rocksdb_manual_compactions_processed(0); +std::atomic rocksdb_manual_compactions_running(0); @@ -600,6 +651,14 @@ static std::unique_ptr rocksdb_db_options = static std::shared_ptr rocksdb_rate_limiter; +/* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */ +static const char *write_policy_names[] = {"write_committed", "write_prepared", + "write_unprepared", NullS}; + +static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1, + "write_policy_typelib", + write_policy_names, nullptr}; + /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */ static const char *info_log_level_names[] = {"debug_level", "info_level", "warn_level", "error_level", @@ -694,6 +753,14 @@ static int rocksdb_validate_flush_log_at_trx_commit( *static_cast(var_ptr) = static_cast(new_value); return HA_EXIT_SUCCESS; } +static void rocksdb_compact_column_family_stub( + THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, + const void *const save) {} + +static int rocksdb_compact_column_family(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + struct st_mysql_value *const value); static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS}; @@ -702,7 +769,8 @@ static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1, nullptr}; const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024; -const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024; +const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024; +const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024; const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000; const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024; const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024; @@ -733,6 +801,11 @@ static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG, /*min*/ 2, /*max*/ ULONG_MAX, 0); +static MYSQL_THDVAR_BOOL( + commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG, + "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr, + nullptr, FALSE); + static MYSQL_THDVAR_BOOL( trace_sst_api, PLUGIN_VAR_RQCMDARG, "Generate trace output in the log for each call to the SstFileWriter", @@ -744,6 +817,13 @@ static MYSQL_THDVAR_BOOL( "unique_checks and enables rocksdb_commit_in_the_middle.", rocksdb_check_bulk_load, nullptr, FALSE); +static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG, + "Allow bulk loading of sk keys during bulk-load. " + "Can be changed only when bulk load is disabled.", + /* Intentionally reuse unsorted's check function */ + rocksdb_check_bulk_load_allow_unsorted, nullptr, + FALSE); + static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG, "Allow unsorted input during bulk-load. " "Can be changed only when bulk load is disabled.", @@ -794,7 +874,7 @@ static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG, static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG, "Maximum number of locks a transaction can have", nullptr, nullptr, - /*default*/ RDB_MAX_ROW_LOCKS, + /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS, /*min*/ 1, /*max*/ RDB_MAX_ROW_LOCKS, 0); @@ -846,6 +926,12 @@ static MYSQL_THDVAR_ULONGLONG( /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY, /* max */ SIZE_T_MAX, 1); +static MYSQL_THDVAR_INT( + manual_compaction_threads, PLUGIN_VAR_RQCMDARG, + "How many rocksdb threads to run for manual compactions", nullptr, nullptr, + /* default rocksdb.dboption max_subcompactions */ 0, + /* min */ 0, /* max */ 128, 0); + static MYSQL_SYSVAR_BOOL( create_if_missing, *reinterpret_cast(&rocksdb_db_options->create_if_missing), @@ -867,6 +953,12 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr, rocksdb_db_options->manual_wal_flush); +static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "DBOptions::write_policy for RocksDB", nullptr, + nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED, + &write_policy_typelib); + static MYSQL_SYSVAR_BOOL( create_missing_column_families, *reinterpret_cast( @@ -1077,7 +1169,9 @@ static MYSQL_SYSVAR_INT(table_cache_numshardbits, "DBOptions::table_cache_numshardbits for RocksDB", nullptr, nullptr, rocksdb_db_options->table_cache_numshardbits, - /* min */ 0, /* max */ INT_MAX, 0); + // LRUCache limits this to 19 bits, anything greater + // fails to create a cache and returns a nullptr + /* min */ 0, /* max */ 19, 0); static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -1187,8 +1281,9 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true); static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "block_cache size for RocksDB", nullptr, nullptr, + PLUGIN_VAR_RQCMDARG, + "block_cache size for RocksDB", + rocksdb_validate_set_block_cache_size, nullptr, /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE, /* min */ RDB_MIN_BLOCK_CACHE_SIZE, /* max */ LONGLONG_MAX, @@ -1435,6 +1530,18 @@ static MYSQL_SYSVAR_BOOL( "on PK TTL data. This variable is a no-op in non-debug builds.", nullptr, nullptr, FALSE); +static MYSQL_SYSVAR_UINT( + max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG, + "Maximum number of pending + ongoing number of manual compactions.", + nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0); + +static MYSQL_SYSVAR_UINT( + debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay, + PLUGIN_VAR_RQCMDARG, + "For debugging purposes only. Sleeping specified seconds " + "for simulating long running compactions.", + nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0); + static MYSQL_SYSVAR_BOOL( reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG, "Reset the RocksDB internal statistics without restarting the DB.", nullptr, @@ -1597,6 +1704,13 @@ static MYSQL_SYSVAR_UINT( RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0, /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0); +static MYSQL_SYSVAR_UINT( + stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG, + "The number of indexes per second to recalculate statistics for. 0 to " + "disable background recalculation.", + nullptr, nullptr, 0 /* default value */, 0 /* min value */, + UINT_MAX /* max value */, 0); + static MYSQL_SYSVAR_BOOL( large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, "Support large index prefix length of 3072 bytes. If off, the maximum " @@ -1610,16 +1724,25 @@ static MYSQL_SYSVAR_BOOL( "detected.", nullptr, nullptr, FALSE); +static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation, + rocksdb_error_on_suboptimal_collation, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Raise an error instead of warning if a sub-optimal " + "collation is used", + nullptr, nullptr, TRUE); + static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100; static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(deadlock_detect), MYSQL_SYSVAR(deadlock_detect_depth), + MYSQL_SYSVAR(commit_time_batch_for_recovery), MYSQL_SYSVAR(max_row_locks), MYSQL_SYSVAR(write_batch_max_bytes), MYSQL_SYSVAR(lock_scanned_rows), MYSQL_SYSVAR(bulk_load), + MYSQL_SYSVAR(bulk_load_allow_sk), MYSQL_SYSVAR(bulk_load_allow_unsorted), MYSQL_SYSVAR(skip_unique_check_tables), MYSQL_SYSVAR(trace_sst_api), @@ -1637,6 +1760,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(create_if_missing), MYSQL_SYSVAR(two_write_queues), MYSQL_SYSVAR(manual_wal_flush), + MYSQL_SYSVAR(write_policy), MYSQL_SYSVAR(create_missing_column_families), MYSQL_SYSVAR(error_if_exists), MYSQL_SYSVAR(paranoid_checks), @@ -1754,6 +1878,11 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(allow_to_start_after_corruption), MYSQL_SYSVAR(git_hash), + MYSQL_SYSVAR(error_on_suboptimal_collation), + MYSQL_SYSVAR(stats_recalc_rate), + MYSQL_SYSVAR(debug_manual_compaction_delay), + MYSQL_SYSVAR(max_manual_compactions), + MYSQL_SYSVAR(manual_compaction_threads), nullptr}; static rocksdb::WriteOptions @@ -1768,6 +1897,50 @@ rdb_get_rocksdb_write_options(my_core::THD *const thd) { return opt; } +static int rocksdb_compact_column_family(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + struct st_mysql_value *const value) { + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + DBUG_ASSERT(value != nullptr); + + if (const char *const cf = value->val_str(value, buff, &len)) { + auto cfh = cf_manager.get_cf(cf); + if (cfh != nullptr && rdb != nullptr) { + int mc_id = rdb_mc_thread.request_manual_compaction( + cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads)); + if (mc_id == -1) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "Can't schedule more manual compactions. " + "Increase rocksdb_max_manual_compactions or stop issuing " + "more manual compactions."); + return HA_EXIT_FAILURE; + } else if (mc_id < 0) { + return HA_EXIT_FAILURE; + } + // NO_LINT_DEBUG + sql_print_information("RocksDB: Manual compaction of column family: %s\n", + cf); + // Checking thd state every short cycle (100ms). This is for allowing to + // exiting this function without waiting for CompactRange to finish. + do { + my_sleep(100000); + } while (!thd->killed && + !rdb_mc_thread.is_manual_compaction_finished(mc_id)); + + if (thd->killed) { + // This cancels if requested compaction state is INITED. + // TODO(yoshinorim): Cancel running compaction as well once + // it is supported in RocksDB. + rdb_mc_thread.clear_manual_compaction_request(mc_id, true); + } + } + } + return HA_EXIT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////////////////// /** @@ -1887,8 +2060,6 @@ protected: bool m_is_two_phase = false; private: - /* Number of RockDB savepoints taken */ - int m_n_savepoints; /* Number of write operations this transaction had when we took the last savepoint (the idea is not to take another savepoint if we haven't made @@ -1897,9 +2068,9 @@ private: ulonglong m_writes_at_last_savepoint; protected: - THD *m_thd = nullptr; - rocksdb::ReadOptions m_read_opts; +protected: + THD *m_thd = nullptr; static std::multiset s_tx_list; static mysql_mutex_t s_tx_list_mutex; @@ -1957,6 +2128,7 @@ protected: } public: + rocksdb::ReadOptions m_read_opts; const char *m_mysql_log_file_name; my_off_t m_mysql_log_offset; #ifdef MARIAROCKS_NOT_YET @@ -1967,6 +2139,7 @@ protected: String m_detailed_error; int64_t m_snapshot_timestamp = 0; bool m_ddl_transaction; + std::shared_ptr m_explicit_snapshot; /* Tracks the number of tables in use through external_lock. @@ -2032,7 +2205,7 @@ protected: if (s.IsDeadlock()) { my_core::thd_mark_transaction_to_rollback(thd, - false /* just statement */); + true /* whole transaction */); m_detailed_error = String(); table_handler->m_deadlock_counter.inc(); rocksdb_row_lock_deadlocks++; @@ -2049,7 +2222,7 @@ protected: } m_detailed_error = String(" (snapshot conflict)", system_charset_info); table_handler->m_deadlock_counter.inc(); - return HA_ERR_LOCK_DEADLOCK; + return HA_ERR_ROCKSDB_STATUS_BUSY; } if (s.IsIOError() || s.IsCorruption()) { @@ -2462,7 +2635,6 @@ public: entire transaction. */ do_set_savepoint(); - m_n_savepoints= 1; m_writes_at_last_savepoint= m_write_count; } @@ -2479,7 +2651,6 @@ public: { do_set_savepoint(); m_writes_at_last_savepoint= m_write_count; - m_n_savepoints++; } } @@ -2490,10 +2661,14 @@ public: void rollback_to_stmt_savepoint() { if (m_writes_at_last_savepoint != m_write_count) { do_rollback_to_savepoint(); - if (!--m_n_savepoints) { - do_set_savepoint(); - m_n_savepoints= 1; - } + /* + RollbackToSavePoint "removes the most recent SetSavePoint()", so + we need to set it again so that next statement can roll back to this + stage. + It's ok to do it here at statement end (instead of doing it at next + statement start) because setting a savepoint is cheap. + */ + do_set_savepoint(); m_writes_at_last_savepoint= m_write_count; } } @@ -2666,6 +2841,17 @@ public: void acquire_snapshot(bool acquire_now) override { if (m_read_opts.snapshot == nullptr) { +#ifdef MARIAROCKS_NOT_YET + const auto thd_ss = std::static_pointer_cast( + m_thd->get_explicit_snapshot()); + if (thd_ss) { + m_explicit_snapshot = thd_ss; + } + if (m_explicit_snapshot) { + auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot(); + snapshot_created(snapshot); + } else +#endif if (is_tx_read_only()) { snapshot_created(rdb->GetSnapshot()); } else if (acquire_now) { @@ -2683,6 +2869,12 @@ public: if (m_read_opts.snapshot != nullptr) { m_snapshot_timestamp = 0; +#ifdef MARIAROCKS_NOT_YET + if (m_explicit_snapshot) { + m_explicit_snapshot.reset(); + need_clear = false; + } else +#endif if (is_tx_read_only()) { rdb->ReleaseSnapshot(m_read_opts.snapshot); need_clear = false; @@ -2792,6 +2984,10 @@ public: tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec); tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect); tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth); + // If this variable is set, this will write commit time write batch + // information on recovery or memtable flush. + tx_opts.use_only_the_last_commit_time_batch_for_recovery = + THDVAR(m_thd, commit_time_batch_for_recovery); tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes); write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); @@ -2886,7 +3082,7 @@ public: /* This is a rocksdb write batch. This class doesn't hold or wait on any transaction locks (skips rocksdb transaction API) thus giving better - performance. The commit is done through rdb->GetBaseDB()->Commit(). + performance. Currently this is only used for replication threads which are guaranteed to be non-conflicting. Any further usage of this class should completely @@ -2908,6 +3104,8 @@ private: bool commit_no_binlog() override { bool res = false; rocksdb::Status s; + rocksdb::TransactionDBWriteOptimizations optimize; + optimize.skip_concurrency_control = true; s = merge_auto_incr_map(m_batch->GetWriteBatch()); if (!s.ok()) { @@ -2918,7 +3116,7 @@ private: release_snapshot(); - s = rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch()); + s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch()); if (!s.ok()) { rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); res = true; @@ -2936,7 +3134,6 @@ error: return res; } -protected: /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */ void do_set_savepoint() override { m_batch->SetSavePoint(); @@ -2946,6 +3143,7 @@ protected: m_batch->RollbackToSavePoint(); } + public: bool is_writebatch_trx() const override { return true; } @@ -3033,6 +3231,13 @@ public: get_for_update(rocksdb::ColumnFamilyHandle *const column_family, const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, bool exclusive) override { + if (value == nullptr) { + rocksdb::PinnableSlice pin_val; + rocksdb::Status s = get(column_family, key, &pin_val); + pin_val.Reset(); + return s; + } + return get(column_family, key, value); } @@ -3564,6 +3769,7 @@ static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx) We get here when committing a statement within a transaction. */ tx->make_stmt_savepoint_permanent(); + tx->make_stmt_savepoint_permanent(); } if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { @@ -3752,6 +3958,7 @@ private: if (!path_entry.path.empty() && !path_entry.limit_exceeded) { auto deadlocking_txn = *(path_entry.path.end() - 1); deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id; + deadlock_info.deadlock_time = path_entry.deadlock_time; } return deadlock_info; } @@ -3799,16 +4006,18 @@ private: path_data += "\n*** DEADLOCK PATH\n" "=========================================\n"; const auto dl_info = get_dl_path_trx_info(path_entry); + const auto deadlock_time = dl_info.deadlock_time; for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) { const auto trx_info = *it; path_data += format_string( + "TIMESTAMP: %" PRId64 "\n" "TRANSACTION ID: %u\n" "COLUMN FAMILY NAME: %s\n" "WAITING KEY: %s\n" "LOCK TYPE: %s\n" "INDEX NAME: %s\n" "TABLE NAME: %s\n", - trx_info.trx_id, trx_info.cf_name.c_str(), + deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(), trx_info.waiting_key.c_str(), trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED", trx_info.index_name.c_str(), trx_info.table_name.c_str()); @@ -4082,7 +4291,7 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, (ulonglong)internal_cache_count * kDefaultInternalCacheSize); str.append(buf); res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print); -#ifdef MARIAROCKS_NOT_YET + /* Show the background thread status */ std::vector thread_list; rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list); @@ -4119,8 +4328,27 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, str, stat_print); } } + +#ifdef MARIAROCKS_NOT_YET + /* Explicit snapshot information */ + str.clear(); + { + std::lock_guard lock(explicit_snapshot_mutex); + for (const auto &elem : explicit_snapshots) { + const auto &ss = elem.second.lock(); + DBUG_ASSERT(ss != nullptr); + const auto &info = ss->ss_info; + str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) + + "\nBinlog File: " + info.binlog_file + + "\nBinlog Pos: " + std::to_string(info.binlog_pos) + + "\nGtid Executed: " + info.gtid_executed + "\n"; + } + } #endif + if (!str.empty()) { + res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print); + } #ifdef MARIAROCKS_NOT_YET } else if (stat_type == HA_ENGINE_TRX) { /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */ @@ -4143,6 +4371,50 @@ static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd, static const char *ha_rocksdb_exts[] = {NullS}; +#ifdef MARIAROCKS_NOT_YET +static bool rocksdb_explicit_snapshot( + handlerton *const /* hton */, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle */ + snapshot_info_st *ss_info) /*!< out: Snapshot information */ +{ + switch (ss_info->op) { + case snapshot_operation::SNAPSHOT_CREATE: { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot()); + if (mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + thd->set_explicit_snapshot(s); + return s == nullptr; + } + case snapshot_operation::SNAPSHOT_ATTACH: { + auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!s) { + return true; + } + *ss_info = s->ss_info; + thd->set_explicit_snapshot(s); + return false; + } + case snapshot_operation::SNAPSHOT_RELEASE: { + if (!thd->get_explicit_snapshot()) { + return true; + } + *ss_info = thd->get_explicit_snapshot()->ss_info; + thd->set_explicit_snapshot(nullptr); + return false; + } + default: + DBUG_ASSERT(false); + return true; + } + return true; +} +#endif + /* Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT @@ -4165,10 +4437,15 @@ static const char *ha_rocksdb_exts[] = {NullS}; InnoDB and RocksDB transactions. */ static int rocksdb_start_tx_and_assign_read_view( - handlerton *const hton, /*!< in: RocksDB handlerton */ - THD* thd) /*!< in: MySQL thread handle of the - user for whom the transaction should - be committed */ + handlerton *const hton, /*!< in: RocksDB handlerton */ + THD *const thd /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ +) +#ifdef MARIAROCKS_NOT_YET + snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos, + gtid executed and snapshot ID */ +#endif { ulong const tx_isolation = my_core::thd_tx_isolation(thd); @@ -4176,14 +4453,25 @@ static int rocksdb_start_tx_and_assign_read_view( my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0)); return HA_EXIT_FAILURE; } + +#ifdef MARIADB_NOT_YET + if (ss_info) { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } else { + return HA_EXIT_FAILURE; + } +#endif + /* MariaDB: there is no need to call mysql_bin_log_lock_commits and then unlock back. SQL layer calls start_consistent_snapshot() for all engines, including the binlog under LOCK_commit_ordered mutex. + The mutex prevents binlog commits from happening (right?) while the storage engine(s) allocate read snapshots. That way, each storage engine is - synchronized with current binlog position. + synchronized with current binlog position. */ mysql_mutex_assert_owner(&LOCK_commit_ordered); @@ -4195,9 +4483,106 @@ static int rocksdb_start_tx_and_assign_read_view( rocksdb_register_tx(hton, thd, tx); tx->acquire_snapshot(true); +#ifdef MARIADB_NOT_YET + if (ss_info) { + mysql_bin_log_unlock_commits(ss_info); + } +#endif return HA_EXIT_SUCCESS; } +#ifdef MARIADB_NOT_YET +static int rocksdb_start_tx_with_shared_read_view( + handlerton *const hton, /*!< in: RocksDB handlerton */ + THD *const thd) /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ +#ifdef MARIADB_NOT_YET + snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos, + gtid executed and snapshot ID */ +#endif +{ + DBUG_ASSERT(thd != nullptr); + + int error = HA_EXIT_SUCCESS; + + ulong const tx_isolation = my_core::thd_tx_isolation(thd); + if (tx_isolation != ISO_REPEATABLE_READ) { + my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0)); + return HA_EXIT_FAILURE; + } + + Rdb_transaction *tx = nullptr; +#ifdef MARIADB_NOT_YET + std::shared_ptr explicit_snapshot; + const auto op = ss_info->op; + + DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE || + op == snapshot_operation::SNAPSHOT_ATTACH); + + // case: if binlogs are available get binlog file/pos and gtid info + if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + + if (op == snapshot_operation::SNAPSHOT_ATTACH) { + explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!explicit_snapshot) { + my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0), + ss_info->snapshot_id); + error = HA_EXIT_FAILURE; + } + } +#endif + + // case: all good till now + if (error == HA_EXIT_SUCCESS) { + tx = get_or_create_tx(thd); + Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); + +#ifdef MARIADB_NOT_YET + if (explicit_snapshot) { + tx->m_explicit_snapshot = explicit_snapshot; + } +#endif + + DBUG_ASSERT(!tx->has_snapshot()); + tx->set_tx_read_only(true); + rocksdb_register_tx(hton, thd, tx); + tx->acquire_snapshot(true); + +#ifdef MARIADB_NOT_YET + // case: an explicit snapshot was not assigned to this transaction + if (!tx->m_explicit_snapshot) { + tx->m_explicit_snapshot = + Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot); + if (!tx->m_explicit_snapshot) { + my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0)); + error = HA_EXIT_FAILURE; + } + } +#endif + } + +#ifdef MARIADB_NOT_YET + // case: unlock the binlog + if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot); + + // copy over the snapshot details to pass to the upper layers + if (tx->m_explicit_snapshot) { + *ss_info = tx->m_explicit_snapshot->ss_info; + ss_info->op = op; + } +#endif + + return error; +} +#endif + /* Dummy SAVEPOINT support. This is needed for long running transactions * like mysqldump (https://bugs.mysql.com/bug.php?id=71017). * Current SAVEPOINT does not correctly handle ROLLBACK and does not return @@ -4422,9 +4807,11 @@ static int rocksdb_init_func(void *const p) { rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key); rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key, rdb_signal_drop_idx_psi_cond_key); + rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key); #else rdb_bg_thread.init(); rdb_drop_idx_thread.init(); + rdb_mc_thread.init(); #endif mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex, MY_MUTEX_INIT_FAST); @@ -4445,6 +4832,8 @@ static int rocksdb_init_func(void *const p) { mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex, MY_MUTEX_INIT_FAST); + mysql_mutex_init(rdb_block_cache_resize_mutex_key, + &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST); Rdb_transaction::init_mutex(); rocksdb_hton->state = SHOW_OPTION_YES; @@ -4465,8 +4854,14 @@ static int rocksdb_init_func(void *const p) { rocksdb_hton->rollback = rocksdb_rollback; rocksdb_hton->show_status = rocksdb_show_status; +#ifdef MARIADB_NOT_YET + rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot; +#endif rocksdb_hton->start_consistent_snapshot = rocksdb_start_tx_and_assign_read_view; +#ifdef MARIADB_NOT_YET + rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view; +#endif rocksdb_hton->savepoint_set = rocksdb_savepoint; rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint; rocksdb_hton->savepoint_rollback_can_release_mdl = @@ -4535,6 +4930,35 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT + if (rocksdb_db_options->use_direct_reads) { + rocksdb::EnvOptions soptions; + rocksdb::Status check_status; + rocksdb::Env *const env = rocksdb_db_options->env; + + std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir); + if (env->FileExists(fname).ok()) { + std::unique_ptr file; + soptions.use_direct_reads = true; + check_status = env->NewSequentialFile(fname, &file, soptions); + } else { + std::unique_ptr file; + soptions.use_direct_writes = true; + check_status = env->ReopenWritableFile(fname, &file, soptions); + if (file != nullptr) { + file->Close(); + } + env->DeleteFile(fname); + } + + if (!check_status.ok()) { + sql_print_error("RocksDB: Unable to use direct io in rocksdb-datadir:" + "(%s)", check_status.getState()); + rdb_open_tables.free_hash(); + DBUG_RETURN(HA_EXIT_FAILURE); + } + } + if (rocksdb_db_options->allow_mmap_writes && rocksdb_db_options->use_direct_io_for_flush_and_compaction) { // See above comment for allow_mmap_reads. (NO_LINT_DEBUG) @@ -4687,8 +5111,10 @@ static int rocksdb_init_func(void *const p) { cf_options_map->get_defaults()); rocksdb::TransactionDBOptions tx_db_options; - tx_db_options.transaction_lock_timeout = 2; // 2 seconds + tx_db_options.transaction_lock_timeout = 2000; // 2 seconds tx_db_options.custom_mutex_factory = std::make_shared(); + tx_db_options.write_policy = + static_cast(rocksdb_write_policy); status = check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); @@ -4710,7 +5136,7 @@ static int rocksdb_init_func(void *const p) { } cf_manager.init(std::move(cf_options_map), &cf_handles); - if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) { + if (dict_manager.init(rdb, &cf_manager)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize data dictionary."); DBUG_RETURN(HA_EXIT_FAILURE); @@ -4771,6 +5197,21 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME +#ifdef HAVE_PSI_INTERFACE + , + rdb_mc_psi_thread_key +#endif + ); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Couldn't start the manual compaction thread: (errno=%d)", + err); + rdb_open_tables.free_hash(); + DBUG_RETURN(HA_EXIT_FAILURE); + } + rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions); if (rocksdb_pause_background_work) { @@ -4877,6 +5318,16 @@ static int rocksdb_done_func(void *const p) { sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err); } + // signal the manual compaction thread to stop + rdb_mc_thread.signal(true); + // Wait for the manual compaction thread to finish. + err = rdb_mc_thread.join(); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err); + } + if (rdb_open_tables.m_hash.size()) { // Looks like we are getting unloaded and yet we have some open tables // left behind. @@ -4933,6 +5384,7 @@ static int rocksdb_done_func(void *const p) { rdb_open_tables.m_hash.~Rdb_table_set(); mysql_mutex_destroy(&rdb_open_tables.m_mutex); mysql_mutex_destroy(&rdb_sysvars_mutex); + mysql_mutex_destroy(&rdb_block_cache_resize_mutex); delete rdb_collation_exceptions; @@ -5348,7 +5800,6 @@ longlong ha_rocksdb::update_hidden_pk_val() { /* Get the id of the hidden pk id from m_last_rowkey */ int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) { - DBUG_ASSERT(hidden_pk_id != nullptr); DBUG_ASSERT(table != nullptr); DBUG_ASSERT(has_hidden_pk(table)); @@ -5404,14 +5855,14 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, my_core::TABLE_SHARE *const table_arg) : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr), m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), + m_scan_it_lower_bound(nullptr), m_scan_it_upper_bound(nullptr), m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr), m_pk_can_be_decoded(false), m_maybe_unpack_info(false), m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr), m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr), m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr), m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr), - m_dup_sk_packed_tuple_old(nullptr), m_eq_cond_lower_bound(nullptr), - m_eq_cond_upper_bound(nullptr), m_pack_buffer(nullptr), + m_dup_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr), m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr), m_row_checksums_checked(0), m_in_rpl_delete_rows(false), m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {} @@ -5950,9 +6401,6 @@ int ha_rocksdb::convert_field_from_storage_format( int ha_rocksdb::convert_record_from_storage_format( const rocksdb::Slice *const key, const rocksdb::Slice *const value, uchar *const buf) { - DBUG_ASSERT(key != nullptr); - DBUG_ASSERT(buf != nullptr); - Rdb_string_reader reader(value); /* @@ -6196,7 +6644,6 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, DBUG_ENTER_FUNC(); DBUG_ASSERT(m_pk_tuple == nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); std::shared_ptr *const kd_arr = tbl_def_arg->m_key_descr_arr; @@ -6248,9 +6695,9 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, m_pack_buffer = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); - m_eq_cond_upper_bound = + m_scan_it_lower_bound = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); - m_eq_cond_lower_bound = + m_scan_it_upper_bound = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); /* @@ -6267,7 +6714,7 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr || m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr || m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr || - m_eq_cond_upper_bound == nullptr || m_eq_cond_lower_bound == nullptr || + m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr || (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr || m_dup_sk_packed_tuple_old == nullptr))) { // One or more of the above allocations failed. Clean up and exit @@ -6307,11 +6754,11 @@ void ha_rocksdb::free_key_buffers() { my_free(m_dup_sk_packed_tuple_old); m_dup_sk_packed_tuple_old = nullptr; - my_free(m_eq_cond_upper_bound); - m_eq_cond_upper_bound = nullptr; + my_free(m_scan_it_lower_bound); + m_scan_it_lower_bound = nullptr; - my_free(m_eq_cond_lower_bound); - m_eq_cond_lower_bound = nullptr; + my_free(m_scan_it_upper_bound); + m_scan_it_upper_bound = nullptr; } #ifdef MARIAROCKS_NOT_YET @@ -6569,9 +7016,8 @@ bool ha_rocksdb::get_error_message(const int error, String *const buf) { static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST, "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST"); - DBUG_ASSERT(buf != nullptr); - - if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK) { + if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK || + error == HA_ERR_ROCKSDB_STATUS_BUSY) { Rdb_transaction *const tx = get_tx_from_thd(ha_thd()); DBUG_ASSERT(tx != nullptr); buf->append(tx->m_detailed_error); @@ -6653,10 +7099,10 @@ int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s, } if (opt_msg) { - my_error(ER_RDB_STATUS_MSG, MYF(0), opt_msg, s.code(), - s.ToString().c_str()); + std::string concatenated_error = s.ToString() + " (" + std::string(opt_msg) + ")"; + my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(), rocksdb_hton_name); } else { - my_error(ER_RDB_STATUS_GENERAL, MYF(0), s.code(), s.ToString().c_str()); + my_error(ER_GET_ERRMSG, MYF(0), s.code(), s.ToString().c_str(), rocksdb_hton_name); } return err; @@ -6725,7 +7171,6 @@ int ha_rocksdb::create_key_defs( /* = nullptr */) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); uint i; @@ -6796,9 +7241,7 @@ int ha_rocksdb::create_cfs( std::array *const cfs) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); char tablename_sys[NAME_LEN + 1]; bool tsys_set= false; @@ -6904,10 +7347,6 @@ int ha_rocksdb::create_inplace_key_defs( const std::array &cfs) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - DBUG_ASSERT(old_tbl_def_arg != nullptr); - std::shared_ptr *const old_key_descr = old_tbl_def_arg->m_key_descr_arr; std::shared_ptr *const new_key_descr = @@ -6975,11 +7414,6 @@ std::unordered_map ha_rocksdb::get_old_key_positions( const Rdb_tbl_def *const old_tbl_def_arg) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(old_table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - DBUG_ASSERT(old_tbl_def_arg != nullptr); - std::shared_ptr *const old_key_descr = old_tbl_def_arg->m_key_descr_arr; std::unordered_map old_key_pos; @@ -7045,9 +7479,6 @@ int ha_rocksdb::compare_keys(const KEY *const old_key, const KEY *const new_key) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(old_key != nullptr); - DBUG_ASSERT(new_key != nullptr); - /* Check index name. */ if (strcmp(old_key->name.str, new_key->name.str) != 0) { DBUG_RETURN(HA_EXIT_FAILURE); @@ -7078,9 +7509,6 @@ int ha_rocksdb::compare_key_parts(const KEY *const old_key, const KEY *const new_key) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(old_key != nullptr); - DBUG_ASSERT(new_key != nullptr); - /* Skip if key parts do not match, as it is a different key */ if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) { DBUG_RETURN(HA_EXIT_FAILURE); @@ -7125,7 +7553,6 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, const struct key_def_cf_info &cf_info) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(new_key_def != nullptr); DBUG_ASSERT(*new_key_def == nullptr); uint64 ttl_duration = 0; @@ -7212,8 +7639,6 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, int rdb_normalize_tablename(const std::string &tablename, std::string *const strbuf) { - DBUG_ASSERT(strbuf != nullptr); - if (tablename.size() < 2 || tablename[0] != '.' || (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) { DBUG_ASSERT(0); // We were not passed table name? @@ -7537,8 +7962,6 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, const bool &full_key_match, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) { - DBUG_ASSERT(iter != nullptr); - /* We are looking for the first record such that index_tuple= lookup_tuple. @@ -7746,8 +8169,6 @@ int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd, } int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { - DBUG_ASSERT(buf != nullptr); - int rc; const rocksdb::Slice &rkey = m_scan_it->key(); const uint pk_size = rkey.size(); @@ -7771,8 +8192,6 @@ int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd, bool move_forward) { - DBUG_ASSERT(buf != nullptr); - int rc = 0; uint pk_size; @@ -7875,7 +8294,6 @@ ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const { pair for. */ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { - DBUG_ASSERT(buf != nullptr); DBUG_ASSERT(table != nullptr); #ifdef MARIAROCKS_NOT_YET stats.rows_requested++; @@ -8192,7 +8610,7 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, else rc = read_row_from_secondary_key(buf, kd, move_forward); - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; /* Exit the loop */ // release the snapshot and iterator so they will be regenerated @@ -8237,8 +8655,6 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, uchar *const buf) { - DBUG_ASSERT(buf != nullptr); - if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) { const Rdb_key_def &kd = *m_key_descr_arr[active_index]; @@ -8552,8 +8968,6 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, const bool skip_ttl_check) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - DBUG_ASSERT(rowid != nullptr); DBUG_ASSERT(table != nullptr); int rc; @@ -8766,8 +9180,6 @@ int ha_rocksdb::index_last(uchar *const buf) { int ha_rocksdb::index_first_intern(uchar *const buf) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - uchar *key; uint key_size; int rc; @@ -8797,7 +9209,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { m_skip_scan_it_next_call = true; rc = index_next_with_direction(buf, true); - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; // exit the loop // release the snapshot and iterator so they will be regenerated @@ -8859,8 +9271,6 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { int ha_rocksdb::index_last_intern(uchar *const buf) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - uchar *key; uint key_size; int rc; @@ -8902,7 +9312,7 @@ int ha_rocksdb::index_last_intern(uchar *const buf) { rc = secondary_index_read(active_index, buf); } - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; /* exit the loop */ // release the snapshot and iterator so they will be regenerated @@ -8987,7 +9397,6 @@ bool ha_rocksdb::commit_in_the_middle() { @retval false if bulk commit was skipped or succeeded */ bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { - DBUG_ASSERT(tx != nullptr); return commit_in_the_middle() && tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) && tx->flush_batch(); @@ -9000,7 +9409,6 @@ bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { 'auto-incremented' pk.) */ bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { - DBUG_ASSERT(table != nullptr); return Rdb_key_def::table_has_hidden_pk(table); } @@ -9010,9 +9418,7 @@ bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { */ bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return (table_arg->s->primary_key == MAX_INDEXES && index == tbl_def_arg->m_key_count - 1); @@ -9021,9 +9427,7 @@ bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, /* Returns index of primary key */ uint ha_rocksdb::pk_index(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1 : table_arg->s->primary_key; @@ -9032,9 +9436,7 @@ uint ha_rocksdb::pk_index(const TABLE *const table_arg, /* Returns true if given index number is a primary key */ bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return index == table_arg->s->primary_key || is_hidden_pk(index, table_arg, tbl_def_arg); @@ -9049,9 +9451,6 @@ uint ha_rocksdb::max_supported_key_part_length() const { const char *ha_rocksdb::get_key_name(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - if (is_hidden_pk(index, table_arg, tbl_def_arg)) { return HIDDEN_PK_NAME; } @@ -9065,9 +9464,6 @@ const char *ha_rocksdb::get_key_name(const uint index, const char *ha_rocksdb::get_key_comment(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - if (is_hidden_pk(index, table_arg, tbl_def_arg)) { return nullptr; } @@ -9119,7 +9515,6 @@ const std::string ha_rocksdb::generate_cf_name(const uint index, } const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); return table_arg->s->comment.str; @@ -9236,8 +9631,7 @@ int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id, /* If the keys are the same, then no lock is needed */ - if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice, - row_info.old_pk_slice)) { + if (!row_info.new_pk_slice.compare(row_info.old_pk_slice)) { *found = false; return HA_EXIT_SUCCESS; } @@ -9326,8 +9720,7 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, rocksdb::Slice((const char *)m_sk_packed_tuple, size); /* - For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs - always require locking. + Acquire lock on the old key in case of UPDATE */ if (row_info.old_data != nullptr) { size = kd.pack_record(table, m_pack_buffer, row_info.old_data, @@ -9336,14 +9729,18 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, const rocksdb::Slice old_slice = rocksdb::Slice((const char *)m_sk_packed_tuple_old, size); - /* - For updates, if the keys are the same, then no lock is needed + const rocksdb::Status s = + get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr); + if (!s.ok()) { + return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler); + } - Also check to see if the key has any fields set to NULL. If it does, then - this key is unique since NULL is not equal to each other, so no lock is - needed. + /* + If the old and new keys are the same we're done since we've already taken + the lock on the old key */ - if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) { + if (!new_slice.compare(old_slice)) { return HA_EXIT_SUCCESS; } } @@ -9369,16 +9766,14 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, The bloom filter may need to be disabled for this lookup. */ - uchar min_bound_buf[MAX_KEY_LENGTH]; - uchar max_bound_buf[MAX_KEY_LENGTH]; - rocksdb::Slice min_bound_slice; - rocksdb::Slice max_bound_slice; + uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + rocksdb::Slice lower_bound_slice; + rocksdb::Slice upper_bound_slice; + const bool total_order_seek = !check_bloom_and_set_bounds( - ha_thd(), kd, new_slice, all_parts_used, - min_bound_buf, - max_bound_buf, - &min_bound_slice, - &max_bound_slice); + ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE, + lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice); const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache); const rocksdb::Status s = @@ -9389,9 +9784,8 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, } rocksdb::Iterator *const iter = row_info.tx->get_iterator( - kd.get_cf(), total_order_seek, fill_cache, - min_bound_slice, max_bound_slice, - true /* read current data */, + kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice, + upper_bound_slice, true /* read current data */, false /* acquire snapshot */); /* Need to scan the transaction to see if there is a duplicate key. @@ -9601,9 +9995,11 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, } int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info) { + const struct update_row_info &row_info, + const bool bulk_load_sk) { int new_packed_size; int old_packed_size; + int rc = HA_EXIT_SUCCESS; rocksdb::Slice new_key_slice; rocksdb::Slice new_value_slice; @@ -9681,18 +10077,23 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, rocksdb::Slice(reinterpret_cast(m_sk_tails.ptr()), m_sk_tails.get_current_pos()); - row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, - new_value_slice); + if (bulk_load_sk && row_info.old_data == nullptr) { + rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true); + } else { + row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, + new_value_slice); + } row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() + new_value_slice.size()); - return HA_EXIT_SUCCESS; + return rc; } int ha_rocksdb::update_indexes(const struct update_row_info &row_info, const bool &pk_changed) { int rc; + bool bulk_load_sk; // The PK must be updated first to pull out the TTL value. rc = update_pk(*m_pk_descr, row_info, pk_changed); @@ -9700,13 +10101,17 @@ int ha_rocksdb::update_indexes(const struct update_row_info &row_info, return rc; } - // Update the remaining indexes. + // Update the remaining indexes. Allow bulk loading only if + // allow_sk is enabled + bulk_load_sk = rocksdb_enable_bulk_load_api && + THDVAR(table->in_use, bulk_load) && + THDVAR(table->in_use, bulk_load_allow_sk); for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) { if (is_pk(key_id, table, m_tbl_def)) { continue; } - rc = update_sk(table, *m_key_descr_arr[key_id], row_info); + rc = update_sk(table, *m_key_descr_arr[key_id], row_info, bulk_load_sk); if (rc != HA_EXIT_SUCCESS) { return rc; } @@ -9804,28 +10209,22 @@ int ha_rocksdb::update_write_row(const uchar *const old_data, @param outer_u */ -void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond, - uchar *lower_bound_buf, - uchar *upper_bound_buf, - rocksdb::Slice *out_lower_bound, - rocksdb::Slice *out_upper_bound) { - uint eq_cond_len = eq_cond.size(); - memcpy(upper_bound_buf, eq_cond.data(), eq_cond_len); - kd.successor(upper_bound_buf, eq_cond_len); - memcpy(lower_bound_buf, eq_cond.data(), eq_cond_len); - kd.predecessor(lower_bound_buf, eq_cond_len); +void ha_rocksdb::setup_iterator_bounds( + const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) { + uint min_len = std::min(eq_cond.size(), bound_len); + memcpy(upper_bound, eq_cond.data(), min_len); + kd.successor(upper_bound, min_len); + memcpy(lower_bound, eq_cond.data(), min_len); + kd.predecessor(lower_bound, min_len); if (kd.m_is_reverse_cf) { - *out_upper_bound = - rocksdb::Slice((const char *)lower_bound_buf, eq_cond_len); - *out_lower_bound = - rocksdb::Slice((const char *)upper_bound_buf, eq_cond_len); + *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); + *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); } else { - *out_upper_bound = - rocksdb::Slice((const char *)upper_bound_buf, eq_cond_len); - *out_lower_bound = - rocksdb::Slice((const char *)lower_bound_buf, eq_cond_len); + *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); + *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); } } @@ -9837,7 +10236,6 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *const slice, const bool use_all_keys, const uint eq_cond_len) { - DBUG_ASSERT(slice != nullptr); DBUG_ASSERT(slice->size() >= eq_cond_len); Rdb_transaction *const tx = get_or_create_tx(table->in_use); @@ -9845,11 +10243,10 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, bool skip_bloom = true; const rocksdb::Slice eq_cond(slice->data(), eq_cond_len); - if (check_bloom_and_set_bounds(ha_thd(), kd, eq_cond, use_all_keys, - m_eq_cond_lower_bound, - m_eq_cond_upper_bound, - &m_eq_cond_lower_bound_slice, - &m_eq_cond_upper_bound_slice)) { + if (check_bloom_and_set_bounds( + ha_thd(), kd, eq_cond, use_all_keys, eq_cond_len, + m_scan_it_lower_bound, m_scan_it_upper_bound, + &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) { skip_bloom = false; } @@ -9890,8 +10287,8 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, m_scan_it = rdb->NewIterator(read_opts, kd.get_cf()); } else { m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache, - m_eq_cond_lower_bound_slice, - m_eq_cond_upper_bound_slice); + m_scan_it_lower_bound_slice, + m_scan_it_upper_bound_slice); } m_scan_it_skips_bloom = skip_bloom; } @@ -9957,7 +10354,7 @@ int ha_rocksdb::rnd_next(uchar *const buf) { int rc; for (;;) { rc = rnd_next_with_direction(buf, true); - if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !m_rnd_scan_is_new_snapshot) break; /* exit the loop */ // release the snapshot and iterator and then regenerate them @@ -10286,7 +10683,7 @@ int ha_rocksdb::info(uint flag) { if (static_cast(stats.data_file_length) < 0 || static_cast(stats.index_file_length) < 0 || static_cast(stats.records) < 0) { - if (analyze(nullptr, nullptr)) { + if (calculate_stats_for_table()) { DBUG_RETURN(HA_EXIT_FAILURE); } @@ -10364,10 +10761,21 @@ int ha_rocksdb::info(uint flag) { KEY *const k = &table->key_info[i]; for (uint j = 0; j < k->ext_key_parts; j++) { const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats; - uint x = k_stats.m_distinct_keys_per_prefix.size() > j && - k_stats.m_distinct_keys_per_prefix[j] > 0 - ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j] - : 0; + uint x; + + if (k_stats.m_distinct_keys_per_prefix.size() > j && + k_stats.m_distinct_keys_per_prefix[j] > 0) { + x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j]; + /* + If the number of rows is less than the number of prefixes (due to + sampling), the average number of rows with the same prefix is 1. + */ + if (x == 0) { + x = 1; + } + } else { + x = 0; + } if (x > stats.records) x = stats.records; if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) || @@ -10572,7 +10980,7 @@ THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to, m_lock_rows = RDB_LOCK_WRITE; } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) { m_lock_rows = RDB_LOCK_READ; - } else { + } else if (lock_type != TL_IGNORE) { m_lock_rows = RDB_LOCK_NONE; if (THDVAR(thd, lock_scanned_rows)) { /* @@ -10737,6 +11145,13 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { DBUG_RETURN(HA_ERR_UNSUPPORTED); } +#ifdef MARIADB_NOT_YET + if (thd->get_explicit_snapshot()) { + my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0)); + DBUG_RETURN(HA_ERR_UNSUPPORTED); + } +#endif + /* SQL layer signals us to take a write lock. It does so when starting DML statement. We should put locks on the rows we're reading. @@ -10958,8 +11373,6 @@ void Rdb_drop_index_thread::run() { } Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) { - DBUG_ASSERT(tablename != nullptr); - std::string str; if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) { // We were not passed table name? @@ -11022,6 +11435,12 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { char key_buf[MAX_KEY_LENGTH]; uint key_len; ulonglong bytes_written = 0; + + uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + rocksdb::Slice lower_bound_slice; + rocksdb::Slice upper_bound_slice; + /* Remove all records in each index. (This is is not crash-safe, but it doesn't matter, because bulk row @@ -11032,13 +11451,12 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { kd.get_infimum_key(reinterpret_cast(key_buf), &key_len); rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); const rocksdb::Slice table_key(key_buf, key_len); - setup_iterator_bounds(kd, table_key, - m_eq_cond_lower_bound, - m_eq_cond_upper_bound, - &m_eq_cond_lower_bound_slice, - &m_eq_cond_upper_bound_slice); - opts.iterate_lower_bound = &m_eq_cond_lower_bound_slice; - opts.iterate_upper_bound = &m_eq_cond_upper_bound_slice; + setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE, + lower_bound_buf, upper_bound_buf, &lower_bound_slice, + &upper_bound_slice); + DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE); + opts.iterate_lower_bound = &lower_bound_slice; + opts.iterate_upper_bound = &upper_bound_slice; std::unique_ptr it(rdb->NewIterator(opts, cf)); it->Seek(table_key); @@ -11077,9 +11495,6 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { int ha_rocksdb::rename_table(const char *const from, const char *const to) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(from != nullptr); - DBUG_ASSERT(to != nullptr); - std::string from_str; std::string to_str; std::string from_db; @@ -11330,29 +11745,28 @@ int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) { DBUG_RETURN(HA_EXIT_SUCCESS); } -int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, - HA_CHECK_OPT *const check_opt) { +static int calculate_stats( + const std::unordered_map> + &to_recalc, + bool include_memtables) { DBUG_ENTER_FUNC(); // find per column family key ranges which need to be queried std::unordered_map> ranges; - std::unordered_set ids_to_check; - std::vector buf(table_arg->s->keys * 2 * - Rdb_key_def::INDEX_NUMBER_SIZE); std::unordered_map stats; - for (uint i = 0; i < table_arg->s->keys; i++) { - const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE]; - const Rdb_key_def &kd = *m_key_descr_arr[i]; - const GL_INDEX_ID index_id = kd.get_gl_index_id(); - ranges[kd.get_cf()].push_back(get_range(i, bufp)); + std::vector buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE); + + uchar *bufp = buf.data(); + for (const auto &it : to_recalc) { + const GL_INDEX_ID index_id = it.first; + auto &kd = it.second; + ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp)); + bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE; - ids_to_check.insert(index_id); - // Initialize the stats to 0. If there are no files that contain - // this gl_index_id, then 0 should be stored for the cached stats. stats[index_id] = Rdb_index_stats(index_id); - DBUG_ASSERT(kd.get_key_parts() > 0); - stats[index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts()); + DBUG_ASSERT(kd->get_key_parts() > 0); + stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts()); } // get RocksDB table properties for these ranges @@ -11363,8 +11777,8 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, it.first, &it.second[0], it.second.size(), &props); DBUG_ASSERT(props.size() >= old_size); if (!status.ok()) { - DBUG_RETURN( - rdb_error_to_mysql(status, "Could not access RocksDB properties")); + DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql( + status, "Could not access RocksDB properties")); } } @@ -11385,61 +11799,62 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, other SQL tables, it can be that we're only seeing a small fraction of table's entries (and so we can't update statistics based on that). */ - if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end()) + if (stats.find(it1.m_gl_index_id) == stats.end()) { continue; + } - auto kd = ddl_manager.safe_find(it1.m_gl_index_id); - DBUG_ASSERT(kd != nullptr); - stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length()); + auto it_index = to_recalc.find(it1.m_gl_index_id); + DBUG_ASSERT(it_index != to_recalc.end()); + if (it_index == to_recalc.end()) { + continue; + } + stats[it1.m_gl_index_id].merge( + it1, true, it_index->second->max_storage_fmt_length()); } num_sst++; } - // calculate memtable cardinality - Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); - auto read_opts = rocksdb::ReadOptions(); - read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; - for (uint i = 0; i < table_arg->s->keys; i++) { - const Rdb_key_def &kd = *m_key_descr_arr[i]; - Rdb_index_stats &stat = stats[kd.get_gl_index_id()]; + if (include_memtables) { + // calculate memtable cardinality + Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); + auto read_opts = rocksdb::ReadOptions(); + read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; + for (const auto &it_kd : to_recalc) { + const std::shared_ptr &kd = it_kd.second; + Rdb_index_stats &stat = stats[kd->get_gl_index_id()]; - uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; - auto r = get_range(i, r_buf); - uint64_t memtableCount; - uint64_t memtableSize; - rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount, - &memtableSize); - if (memtableCount < (uint64_t)stat.m_rows / 10) { - // skip tables that already have enough stats from SST files to reduce - // overhead and avoid degradation of big tables stats by sampling from - // relatively tiny (less than 10% of full data set) memtable dataset - continue; - } - - std::unique_ptr it = std::unique_ptr( - rdb->NewIterator(read_opts, kd.get_cf())); - - uchar *first_key; - uint key_size; - if (is_pk(i, table, m_tbl_def)) { - first_key = m_pk_packed_tuple; - } else { - first_key = m_sk_packed_tuple; - } - kd.get_first_key(first_key, &key_size); - rocksdb::Slice first_index_key((const char *)first_key, key_size); - - cardinality_collector.Reset(); - for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { - const rocksdb::Slice key = it->key(); - if (!kd.covers_key(key)) { - break; // end of this index + uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; + auto r = myrocks::get_range(*kd, r_buf); + uint64_t memtableCount; + uint64_t memtableSize; + rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount, + &memtableSize); + if (memtableCount < (uint64_t)stat.m_rows / 10) { + // skip tables that already have enough stats from SST files to reduce + // overhead and avoid degradation of big tables stats by sampling from + // relatively tiny (less than 10% of full data set) memtable dataset + continue; } - stat.m_rows++; - cardinality_collector.ProcessKey(key, &kd, &stat); + std::unique_ptr it = + std::unique_ptr( + rdb->NewIterator(read_opts, kd->get_cf())); + + rocksdb::Slice first_index_key((const char *)r_buf, + Rdb_key_def::INDEX_NUMBER_SIZE); + + cardinality_collector.Reset(); + for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { + const rocksdb::Slice key = it->key(); + if (!kd->covers_key(key)) { + break; // end of this index + } + stat.m_rows++; + + cardinality_collector.ProcessKey(key, kd.get(), &stat); + } + cardinality_collector.AdjustStats(&stat); } - cardinality_collector.AdjustStats(&stat); } // set and persist new stats @@ -11449,6 +11864,19 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, DBUG_RETURN(HA_EXIT_SUCCESS); } +int ha_rocksdb::calculate_stats_for_table() { + DBUG_ENTER_FUNC(); + + std::unordered_map> + ids_to_check; + for (uint i = 0; i < table->s->keys; i++) { + ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(), + m_key_descr_arr[i])); + } + + DBUG_RETURN(calculate_stats(ids_to_check, true)); +} + /* @return HA_ADMIN_OK OK @@ -11457,7 +11885,16 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) { DBUG_ENTER_FUNC(); - if (table && calculate_stats(table, thd, check_opt) != HA_EXIT_SUCCESS) { + if (table) { + if (calculate_stats_for_table() != HA_EXIT_SUCCESS) { + DBUG_RETURN(HA_ADMIN_FAILED); + } + } + + // A call to ::info is needed to repopulate some SQL level structs. This is + // necessary for online analyze because we cannot rely on another ::open + // call to call info for us. + if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) { DBUG_RETURN(HA_ADMIN_FAILED); } @@ -12286,18 +12723,6 @@ bool ha_rocksdb::commit_inplace_alter_table( dict_manager.finish_indexes_operation( create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING); - /* - We need to recalculate the index stats here manually. The reason is that - the secondary index does not exist inside - m_index_num_to_keydef until it is committed to the data dictionary, which - prevents us from updating the stats normally as the ddl_manager cannot - find the proper gl_index_ids yet during adjust_stats calls. - */ - if (calculate_stats(altered_table, nullptr, nullptr)) { - /* Failed to update index statistics, should never happen */ - DBUG_ASSERT(0); - } - rdb_drop_idx_thread.signal(); } @@ -12368,6 +12793,8 @@ struct rocksdb_status_counters_t { uint64_t block_cache_data_hit; uint64_t block_cache_data_add; uint64_t bloom_filter_useful; + uint64_t bloom_filter_full_positive; + uint64_t bloom_filter_full_true_positive; uint64_t memtable_hit; uint64_t memtable_miss; uint64_t get_hit_l0; @@ -12442,6 +12869,8 @@ DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS) DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT) DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD) DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL) +DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE) +DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE) DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT) DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS) DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0) @@ -12676,6 +13105,8 @@ static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_data_hit), DEF_STATUS_VAR(block_cache_data_add), DEF_STATUS_VAR(bloom_filter_useful), + DEF_STATUS_VAR(bloom_filter_full_positive), + DEF_STATUS_VAR(bloom_filter_full_true_positive), DEF_STATUS_VAR(memtable_hit), DEF_STATUS_VAR(memtable_miss), DEF_STATUS_VAR(get_hit_l0), @@ -12733,6 +13164,10 @@ static SHOW_VAR rocksdb_status_vars[] = { &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("manual_compactions_processed", + &rocksdb_manual_compactions_processed, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("manual_compactions_running", + &rocksdb_manual_compactions_running, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete, @@ -12806,24 +13241,193 @@ void Rdb_background_thread::run() { rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD); } } + // Recalculate statistics for indexes. + if (rocksdb_stats_recalc_rate) { + std::unordered_map> + to_recalc; + + if (rdb_indexes_to_recalc.empty()) { + struct Rdb_index_collector : public Rdb_tables_scanner { + int add_table(Rdb_tbl_def *tdef) override { + for (uint i = 0; i < tdef->m_key_count; i++) { + rdb_indexes_to_recalc.push_back( + tdef->m_key_descr_arr[i]->get_gl_index_id()); + } + return HA_EXIT_SUCCESS; + } + } collector; + ddl_manager.scan_for_tables(&collector); + } + + while (to_recalc.size() < rocksdb_stats_recalc_rate && + !rdb_indexes_to_recalc.empty()) { + const auto index_id = rdb_indexes_to_recalc.back(); + rdb_indexes_to_recalc.pop_back(); + + std::shared_ptr keydef = + ddl_manager.safe_find(index_id); + + if (keydef) { + to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef)); + } + } + + if (!to_recalc.empty()) { + calculate_stats(to_recalc, false); + } + } + } // save remaining stats which might've left unsaved ddl_manager.persist_stats(); } -bool ha_rocksdb::check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond, - const bool use_all_keys, - uchar *lower_bound_buf, - uchar *upper_bound_buf, - rocksdb::Slice *out_lower_bound, - rocksdb::Slice *out_upper_bound) { +/* + A background thread to handle manual compactions, + except for dropping indexes/tables. Every second, it checks + pending manual compactions, and it calls CompactRange if there is. +*/ +void Rdb_manual_compaction_thread::run() { + mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + for (;;) { + if (m_stop) { + break; + } + timespec ts; + set_timespec(ts, 1); + + const auto ret MY_ATTRIBUTE((__unused__)) = + mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts); + if (m_stop) { + break; + } + // make sure, no program error is returned + DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT); + RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); + + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + // Grab the first item and proceed, if not empty. + if (m_requests.empty()) { + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + continue; + } + Manual_compaction_request &mcr = m_requests.begin()->second; + DBUG_ASSERT(mcr.cf != nullptr); + DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED); + mcr.state = Manual_compaction_request::RUNNING; + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + + DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING); + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id, + mcr.cf->GetName().c_str()); + rocksdb_manual_compactions_running++; + if (rocksdb_debug_manual_compaction_delay > 0) { + my_sleep(rocksdb_debug_manual_compaction_delay * 1000000); + } + // CompactRange may take a very long time. On clean shutdown, + // it is cancelled by CancelAllBackgroundWork, then status is + // set to shutdownInProgress. + const rocksdb::Status s = rdb->CompactRange( + getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit); + rocksdb_manual_compactions_running--; + if (s.ok()) { + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id, + mcr.cf->GetName().c_str()); + } else { + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s aborted. %s", + mcr.mc_id, mcr.cf->GetName().c_str(), s.getState()); + if (!s.IsShutdownInProgress()) { + rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD); + } else { + DBUG_ASSERT(m_requests.size() == 1); + } + } + rocksdb_manual_compactions_processed++; + clear_manual_compaction_request(mcr.mc_id, false); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + } + clear_all_manual_compaction_requests(); + DBUG_ASSERT(m_requests.empty()); + RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); + mysql_mutex_destroy(&m_mc_mutex); +} + +void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() { + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + m_requests.clear(); + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); +} + +void Rdb_manual_compaction_thread::clear_manual_compaction_request( + int mc_id, bool init_only) { + bool erase = true; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + auto it = m_requests.find(mc_id); + if (it != m_requests.end()) { + if (init_only) { + Manual_compaction_request mcr = it->second; + if (mcr.state != Manual_compaction_request::INITED) { + erase = false; + } + } + if (erase) { + m_requests.erase(it); + } + } else { + // Current code path guarantees that erasing by the same mc_id happens + // at most once. INITED state may be erased by a thread that requested + // the compaction. RUNNING state is erased by mc thread only. + DBUG_ASSERT(0); + } + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); +} + +int Rdb_manual_compaction_thread::request_manual_compaction( + rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start, + rocksdb::Slice *limit, int concurrency) { + int mc_id = -1; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + if (m_requests.size() >= rocksdb_max_manual_compactions) { + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return mc_id; + } + Manual_compaction_request mcr; + mc_id = mcr.mc_id = ++m_latest_mc_id; + mcr.state = Manual_compaction_request::INITED; + mcr.cf = cf; + mcr.start = start; + mcr.limit = limit; + mcr.concurrency = concurrency; + m_requests.insert(std::make_pair(mcr.mc_id, mcr)); + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return mc_id; +} + +bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) { + bool finished = false; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + if (m_requests.count(mc_id) == 0) { + finished = true; + } + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return finished; +} + +bool ha_rocksdb::check_bloom_and_set_bounds( + THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, + const bool use_all_keys, size_t bound_len, uchar *const lower_bound, + uchar *const upper_bound, rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice) { bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys); if (!can_use_bloom) { - setup_iterator_bounds(kd, eq_cond, - lower_bound_buf, upper_bound_buf, - out_lower_bound, out_upper_bound); + setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound, + lower_bound_slice, upper_bound_slice); } return can_use_bloom; } @@ -12933,7 +13537,6 @@ void rdb_update_global_stats(const operation_type &type, uint count, int rdb_get_table_perf_counters(const char *const tablename, Rdb_perf_counters *const counters) { - DBUG_ASSERT(counters != nullptr); DBUG_ASSERT(tablename != nullptr); Rdb_table_handler *table_handler; @@ -12973,10 +13576,7 @@ const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) { // so that we can capture as much data as possible to debug the root cause // more efficiently. #ifdef __GNUC__ -#pragma GCC push_options -#pragma GCC optimize("O0") #endif - void rdb_handle_io_error(const rocksdb::Status status, const RDB_IO_ERROR_TYPE err_type) { if (status.IsIOError()) { @@ -12991,6 +13591,9 @@ void rdb_handle_io_error(const rocksdb::Status status, } case RDB_IO_ERROR_BG_THREAD: { rdb_log_status_error(status, "BG thread failed to write to RocksDB"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on BG write error."); + abort(); break; } case RDB_IO_ERROR_GENERAL: { @@ -13026,9 +13629,7 @@ void rdb_handle_io_error(const rocksdb::Status status, } } #ifdef __GNUC__ -#pragma GCC pop_options #endif - Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; } Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; } @@ -13329,6 +13930,42 @@ static void rocksdb_set_wal_bytes_per_sync( RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +/* + Validating and updating block cache size via sys_var::check path. + SetCapacity may take seconds when reducing block cache, and + sys_var::update holds LOCK_global_system_variables mutex, so + updating block cache size is done at check path instead. +*/ +static int rocksdb_validate_set_block_cache_size( + THD *thd MY_ATTRIBUTE((__unused__)), + struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), + void *var_ptr, struct st_mysql_value *value) { + DBUG_ASSERT(value != nullptr); + + long long new_value; + + /* value is NULL */ + if (value->val_int(value, &new_value)) { + return HA_EXIT_FAILURE; + } + + if (new_value < RDB_MIN_BLOCK_CACHE_SIZE || + (uint64_t)new_value > (uint64_t)LONGLONG_MAX) { + return HA_EXIT_FAILURE; + } + + RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex); + const rocksdb::BlockBasedTableOptions &table_options = + rdb_get_table_options(); + + if (rocksdb_block_cache_size != new_value && table_options.block_cache) { + table_options.block_cache->SetCapacity(new_value); + } + *static_cast(var_ptr) = static_cast(new_value); + RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex); + return HA_EXIT_SUCCESS; +} + static int rocksdb_validate_update_cf_options(THD * /* unused */, struct st_mysql_sys_var * /*unused*/, @@ -13510,6 +14147,13 @@ double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) { DBUG_RETURN((rows / 20.0) + 1); } +void ha_rocksdb::print_error(int error, myf errflag) { + if (error == HA_ERR_ROCKSDB_STATUS_BUSY) { + error = HA_ERR_LOCK_DEADLOCK; + } + handler::print_error(error, errflag); +} + std::string rdb_corruption_marker_file_name() { std::string ret(rocksdb_datadir); ret.append("/ROCKSDB_CORRUPTED"); @@ -13572,7 +14216,7 @@ maria_declare_plugin(rocksdb_se){ myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global, myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats, myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl, - myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info, - myrocks::rdb_i_s_trx_info, + myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map, + myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info, myrocks::rdb_i_s_deadlock_info maria_declare_plugin_end; diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 5bf473a2fb0..9b3352594d9 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -107,6 +107,7 @@ struct Rdb_deadlock_info { std::string table_name; }; std::vector path; + int64_t deadlock_time; ulonglong victim_trx_id; }; @@ -147,6 +148,11 @@ const char *const BG_THREAD_NAME = "myrocks-bg"; */ const char *const INDEX_THREAD_NAME = "myrocks-index"; +/* + Name for the manual compaction thread. +*/ +const char *const MANUAL_COMPACTION_THREAD_NAME = "myrocks-mc"; + /* Separator between partition name and the qualifier. Sample usage: @@ -503,6 +509,12 @@ class ha_rocksdb : public my_core::handler { const rocksdb::Snapshot *m_scan_it_snapshot; + /* Buffers used for upper/lower bounds for m_scan_it. */ + uchar *m_scan_it_lower_bound; + uchar *m_scan_it_upper_bound; + rocksdb::Slice m_scan_it_lower_bound_slice; + rocksdb::Slice m_scan_it_upper_bound_slice; + Rdb_tbl_def *m_tbl_def; /* Primary Key encoder from KeyTupleFormat to StorageFormat */ @@ -566,12 +578,6 @@ class ha_rocksdb : public my_core::handler { uchar *m_dup_sk_packed_tuple; uchar *m_dup_sk_packed_tuple_old; - /* Buffers used for passing upper/bound eq conditions. */ - uchar *m_eq_cond_lower_bound; - uchar *m_eq_cond_upper_bound; - rocksdb::Slice m_eq_cond_lower_bound_slice; - rocksdb::Slice m_eq_cond_upper_bound_slice; - /* Temporary space for packing VARCHARs (we provide it to pack_record()/pack_index_tuple() calls). @@ -653,21 +659,20 @@ class ha_rocksdb : public my_core::handler { enum ha_rkey_function find_flag) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); void setup_iterator_bounds(const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond, - uchar *lower_bound_buf, - uchar *upper_bound_buf, - rocksdb::Slice *out_lower_bound, - rocksdb::Slice *out_upper_bound); + const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, const bool use_all_keys); bool check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, - const bool use_all_keys, - uchar *lower_bound_buf, - uchar *upper_bound_buf, - rocksdb::Slice *out_lower_bound, - rocksdb::Slice *out_upper_bound); + const bool use_all_keys, size_t bound_len, + uchar *const lower_bound, + uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice, const bool use_all_keys, const uint eq_cond_len) MY_ATTRIBUTE((__nonnull__)); @@ -1053,6 +1058,7 @@ public: } virtual double read_time(uint, uint, ha_rows rows) override; + virtual void print_error(int error, myf errflag) override; int open(const char *const name, int mode, uint test_if_locked) override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1167,8 +1173,8 @@ private: MY_ATTRIBUTE((__nonnull__)); int compare_key_parts(const KEY *const old_key, - const KEY *const new_key) const; - MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + const KEY *const new_key) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); int compare_keys(const KEY *const old_key, const KEY *const new_key) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); @@ -1223,7 +1229,7 @@ private: int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info, const bool &pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info) + const struct update_row_info &row_info, const bool bulk_load_sk) MY_ATTRIBUTE((__warn_unused_result__)); int update_indexes(const struct update_row_info &row_info, const bool &pk_changed) @@ -1277,7 +1283,9 @@ private: int finalize_bulk_load(bool print_client_error = true) MY_ATTRIBUTE((__warn_unused_result__)); -public: + int calculate_stats_for_table() MY_ATTRIBUTE((__warn_unused_result__)); + + public: int index_init(uint idx, bool sorted) override MY_ATTRIBUTE((__warn_unused_result__)); int index_end() override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1370,9 +1378,6 @@ public: MY_ATTRIBUTE((__warn_unused_result__)); int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override MY_ATTRIBUTE((__warn_unused_result__)); - int calculate_stats(const TABLE *const table_arg, THD *const thd, - HA_CHECK_OPT *const check_opt) - MY_ATTRIBUTE((__warn_unused_result__)); enum_alter_inplace_result check_if_supported_inplace_alter( TABLE *altered_table, @@ -1402,7 +1407,7 @@ public: virtual void rpl_after_delete_rows() override; virtual void rpl_before_update_rows() override; virtual void rpl_after_update_rows() override; - virtual bool use_read_free_rpl(); + virtual bool use_read_free_rpl() override; #endif // MARIAROCKS_NOT_YET private: diff --git a/storage/rocksdb/ha_rocksdb_proto.h b/storage/rocksdb/ha_rocksdb_proto.h index 85c3968cc99..deb65edddd3 100644 --- a/storage/rocksdb/ha_rocksdb_proto.h +++ b/storage/rocksdb/ha_rocksdb_proto.h @@ -39,7 +39,12 @@ enum RDB_IO_ERROR_TYPE { const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type); void rdb_handle_io_error(const rocksdb::Status status, - const RDB_IO_ERROR_TYPE err_type); + const RDB_IO_ERROR_TYPE err_type) +#if defined(__clang__) + MY_ATTRIBUTE((optnone)); +#else + MY_ATTRIBUTE((optimize("O0"))); +#endif int rdb_normalize_tablename(const std::string &tablename, std::string *str) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); diff --git a/storage/rocksdb/mysql-test/rocksdb/combinations b/storage/rocksdb/mysql-test/rocksdb/combinations new file mode 100644 index 00000000000..d49de3d38b3 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/combinations @@ -0,0 +1,6 @@ +[write_committed] +loose-rocksdb_write_policy=write_committed + +[write_prepared] +loose-rocksdb_write_policy=write_prepared +loose-rocksdb_commit_time_batch_for_recovery=on diff --git a/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc b/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc new file mode 100644 index 00000000000..681b966f680 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc @@ -0,0 +1,3 @@ +if (`select count(*) = 0 from information_schema.session_variables where variable_name = 'rocksdb_write_policy' and variable_value = 'write_committed';`) { + --skip Test requires write_committed policy +} diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result index 6325dc97cf5..32c0537c780 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result @@ -299,11 +299,13 @@ connection con1; show global variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 show session variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (i INT, j INT, PRIMARY KEY (i)) ENGINE = ROCKSDB; @@ -356,6 +358,7 @@ SET session rocksdb_merge_buf_size = 340; show variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load OFF +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (a VARCHAR(80)) ENGINE=RocksDB; @@ -463,3 +466,24 @@ t1 CREATE TABLE `t1` ( KEY `kb` (`b`(8)) ) ENGINE=ROCKSDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin DROP TABLE t1; +SET @prior_rocksdb_table_stats_sampling_pct = @@rocksdb_table_stats_sampling_pct; +set global rocksdb_table_stats_sampling_pct = 100; +CREATE TABLE t1 (a INT, b INT, PRIMARY KEY ka(a)) ENGINE=RocksDB; +INSERT INTO t1 (a, b) VALUES (1, 10); +INSERT INTO t1 (a, b) VALUES (2, 10); +INSERT INTO t1 (a, b) VALUES (3, 20); +INSERT INTO t1 (a, b) VALUES (4, 20); +set global rocksdb_force_flush_memtable_now=1; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +SHOW INDEX in t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment +t1 0 PRIMARY 1 a A 4 NULL NULL LSMTREE +ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; +SHOW INDEX in t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment +t1 0 PRIMARY 1 a A 4 NULL NULL LSMTREE +t1 1 kb 1 b A 2 NULL NULL YES LSMTREE +DROP TABLE t1; +SET global rocksdb_table_stats_sampling_pct = @prior_rocksdb_table_stats_sampling_pct; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result index 08f2329f688..0617232f1e3 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result @@ -17,7 +17,7 @@ ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; set @tmp= @@rocksdb_max_row_locks; set session rocksdb_max_row_locks=1000; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; -ERROR HY000: Status error 10 received from RocksDB: Operation aborted: Failed to acquire lock due to max_num_locks limit +ERROR HY000: Got error 10 'Operation aborted: Failed to acquire lock due to max_num_locks limit' from ROCKSDB set session rocksdb_bulk_load=1; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; set session rocksdb_bulk_load=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result index fe08cd7c361..604e5572eab 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result @@ -59,12 +59,10 @@ insert into t values (); set debug_dbug="+d,crash_commit_before"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 4 -select max(i) from t; -max(i) -3 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After engine prepare begin; insert into t values (); @@ -72,12 +70,10 @@ insert into t values (); set debug_dbug="+d,crash_commit_after_prepare"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 4 -select max(i) from t; -max(i) -3 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After binlog begin; insert into t values (); @@ -85,12 +81,10 @@ insert into t values (); set debug_dbug="+d,crash_commit_after_log"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 6 -select max(i) from t; -max(i) -5 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After everything begin; insert into t values (); @@ -98,10 +92,8 @@ insert into t values (); set debug_dbug="+d,crash_commit_after"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 8 -select max(i) from t; -max(i) -7 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 drop table t; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result index 0c496227006..f59b841a595 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result @@ -158,3 +158,21 @@ INSERT INTO t1 (a) VALUES (1); UPDATE t1 SET pk = 3; ALTER TABLE t1 AUTO_INCREMENT 2; DROP TABLE t1; +#---------------------------------- +# Issue #792 Crash in autoincrement +#---------------------------------- +CREATE TABLE t1(C1 DOUBLE AUTO_INCREMENT KEY,C2 CHAR) ENGINE=ROCKSDB; +INSERT INTO t1 VALUES(2177,0); +DROP TABLE t1; +CREATE TABLE t0(c0 BLOB) ENGINE=ROCKSDB; +INSERT INTO t0 VALUES(0); +ALTER TABLE t0 AUTO_INCREMENT=0; +DROP TABLE t0; +#---------------------------------- +# Issue #869 Crash in autoincrement +#---------------------------------- +CREATE TABLE t1 (pk INT AUTO_INCREMENT, a INT, PRIMARY KEY(pk)) ENGINE=RocksDB; +INSERT INTO t1 (a) VALUES (1); +UPDATE t1 SET pk = 3; +ALTER TABLE t1 AUTO_INCREMENT 2; +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result new file mode 100644 index 00000000000..4b02d1103cf --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result @@ -0,0 +1,15 @@ +create table r1 (id bigint primary key, value bigint) engine=rocksdb; +create table r2 (id bigint, value bigint, primary key (id) comment 'cf2') engine=rocksdb; +set session rocksdb_bulk_load=1; +set session rocksdb_bulk_load=0; +select variable_value into @h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +insert into r1 values (100, 100); +select variable_value-@h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +variable_value-@h +1 +select variable_value into @h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +insert into r2 values (100, 100); +select variable_value-@h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +variable_value-@h +0 +DROP TABLE r1, r2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result new file mode 100644 index 00000000000..42f820a2a42 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result @@ -0,0 +1,229 @@ +SET rocksdb_bulk_load_size=15; +CREATE TABLE t4 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t3 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t2 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t1 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +SET rocksdb_bulk_load=1; +INSERT INTO t1 SELECT * FROM t3 FORCE INDEX (PRIMARY) ORDER BY a; +SELECT count(*) FROM t1 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t1 FORCE INDEX (b); +count(*) +10 +SELECT count(*) FROM t1 FORCE INDEX (c); +count(*) +10 +SET rocksdb_bulk_load=0; +SELECT * FROM t1 FORCE INDEX (PRIMARY); +a b c +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +SELECT b FROM t1 FORCE INDEX (b); +b +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +SELECT c FROM t1 FORCE INDEX (c); +c +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 3862424802 +CHECKSUM TABLE t1; +Table Checksum +test.t1 3862424802 +SET rocksdb_bulk_load_allow_sk=1; +SET rocksdb_bulk_load=1; +INSERT INTO t4 SELECT * FROM t3 FORCE INDEX (PRIMARY) ORDER BY a; +SELECT count(*) FROM t4 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t4 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t4 FORCE INDEX (c); +count(*) +0 +SET rocksdb_bulk_load=0; +SELECT * FROM t4 FORCE INDEX (PRIMARY); +a b c +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +SELECT b FROM t4 FORCE INDEX (b); +b +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +SELECT c FROM t4 FORCE INDEX (c); +c +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 3862424802 +CHECKSUM TABLE t4; +Table Checksum +test.t4 3862424802 +SET rocksdb_bulk_load_allow_unsorted=1; +SET rocksdb_bulk_load_allow_sk=1; +SET rocksdb_bulk_load=1; +INSERT INTO t2 SELECT * FROM t3 WHERE b >= 0 ORDER BY b; +INSERT INTO t2 SELECT * FROM t3 WHERE b < 0 ORDER BY b; +SELECT count(*) FROM t2 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (c); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (c); +count(*) +0 +SET rocksdb_bulk_load=0; +SELECT * FROM t2 FORCE INDEX (PRIMARY); +a b c +-19 21 21 +-17 19 19 +-15 17 17 +-13 15 15 +-11 13 13 +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +12 -10 -10 +14 -12 -12 +16 -14 -14 +18 -16 -16 +20 -18 -18 +SELECT b FROM t2 FORCE INDEX (b); +b +-18 +-16 +-14 +-12 +-10 +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 +SELECT c FROM t2 FORCE INDEX (c); +c +-18 +-16 +-14 +-12 +-10 +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 1495594118 +CHECKSUM TABLE t2; +Table Checksum +test.t2 1495594118 +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; +DROP TABLE t4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result b/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result index 4b201d523d9..d037c636a16 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result @@ -82,4 +82,19 @@ t1 1 t1_5 2 c1 A 100000 NULL NULL YES LSMTREE SELECT table_name, table_rows FROM information_schema.tables WHERE table_schema = DATABASE(); table_name table_rows t1 100000 -drop table t1; +CREATE TABLE t2 (a INT, b INT, c INT, d INT, e INT, f INT, g INT, +PRIMARY KEY (a), KEY (c, b, a, d, e, f, g)) +ENGINE=ROCKSDB; +SET GLOBAL rocksdb_force_flush_memtable_now = 1; +ANALYZE TABLE t2; +Table Op Msg_type Msg_text +test.t2 analyze status OK +cardinality of the columns after 'a' must be equal to the cardinality of column 'a' +SELECT CARDINALITY INTO @c FROM information_schema.statistics WHERE TABLE_NAME='t2' AND INDEX_NAME='c' AND COLUMN_NAME='a'; +SELECT COLUMN_NAME, CARDINALITY = @c FROM information_schema.statistics WHERE TABLE_NAME='t2' AND INDEX_NAME='c' AND SEQ_IN_INDEX > 3; +COLUMN_NAME CARDINALITY = @c +d 1 +e 1 +f 1 +g 1 +drop table t1, t2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/collation.result b/storage/rocksdb/mysql-test/rocksdb/r/collation.result index e372cbe2109..10e0d9b0002 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/collation.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/collation.result @@ -1,6 +1,7 @@ -SET @start_global_value = @@global.ROCKSDB_STRICT_COLLATION_EXCEPTIONS; -DROP TABLE IF EXISTS t1; +call mtr.add_suppression("Invalid pattern"); CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text) engine=rocksdb charset utf8; +ALTER TABLE t1 ADD INDEX (value); +ERROR HY000: Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). DROP TABLE t1; CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value)) engine=rocksdb charset utf8; ERROR HY000: Unsupported collation on string indexed column test.t1.value Use binary collation (latin1_bin, binary, utf8_bin). @@ -13,6 +14,7 @@ SET GLOBAL rocksdb_strict_collation_check=1; CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value2)) engine=rocksdb charset utf8; DROP TABLE t1; CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset latin1 collate latin1_bin; +ALTER TABLE t1 collate=latin1_general_ci; DROP TABLE t1; CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset utf8 collate utf8_bin; DROP TABLE t1; @@ -127,4 +129,16 @@ CREATE TABLE abcd (id INT PRIMARY KEY, value varchar(50), index(value)) engine=r ERROR HY000: Unsupported collation on string indexed column test.abcd.value Use binary collation (latin1_bin, binary, utf8_bin). DROP TABLE abc; SET GLOBAL rocksdb_strict_collation_exceptions=null; -SET GLOBAL rocksdb_strict_collation_exceptions=@start_global_value; +SET GLOBAL rocksdb_strict_collation_check=1; +CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value)) engine=rocksdb charset utf8; +Warnings: +Warning 1210 Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). +DROP TABLE t1; +CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text) engine=rocksdb charset utf8; +ALTER TABLE t1 ADD INDEX (value); +Warnings: +Warning 1210 Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). +DROP TABLE t1; +CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset latin1 collate latin1_bin; +ALTER TABLE t1 collate=latin1_general_ci; +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result b/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result new file mode 100644 index 00000000000..789ce12e900 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result @@ -0,0 +1,21 @@ +CREATE DATABASE db_rpc; +USE db_rpc; +CREATE TABLE t1(pk INT PRIMARY KEY) ENGINE=rocksdb; +SET GLOBAL rocksdb_enable_2pc=1; +SET autocommit = 0; +SET autocommit = 0; +BEGIN; +BEGIN; +SELECT * from t1; +pk +SELECT * from t1; +pk +INSERT INTO t1 VALUES(1); +INSERT INTO t1 VALUES(2); +COMMIT; +COMMIT; +SELECT * from db_rpc.t1; +pk +1 +2 +DROP DATABASE db_rpc; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result new file mode 100644 index 00000000000..e5aeb57ebdf --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result @@ -0,0 +1,38 @@ +USE mysql; +CREATE TABLE mysql_table (a INT) ENGINE=ROCKSDB; +CREATE TABLE test.mysql_table (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +USE test; +CREATE TABLE mysql_table (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE IF NOT EXISTS mysql_table_2 (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_table_no_cols ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql.mysql_table_2 (a INT) ENGINE=ROCKSDB; +CREATE TABLE mysql_primkey (a INT PRIMARY KEY, b INT, c INT, d INT, INDEX (c)) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey DROP b, DROP a, ADD (f INT PRIMARY KEY); +ALTER TABLE mysql_primkey DROP PRIMARY KEY; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey2 (a INT PRIMARY KEY, b INT, c INT) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey2 DROP b; +ALTER TABLE mysql_primkey2 ADD (b INT); +ALTER TABLE mysql_primkey2 DROP c, DROP A; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey3 (a INT PRIMARY KEY, b INT, c INT, INDEX indexonb (b), INDEX indexonc (c)) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey3 DROP INDEX indexonb; +ALTER TABLE mysql_primkey3 DROP c; +ALTER TABLE mysql_primkey3 DROP PRIMARY KEY, ADD PRIMARY KEY(b); +CREATE TABLE mysql_primkey4(a INT, b INT, PRIMARY KEY(a), INDEX si (a, b)) ENGINE=ROCKSDB; +DROP INDEX si ON mysql_primkey4; +DROP INDEX `PRIMARY` ON mysql_primkey4; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +ALTER TABLE mysql.mysql_table ADD PRIMARY KEY (a); +ALTER TABLE mysql.mysql_table DROP PRIMARY KEY; +DROP TABLE mysql_primkey; +DROP TABLE mysql_primkey2; +DROP TABLE mysql_primkey3; +DROP TABLE mysql_primkey4; +USE mysql; +DROP TABLE mysql_table; +DROP TABLE mysql_table_2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result index 39130475349..50733f81598 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result @@ -45,7 +45,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 modify i bigint;; set high_priority_ddl = 0; @@ -98,7 +98,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 rename t1_new;; set high_priority_ddl = 0; @@ -152,7 +152,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop table t1;; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 @@ -202,7 +202,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop table t1;; set high_priority_ddl = 0; @@ -251,7 +251,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 modify i bigint;; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 @@ -302,7 +302,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 create index idx1 on t1 (i);; set high_priority_ddl = 0; @@ -342,7 +342,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop index idx1 on t1;; set high_priority_ddl = 0; @@ -390,7 +390,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 truncate t1;; set high_priority_ddl = 0; @@ -438,7 +438,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 create trigger ins_sum before insert on t1 for each row set @sum = @sum + new.i;; set high_priority_ddl = 0; @@ -478,7 +478,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop trigger ins_sum;; set high_priority_ddl = 0; @@ -528,7 +528,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 optimize table t1;; Table Op Msg_type Msg_text @@ -538,6 +538,55 @@ connection: default (for show processlist) show processlist; Id User Host db Command Time State Info Rows examined Rows sent Tid Srv_Id root test