diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS deleted file mode 100644 index 538007ed62d..00000000000 --- a/.github/CODEOWNERS +++ /dev/null @@ -1,2 +0,0 @@ -/debian @ottok - diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8c2b4ae363d..f5d0ef87e07 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -42,7 +42,7 @@ variables: CMAKE_FLAGS: "-DWITH_SSL=system -DPLUGIN_COLUMNSTORE=NO -DPLUGIN_ROCKSDB=NO -DPLUGIN_S3=NO -DPLUGIN_MROONGA=NO -DPLUGIN_CONNECT=NO -DPLUGIN_MROONGA=NO -DPLUGIN_TOKUDB=NO -DPLUGIN_PERFSCHEMA=NO -DWITH_WSREP=OFF" # Major version dictates which branches share the same ccache. E.g. 10.6-abc # and 10.6-xyz will have the same cache. - MARIADB_MAJOR_VERSION: "10.8" + MARIADB_MAJOR_VERSION: "10.9" # NOTE! Currently ccache is only used on the Centos8 build. As each job has # sufficiently different environments they are unable to benefit from each # other's ccaches. As each build generates about 1 GB of ccache, having diff --git a/cmake/cpack_rpm.cmake b/cmake/cpack_rpm.cmake index d02f2952cfe..71704d95c38 100644 --- a/cmake/cpack_rpm.cmake +++ b/cmake/cpack_rpm.cmake @@ -297,7 +297,7 @@ ELSEIF(RPM MATCHES "sles") ENDIF() # MDEV-24629, we need it outside of ELSIFs -IF(RPM MATCHES "fedora3[234]") +IF(RPM MATCHES "fedora") ALTERNATIVE_NAME("common" "mariadb-connector-c-config" ${MARIADB_CONNECTOR_C_VERSION}-1) ENDIF() diff --git a/debian/additions/innotop/innotop b/debian/additions/innotop/innotop old mode 100644 new mode 100755 index 2bc090917fe..8e5a41768eb --- a/debian/additions/innotop/innotop +++ b/debian/additions/innotop/innotop @@ -20,6 +20,9 @@ # Street, Fifth Floor, Boston, MA 02110-1335 USA use strict; +use warnings; +use utf8; +use feature ':5.16'; use warnings FATAL => 'all'; our $VERSION = '1.11.4'; @@ -265,7 +268,7 @@ sub get_dbh { $dbh->do($sql); MKDEBUG && _d('Enabling charset for STDOUT'); if ( $charset eq 'utf8' ) { - binmode(STDOUT, ':utf8') + binmode(STDOUT, ':encoding(UTF-8)') or die "Can't binmode(STDOUT, ':utf8'): $OS_ERROR"; } else { @@ -612,6 +615,9 @@ sub ts_to_string { sub parse_innodb_timestamp { my $text = shift; + if ( ! defined $text ) { + return (0, 0, 0, 0, 0, 0); + } my ( $y, $m, $d, $h, $i, $s ) = $text =~ m/^(\d\d)(\d\d)(\d\d) +(\d+):(\d+):(\d+)$/; die("Can't get timestamp from $text\n") unless $y; @@ -803,7 +809,8 @@ sub parse_fk_transaction_error { # TODO: write some tests for this sub parse_innodb_record_dump { my ( $dump, $complete, $debug ) = @_; - return undef unless $dump; + # Use bare return as recommend in page 199 of PBP + return unless $dump; my $result = {}; @@ -6769,6 +6776,9 @@ sub set_precision { my ( $num, $precision ) = @_; $num = 0 unless defined $num; $precision = $config{num_digits}->{val} if !defined $precision; + if ( $num eq "" ) { + $num = int(0); + } sprintf("%.${precision}f", $num); } @@ -6777,6 +6787,9 @@ sub set_precision { sub percent { my ( $num ) = @_; $num = 0 unless defined $num; + if ( $num eq "" ) { + $num = int(0); + } my $digits = $config{num_digits}->{val}; return sprintf("%.${digits}f", $num * 100) . ($config{show_percent}->{val} ? '%' : ''); @@ -6841,7 +6854,7 @@ sub make_color_func { push @criteria, "( defined \$set->{$spec->{col}} && \$set->{$spec->{col}} $spec->{op} $val ) { return '$spec->{color}'; }"; } - return undef unless @criteria; + return unless @criteria; my $sub = eval 'sub { my ( $set ) = @_; if ' . join(" elsif ", @criteria) . '}'; die if $EVAL_ERROR; return $sub; @@ -7521,10 +7534,10 @@ sub choose_connections { sub do_stmt { my ( $cxn, $stmt_name, @args ) = @_; - return undef if $file; + return if $file; # Test if the cxn should not even be tried - return undef if $dbhs{$cxn} + return if $dbhs{$cxn} && $dbhs{$cxn}->{failed} && ( !$dbhs{$cxn}->{dbh} || !$dbhs{$cxn}->{dbh}->{Active} || $dbhs{$cxn}->{mode} eq $config{mode}->{val} ); @@ -7596,10 +7609,10 @@ sub handle_cxn_error { sub do_query { my ( $cxn, $query ) = @_; - return undef if $file; + return if $file; # Test if the cxn should not even be tried - return undef if $dbhs{$cxn} + return if $dbhs{$cxn} && $dbhs{$cxn}->{failed} && ( !$dbhs{$cxn}->{dbh} || !$dbhs{$cxn}->{dbh}->{Active} || $dbhs{$cxn}->{mode} eq $config{mode}->{val} ); @@ -7781,7 +7794,7 @@ sub compile_select_stmt { sub compile_filter { my ( $text ) = @_; my ( $sub, $err ); - eval "\$sub = sub { my \$set = shift; $text }"; + eval { $sub = sub { my $set = shift; $text } }; if ( $EVAL_ERROR ) { $EVAL_ERROR =~ s/at \(eval.*$//; $sub = sub { return $EVAL_ERROR }; @@ -8013,7 +8026,7 @@ sub load_config_plugins { # First, find a list of all plugins that exist on disk, and get information about them. my $dir = $config{plugin_dir}->{val}; - foreach my $p_file ( <$dir/*.pm> ) { + foreach my $p_file (glob($dir."/*.pm")) { my ($package, $desc); eval { open my $p_in, "<", $p_file or die $OS_ERROR; @@ -9192,7 +9205,7 @@ sub switch_var_set { # edit_stmt_sleep_times {{{3 sub edit_stmt_sleep_times { $clear_screen_sub->(); - my $stmt = prompt_list('Specify a statement', '', sub { return sort keys %stmt_maker_for }); + my $stmt = prompt_list('Specify a statement', '', sub { my @tmparray = sort keys %stmt_maker_for; return @tmparray }); return unless $stmt && exists $stmt_maker_for{$stmt}; $clear_screen_sub->(); my $curr_val = $stmt_sleep_time_for{$stmt} || 0; @@ -9843,7 +9856,7 @@ sub get_slave_status { sub is_func { my ( $word ) = @_; return defined(&$word) - || eval "my \$x= sub { $word }; 1" + || eval { my $x = sub { $word }; 1 } || $EVAL_ERROR !~ m/^Bareword/; } diff --git a/debian/mariadb-server.mariadb.init b/debian/mariadb-server.mariadb.init index f4051d4b007..3178a5d2244 100644 --- a/debian/mariadb-server.mariadb.init +++ b/debian/mariadb-server.mariadb.init @@ -86,7 +86,7 @@ sanity_checks() { datadir=`mariadbd_get_param datadir` # As preset blocksize of GNU df is 1024 then available bytes is $df_available_blocks * 1024 # 4096 blocks is then lower than 4 MB - df_available_blocks=`LC_ALL=C BLOCKSIZE= df --output=avail "$datadir" | tail -n 1` + df_available_blocks="$(LC_ALL=C BLOCKSIZE='' df --output=avail "$datadir" | tail -n 1)" if [ "$df_available_blocks" -lt "4096" ]; then log_failure_msg "$0: ERROR: The partition with $datadir is too full!" echo "ERROR: The partition with $datadir is too full!" | $ERR_LOGGER diff --git a/debian/mariadb-server.preinst b/debian/mariadb-server.preinst index e92f97a618c..eb0b825ca28 100644 --- a/debian/mariadb-server.preinst +++ b/debian/mariadb-server.preinst @@ -223,14 +223,23 @@ then mkdir -Z $mysql_datadir fi -# As preset blocksize of GNU df is 1024 then available bytes is $df_available_blocks * 1024 -# 4096 blocks is then lower than 4 MB -df_available_blocks=`LC_ALL=C BLOCKSIZE= df --output=avail "$datadir" | tail -n 1` -if [ "$df_available_blocks" -lt "4096" ] +# Check if MariaDB datadir is available if not fails. +# There should be symlink or directory available or something will fail. +if [ -d "$mysql_datadir" ] || [ -L "$mysql_datadir" ] then - echo "ERROR: There's not enough space in $mysql_datadir/" 1>&2 - db_stop - exit 1 + # As preset blocksize of GNU df is 1024 then available bytes is $df_available_blocks * 1024 + # 4096 blocks is then lower than 4 MB + df_available_blocks="$(LC_ALL=C BLOCKSIZE='' df --output=avail "$mysql_datadir" | tail -n 1)" + if [ "$df_available_blocks" -lt "4096" ] + then + echo "ERROR: There's not enough space in $mysql_datadir/" 1>&2 + db_stop + exit 1 + fi +else + echo "ERROR: There's no directory or symlink available: $mysql_datadir/" 1>&2 + db_stop + exit 1 fi # Since the home directory was created before putting the user into diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 587d572fe91..40d7a6f39a2 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -847,27 +847,49 @@ void mdl_lock_all() // Convert non-null terminated filename to space name +// Note that in 10.6 the filename may be an undo file name static std::string filename_to_spacename(const void *filename, size_t len) { - // null- terminate filename - char *f = (char *)malloc(len + 1); - ut_a(f); - memcpy(f, filename, len); - f[len] = 0; - for (size_t i = 0; i < len; i++) - if (f[i] == '\\') - f[i] = '/'; - char *p = strrchr(f, '.'); - ut_a(p); - *p = 0; - char *table = strrchr(f, '/'); - ut_a(table); - *table = 0; - char *db = strrchr(f, '/'); - *table = '/'; - std::string s(db ? db+1 : f); - free(f); - return s; + char f[FN_REFLEN]; + char *p= 0, *table, *db; + DBUG_ASSERT(len < FN_REFLEN); + + strmake(f, (const char*) filename, len); + +#ifdef _WIN32 + for (size_t i = 0; i < len; i++) + { + if (f[i] == '\\') + f[i] = '/'; + } +#endif + + /* Remove extension, if exists */ + if (!(p= strrchr(f, '.'))) + goto err; + *p= 0; + + /* Find table name */ + if (!(table= strrchr(f, '/'))) + goto err; + *table = 0; + + /* Find database name */ + db= strrchr(f, '/'); + *table = '/'; + if (!db) + goto err; + { + std::string s(db+1); + return s; + } + +err: + /* Not a database/table. Return original (converted) name */ + if (p) + *p= '.'; // Restore removed extension + std::string s(f); + return s; } /** Report an operation to create, delete, or rename a file during backup. @@ -3146,7 +3168,7 @@ static bool xtrabackup_copy_logfile() if (log_sys.buf[recv_sys.offset] <= 1) break; - if (recv_sys.parse_mtr(STORE_NO) == recv_sys_t::OK) + if (recv_sys.parse_mtr(false) == recv_sys_t::OK) { do { @@ -3156,7 +3178,7 @@ static bool xtrabackup_copy_logfile() sequence_offset)); *seq= 1; } - while ((r= recv_sys.parse_mtr(STORE_NO)) == recv_sys_t::OK); + while ((r= recv_sys.parse_mtr(false)) == recv_sys_t::OK); if (ds_write(dst_log_file, log_sys.buf + start_offset, recv_sys.offset - start_offset)) diff --git a/include/m_string.h b/include/m_string.h index 046dc39d13a..6a645b20a7f 100644 --- a/include/m_string.h +++ b/include/m_string.h @@ -249,14 +249,15 @@ static inline void lex_string_set3(LEX_CSTRING *lex_str, const char *c_str, */ static inline int safe_strcpy(char *dst, size_t dst_size, const char *src) { - memset(dst, '\0', dst_size); - strncpy(dst, src, dst_size - 1); - /* - If the first condition is true, we are guaranteed to have src length - >= (dst_size - 1), hence safe to access src[dst_size - 1]. - */ - if (dst[dst_size - 2] != '\0' && src[dst_size - 1] != '\0') - return 1; /* Truncation of src. */ + DBUG_ASSERT(dst_size > 0); + /* Note, strncpy will zerofill end of dst if src shorter than dst_size */ + strncpy(dst, src, dst_size); + if (dst[dst_size-1]) + { + /* Ensure string is zero terminated */ + dst[dst_size-1]= 0; + return 1; + } return 0; } diff --git a/include/myisammrg.h b/include/myisammrg.h index 1d7efbe74d6..b3bca218a44 100644 --- a/include/myisammrg.h +++ b/include/myisammrg.h @@ -71,6 +71,7 @@ typedef struct st_myrg_info ulong cache_size; uint merge_insert_method; uint tables,options,reclength,keys; + uint key_parts; my_bool cache_in_use; /* If MERGE children attached to parent. See top comment in ha_myisammrg.cc */ my_bool children_attached; diff --git a/include/mysql/service_wsrep.h b/include/mysql/service_wsrep.h index 8541b348ae4..8add709362e 100644 --- a/include/mysql/service_wsrep.h +++ b/include/mysql/service_wsrep.h @@ -57,6 +57,7 @@ extern struct wsrep_service_st { my_bool (*wsrep_on_func)(const MYSQL_THD thd); bool (*wsrep_prepare_key_for_innodb_func)(MYSQL_THD thd, const unsigned char*, size_t, const unsigned char*, size_t, struct wsrep_buf*, size_t*); void (*wsrep_thd_LOCK_func)(const MYSQL_THD thd); + int (*wsrep_thd_TRYLOCK_func)(const MYSQL_THD thd); void (*wsrep_thd_UNLOCK_func)(const MYSQL_THD thd); const char * (*wsrep_thd_query_func)(const MYSQL_THD thd); int (*wsrep_thd_retry_counter_func)(const MYSQL_THD thd); @@ -89,7 +90,6 @@ extern struct wsrep_service_st { ulong (*wsrep_OSU_method_get_func)(const MYSQL_THD thd); my_bool (*wsrep_thd_has_ignored_error_func)(const MYSQL_THD thd); void (*wsrep_thd_set_ignored_error_func)(MYSQL_THD thd, my_bool val); - bool (*wsrep_thd_set_wsrep_aborter_func)(MYSQL_THD bf_thd, MYSQL_THD thd); void (*wsrep_report_bf_lock_wait_func)(const MYSQL_THD thd, unsigned long long trx_id); void (*wsrep_thd_kill_LOCK_func)(const MYSQL_THD thd); @@ -111,6 +111,7 @@ extern struct wsrep_service_st { #define wsrep_on(thd) (thd) && WSREP_ON && wsrep_service->wsrep_on_func(thd) #define wsrep_prepare_key_for_innodb(A,B,C,D,E,F,G) wsrep_service->wsrep_prepare_key_for_innodb_func(A,B,C,D,E,F,G) #define wsrep_thd_LOCK(T) wsrep_service->wsrep_thd_LOCK_func(T) +#define wsrep_thd_TRYLOCK(T) wsrep_service->wsrep_thd_TRYLOCK_func(T) #define wsrep_thd_UNLOCK(T) wsrep_service->wsrep_thd_UNLOCK_func(T) #define wsrep_thd_kill_LOCK(T) wsrep_service->wsrep_thd_kill_LOCK_func(T) #define wsrep_thd_kill_UNLOCK(T) wsrep_service->wsrep_thd_kill_UNLOCK_func(T) @@ -141,7 +142,6 @@ extern struct wsrep_service_st { #define wsrep_OSU_method_get(T) wsrep_service->wsrep_OSU_method_get_func(T) #define wsrep_thd_has_ignored_error(T) wsrep_service->wsrep_thd_has_ignored_error_func(T) #define wsrep_thd_set_ignored_error(T,V) wsrep_service->wsrep_thd_set_ignored_error_func(T,V) -#define wsrep_thd_set_wsrep_aborter(T) wsrep_service->wsrep_thd_set_wsrep_aborter_func(T1, T2) #define wsrep_report_bf_lock_wait(T,I) wsrep_service->wsrep_report_bf_lock_wait(T,I) #define wsrep_thd_set_PA_unsafe(T) wsrep_service->wsrep_thd_set_PA_unsafe_func(T) #else @@ -175,6 +175,8 @@ void wsrep_set_data_home_dir(const char *data_dir); extern "C" my_bool wsrep_on(const MYSQL_THD thd); /* Lock thd wsrep lock */ extern "C" void wsrep_thd_LOCK(const MYSQL_THD thd); +/* Try thd wsrep lock. Return non-zero if lock could not be taken. */ +extern "C" int wsrep_thd_TRYLOCK(const MYSQL_THD thd); /* Unlock thd wsrep lock */ extern "C" void wsrep_thd_UNLOCK(const MYSQL_THD thd); @@ -197,8 +199,6 @@ extern "C" my_bool wsrep_thd_is_local(const MYSQL_THD thd); /* Return true if thd is in high priority mode */ /* todo: rename to is_high_priority() */ extern "C" my_bool wsrep_thd_is_applying(const MYSQL_THD thd); -/* set wsrep_aborter for the target THD */ -extern "C" bool wsrep_thd_set_wsrep_aborter(MYSQL_THD bf_thd, MYSQL_THD victim_thd); /* Return true if thd is in TOI mode */ extern "C" my_bool wsrep_thd_is_toi(const MYSQL_THD thd); /* Return true if thd is in replicating TOI mode */ @@ -249,7 +249,6 @@ extern "C" my_bool wsrep_thd_is_applying(const MYSQL_THD thd); extern "C" ulong wsrep_OSU_method_get(const MYSQL_THD thd); extern "C" my_bool wsrep_thd_has_ignored_error(const MYSQL_THD thd); extern "C" void wsrep_thd_set_ignored_error(MYSQL_THD thd, my_bool val); -extern "C" bool wsrep_thd_set_wsrep_aborter(MYSQL_THD bf_thd, MYSQL_THD victim_thd); extern "C" void wsrep_report_bf_lock_wait(const THD *thd, unsigned long long trx_id); /* declare parallel applying unsafety for the THD */ diff --git a/mysql-test/lib/My/File/Path.pm b/mysql-test/lib/My/File/Path.pm index d60027c909e..fd3cf6dd61c 100644 --- a/mysql-test/lib/My/File/Path.pm +++ b/mysql-test/lib/My/File/Path.pm @@ -34,7 +34,7 @@ use strict; use Exporter; use base "Exporter"; -our @EXPORT= qw /rmtree mkpath copytree/; +our @EXPORT= qw /rmtree mkpath copytree make_readonly/; use File::Find; use File::Copy; @@ -184,6 +184,10 @@ sub copytree { # Only copy plain files next unless -f "$from_dir/$_"; copy("$from_dir/$_", "$to_dir/$_"); + if (!$use_umask) + { + chmod(0666, "$to_dir/$_"); + } } closedir(DIR); @@ -193,4 +197,29 @@ sub copytree { } } + +sub make_readonly { + my ($dir) = @_; + + die "Usage: make_readonly(])" + unless @_ == 1; + + opendir(DIR, "$dir") + or croak("Can't find $dir$!"); + for(readdir(DIR)) { + + next if "$_" eq "." or "$_" eq ".."; + + if ( -d "$dir/$_" ) + { + make_readonly("$dir/$_"); + next; + } + + # Only copy plain files + next unless -f "$dir/$_"; + chmod 0444, "$dir/$_"; + } + closedir(DIR); +} 1; diff --git a/mysql-test/lib/My/SafeProcess/Base.pm b/mysql-test/lib/My/SafeProcess/Base.pm index 818e6e34e11..1cd01cb0ca9 100644 --- a/mysql-test/lib/My/SafeProcess/Base.pm +++ b/mysql-test/lib/My/SafeProcess/Base.pm @@ -40,7 +40,7 @@ our @EXPORT= qw(create_process); # Retry a couple of times if fork returns EAGAIN # sub _safe_fork { - my $retries= 5; + my $retries= 100; my $pid; FORK: diff --git a/mysql-test/main/distinct.result b/mysql-test/main/distinct.result index fa9f0259a0f..ac693421ba2 100644 --- a/mysql-test/main/distinct.result +++ b/mysql-test/main/distinct.result @@ -1157,3 +1157,28 @@ explain select * from t1 limit 0 offset 10; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Zero limit drop table t1, t2; +# +# MDEV-28285 Unexpected result when combining DISTINCT, subselect +# and LIMIT +# +create table t1 (a int primary key); +create table t2 (a int primary key, b int not null); +insert into t1 select seq from seq_1_to_10; +insert into t2 select seq,seq from seq_1_to_10; +select distinct a from t1 where t1.a=1 and t1.a in (select a from t2 where t2.b in (1,2)); +a +1 +explain select distinct a from t1 where t1.a=1 and t1.a in (select a+0 from t2 where t2.b in (1,2)) limit 10,10; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 const PRIMARY PRIMARY 4 const 1 Using index; Using temporary +1 PRIMARY eq_ref distinct_key distinct_key 8 func 1 +2 MATERIALIZED t2 ALL NULL NULL NULL NULL 10 Using where +select distinct a from t1 where t1.a=1 and t1.a in (select a+0 from t2 where t2.b in (1,2)) limit 10,10; +a +select distinct a from t1 where t1.a=1 and t1.a in (select a+0 from t2 where t2.b in (1,2)) limit 0,1; +a +1 +drop table t1,t2; +# +# end of 10.5 tests +# diff --git a/mysql-test/main/distinct.test b/mysql-test/main/distinct.test index 893e2dcc9a7..9aa3b2921aa 100644 --- a/mysql-test/main/distinct.test +++ b/mysql-test/main/distinct.test @@ -892,3 +892,24 @@ explain select * from t1 limit 0; explain select * from t1 limit 0 offset 10; drop table t1, t2; + +--echo # +--echo # MDEV-28285 Unexpected result when combining DISTINCT, subselect +--echo # and LIMIT +--echo # + +create table t1 (a int primary key); +create table t2 (a int primary key, b int not null); + +insert into t1 select seq from seq_1_to_10; +insert into t2 select seq,seq from seq_1_to_10; + +select distinct a from t1 where t1.a=1 and t1.a in (select a from t2 where t2.b in (1,2)); +explain select distinct a from t1 where t1.a=1 and t1.a in (select a+0 from t2 where t2.b in (1,2)) limit 10,10; +select distinct a from t1 where t1.a=1 and t1.a in (select a+0 from t2 where t2.b in (1,2)) limit 10,10; +select distinct a from t1 where t1.a=1 and t1.a in (select a+0 from t2 where t2.b in (1,2)) limit 0,1; +drop table t1,t2; + +--echo # +--echo # end of 10.5 tests +--echo # diff --git a/mysql-test/main/func_json.result b/mysql-test/main/func_json.result index 8cbc5305405..5af8e658d34 100644 --- a/mysql-test/main/func_json.result +++ b/mysql-test/main/func_json.result @@ -2578,5 +2578,29 @@ SELECT JSON_EXTRACT('{ "my-key": 1 }', '$.my-key'); JSON_EXTRACT('{ "my-key": 1 }', '$.my-key') 1 # +# MDEV-23187: Assorted assertion failures in json_find_path with certain collations +# +SET @save_collation_connection= @@collation_connection; +SET @json='{ "A": [ [{"k":"v"},[1]],true],"B": {"C": 1} }'; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); +JSON_VALUE(@json,'$.A[last-1][last-1].key1') +NULL +SET @json='{ "A": [ [{"k":"v"},[1]],true],"B": {"C": 1} }'; +SET collation_connection='ucs2_bin'; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); +JSON_VALUE(@json,'$.A[last-1][last-1].key1') +NULL +SET @json='{ "A": [ [{"k":"v"},[15]],true],"B": {"C": 1} }'; +SET sql_mode=0,character_set_connection=utf32; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); +JSON_VALUE(@json,'$.A[last-1][last-1].key1') +NULL +SET @json='{ "A": [ [{"k":"v"},[15]],true],"B": {"C": 1} }'; +SET sql_mode=0,character_set_connection=utf32; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); +JSON_VALUE(@json,'$.A[last-1][last-1].key1') +NULL +SET @@collation_connection= @save_collation_connection; +# # End of 10.9 Test # diff --git a/mysql-test/main/func_json.test b/mysql-test/main/func_json.test index 9bf0c9bae05..23a703ca716 100644 --- a/mysql-test/main/func_json.test +++ b/mysql-test/main/func_json.test @@ -1772,6 +1772,32 @@ DROP TABLE t1; SELECT JSON_EXTRACT('{ "my-key": 1 }', '$."my-key"'); SELECT JSON_EXTRACT('{ "my-key": 1 }', '$.my-key'); +--echo # +--echo # MDEV-23187: Assorted assertion failures in json_find_path with certain collations +--echo # + + +SET @save_collation_connection= @@collation_connection; + +SET @json='{ "A": [ [{"k":"v"},[1]],true],"B": {"C": 1} }'; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); + +SET @json='{ "A": [ [{"k":"v"},[1]],true],"B": {"C": 1} }'; +SET collation_connection='ucs2_bin'; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); + +SET @json='{ "A": [ [{"k":"v"},[15]],true],"B": {"C": 1} }'; +SET sql_mode=0,character_set_connection=utf32; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); + + +SET @json='{ "A": [ [{"k":"v"},[15]],true],"B": {"C": 1} }'; +SET sql_mode=0,character_set_connection=utf32; +SELECT JSON_VALUE(@json,'$.A[last-1][last-1].key1'); + +SET @@collation_connection= @save_collation_connection; + + --echo # --echo # End of 10.9 Test --echo # diff --git a/mysql-test/main/group_min_max.result b/mysql-test/main/group_min_max.result index 706a4132614..055bd266e15 100644 --- a/mysql-test/main/group_min_max.result +++ b/mysql-test/main/group_min_max.result @@ -4095,6 +4095,116 @@ MIN(pk) a 5 10 DROP TABLE t1; # +# MDEV-6768 Wrong result with agregate with join with no resultset +# +create table t1 +( +PARENT_ID int(10) unsigned NOT NULL AUTO_INCREMENT, +PARENT_FIELD VARCHAR(10), +PRIMARY KEY (PARENT_ID) +) engine=innodb; +create table t2 +( +CHILD_ID INT NOT NULL AUTO_INCREMENT, +PARENT_ID INT NOT NULL, +CHILD_FIELD varchar(10), +PRIMARY KEY (CHILD_ID) +)engine=innodb; +INSERT INTO t1 (PARENT_FIELD) +SELECT 'AAAA'; +INSERT INTO t2 (PARENT_ID, CHILD_FIELD) +SELECT 1, 'BBBB'; +explain select +t1.PARENT_ID, +min(CHILD_FIELD) +from t1 straight_join t2 +where t1.PARENT_ID = 1 +and t1.PARENT_ID = t2.PARENT_ID +and t2.CHILD_FIELD = "ZZZZ"; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1 Using index +1 SIMPLE t2 ALL NULL NULL NULL NULL 1 Using where +select +t1.PARENT_ID, +min(CHILD_FIELD) +from t1 straight_join t2 +where t1.PARENT_ID = 1 +and t1.PARENT_ID = t2.PARENT_ID +and t2.CHILD_FIELD = "ZZZZ"; +PARENT_ID min(CHILD_FIELD) +NULL NULL +select +1, +min(CHILD_FIELD) +from t1 straight_join t2 +where t1.PARENT_ID = 1 +and t1.PARENT_ID = t2.PARENT_ID +and t2.CHILD_FIELD = "ZZZZ"; +1 min(CHILD_FIELD) +1 NULL +select +IFNULL(t1.PARENT_ID,1), +min(CHILD_FIELD) +from t1 straight_join t2 +where t1.PARENT_ID = 1 +and t1.PARENT_ID = t2.PARENT_ID +and t2.CHILD_FIELD = "ZZZZ"; +IFNULL(t1.PARENT_ID,1) min(CHILD_FIELD) +1 NULL +# Check that things works with MyISAM (which has different explain) +alter table t1 engine=myisam; +alter table t2 engine=myisam; +explain select +t1.PARENT_ID, +min(CHILD_FIELD) +from t1 straight_join t2 +where t1.PARENT_ID = 1 +and t1.PARENT_ID = t2.PARENT_ID +and t2.CHILD_FIELD = "ZZZZ"; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables +select +t1.PARENT_ID, +min(CHILD_FIELD) +from t1 straight_join t2 +where t1.PARENT_ID = 1 +and t1.PARENT_ID = t2.PARENT_ID +and t2.CHILD_FIELD = "ZZZZ"; +PARENT_ID min(CHILD_FIELD) +NULL NULL +drop table t1,t2; +# Check that things works if sub queries are re-executed +create table t1 (a int primary key, b int); +create table t2 (a int primary key, b int); +create table t3 (a int primary key, b int); +insert into t1 values (1,1),(2,2),(3,3); +insert into t2 values (1,1),(2,2),(3,3); +insert into t3 values (1,1),(3,3); +explain +select *, +(select +CONCAT('t2:', IFNULL(t2.a, 't2a-null'), ';', +'min_t3_b:', IFNULL(min(t3.b), 't3b-null')) +from t2,t3 +where t2.a=1 and t1.b = t3.a) as s1 +from t1; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 3 +2 DEPENDENT SUBQUERY t2 const PRIMARY PRIMARY 4 const 1 Using index +2 DEPENDENT SUBQUERY t3 eq_ref PRIMARY PRIMARY 4 test.t1.b 1 +select *, +(select +CONCAT('t2:', IFNULL(t2.a, 't2a-null'), ';', +'min_t3_b:', IFNULL(min(t3.b), 't3b-null')) +from t2,t3 +where t2.a=1 and t1.b = t3.a) as s1 +from t1; +a b s1 +1 1 t2:1;min_t3_b:1 +2 2 t2:t2a-null;min_t3_b:t3b-null +3 3 t2:1;min_t3_b:3 +drop table t1,t2,t3; +# # End of 10.5 tests # # diff --git a/mysql-test/main/group_min_max.test b/mysql-test/main/group_min_max.test index 1bc334dd3da..9d056473537 100644 --- a/mysql-test/main/group_min_max.test +++ b/mysql-test/main/group_min_max.test @@ -1749,6 +1749,116 @@ SELECT MIN(pk), a FROM t1 WHERE pk <> 1 GROUP BY a; DROP TABLE t1; +--echo # +--echo # MDEV-6768 Wrong result with agregate with join with no resultset +--echo # + +create table t1 +( + PARENT_ID int(10) unsigned NOT NULL AUTO_INCREMENT, + PARENT_FIELD VARCHAR(10), + PRIMARY KEY (PARENT_ID) +) engine=innodb; + +create table t2 +( + CHILD_ID INT NOT NULL AUTO_INCREMENT, + PARENT_ID INT NOT NULL, + CHILD_FIELD varchar(10), + PRIMARY KEY (CHILD_ID) +)engine=innodb; + +INSERT INTO t1 (PARENT_FIELD) +SELECT 'AAAA'; + +INSERT INTO t2 (PARENT_ID, CHILD_FIELD) +SELECT 1, 'BBBB'; + +explain select + t1.PARENT_ID, + min(CHILD_FIELD) + from t1 straight_join t2 + where t1.PARENT_ID = 1 + and t1.PARENT_ID = t2.PARENT_ID + and t2.CHILD_FIELD = "ZZZZ"; + +select + t1.PARENT_ID, + min(CHILD_FIELD) + from t1 straight_join t2 + where t1.PARENT_ID = 1 + and t1.PARENT_ID = t2.PARENT_ID + and t2.CHILD_FIELD = "ZZZZ"; + +select + 1, + min(CHILD_FIELD) + from t1 straight_join t2 + where t1.PARENT_ID = 1 + and t1.PARENT_ID = t2.PARENT_ID + and t2.CHILD_FIELD = "ZZZZ"; + +select + IFNULL(t1.PARENT_ID,1), + min(CHILD_FIELD) + from t1 straight_join t2 + where t1.PARENT_ID = 1 + and t1.PARENT_ID = t2.PARENT_ID + and t2.CHILD_FIELD = "ZZZZ"; + + +--echo # Check that things works with MyISAM (which has different explain) + +alter table t1 engine=myisam; +alter table t2 engine=myisam; + +explain select + t1.PARENT_ID, + min(CHILD_FIELD) + from t1 straight_join t2 + where t1.PARENT_ID = 1 + and t1.PARENT_ID = t2.PARENT_ID + and t2.CHILD_FIELD = "ZZZZ"; + +select + t1.PARENT_ID, + min(CHILD_FIELD) + from t1 straight_join t2 + where t1.PARENT_ID = 1 + and t1.PARENT_ID = t2.PARENT_ID + and t2.CHILD_FIELD = "ZZZZ"; + +drop table t1,t2; + +--echo # Check that things works if sub queries are re-executed + +create table t1 (a int primary key, b int); +create table t2 (a int primary key, b int); +create table t3 (a int primary key, b int); + +insert into t1 values (1,1),(2,2),(3,3); +insert into t2 values (1,1),(2,2),(3,3); +insert into t3 values (1,1),(3,3); + +explain +select *, + (select + CONCAT('t2:', IFNULL(t2.a, 't2a-null'), ';', + 'min_t3_b:', IFNULL(min(t3.b), 't3b-null')) + from t2,t3 + where t2.a=1 and t1.b = t3.a) as s1 +from t1; + +select *, + (select + CONCAT('t2:', IFNULL(t2.a, 't2a-null'), ';', + 'min_t3_b:', IFNULL(min(t3.b), 't3b-null')) + from t2,t3 + where t2.a=1 and t1.b = t3.a) as s1 +from t1; + +drop table t1,t2,t3; + --echo # --echo # End of 10.5 tests --echo # diff --git a/mysql-test/main/merge.result b/mysql-test/main/merge.result index 230fcf48e9d..e7330b7e54b 100644 --- a/mysql-test/main/merge.result +++ b/mysql-test/main/merge.result @@ -3902,6 +3902,18 @@ DROP TABLE t1; DROP TABLE m1; set global default_storage_engine=@save_default_storage_engine; # +# MDEV-31083 ASAN use-after-poison in myrg_attach_children +# +CREATE TABLE t1 (f TEXT, FULLTEXT (f)) ENGINE=MyISAM; +INSERT INTO t1 VALUES ('foo'),('bar'); +CREATE TABLE mrg (f TEXT) ENGINE=MERGE, UNION(t1); +SELECT * FROM mrg; +f +foo +bar +DROP TABLE mrg, t1; +End of 10.5 tests +# # End of 10.0 tests # # diff --git a/mysql-test/main/merge.test b/mysql-test/main/merge.test index 0485f3ed1c3..76903e27ae9 100644 --- a/mysql-test/main/merge.test +++ b/mysql-test/main/merge.test @@ -2859,6 +2859,18 @@ set global default_storage_engine=@save_default_storage_engine; # gone so execution of other tests won't be affected by their presence. --source include/wait_until_count_sessions.inc +--echo # +--echo # MDEV-31083 ASAN use-after-poison in myrg_attach_children +--echo # + +CREATE TABLE t1 (f TEXT, FULLTEXT (f)) ENGINE=MyISAM; +INSERT INTO t1 VALUES ('foo'),('bar'); +CREATE TABLE mrg (f TEXT) ENGINE=MERGE, UNION(t1); +SELECT * FROM mrg; +DROP TABLE mrg, t1; + +--echo End of 10.5 tests + --echo # --echo # End of 10.0 tests --echo # diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result index 62466de1113..c086a9673a5 100644 --- a/mysql-test/main/selectivity.result +++ b/mysql-test/main/selectivity.result @@ -1824,7 +1824,6 @@ test.t1 analyze status Table is already up to date test.t2 analyze status Engine-independent statistics collected test.t2 analyze status Table is already up to date set optimizer_switch='exists_to_in=off'; -set optimizer_use_condition_selectivity=2; SELECT * FROM t1 WHERE EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id @@ -1849,18 +1848,39 @@ id a 17 17 18 18 19 19 -explain SELECT * FROM t1 +set statement optimizer_use_condition_selectivity=2 for explain SELECT * FROM t1 WHERE EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id WHERE A.a=t1.a AND t2.b < 20); id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 ALL NULL NULL NULL NULL 100 Using where -2 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 -2 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (10%) Using where; Using rowid filter -EXPLAIN SELECT * FROM t1 A, t1 B WHERE A.a = B.a and A.id = 65; +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (10%) Using where; Using rowid filter +set statement optimizer_use_condition_selectivity=4 for explain SELECT * FROM t1 +WHERE +EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id +WHERE A.a=t1.a AND t2.b < 20); id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE A const PRIMARY,a PRIMARY 4 const 1 -1 SIMPLE B ref a a 5 const 1 +1 PRIMARY t1 ALL NULL NULL NULL NULL 100 Using where +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (10%) Using where; Using rowid filter +set @query="EXPLAIN SELECT * FROM t1 A, t1 B WHERE A.a = B.a and A.id = 65"; +set statement optimizer_use_condition_selectivity=2 for explain SELECT * FROM t1 +WHERE +EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id +WHERE A.a=t1.a AND t2.b < 20); +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 100 Using where +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (10%) Using where; Using rowid filter +set statement optimizer_use_condition_selectivity=4 for explain SELECT * FROM t1 +WHERE +EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id +WHERE A.a=t1.a AND t2.b < 20); +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 100 Using where +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (10%) Using where; Using rowid filter explain SELECT * FROM t1 WHERE EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id @@ -1870,7 +1890,6 @@ id select_type table type possible_keys key key_len ref rows Extra 2 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 2 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (10%) Using where; Using rowid filter set optimizer_switch= @save_optimizer_switch; -set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; drop table t1,t2; # # MDEV-21495: Conditional jump or move depends on uninitialised value in sel_arg_range_seq_next diff --git a/mysql-test/main/selectivity.test b/mysql-test/main/selectivity.test index df3850d74b7..9f21bea442a 100644 --- a/mysql-test/main/selectivity.test +++ b/mysql-test/main/selectivity.test @@ -1236,13 +1236,10 @@ set optimizer_use_condition_selectivity= @@optimizer_use_condition_selectivity; drop table t1,t2,t3; - --echo # --echo # MDEV-20519: Query plan regression with optimizer_use_condition_selectivity=4 --echo # - - create table t1 (id int, a int, PRIMARY KEY(id), key(a)); insert into t1 select seq,seq from seq_1_to_100; @@ -1252,7 +1249,6 @@ insert into t2 select seq,seq,seq from seq_1_to_100; analyze table t1,t2 persistent for all; set optimizer_switch='exists_to_in=off'; -set optimizer_use_condition_selectivity=2; let $query= SELECT * FROM t1 WHERE @@ -1260,14 +1256,16 @@ let $query= SELECT * FROM t1 WHERE A.a=t1.a AND t2.b < 20); eval $query; -eval explain $query; +eval set statement optimizer_use_condition_selectivity=2 for explain $query; +eval set statement optimizer_use_condition_selectivity=4 for explain $query; -EXPLAIN SELECT * FROM t1 A, t1 B WHERE A.a = B.a and A.id = 65; +set @query="EXPLAIN SELECT * FROM t1 A, t1 B WHERE A.a = B.a and A.id = 65"; +eval set statement optimizer_use_condition_selectivity=2 for explain $query; +eval set statement optimizer_use_condition_selectivity=4 for explain $query; eval explain $query; set optimizer_switch= @save_optimizer_switch; -set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; drop table t1,t2; --echo # diff --git a/mysql-test/main/selectivity_innodb.result b/mysql-test/main/selectivity_innodb.result index a4366214643..c7eb4ac2608 100644 --- a/mysql-test/main/selectivity_innodb.result +++ b/mysql-test/main/selectivity_innodb.result @@ -1836,7 +1836,6 @@ test.t1 analyze status OK test.t2 analyze status Engine-independent statistics collected test.t2 analyze status OK set optimizer_switch='exists_to_in=off'; -set optimizer_use_condition_selectivity=2; SELECT * FROM t1 WHERE EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id @@ -1861,18 +1860,39 @@ id a 17 17 18 18 19 19 -explain SELECT * FROM t1 +set statement optimizer_use_condition_selectivity=2 for explain SELECT * FROM t1 WHERE EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id WHERE A.a=t1.a AND t2.b < 20); id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY t1 index NULL a 5 NULL 100 Using where; Using index -2 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 Using index -2 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (19%) Using where; Using rowid filter -EXPLAIN SELECT * FROM t1 A, t1 B WHERE A.a = B.a and A.id = 65; +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 Using index +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (19%) Using where; Using rowid filter +set statement optimizer_use_condition_selectivity=4 for explain SELECT * FROM t1 +WHERE +EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id +WHERE A.a=t1.a AND t2.b < 20); id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE A const PRIMARY,a PRIMARY 4 const 1 -1 SIMPLE B ref a a 5 const 1 Using index +1 PRIMARY t1 index NULL a 5 NULL 100 Using where; Using index +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 Using index +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (19%) Using where; Using rowid filter +set @query="EXPLAIN SELECT * FROM t1 A, t1 B WHERE A.a = B.a and A.id = 65"; +set statement optimizer_use_condition_selectivity=2 for explain SELECT * FROM t1 +WHERE +EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id +WHERE A.a=t1.a AND t2.b < 20); +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 index NULL a 5 NULL 100 Using where; Using index +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 Using index +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (19%) Using where; Using rowid filter +set statement optimizer_use_condition_selectivity=4 for explain SELECT * FROM t1 +WHERE +EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id +WHERE A.a=t1.a AND t2.b < 20); +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 index NULL a 5 NULL 100 Using where; Using index +3 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 Using index +3 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (19%) Using where; Using rowid filter explain SELECT * FROM t1 WHERE EXISTS (SELECT * FROM t1 A INNER JOIN t2 ON t2.a = A.id @@ -1882,7 +1902,6 @@ id select_type table type possible_keys key key_len ref rows Extra 2 DEPENDENT SUBQUERY A ref PRIMARY,a a 5 test.t1.a 1 Using index 2 DEPENDENT SUBQUERY t2 ref|filter a,b a|b 5|5 test.A.id 1 (19%) Using where; Using rowid filter set optimizer_switch= @save_optimizer_switch; -set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; drop table t1,t2; # # MDEV-21495: Conditional jump or move depends on uninitialised value in sel_arg_range_seq_next diff --git a/mysql-test/main/type_timestamp.result b/mysql-test/main/type_timestamp.result index a4516fc91a2..30d4d819e14 100644 --- a/mysql-test/main/type_timestamp.result +++ b/mysql-test/main/type_timestamp.result @@ -1230,6 +1230,8 @@ SELECT * FROM t1 HAVING MIN(t1.c1) >= ALL(SELECT 'a' UNION SELECT 'r'); c1 Warnings: Warning 1292 Truncated incorrect datetime value: 'r' +SELECT * FROM t1 HAVING MIN(t1.c1) > 0; +c1 DROP TABLE t1; CREATE TABLE t1 (c1 timestamp); INSERT INTO t1 VALUES ('2010-01-01 00:00:00'); diff --git a/mysql-test/main/type_timestamp.test b/mysql-test/main/type_timestamp.test index 8edd52fec2a..45ec6eae8bb 100644 --- a/mysql-test/main/type_timestamp.test +++ b/mysql-test/main/type_timestamp.test @@ -810,6 +810,7 @@ DROP TABLE t1; CREATE TABLE t1 (c1 timestamp); SELECT MIN(t1.c1) AS k1 FROM t1 HAVING (k1 >= ALL(SELECT 'a' UNION SELECT 'r')); SELECT * FROM t1 HAVING MIN(t1.c1) >= ALL(SELECT 'a' UNION SELECT 'r'); +SELECT * FROM t1 HAVING MIN(t1.c1) > 0; DROP TABLE t1; CREATE TABLE t1 (c1 timestamp); diff --git a/mysql-test/mariadb-test-run.pl b/mysql-test/mariadb-test-run.pl index b4801b3835a..d2ecb6164e7 100755 --- a/mysql-test/mariadb-test-run.pl +++ b/mysql-test/mariadb-test-run.pl @@ -408,8 +408,11 @@ sub main { mark_time_used('collect'); - mysql_install_db(default_mysqld(), "$opt_vardir/install.db") unless using_extern(); - + if (!using_extern()) + { + mysql_install_db(default_mysqld(), "$opt_vardir/install.db"); + make_readonly("$opt_vardir/install.db"); + } if ($opt_dry_run) { for (@$tests) { diff --git a/mysql-test/suite/galera/disabled.def b/mysql-test/suite/galera/disabled.def index bcf6078f624..84ecd3eb7fb 100644 --- a/mysql-test/suite/galera/disabled.def +++ b/mysql-test/suite/galera/disabled.def @@ -27,3 +27,5 @@ galera_bf_kill_debug : timeout after 900 seconds galera_ssl_upgrade : [Warning] Failed to load slave replication state from table mysql.gtid_slave_pos: 130: Incorrect file format 'gtid_slave_pos' galera_parallel_simple : timeout related to wsrep_sync_wait galera_insert_bulk : MDEV-30536 no expected deadlock in galera_insert_bulk test +MDEV-27713 : test is using get_lock(), which is now rejected in cluster +galera_bf_abort_group_commit : MDEV-30855 PR to remove the test exists diff --git a/mysql-test/suite/galera/r/MDEV-29293.result b/mysql-test/suite/galera/r/MDEV-29293.result new file mode 100644 index 00000000000..70c0cc84a31 --- /dev/null +++ b/mysql-test/suite/galera/r/MDEV-29293.result @@ -0,0 +1,21 @@ +connection node_2; +connection node_1; +connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1; +set wsrep_sync_wait = 0; +CREATE TABLE t1(a int not null primary key auto_increment, b int) engine=InnoDB; +INSERT INTO t1 VALUES (1,2); +connection node_1a; +BEGIN; +UPDATE t1 SET b=3 WHERE a=1; +connection node_1; +set debug_sync='wsrep_kill_before_awake_no_mutex SIGNAL before_kill WAIT_FOR continue'; +connection node_1b; +set debug_sync= 'now WAIT_FOR before_kill'; +connection node_2; +UPDATE t1 SET b=7 WHERE a=1; +connection node_1b; +set debug_sync= 'now SIGNAL continue'; +connection node_1; +DROP TABLE t1; +SET DEBUG_SYNC= 'RESET'; diff --git a/mysql-test/suite/galera/r/galera_create_table_as_select.result b/mysql-test/suite/galera/r/galera_create_table_as_select.result index 6f65ee99f0a..beda5f30fe2 100644 --- a/mysql-test/suite/galera/r/galera_create_table_as_select.result +++ b/mysql-test/suite/galera/r/galera_create_table_as_select.result @@ -82,6 +82,7 @@ connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; LOCK TABLE t2 WRITE; connection node_1; CREATE TABLE t1 AS SELECT * FROM t2;; +connection node_1a; connection node_2; SELECT COUNT(*) = 5 FROM t2; COUNT(*) = 5 diff --git a/mysql-test/suite/galera/r/galera_gcache_recover_manytrx.result b/mysql-test/suite/galera/r/galera_gcache_recover_manytrx.result index 5caf22b39ca..5718807b5c4 100644 --- a/mysql-test/suite/galera/r/galera_gcache_recover_manytrx.result +++ b/mysql-test/suite/galera/r/galera_gcache_recover_manytrx.result @@ -134,6 +134,3 @@ connection node_1; call mtr.add_suppression("Error in Log_event::read_log_event():.*"); CALL mtr.add_suppression("conflict state 7 after post commit"); CALL mtr.add_suppression("Skipped GCache ring buffer recovery"); -connection node_2; -call mtr.add_suppression("Error in Log_event::read_log_event():.*"); -CALL mtr.add_suppression("Skipped GCache ring buffer recovery"); diff --git a/mysql-test/suite/galera/r/galera_kill_group_commit.result b/mysql-test/suite/galera/r/galera_kill_group_commit.result new file mode 100644 index 00000000000..bb59ce1486f --- /dev/null +++ b/mysql-test/suite/galera/r/galera_kill_group_commit.result @@ -0,0 +1,27 @@ +connection node_2; +connection node_1; +connect node_1_kill, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connect node_1_ctrl, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET SESSION wsrep_sync_wait = 0; +connect node_1_follower, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET SESSION wsrep_sync_wait = 0; +connection node_1; +CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB; +SET SESSION DEBUG_SYNC = "commit_before_enqueue SIGNAL leader_before_enqueue_reached WAIT_FOR leader_before_enqueue_continue"; +INSERT INTO t1 VALUES (1); +connection node_1_ctrl; +SET DEBUG_SYNC = "now WAIT_FOR leader_before_enqueue_reached"; +connection node_1_follower; +INSERT INTO t1 VALUES (2);; +connection node_1_ctrl; +connection node_1_kill; +# Execute KILL QUERY for group commit follower +SET DEBUG_SYNC = "now SIGNAL leader_before_enqueue_continue"; +connection node_1_follower; +connection node_1; +SELECT * FROM t1; +f1 +1 +2 +SET DEBUG_SYNC = "RESET"; +DROP TABLE t1; diff --git a/mysql-test/suite/galera/r/galera_var_retry_autocommit.result b/mysql-test/suite/galera/r/galera_var_retry_autocommit.result index 50667b0a4fa..eee740b6036 100644 --- a/mysql-test/suite/galera/r/galera_var_retry_autocommit.result +++ b/mysql-test/suite/galera/r/galera_var_retry_autocommit.result @@ -36,7 +36,10 @@ SET DEBUG_SYNC = 'now SIGNAL wsrep_retry_autocommit_continue'; connection node_1; SELECT COUNT(*) FROM t1; COUNT(*) -1 +connection node_1; +SELECT COUNT(*) FROM t1; +COUNT(*) +0 SET DEBUG_SYNC = 'RESET'; SET GLOBAL debug_dbug = NULL; DROP TABLE t1; diff --git a/mysql-test/suite/galera/t/MDEV-29293.test b/mysql-test/suite/galera/t/MDEV-29293.test new file mode 100644 index 00000000000..dacbf714c06 --- /dev/null +++ b/mysql-test/suite/galera/t/MDEV-29293.test @@ -0,0 +1,41 @@ +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/galera_have_debug_sync.inc + +--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1 +set wsrep_sync_wait = 0; + +CREATE TABLE t1(a int not null primary key auto_increment, b int) engine=InnoDB; +INSERT INTO t1 VALUES (1,2); + +--connection node_1a +--let $victim_id = `SELECT CONNECTION_ID()` +BEGIN; +UPDATE t1 SET b=3 WHERE a=1; + +--connection node_1 +set debug_sync='wsrep_kill_before_awake_no_mutex SIGNAL before_kill WAIT_FOR continue'; +--disable_query_log +--disable_result_log +--send_eval KILL CONNECTION $victim_id +--enable_result_log +--enable_query_log + +--connection node_1b +set debug_sync= 'now WAIT_FOR before_kill'; + +--connection node_2 +UPDATE t1 SET b=7 WHERE a=1; + +--connection node_1b +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE User = 'system user' AND State LIKE 'Update_rows_log_event%'; +--source include/wait_condition.inc +set debug_sync= 'now SIGNAL continue'; + +--connection node_1 +--reap +DROP TABLE t1; +SET DEBUG_SYNC= 'RESET'; + diff --git a/mysql-test/suite/galera/t/galera_create_table_as_select.test b/mysql-test/suite/galera/t/galera_create_table_as_select.test index a6c1f657280..cfee63e5e27 100644 --- a/mysql-test/suite/galera/t/galera_create_table_as_select.test +++ b/mysql-test/suite/galera/t/galera_create_table_as_select.test @@ -113,6 +113,10 @@ LOCK TABLE t2 WRITE; --connection node_1 --send CREATE TABLE t1 AS SELECT * FROM t2; +--connection node_1a +--let $wait_condition = SELECT COUNT(*) = 1 FROM information_schema.processlist WHERE STATE LIKE 'Waiting for table metadata lock%' +--source include/wait_condition.inc + --connection node_2 SELECT COUNT(*) = 5 FROM t2; CREATE TABLE t1 AS SELECT * FROM t2; @@ -121,7 +125,7 @@ CREATE TABLE t1 AS SELECT * FROM t2; UNLOCK TABLES; --connection node_1 ---error ER_TABLE_EXISTS_ERROR,ER_LOCK_DEADLOCK +--error ER_TABLE_EXISTS_ERROR,ER_QUERY_INTERRUPTED --reap DROP TABLE t1, t2; diff --git a/mysql-test/suite/galera/t/galera_kill_group_commit.cnf b/mysql-test/suite/galera/t/galera_kill_group_commit.cnf new file mode 100644 index 00000000000..60f4f776409 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_kill_group_commit.cnf @@ -0,0 +1,5 @@ +!include ../galera_2nodes.cnf + +[mysqld] +log-bin +log-slave-updates diff --git a/mysql-test/suite/galera/t/galera_kill_group_commit.test b/mysql-test/suite/galera/t/galera_kill_group_commit.test new file mode 100644 index 00000000000..4b84f2d90ef --- /dev/null +++ b/mysql-test/suite/galera/t/galera_kill_group_commit.test @@ -0,0 +1,69 @@ +# +# Verify that transaction which has reached group commit queue +# cannot be killed. If the kill succeeds, assertion for +# wsrep transaction state will fail. +# +# If the bug is present, i.e. wsrep transaction gets killed during +# group commit wait, this test is enough to reproduce the crash +# most of the time. +# + +--source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/galera_cluster.inc + +# Connection for KILL commands +--connect node_1_kill, 127.0.0.1, root, , test, $NODE_MYPORT_1 +# Connection for sync point control +--connect node_1_ctrl, 127.0.0.1, root, , test, $NODE_MYPORT_1 +SET SESSION wsrep_sync_wait = 0; +# Connection for group commit follower +--connect node_1_follower, 127.0.0.1, root, , test, $NODE_MYPORT_1 +# Need to disable sync wait to reach commit queue when leader +# is blocked. +SET SESSION wsrep_sync_wait = 0; +--let $follower_id = `SELECT CONNECTION_ID()` + +--connection node_1 +CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB; + +SET SESSION DEBUG_SYNC = "commit_before_enqueue SIGNAL leader_before_enqueue_reached WAIT_FOR leader_before_enqueue_continue"; +--send INSERT INTO t1 VALUES (1) + +--connection node_1_ctrl +SET DEBUG_SYNC = "now WAIT_FOR leader_before_enqueue_reached"; + +--connection node_1_follower +# SET SESSION DEBUG_SYNC = "group_commit_waiting_for_prior SIGNAL follower_waiting_for_prior_reached WAIT_FOR follower_waiting_for_prior_continue"; +--send INSERT INTO t1 VALUES (2); + +--connection node_1_ctrl +# TODO: Is it possible to use sync points to enforce group commit to happen? +# The leader will hold commit monitor in commit_before_enqueue sync point, +# which prevents the follower to reach the group commit wait state. +# We now sleep and expect the follower to reach group commit, but this +# may cause false negatives. +--sleep 1 + +--connection node_1_kill +--echo # Execute KILL QUERY for group commit follower +--disable_query_log +--disable_result_log +# Because it is currently impossible to verify that the +# follower has reached group commit queue, the KILL may +# sometimes return success. +--error 0,ER_KILL_DENIED_ERROR +--eval KILL QUERY $follower_id +--enable_result_log +--enable_query_log + +SET DEBUG_SYNC = "now SIGNAL leader_before_enqueue_continue"; +--connection node_1_follower +--reap + +--connection node_1 +--reap +SELECT * FROM t1; + +SET DEBUG_SYNC = "RESET"; +DROP TABLE t1; diff --git a/mysql-test/suite/galera/t/galera_var_retry_autocommit.test b/mysql-test/suite/galera/t/galera_var_retry_autocommit.test index c58eba1410e..8009fe88c65 100644 --- a/mysql-test/suite/galera/t/galera_var_retry_autocommit.test +++ b/mysql-test/suite/galera/t/galera_var_retry_autocommit.test @@ -64,6 +64,7 @@ SELECT COUNT(*) FROM t1; SET DEBUG_SYNC = 'now SIGNAL wsrep_retry_autocommit_continue'; --connection node_1 +--error 0,ER_LOCK_DEADLOCK --reap SELECT COUNT(*) FROM t1; diff --git a/mysql-test/suite/innodb/r/monitor.result b/mysql-test/suite/innodb/r/monitor.result index 63f78752276..f3d7ca8cd02 100644 --- a/mysql-test/suite/innodb/r/monitor.result +++ b/mysql-test/suite/innodb/r/monitor.result @@ -1,10 +1,9 @@ -set global innodb_monitor_disable = All; select name, if(enabled,'enabled','disabled') status from information_schema.innodb_metrics; name status metadata_table_handles_opened disabled -lock_deadlocks disabled -lock_timeouts disabled +lock_deadlocks enabled +lock_timeouts enabled lock_rec_lock_waits disabled lock_table_lock_waits disabled lock_rec_lock_requests disabled @@ -14,30 +13,30 @@ lock_rec_locks disabled lock_table_lock_created disabled lock_table_lock_removed disabled lock_table_locks disabled -lock_row_lock_current_waits disabled -lock_row_lock_time disabled -lock_row_lock_time_max disabled -lock_row_lock_waits disabled -lock_row_lock_time_avg disabled -buffer_pool_size disabled -buffer_pool_reads disabled -buffer_pool_read_requests disabled -buffer_pool_write_requests disabled -buffer_pool_wait_free disabled -buffer_pool_read_ahead disabled -buffer_pool_read_ahead_evicted disabled -buffer_pool_pages_total disabled -buffer_pool_pages_misc disabled -buffer_pool_pages_data disabled -buffer_pool_bytes_data disabled -buffer_pool_pages_dirty disabled -buffer_pool_bytes_dirty disabled -buffer_pool_pages_free disabled -buffer_pages_created disabled -buffer_pages_written disabled -buffer_pages_read disabled -buffer_data_reads disabled -buffer_data_written disabled +lock_row_lock_current_waits enabled +lock_row_lock_time enabled +lock_row_lock_time_max enabled +lock_row_lock_waits enabled +lock_row_lock_time_avg enabled +buffer_pool_size enabled +buffer_pool_reads enabled +buffer_pool_read_requests enabled +buffer_pool_write_requests enabled +buffer_pool_wait_free enabled +buffer_pool_read_ahead enabled +buffer_pool_read_ahead_evicted enabled +buffer_pool_pages_total enabled +buffer_pool_pages_misc enabled +buffer_pool_pages_data enabled +buffer_pool_bytes_data enabled +buffer_pool_pages_dirty enabled +buffer_pool_bytes_dirty enabled +buffer_pool_pages_free enabled +buffer_pages_created enabled +buffer_pages_written enabled +buffer_pages_read enabled +buffer_data_reads enabled +buffer_data_written enabled buffer_flush_batch_scanned disabled buffer_flush_batch_num_scan disabled buffer_flush_batch_scanned_per_call disabled @@ -70,8 +69,8 @@ buffer_flush_background_pages disabled buffer_LRU_batch_scanned disabled buffer_LRU_batch_num_scan disabled buffer_LRU_batch_scanned_per_call disabled -buffer_LRU_batch_flush_total_pages disabled -buffer_LRU_batch_evict_total_pages disabled +buffer_LRU_batch_flush_total_pages enabled +buffer_LRU_batch_evict_total_pages enabled buffer_LRU_single_flush_failure_count disabled buffer_LRU_get_free_search disabled buffer_LRU_search_scanned disabled @@ -112,21 +111,21 @@ buffer_page_written_blob disabled buffer_page_written_zblob disabled buffer_page_written_zblob2 disabled buffer_page_written_other disabled -os_data_reads disabled -os_data_writes disabled -os_data_fsyncs disabled -os_pending_reads disabled -os_pending_writes disabled -os_log_bytes_written disabled +os_data_reads enabled +os_data_writes enabled +os_data_fsyncs enabled +os_pending_reads enabled +os_pending_writes enabled +os_log_bytes_written enabled trx_rw_commits disabled trx_ro_commits disabled trx_nl_ro_commits disabled trx_commits_insert_update disabled trx_rollbacks disabled trx_rollbacks_savepoint disabled -trx_rseg_history_len disabled +trx_rseg_history_len enabled trx_undo_slots_used disabled -trx_undo_slots_cached disabled +trx_undo_slots_cached enabled trx_rseg_current_size disabled purge_del_mark_records disabled purge_upd_exist_or_extern_records disabled @@ -142,9 +141,9 @@ log_lsn_current disabled log_lsn_checkpoint_age disabled log_lsn_buf_pool_oldest disabled log_max_modified_age_async disabled -log_waits disabled -log_write_requests disabled -log_writes disabled +log_waits enabled +log_write_requests enabled +log_writes enabled compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled @@ -162,34 +161,34 @@ index_page_merge_successful disabled index_page_reorg_attempts disabled index_page_reorg_successful disabled index_page_discards disabled -adaptive_hash_searches disabled -adaptive_hash_searches_btree disabled +adaptive_hash_searches enabled +adaptive_hash_searches_btree enabled adaptive_hash_pages_added disabled adaptive_hash_pages_removed disabled adaptive_hash_rows_added disabled adaptive_hash_rows_removed disabled adaptive_hash_rows_deleted_no_hash_entry disabled adaptive_hash_rows_updated disabled -file_num_open_files disabled -ibuf_merges_insert disabled -ibuf_merges_delete_mark disabled -ibuf_merges_delete disabled -ibuf_merges_discard_insert disabled -ibuf_merges_discard_delete_mark disabled -ibuf_merges_discard_delete disabled -ibuf_merges disabled -ibuf_size disabled +file_num_open_files enabled +ibuf_merges_insert enabled +ibuf_merges_delete_mark enabled +ibuf_merges_delete enabled +ibuf_merges_discard_insert enabled +ibuf_merges_discard_delete_mark enabled +ibuf_merges_discard_delete enabled +ibuf_merges enabled +ibuf_size enabled innodb_master_thread_sleeps disabled -innodb_activity_count disabled +innodb_activity_count enabled innodb_master_active_loops disabled innodb_master_idle_loops disabled innodb_log_flush_usec disabled innodb_dict_lru_usec disabled innodb_dict_lru_count_active disabled innodb_dict_lru_count_idle disabled -innodb_dblwr_writes disabled -innodb_dblwr_pages_written disabled -innodb_page_size disabled +innodb_dblwr_writes enabled +innodb_dblwr_pages_written enabled +innodb_page_size enabled ddl_background_drop_indexes disabled ddl_online_create_index disabled ddl_pending_alter_table disabled @@ -199,6 +198,9 @@ icp_attempts disabled icp_no_match disabled icp_out_of_range disabled icp_match disabled +set global innodb_monitor_disable = All; +select name from information_schema.innodb_metrics where enabled; +name set global innodb_monitor_enable = all; select name from information_schema.innodb_metrics where not enabled; name diff --git a/mysql-test/suite/innodb/r/stat_tables.result b/mysql-test/suite/innodb/r/stat_tables.result index c1ce6fc8fce..99c862fea77 100644 --- a/mysql-test/suite/innodb/r/stat_tables.result +++ b/mysql-test/suite/innodb/r/stat_tables.result @@ -26,4 +26,60 @@ UPDATE mysql.innodb_table_stats SET last_update=NULL WHERE table_name='t1'; XA END 'test'; XA ROLLBACK 'test'; DROP TABLE t1; +# +# MDEV-30483 After upgrade to 10.6 from Mysql 5.7 seeing "InnoDB: Column last_update in table mysql.innodb_table_stats is BINARY(4) NOT NULL but should be INT UNSIGNED NOT NULL" +# +# +# Testing a non-default format: Field_timestamp0 - UINT4 based +# +SET @@global.mysql56_temporal_format=0; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW COLUMNS FROM mysql.innodb_table_stats LIKE 'last_update'; +Field Type Null Key Default Extra +last_update timestamp /* mariadb-5.3 */ NO current_timestamp() on update current_timestamp() +SHOW COLUMNS FROM mysql.innodb_index_stats LIKE 'last_update'; +Field Type Null Key Default Extra +last_update timestamp /* mariadb-5.3 */ NO current_timestamp() on update current_timestamp() +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=1; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +TIMESTAMPDIFF(DAY,last_update,now())<=1 +1 +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +TIMESTAMPDIFF(DAY,last_update,now())<=1 +1 +DROP TABLE t1; +# +# Now as the table t1 is dropped, expect no statistics +# +SELECT * FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +database_name table_name last_update n_rows clustered_index_size sum_of_other_index_sizes +SELECT * FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +database_name table_name index_name last_update stat_name stat_value sample_size stat_description +# +# Testing with the default format: Field_timestampf - BINARY(4) based with the UNSIGNED_FLAG +# +SET @@global.mysql56_temporal_format=1; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW COLUMNS FROM mysql.innodb_table_stats LIKE 'last_update'; +Field Type Null Key Default Extra +last_update timestamp NO current_timestamp() on update current_timestamp() +SHOW COLUMNS FROM mysql.innodb_index_stats LIKE 'last_update'; +Field Type Null Key Default Extra +last_update timestamp NO current_timestamp() on update current_timestamp() +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=1; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +TIMESTAMPDIFF(DAY,last_update,now())<=1 +1 +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +TIMESTAMPDIFF(DAY,last_update,now())<=1 +1 +DROP TABLE t1; # End of 10.6 tests diff --git a/mysql-test/suite/innodb/t/monitor.test b/mysql-test/suite/innodb/t/monitor.test index 9b093d41892..2c9152621eb 100644 --- a/mysql-test/suite/innodb/t/monitor.test +++ b/mysql-test/suite/innodb/t/monitor.test @@ -5,12 +5,14 @@ # sys_vars.innodb_monitor_enable_basic --source include/have_innodb.inc -set global innodb_monitor_disable = All; # Test turn on/off the monitor counter with "all" option # By default, they will be off. select name, if(enabled,'enabled','disabled') status from information_schema.innodb_metrics; +set global innodb_monitor_disable = All; +select name from information_schema.innodb_metrics where enabled; + # Turn on all monitor counters set global innodb_monitor_enable = all; diff --git a/mysql-test/suite/innodb/t/stat_tables.test b/mysql-test/suite/innodb/t/stat_tables.test index dd18c265e99..ad421587283 100644 --- a/mysql-test/suite/innodb/t/stat_tables.test +++ b/mysql-test/suite/innodb/t/stat_tables.test @@ -28,4 +28,57 @@ XA END 'test'; XA ROLLBACK 'test'; DROP TABLE t1; +--echo # +--echo # MDEV-30483 After upgrade to 10.6 from Mysql 5.7 seeing "InnoDB: Column last_update in table mysql.innodb_table_stats is BINARY(4) NOT NULL but should be INT UNSIGNED NOT NULL" +--echo # + +# The following tests demonstrate that these columns: +# - innodb_table_stats.last_update +# - innodb_index_stats.last_update +# have sane values close to NOW(), rather than any garbage, +# with all TIMESTAMP formats. + +--echo # +--echo # Testing a non-default format: Field_timestamp0 - UINT4 based +--echo # + +SET @@global.mysql56_temporal_format=0; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW COLUMNS FROM mysql.innodb_table_stats LIKE 'last_update'; +SHOW COLUMNS FROM mysql.innodb_index_stats LIKE 'last_update'; +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=1; + +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +DROP TABLE t1; + +--echo # +--echo # Now as the table t1 is dropped, expect no statistics +--echo # + +SELECT * FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +SELECT * FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; + +--echo # +--echo # Testing with the default format: Field_timestampf - BINARY(4) based with the UNSIGNED_FLAG +--echo # + +SET @@global.mysql56_temporal_format=1; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW COLUMNS FROM mysql.innodb_table_stats LIKE 'last_update'; +SHOW COLUMNS FROM mysql.innodb_index_stats LIKE 'last_update'; +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=1; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +DROP TABLE t1; + + --echo # End of 10.6 tests diff --git a/mysql-test/suite/parts/r/partition_purge.result b/mysql-test/suite/parts/r/partition_purge.result new file mode 100644 index 00000000000..072b141cd8d --- /dev/null +++ b/mysql-test/suite/parts/r/partition_purge.result @@ -0,0 +1,26 @@ +CREATE TABLE t1(f1 INT, f2 INT, INDEX(f1))ENGINE=InnoDB +PARTITION BY LIST(f1) ( +PARTITION p1 VALUES in (1, 2, 3), +PARTITION p2 VALUES in (4, 5, 6)); +INSERT INTO t1 VALUES(1, 1), (1, 1), (6, 1); +connect con1,localhost,root,,,; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connect con2,localhost,root,,,; +SET DEBUG_SYNC="innodb_rollback_inplace_alter_table SIGNAL default_resume WAIT_FOR alter_resume"; +ALTER TABLE t1 ADD UNIQUE INDEX(f1); +connection default; +set DEBUG_SYNC="now WAIT_FOR default_resume"; +SET DEBUG_SYNC="innodb_row_update_for_mysql_begin SIGNAL alter_resume WAIT_FOR alter_finish"; +DELETE FROM t1; +connection con2; +ERROR 23000: Duplicate entry '1' for key 'f1_2' +SET DEBUG_SYNC="now SIGNAL alter_finish"; +connection default; +connection con1; +commit; +connection default; +disconnect con1; +disconnect con2; +InnoDB 0 transactions not purged +drop table t1; +SET DEBUG_SYNC=reset; diff --git a/mysql-test/suite/parts/t/partition_purge.opt b/mysql-test/suite/parts/t/partition_purge.opt new file mode 100644 index 00000000000..a39e5228c9d --- /dev/null +++ b/mysql-test/suite/parts/t/partition_purge.opt @@ -0,0 +1 @@ +--innodb_purge_threads=1 diff --git a/mysql-test/suite/parts/t/partition_purge.test b/mysql-test/suite/parts/t/partition_purge.test new file mode 100644 index 00000000000..2df81b0eb77 --- /dev/null +++ b/mysql-test/suite/parts/t/partition_purge.test @@ -0,0 +1,37 @@ +--source include/have_innodb.inc +--source include/have_partition.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc + +CREATE TABLE t1(f1 INT, f2 INT, INDEX(f1))ENGINE=InnoDB + PARTITION BY LIST(f1) ( + PARTITION p1 VALUES in (1, 2, 3), + PARTITION p2 VALUES in (4, 5, 6)); +INSERT INTO t1 VALUES(1, 1), (1, 1), (6, 1); +connect(con1,localhost,root,,,); +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +connect(con2,localhost,root,,,); +SET DEBUG_SYNC="innodb_rollback_inplace_alter_table SIGNAL default_resume WAIT_FOR alter_resume"; +send ALTER TABLE t1 ADD UNIQUE INDEX(f1); + +connection default; +set DEBUG_SYNC="now WAIT_FOR default_resume"; +SET DEBUG_SYNC="innodb_row_update_for_mysql_begin SIGNAL alter_resume WAIT_FOR alter_finish"; +send DELETE FROM t1; + +connection con2; +--error ER_DUP_ENTRY +reap; +SET DEBUG_SYNC="now SIGNAL alter_finish"; + +connection default; +reap; +connection con1; +commit; +connection default; +disconnect con1; +disconnect con2; +--source ../../innodb/include/wait_all_purged.inc +drop table t1; +SET DEBUG_SYNC=reset; diff --git a/plugin/type_mysql_timestamp/CMakeLists.txt b/plugin/type_mysql_timestamp/CMakeLists.txt new file mode 100644 index 00000000000..ca7bf1e7704 --- /dev/null +++ b/plugin/type_mysql_timestamp/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) 2019, MariaDB corporation +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +MYSQL_ADD_PLUGIN(type_mysql_timestamp plugin.cc RECOMPILE_FOR_EMBEDDED + MODULE_ONLY COMPONENT Test) diff --git a/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/suite.opt b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/suite.opt new file mode 100644 index 00000000000..e9e2a99b589 --- /dev/null +++ b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/suite.opt @@ -0,0 +1 @@ +--plugin-load-add=$TYPE_MYSQL_TIMESTAMP_SO diff --git a/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/suite.pm b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/suite.pm new file mode 100644 index 00000000000..cbb8f1b097f --- /dev/null +++ b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/suite.pm @@ -0,0 +1,10 @@ +package My::Suite::Type_test; + +@ISA = qw(My::Suite); + +return "No TYPE_TEST plugin" unless $ENV{TYPE_MYSQL_TIMESTAMP_SO}; + +sub is_default { 1 } + +bless { }; + diff --git a/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp.result b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp.result new file mode 100644 index 00000000000..4a622ffa8bf --- /dev/null +++ b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp.result @@ -0,0 +1,45 @@ +# +# MDEV-30483 After upgrade to 10.6 from Mysql 5.7 seeing "InnoDB: Column last_update in table mysql.innodb_table_stats is BINARY(4) NOT NULL but should be INT UNSIGNED NOT NULL" +# +SELECT +PLUGIN_NAME, +PLUGIN_VERSION, +PLUGIN_STATUS, +PLUGIN_TYPE, +PLUGIN_AUTHOR, +PLUGIN_DESCRIPTION, +PLUGIN_LICENSE, +PLUGIN_MATURITY, +PLUGIN_AUTH_VERSION +FROM INFORMATION_SCHEMA.PLUGINS +WHERE PLUGIN_TYPE='DATA TYPE' + AND PLUGIN_NAME LIKE 'type_mysql_timestamp'; +PLUGIN_NAME type_mysql_timestamp +PLUGIN_VERSION 1.0 +PLUGIN_STATUS ACTIVE +PLUGIN_TYPE DATA TYPE +PLUGIN_AUTHOR MariaDB Corporation +PLUGIN_DESCRIPTION Data type TYPE_MYSQL_TIMESTAMP +PLUGIN_LICENSE GPL +PLUGIN_MATURITY Experimental +PLUGIN_AUTH_VERSION 1.0 +CREATE TABLE t1 (a TYPE_MYSQL_TIMESTAMP); +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` type_mysql_timestamp NULL DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +DROP TABLE t1; +CREATE TABLE t1 (a TIMESTAMP); +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` timestamp NULL DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +ALTER TABLE t1 MODIFY a TYPE_MYSQL_TIMESTAMP; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` type_mysql_timestamp NULL DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +DROP TABLE t1; diff --git a/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp.test b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp.test new file mode 100644 index 00000000000..a7aaa5a3e4c --- /dev/null +++ b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp.test @@ -0,0 +1,31 @@ +--source include/have_innodb.inc + +--echo # +--echo # MDEV-30483 After upgrade to 10.6 from Mysql 5.7 seeing "InnoDB: Column last_update in table mysql.innodb_table_stats is BINARY(4) NOT NULL but should be INT UNSIGNED NOT NULL" +--echo # + +--vertical_results +SELECT + PLUGIN_NAME, + PLUGIN_VERSION, + PLUGIN_STATUS, + PLUGIN_TYPE, + PLUGIN_AUTHOR, + PLUGIN_DESCRIPTION, + PLUGIN_LICENSE, + PLUGIN_MATURITY, + PLUGIN_AUTH_VERSION +FROM INFORMATION_SCHEMA.PLUGINS + WHERE PLUGIN_TYPE='DATA TYPE' + AND PLUGIN_NAME LIKE 'type_mysql_timestamp'; +--horizontal_results + +CREATE TABLE t1 (a TYPE_MYSQL_TIMESTAMP); +SHOW CREATE TABLE t1; +DROP TABLE t1; + +CREATE TABLE t1 (a TIMESTAMP); +SHOW CREATE TABLE t1; +ALTER TABLE t1 MODIFY a TYPE_MYSQL_TIMESTAMP; +SHOW CREATE TABLE t1; +DROP TABLE t1; diff --git a/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp_stat_tables.result b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp_stat_tables.result new file mode 100644 index 00000000000..97be602f673 --- /dev/null +++ b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp_stat_tables.result @@ -0,0 +1,108 @@ +# +# MDEV-30483 After upgrade to 10.6 from Mysql 5.7 seeing "InnoDB: Column last_update in table mysql.innodb_table_stats is BINARY(4) NOT NULL but should be INT UNSIGNED NOT NULL" +# +SET @@global.innodb_stats_persistent=0; +SHOW CREATE TABLE mysql.innodb_table_stats; +Table Create Table +innodb_table_stats CREATE TABLE `innodb_table_stats` ( + `database_name` varchar(64) NOT NULL, + `table_name` varchar(199) NOT NULL, + `last_update` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `n_rows` bigint(20) unsigned NOT NULL, + `clustered_index_size` bigint(20) unsigned NOT NULL, + `sum_of_other_index_sizes` bigint(20) unsigned NOT NULL, + PRIMARY KEY (`database_name`,`table_name`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin STATS_PERSISTENT=0 +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_table_stats; +Table Create Table +innodb_table_stats CREATE TABLE `innodb_table_stats` ( + `database_name` varchar(64) NOT NULL, + `table_name` varchar(199) NOT NULL, + `last_update` type_mysql_timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `n_rows` bigint(20) unsigned NOT NULL, + `clustered_index_size` bigint(20) unsigned NOT NULL, + `sum_of_other_index_sizes` bigint(20) unsigned NOT NULL, + PRIMARY KEY (`database_name`,`table_name`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin STATS_PERSISTENT=0 +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_index_stats; +Table Create Table +innodb_index_stats CREATE TABLE `innodb_index_stats` ( + `database_name` varchar(64) NOT NULL, + `table_name` varchar(199) NOT NULL, + `index_name` varchar(64) NOT NULL, + `last_update` type_mysql_timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `stat_name` varchar(64) NOT NULL, + `stat_value` bigint(20) unsigned NOT NULL, + `sample_size` bigint(20) unsigned DEFAULT NULL, + `stat_description` varchar(1024) NOT NULL, + PRIMARY KEY (`database_name`,`table_name`,`index_name`,`stat_name`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin STATS_PERSISTENT=0 +SET @@global.innodb_stats_persistent=1; +CREATE TABLE t1 (a INT, KEY(a)) ENGINE=InnoDB; +INSERT INTO t1 VALUES (10); +DROP TABLE t1; +SET @@global.innodb_stats_persistent=0; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_table_stats; +Table Create Table +innodb_table_stats CREATE TABLE `innodb_table_stats` ( + `database_name` varchar(64) NOT NULL, + `table_name` varchar(199) NOT NULL, + `last_update` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `n_rows` bigint(20) unsigned NOT NULL, + `clustered_index_size` bigint(20) unsigned NOT NULL, + `sum_of_other_index_sizes` bigint(20) unsigned NOT NULL, + PRIMARY KEY (`database_name`,`table_name`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin STATS_PERSISTENT=0 +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_index_stats; +Table Create Table +innodb_index_stats CREATE TABLE `innodb_index_stats` ( + `database_name` varchar(64) NOT NULL, + `table_name` varchar(199) NOT NULL, + `index_name` varchar(64) NOT NULL, + `last_update` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + `stat_name` varchar(64) NOT NULL, + `stat_value` bigint(20) unsigned NOT NULL, + `sample_size` bigint(20) unsigned DEFAULT NULL, + `stat_description` varchar(1024) NOT NULL, + PRIMARY KEY (`database_name`,`table_name`,`index_name`,`stat_name`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_bin STATS_PERSISTENT=0 +SET @@global.innodb_stats_persistent=1; +# +# Testing MySQL-5.6-alike Field_timestampf: BINARY(4) based, without UNSIGNED_FLAG +# +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW COLUMNS FROM mysql.innodb_table_stats LIKE 'last_update'; +Field Type Null Key Default Extra +last_update type_mysql_timestamp NO current_timestamp() on update current_timestamp() +SHOW COLUMNS FROM mysql.innodb_index_stats LIKE 'last_update'; +Field Type Null Key Default Extra +last_update type_mysql_timestamp NO current_timestamp() on update current_timestamp() +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=1; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +TIMESTAMPDIFF(DAY,last_update,now())<=1 +1 +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +TIMESTAMPDIFF(DAY,last_update,now())<=1 +1 +DROP TABLE t1; +# +# Now as the table t1 is dropped, expect no statistics +# +SELECT * FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +database_name table_name last_update n_rows clustered_index_size sum_of_other_index_sizes +SELECT * FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +database_name table_name index_name last_update stat_name stat_value sample_size stat_description +# +# Restore the structure of the tables +# +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); diff --git a/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp_stat_tables.test b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp_stat_tables.test new file mode 100644 index 00000000000..ff596c36fb3 --- /dev/null +++ b/plugin/type_mysql_timestamp/mysql-test/type_mysql_timestamp/type_mysql_timestamp_stat_tables.test @@ -0,0 +1,62 @@ +--source include/have_innodb.inc + +--echo # +--echo # MDEV-30483 After upgrade to 10.6 from Mysql 5.7 seeing "InnoDB: Column last_update in table mysql.innodb_table_stats is BINARY(4) NOT NULL but should be INT UNSIGNED NOT NULL" +--echo # + +SET @@global.innodb_stats_persistent=0; +SHOW CREATE TABLE mysql.innodb_table_stats; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_table_stats; +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_index_stats; +SET @@global.innodb_stats_persistent=1; + +CREATE TABLE t1 (a INT, KEY(a)) ENGINE=InnoDB; +INSERT INTO t1 VALUES (10); +DROP TABLE t1; + +SET @@global.innodb_stats_persistent=0; +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_table_stats; +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW CREATE TABLE mysql.innodb_index_stats; +SET @@global.innodb_stats_persistent=1; + + +# The following test demonstrate that these columns: +# - innodb_table_stats.last_update +# - innodb_index_stats.last_update +# have sane values close to NOW(), rather than any garbage, +# with MySQL-alike Field_timestampf + +--echo # +--echo # Testing MySQL-5.6-alike Field_timestampf: BINARY(4) based, without UNSIGNED_FLAG +--echo # + +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TYPE_MYSQL_TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +SHOW COLUMNS FROM mysql.innodb_table_stats LIKE 'last_update'; +SHOW COLUMNS FROM mysql.innodb_index_stats LIKE 'last_update'; +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=1; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +SELECT TIMESTAMPDIFF(DAY,last_update,now())<=1 FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; +DROP TABLE t1; + +--echo # +--echo # Now as the table t1 is dropped, expect no statistics +--echo # + +SELECT * FROM mysql.innodb_table_stats +WHERE database_name='test' AND table_name='t1'; +SELECT * FROM mysql.innodb_index_stats +WHERE database_name='test' AND table_name='t1' AND stat_name='size'; + +--echo # +--echo # Restore the structure of the tables +--echo # + +ALTER TABLE mysql.innodb_table_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); +ALTER TABLE mysql.innodb_index_stats MODIFY last_update TIMESTAMP NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(); diff --git a/plugin/type_mysql_timestamp/plugin.cc b/plugin/type_mysql_timestamp/plugin.cc new file mode 100644 index 00000000000..fd6ad896aa7 --- /dev/null +++ b/plugin/type_mysql_timestamp/plugin.cc @@ -0,0 +1,177 @@ +/* + Copyright (c) 2023, MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include +#include +#include +#include "sql_type.h" + + +class Type_collection_local: public Type_collection +{ +protected: + const Type_handler *aggregate_common(const Type_handler *h1, + const Type_handler *h2) const; +public: + const Type_handler *handler_by_name(const LEX_CSTRING &name) const override + { + return NULL; + } + + const Type_handler *aggregate_for_result(const Type_handler *h1, + const Type_handler *h2) + const override + { + return aggregate_common(h1, h2); + } + + const Type_handler *aggregate_for_comparison(const Type_handler *h1, + const Type_handler *h2) + const override + { + return aggregate_common(h1, h2); + } + + const Type_handler *aggregate_for_min_max(const Type_handler *h1, + const Type_handler *h2) + const override + { + return aggregate_common(h1, h2); + } + + const Type_handler *aggregate_for_num_op(const Type_handler *h1, + const Type_handler *h2) + const override + { + return aggregate_common(h1, h2); + } +}; + + +static Type_collection_local type_collection_local; + + +/* + A more MySQL compatible Field: + it does not set the UNSIGNED_FLAG. + This is how MySQL's Field_timestampf works. +*/ +class Field_mysql_timestampf :public Field_timestampf +{ +public: + Field_mysql_timestampf(const LEX_CSTRING &name, + const Record_addr &addr, + enum utype unireg_check_arg, + TABLE_SHARE *share, decimal_digits_t dec_arg) + :Field_timestampf(addr.ptr(), addr.null_ptr(), addr.null_bit(), + unireg_check_arg, &name, share, dec_arg) + { + flags&= ~UNSIGNED_FLAG; // MySQL compatibility + } + void sql_type(String &str) const override + { + sql_type_opt_dec_comment(str, + Field_mysql_timestampf::type_handler()->name(), + dec, type_version_mysql56()); + } + const Type_handler *type_handler() const override; +}; + + +class Type_handler_mysql_timestamp2: public Type_handler_timestamp2 +{ +public: + const Type_collection *type_collection() const override + { + return &type_collection_local; + } + Field *make_table_field_from_def(TABLE_SHARE *share, MEM_ROOT *root, + const LEX_CSTRING *name, + const Record_addr &rec, const Bit_addr &bit, + const Column_definition_attributes *attr, + uint32 flags) const override + { + return new (root) + Field_mysql_timestampf(*name, rec, attr->unireg_check, share, + attr->temporal_dec(MAX_DATETIME_WIDTH)); + } + void Column_definition_implicit_upgrade(Column_definition *c) const override + { + /* + Suppress the automatic upgrade depending on opt_mysql56_temporal_format, + derived from Type_handler_timestamp_common. + */ + } +}; + + +static Type_handler_mysql_timestamp2 type_handler_mysql_timestamp2; + + +const Type_handler *Field_mysql_timestampf::type_handler() const +{ + return &type_handler_mysql_timestamp2; +} + + +const Type_handler * +Type_collection_local::aggregate_common(const Type_handler *h1, + const Type_handler *h2) const +{ + if (h1 == h2) + return h1; + + static const Type_aggregator::Pair agg[]= + { + { + &type_handler_timestamp2, + &type_handler_mysql_timestamp2, + &type_handler_mysql_timestamp2 + }, + {NULL,NULL,NULL} + }; + + return Type_aggregator::find_handler_in_array(agg, h1, h2, true); +} + + +static struct st_mariadb_data_type plugin_descriptor_type_mysql_timestamp= +{ + MariaDB_DATA_TYPE_INTERFACE_VERSION, + &type_handler_mysql_timestamp2 +}; + + + +/*************************************************************************/ + +maria_declare_plugin(type_mysql_timestamp) +{ + MariaDB_DATA_TYPE_PLUGIN, // the plugin type (see include/mysql/plugin.h) + &plugin_descriptor_type_mysql_timestamp, // pointer to type-specific plugin descriptor + "type_mysql_timestamp", // plugin name + "MariaDB Corporation", // plugin author + "Data type TYPE_MYSQL_TIMESTAMP", // the plugin description + PLUGIN_LICENSE_GPL, // the plugin license (see include/mysql/plugin.h) + 0, // Pointer to plugin initialization function + 0, // Pointer to plugin deinitialization function + 0x0100, // Numeric version 0xAABB means AA.BB version + NULL, // Status variables + NULL, // System variables + "1.0", // String version representation + MariaDB_PLUGIN_MATURITY_EXPERIMENTAL // Maturity(see include/mysql/plugin.h)*/ +} +maria_declare_plugin_end; diff --git a/sql/field.h b/sql/field.h index 642456b9774..e57a93b6562 100644 --- a/sql/field.h +++ b/sql/field.h @@ -3354,7 +3354,7 @@ public: /** TIMESTAMP(0..6) - MySQL56 version */ -class Field_timestampf final :public Field_timestamp_with_dec { +class Field_timestampf :public Field_timestamp_with_dec { void store_TIMEVAL(const timeval &tv) override; public: Field_timestampf(uchar *ptr_arg, diff --git a/sql/handler.cc b/sql/handler.cc index 1b5212d22de..75d9dd21610 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -7972,6 +7972,9 @@ Compare_keys handler::compare_key_parts(const Field &old_field, concurrent accesses. And it's an overkill to take LOCK_plugin and iterate the whole installed_htons[] array every time. + @note Object victim_thd is not guaranteed to exist after this + function returns. + @param bf_thd brute force THD asking for the abort @param victim_thd victim THD to be aborted @@ -7985,6 +7988,8 @@ int ha_abort_transaction(THD *bf_thd, THD *victim_thd, my_bool signal) if (!WSREP(bf_thd) && !(bf_thd->variables.wsrep_OSU_method == WSREP_OSU_RSU && wsrep_thd_is_toi(bf_thd))) { + mysql_mutex_unlock(&victim_thd->LOCK_thd_data); + mysql_mutex_unlock(&victim_thd->LOCK_thd_kill); DBUG_RETURN(0); } @@ -7996,6 +8001,8 @@ int ha_abort_transaction(THD *bf_thd, THD *victim_thd, my_bool signal) else { WSREP_WARN("Cannot abort InnoDB transaction"); + mysql_mutex_unlock(&victim_thd->LOCK_thd_data); + mysql_mutex_unlock(&victim_thd->LOCK_thd_kill); } DBUG_RETURN(0); diff --git a/sql/handler.h b/sql/handler.h index 77c77c83c0f..f4eaa5d612f 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -45,6 +45,7 @@ #include "sql_sequence.h" #include "mem_root_array.h" #include // pair +#include /* __attribute__ */ class Alter_info; class Virtual_column_info; @@ -1460,9 +1461,9 @@ struct handlerton const char *query, uint query_length, const char *db, const char *table_name); - void (*abort_transaction)(handlerton *hton, THD *bf_thd, - THD *victim_thd, my_bool signal); - int (*set_checkpoint)(handlerton *hton, const XID* xid); + void (*abort_transaction)(handlerton *hton, THD *bf_thd, THD *victim_thd, + my_bool signal) __attribute__((nonnull)); + int (*set_checkpoint)(handlerton *hton, const XID *xid); int (*get_checkpoint)(handlerton *hton, XID* xid); /** Check if the version of the table matches the version in the .frm diff --git a/sql/item_func.h b/sql/item_func.h index 6e714814526..6df3b98276b 100644 --- a/sql/item_func.h +++ b/sql/item_func.h @@ -373,7 +373,7 @@ public: { for (uint i= 0; i < arg_count; i++) { - args[i]->no_rows_in_result(); + args[i]->restore_to_before_no_rows_in_result(); } } void convert_const_compared_to_int_field(THD *thd); diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 41a9f46a05c..4efb1fe8da2 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1747,6 +1747,11 @@ static void close_connections(void) (void) unlink(mysqld_unix_port); } } + /* + The following is needed to the threads stuck in + setup_connection_thread_globals() + to continue. + */ listen_sockets.free_memory(); mysql_mutex_unlock(&LOCK_start_thread); @@ -2032,6 +2037,7 @@ static void clean_up(bool print_message) end_ssl(); #ifndef EMBEDDED_LIBRARY vio_end(); + listen_sockets.free_memory(); #endif /*!EMBEDDED_LIBRARY*/ #if defined(ENABLED_DEBUG_SYNC) /* End the debug sync facility. See debug_sync.cc. */ diff --git a/sql/service_wsrep.cc b/sql/service_wsrep.cc index dd12149ff48..e1a4a25b27a 100644 --- a/sql/service_wsrep.cc +++ b/sql/service_wsrep.cc @@ -32,6 +32,11 @@ extern "C" void wsrep_thd_LOCK(const THD *thd) mysql_mutex_lock(&thd->LOCK_thd_data); } +extern "C" int wsrep_thd_TRYLOCK(const THD *thd) +{ + return mysql_mutex_trylock(&thd->LOCK_thd_data); +} + extern "C" void wsrep_thd_UNLOCK(const THD *thd) { mysql_mutex_unlock(&thd->LOCK_thd_data); @@ -196,6 +201,7 @@ extern "C" void wsrep_handle_SR_rollback(THD *bf_thd, /* Note: do not store/reset globals before wsrep_bf_abort() call to avoid losing BF thd context. */ + mysql_mutex_lock(&victim_thd->LOCK_thd_data); if (!(bf_thd && bf_thd != victim_thd)) { DEBUG_SYNC(victim_thd, "wsrep_before_SR_rollback"); @@ -208,6 +214,7 @@ extern "C" void wsrep_handle_SR_rollback(THD *bf_thd, { wsrep_thd_self_abort(victim_thd); } + mysql_mutex_unlock(&victim_thd->LOCK_thd_data); if (bf_thd) { wsrep_store_threadvars(bf_thd); @@ -218,7 +225,7 @@ extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd, my_bool signal) { mysql_mutex_assert_owner(&victim_thd->LOCK_thd_kill); - mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_data); + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); my_bool ret= wsrep_bf_abort(bf_thd, victim_thd); /* Send awake signal if victim was BF aborted or does not @@ -227,19 +234,8 @@ extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd, */ if ((ret || !wsrep_on(victim_thd)) && signal) { - mysql_mutex_lock(&victim_thd->LOCK_thd_data); - - if (victim_thd->wsrep_aborter && victim_thd->wsrep_aborter != bf_thd->thread_id) - { - WSREP_DEBUG("victim is killed already by %llu, skipping awake", - victim_thd->wsrep_aborter); - mysql_mutex_unlock(&victim_thd->LOCK_thd_data); - return false; - } - victim_thd->wsrep_aborter= bf_thd->thread_id; victim_thd->awake_no_mutex(KILL_QUERY_HARD); - mysql_mutex_unlock(&victim_thd->LOCK_thd_data); } else { WSREP_DEBUG("wsrep_thd_bf_abort skipped awake, signal %d", signal); } @@ -368,25 +364,6 @@ extern "C" ulong wsrep_OSU_method_get(const MYSQL_THD thd) return(global_system_variables.wsrep_OSU_method); } -extern "C" bool wsrep_thd_set_wsrep_aborter(THD *bf_thd, THD *victim_thd) -{ - mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); - if (!bf_thd) - { - victim_thd->wsrep_aborter= 0; - WSREP_DEBUG("wsrep_thd_set_wsrep_aborter resetting wsrep_aborter"); - return false; - } - if (victim_thd->wsrep_aborter && victim_thd->wsrep_aborter != bf_thd->thread_id) - { - return true; - } - victim_thd->wsrep_aborter= bf_thd->thread_id; - WSREP_DEBUG("wsrep_thd_set_wsrep_aborter setting wsrep_aborter %u", - victim_thd->wsrep_aborter); - return false; -} - extern "C" void wsrep_report_bf_lock_wait(const THD *thd, unsigned long long trx_id) { diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 74f9e80df7e..bc1e2362cc7 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -1293,6 +1293,11 @@ void THD::init() wsrep_affected_rows = 0; m_wsrep_next_trx_id = WSREP_UNDEFINED_TRX_ID; wsrep_aborter = 0; + wsrep_abort_by_kill = NOT_KILLED; + wsrep_abort_by_kill_err = 0; +#ifndef DBUG_OFF + wsrep_killed_state = 0; +#endif /* DBUG_OFF */ wsrep_desynced_backup_stage= false; #endif /* WITH_WSREP */ @@ -1641,6 +1646,13 @@ void THD::reset_for_reuse() #endif #ifdef WITH_WSREP wsrep_free_status(this); + wsrep_cs().reset_error(); + wsrep_aborter= 0; + wsrep_abort_by_kill= NOT_KILLED; + wsrep_abort_by_kill_err= 0; +#ifndef DBUG_OFF + wsrep_killed_state= 0; +#endif /* DBUG_OFF */ #endif /* WITH_WSREP */ } @@ -1897,7 +1909,9 @@ void THD::awake_no_mutex(killed_state state_to_set) } /* Interrupt target waiting inside a storage engine. */ - if (state_to_set != NOT_KILLED && !wsrep_is_bf_aborted(this)) + if (state_to_set != NOT_KILLED && + IF_WSREP(!wsrep_is_bf_aborted(this) && wsrep_abort_by_kill == NOT_KILLED, + true)) ha_kill_query(this, thd_kill_level(this)); abort_current_cond_wait(false); @@ -2125,6 +2139,17 @@ void THD::reset_killed() mysql_mutex_unlock(&LOCK_thd_kill); } #ifdef WITH_WSREP + if (WSREP_NNULL(this)) + { + if (wsrep_abort_by_kill != NOT_KILLED) + { + mysql_mutex_assert_not_owner(&LOCK_thd_kill); + mysql_mutex_lock(&LOCK_thd_kill); + wsrep_abort_by_kill= NOT_KILLED; + wsrep_abort_by_kill_err= 0; + mysql_mutex_unlock(&LOCK_thd_kill); + } + } mysql_mutex_assert_not_owner(&LOCK_thd_data); mysql_mutex_lock(&LOCK_thd_data); wsrep_aborter= 0; diff --git a/sql/sql_class.h b/sql/sql_class.h index e538c7c5fd3..cd365a89706 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -5382,7 +5382,14 @@ public: bool wsrep_ignore_table; /* thread who has started kill for this THD protected by LOCK_thd_data*/ my_thread_id wsrep_aborter; - + /* Kill signal used, if thread was killed by manual KILL. Protected by + LOCK_thd_kill. */ + std::atomic wsrep_abort_by_kill; + /* */ + struct err_info* wsrep_abort_by_kill_err; +#ifndef DBUG_OFF + int wsrep_killed_state; +#endif /* DBUG_OFF */ /* true if BF abort is observed in do_command() right after reading client's packet, and if the client has sent PS execute command. */ bool wsrep_delayed_BF_abort; diff --git a/sql/sql_limit.h b/sql/sql_limit.h index 41308bc12db..335aff9d215 100644 --- a/sql/sql_limit.h +++ b/sql/sql_limit.h @@ -61,6 +61,15 @@ class Select_limit_counters with_ties= false; } + /* Send the first row, still honoring offset_limit_cnt */ + void send_first_row() + { + /* Guard against overflow */ + if ((select_limit_cnt= offset_limit_cnt +1 ) == 0) + select_limit_cnt= offset_limit_cnt; + // with_ties= false; Remove // on merge to 10.6 + } + bool is_unlimited() const { return select_limit_cnt == HA_POS_ERROR; } /* diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 3b6f66b5ae7..ef4c7945233 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -7866,7 +7866,7 @@ static bool wsrep_mysql_parse(THD *thd, char *rawbuf, uint length, thd->wsrep_retry_counter < thd->variables.wsrep_retry_autocommit) { #ifdef ENABLED_DEBUG_SYNC - DBUG_EXECUTE_IF("sync.wsrep_retry_autocommit", + DBUG_EXECUTE_IF("sync.wsrep_retry_autocommit", { const char act[]= "now " @@ -9229,23 +9229,20 @@ kill_one_thread(THD *thd, my_thread_id id, killed_state kill_signal, killed_type thd->security_ctx->user_matches(tmp->security_ctx)) #endif /* WITH_WSREP */ { + { #ifdef WITH_WSREP - DEBUG_SYNC(thd, "before_awake_no_mutex"); - if (tmp->wsrep_aborter && tmp->wsrep_aborter != thd->thread_id) - { - /* victim is in hit list already, bail out */ - WSREP_DEBUG("victim %lld has wsrep aborter: %lu, skipping awake()", - id, tmp->wsrep_aborter); - error= 0; - } - else + if (WSREP(tmp)) + { + error = wsrep_kill_thd(thd, tmp, kill_signal); + } + else + { #endif /* WITH_WSREP */ - { - WSREP_DEBUG("kill_one_thread victim: %lld wsrep_aborter %lu" - " by signal %d", - id, tmp->wsrep_aborter, kill_signal); tmp->awake_no_mutex(kill_signal); error= 0; +#ifdef WITH_WSREP + } +#endif /* WITH_WSREP */ } } else @@ -9368,18 +9365,6 @@ static void sql_kill(THD *thd, my_thread_id id, killed_state state, killed_type type) { uint error; -#ifdef WITH_WSREP - if (WSREP(thd)) - { - WSREP_DEBUG("sql_kill called"); - if (thd->wsrep_applier) - { - WSREP_DEBUG("KILL in applying, bailing out here"); - return; - } - WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL) - } -#endif /* WITH_WSREP */ if (likely(!(error= kill_one_thread(thd, id, state, type)))) { if (!thd->killed) @@ -9389,11 +9374,6 @@ void sql_kill(THD *thd, my_thread_id id, killed_state state, killed_type type) } else my_error(error, MYF(0), id); -#ifdef WITH_WSREP - return; - wsrep_error_label: - my_error(ER_KILL_DENIED_ERROR, MYF(0), (long long) thd->thread_id); -#endif /* WITH_WSREP */ } @@ -9402,18 +9382,6 @@ sql_kill_user(THD *thd, LEX_USER *user, killed_state state) { uint error; ha_rows rows; -#ifdef WITH_WSREP - if (WSREP(thd)) - { - WSREP_DEBUG("sql_kill_user called"); - if (thd->wsrep_applier) - { - WSREP_DEBUG("KILL in applying, bailing out here"); - return; - } - WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL) - } -#endif /* WITH_WSREP */ switch (error= kill_threads_for_user(thd, user, state, &rows)) { case 0: @@ -9429,11 +9397,6 @@ sql_kill_user(THD *thd, LEX_USER *user, killed_state state) default: my_error(error, MYF(0)); } -#ifdef WITH_WSREP - return; - wsrep_error_label: - my_error(ER_KILL_DENIED_ERROR, MYF(0), (long long) thd->thread_id); -#endif /* WITH_WSREP */ } diff --git a/sql/sql_plugin_services.inl b/sql/sql_plugin_services.inl index 3a66e982e7b..f2b2d08de1d 100644 --- a/sql/sql_plugin_services.inl +++ b/sql/sql_plugin_services.inl @@ -151,6 +151,7 @@ static struct wsrep_service_st wsrep_handler = { wsrep_on, wsrep_prepare_key_for_innodb, wsrep_thd_LOCK, + wsrep_thd_TRYLOCK, wsrep_thd_UNLOCK, wsrep_thd_query, wsrep_thd_retry_counter, @@ -179,7 +180,6 @@ static struct wsrep_service_st wsrep_handler = { wsrep_OSU_method_get, wsrep_thd_has_ignored_error, wsrep_thd_set_ignored_error, - wsrep_thd_set_wsrep_aborter, wsrep_report_bf_lock_wait, wsrep_thd_kill_LOCK, wsrep_thd_kill_UNLOCK, diff --git a/sql/sql_select.cc b/sql/sql_select.cc index d96df60aba3..0998a2deafa 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -150,10 +150,10 @@ static void update_depend_map_for_order(JOIN *join, ORDER *order); static ORDER *remove_const(JOIN *join,ORDER *first_order,COND *cond, bool change_list, bool *simple_order); static int return_zero_rows(JOIN *join, select_result *res, - List &tables, - List &fields, bool send_row, + List *tables, + List *fields, bool send_row, ulonglong select_options, const char *info, - Item *having, List &all_fields); + Item *having, List *all_fields); static COND *build_equal_items(JOIN *join, COND *cond, COND_EQUAL *inherited, List *join_list, @@ -1301,11 +1301,40 @@ int SELECT_LEX::vers_setup_conds(THD *thd, TABLE_LIST *tables) DBUG_RETURN(0); } + /***************************************************************************** Check fields, find best join, do the select and output fields. mysql_select assumes that all tables are already opened *****************************************************************************/ +/* + Check if we have a field reference. If yes, we have to use + mixed_implicit_grouping. +*/ + +static bool check_list_for_field(List *items) +{ + List_iterator_fast select_it(*items); + Item *select_el; + + while ((select_el= select_it++)) + { + if (select_el->with_field()) + return true; + } + return false; +} + +static bool check_list_for_field(ORDER *order) +{ + for (; order; order= order->next) + { + if (order->item[0]->with_field()) + return true; + } + return false; +} + /** Prepare of whole select (including sub queries in future). @@ -1387,53 +1416,44 @@ JOIN::prepare(TABLE_LIST *tables_init, COND *conds_init, uint og_num, DBUG_RETURN(-1); /* - TRUE if the SELECT list mixes elements with and without grouping, - and there is no GROUP BY clause. Mixing non-aggregated fields with - aggregate functions in the SELECT list is a MySQL extenstion that - is allowed only if the ONLY_FULL_GROUP_BY sql mode is not set. + mixed_implicit_grouping will be set to TRUE if the SELECT list + mixes elements with and without grouping, and there is no GROUP BY + clause. + Mixing non-aggregated fields with aggregate functions in the + SELECT list or HAVING is a MySQL extension that is allowed only if + the ONLY_FULL_GROUP_BY sql mode is not set. */ mixed_implicit_grouping= false; if ((~thd->variables.sql_mode & MODE_ONLY_FULL_GROUP_BY) && select_lex->with_sum_func && !group_list) { - List_iterator_fast select_it(fields_list); - Item *select_el; /* Element of the SELECT clause, can be an expression. */ - bool found_field_elem= false; - bool found_sum_func_elem= false; - - while ((select_el= select_it++)) + if (check_list_for_field(&fields_list) || + check_list_for_field(order)) { - if (select_el->with_sum_func()) - found_sum_func_elem= true; - if (select_el->with_field()) - found_field_elem= true; - if (found_sum_func_elem && found_field_elem) + List_iterator_fast li(select_lex->leaf_tables); + + mixed_implicit_grouping= true; // mark for future + + while (TABLE_LIST *tbl= li++) { - mixed_implicit_grouping= true; - break; + /* + If the query uses implicit grouping where the select list + contains both aggregate functions and non-aggregate fields, + any non-aggregated field may produce a NULL value. Set all + fields of each table as nullable before semantic analysis to + take into account this change of nullability. + + Note: this loop doesn't touch tables inside merged + semi-joins, because subquery-to-semijoin conversion has not + been done yet. This is intended. + */ + if (tbl->table) + tbl->table->maybe_null= 1; } } } - table_count= select_lex->leaf_tables.elements; - TABLE_LIST *tbl; - List_iterator_fast li(select_lex->leaf_tables); - while ((tbl= li++)) - { - /* - If the query uses implicit grouping where the select list contains both - aggregate functions and non-aggregate fields, any non-aggregated field - may produce a NULL value. Set all fields of each table as nullable before - semantic analysis to take into account this change of nullability. - - Note: this loop doesn't touch tables inside merged semi-joins, because - subquery-to-semijoin conversion has not been done yet. This is intended. - */ - if (mixed_implicit_grouping && tbl->table) - tbl->table->maybe_null= 1; - } - uint real_og_num= og_num; if (skip_order_by && select_lex != select_lex->master_unit()->global_parameters()) @@ -1446,14 +1466,14 @@ JOIN::prepare(TABLE_LIST *tables_init, COND *conds_init, uint og_num, DBUG_RETURN(-1); ref_ptrs= ref_ptr_array_slice(0); - + enum_parsing_place save_place= thd->lex->current_select->context_analysis_place; thd->lex->current_select->context_analysis_place= SELECT_LIST; { List_iterator_fast it(select_lex->leaf_tables); - while ((tbl= it++)) + while (TABLE_LIST *tbl= it++) { if (tbl->table_function && tbl->table_function->setup(thd, tbl, select_lex_arg)) @@ -4074,7 +4094,7 @@ bool JOIN::make_aggr_tables_info() set_items_ref_array(items0); if (join_tab) join_tab[exec_join_tab_cnt() + aggr_tables - 1].next_select= - setup_end_select_func(this, NULL); + setup_end_select_func(this); group= has_group_by; DBUG_RETURN(false); @@ -4469,13 +4489,7 @@ JOIN::reinit() } } - /* Reset of sum functions */ - if (sum_funcs) - { - Item_sum *func, **func_ptr= sum_funcs; - while ((func= *(func_ptr++))) - func->clear(); - } + clear_sum_funcs(); if (no_rows_in_result_called) { @@ -4758,12 +4772,12 @@ void JOIN::exec_inner() } else { - (void) return_zero_rows(this, result, select_lex->leaf_tables, - *columns_list, + (void) return_zero_rows(this, result, &select_lex->leaf_tables, + columns_list, send_row_on_empty_set(), select_options, zero_result_cause, - having ? having : tmp_having, all_fields); + having ? having : tmp_having, &all_fields); DBUG_VOID_RETURN; } } @@ -15641,10 +15655,36 @@ ORDER *simple_remove_const(ORDER *order, COND *where) } +/* + Set all fields in the table to have a null value + + @param tables Table list +*/ + +static void make_tables_null_complemented(List *tables) +{ + List_iterator ti(*tables); + TABLE_LIST *table; + while ((table= ti++)) + { + /* + Don't touch semi-join materialization tables, as the a join_free() + call may have freed them (and HAVING clause can't have references to + them anyway). + */ + if (!table->is_jtbm()) + { + TABLE *tbl= table->table; + mark_as_null_row(tbl); // Set fields to NULL + } + } +} + + static int -return_zero_rows(JOIN *join, select_result *result, List &tables, - List &fields, bool send_row, ulonglong select_options, - const char *info, Item *having, List &all_fields) +return_zero_rows(JOIN *join, select_result *result, List *tables, + List *fields, bool send_row, ulonglong select_options, + const char *info, Item *having, List *all_fields) { DBUG_ENTER("return_zero_rows"); @@ -15660,24 +15700,15 @@ return_zero_rows(JOIN *join, select_result *result, List &tables, Set all tables to have NULL row. This is needed as we will be evaluating HAVING condition. */ - List_iterator ti(tables); - TABLE_LIST *table; - while ((table= ti++)) - { - /* - Don't touch semi-join materialization tables, as the above join_free() - call has freed them (and HAVING clause can't have references to them - anyway). - */ - if (!table->is_jtbm()) - mark_as_null_row(table->table); // All fields are NULL - } - List_iterator_fast it(all_fields); + make_tables_null_complemented(tables); + + List_iterator_fast it(*all_fields); Item *item; /* Inform all items (especially aggregating) to calculate HAVING correctly, also we will need it for sending results. */ + join->no_rows_in_result_called= 1; while ((item= it++)) item->no_rows_in_result(); if (having && having->val_int() == 0) @@ -15691,12 +15722,12 @@ return_zero_rows(JOIN *join, select_result *result, List &tables, join->thd->limit_found_rows= 0; } - if (!(result->send_result_set_metadata(fields, + if (!(result->send_result_set_metadata(*fields, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))) { bool send_error= FALSE; if (send_row) - send_error= result->send_data_with_check(fields, join->unit, 0) > 0; + send_error= result->send_data_with_check(*fields, join->unit, 0) > 0; if (likely(!send_error)) result->send_eof(); // Should be safe } @@ -15712,49 +15743,42 @@ return_zero_rows(JOIN *join, select_result *result, List &tables, } /** - used only in JOIN::clear (always) and in do_select() - (if there where no matching rows) + Reset table rows to contain a null-complement row (all fields are null) + + Used only in JOIN::clear() and in do_select() if there where no matching rows. @param join JOIN - @param cleared_tables If not null, clear also const tables and mark all - cleared tables in the map. cleared_tables is only - set when called from do_select() when there is a - group function and there where no matching rows. + @param cleared_tables Used to mark all cleared tables in the map. Needed for + unclear_tables() to know which tables to restore to + their original state. */ static void clear_tables(JOIN *join, table_map *cleared_tables) { - /* - must clear only the non-const tables as const tables are not re-calculated. - */ + DBUG_ASSERT(cleared_tables); for (uint i= 0 ; i < join->table_count ; i++) { TABLE *table= join->table[i]; if (table->null_row) continue; // Nothing more to do - if (!(table->map & join->const_table_map) || cleared_tables) + (*cleared_tables)|= (((table_map) 1) << i); + if (table->s->null_bytes) { - if (cleared_tables) - { - (*cleared_tables)|= (((table_map) 1) << i); - if (table->s->null_bytes) - { - /* - Remember null bits for the record so that we can restore the - original const record in unclear_tables() - */ - memcpy(table->record[1], table->null_flags, table->s->null_bytes); - } - } - mark_as_null_row(table); // All fields are NULL + /* + Remember null bits for the record so that we can restore the + original const record in unclear_tables() + */ + memcpy(table->record[1], table->null_flags, table->s->null_bytes); } + mark_as_null_row(table); // All fields are NULL } } /** Reverse null marking for tables and restore null bits. + This return the tables to the state of before clear_tables(). We have to do this because the tables may be re-used in a sub query and the subquery will assume that the const tables contains the original @@ -21493,9 +21517,9 @@ void set_postjoin_aggr_write_func(JOIN_TAB *tab) end_select function to use. This function can't fail. */ -Next_select_func setup_end_select_func(JOIN *join, JOIN_TAB *tab) +Next_select_func setup_end_select_func(JOIN *join) { - TMP_TABLE_PARAM *tmp_tbl= tab ? tab->tmp_table_param : &join->tmp_table_param; + TMP_TABLE_PARAM *tmp_tbl= &join->tmp_table_param; /* Choose method for presenting result to user. Use end_send_group @@ -21565,7 +21589,7 @@ do_select(JOIN *join, Procedure *procedure) join->duplicate_rows= join->send_records=0; if (join->only_const_tables() && !join->need_tmp) { - Next_select_func end_select= setup_end_select_func(join, NULL); + Next_select_func end_select= setup_end_select_func(join); /* HAVING will be checked after processing aggregate functions, @@ -22040,6 +22064,7 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records) } } + /* Restore state if mark_as_null_row() have been called */ if (join_tab->last_inner) { JOIN_TAB *last_inner_tab= join_tab->last_inner; @@ -23452,11 +23477,18 @@ end_send_group(JOIN *join, JOIN_TAB *join_tab, bool end_of_records) { int idx= -1; enum_nested_loop_state ok_code= NESTED_LOOP_OK; + /* + join_tab can be 0 in the case all tables are const tables and we did not + need a temporary table to store the result. + In this case we use the original given fields, which is stored in + join->fields. + */ List *fields= join_tab ? (join_tab-1)->fields : join->fields; DBUG_ENTER("end_send_group"); if (!join->items3.is_null() && !join->set_group_rpa) { + /* Move ref_pointer_array to points to items3 */ join->set_group_rpa= true; join->set_items_ref_array(join->items3); } @@ -23464,10 +23496,12 @@ end_send_group(JOIN *join, JOIN_TAB *join_tab, bool end_of_records) if (!join->first_record || end_of_records || (idx=test_if_group_changed(join->group_fields)) >= 0) { + if (!join->group_sent && (join->first_record || (end_of_records && !join->group && !join->group_optimized_away))) { + table_map cleared_tables= (table_map) 0; if (join->procedure) join->procedure->end_group(); /* Test if there was a group change. */ @@ -23492,11 +23526,13 @@ end_send_group(JOIN *join, JOIN_TAB *join_tab, bool end_of_records) /* Reset all sum functions on group change. */ if (!join->first_record) { - List_iterator_fast it(*join->fields); - Item *item; /* No matching rows for group function */ - join->clear(); + List_iterator_fast it(*fields); + Item *item; + join->no_rows_in_result_called= 1; + + join->clear(&cleared_tables); while ((item= it++)) item->no_rows_in_result(); } @@ -23524,7 +23560,14 @@ end_send_group(JOIN *join, JOIN_TAB *join_tab, bool end_of_records) if (join->rollup_send_data((uint) (idx+1))) error= 1; } - } + if (join->no_rows_in_result_called) + { + /* Restore null tables to original state */ + join->no_rows_in_result_called= 0; + if (cleared_tables) + unclear_tables(join, &cleared_tables); + } + } if (unlikely(error > 0)) DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */ if (end_of_records) @@ -23841,6 +23884,7 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), { if (join->first_record || (end_of_records && !join->group)) { + table_map cleared_tables= (table_map) 0; if (join->procedure) join->procedure->end_group(); int send_group_parts= join->send_group_parts; @@ -23849,7 +23893,7 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), if (!join->first_record) { /* No matching rows for group function */ - join->clear(); + join->clear(&cleared_tables); } copy_sum_funcs(join->sum_funcs, join->sum_funcs_end[send_group_parts]); @@ -23872,6 +23916,8 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), DBUG_RETURN(NESTED_LOOP_ERROR); } } + if (cleared_tables) + unclear_tables(join, &cleared_tables); if (end_of_records) goto end; } @@ -25603,7 +25649,7 @@ JOIN_TAB::remove_duplicates() !(join->select_options & OPTION_FOUND_ROWS)) { // only const items with no OPTION_FOUND_ROWS - join->unit->lim.set_single_row(); // Only send first row + join->unit->lim.send_first_row(); // Only send first row my_free(sortorder); DBUG_RETURN(false); } @@ -28007,11 +28053,8 @@ int JOIN::rollup_write_data(uint idx, TMP_TABLE_PARAM *tmp_table_param_arg, (end_send_group/end_write_group) */ -void JOIN::clear() +void inline JOIN::clear_sum_funcs() { - clear_tables(this, 0); - copy_fields(&tmp_table_param); - if (sum_funcs) { Item_sum *func, **func_ptr= sum_funcs; @@ -28021,6 +28064,22 @@ void JOIN::clear() } +/* + Prepare for returning 'empty row' when there is no matching row. + + - Mark all tables with mark_as_null_row() + - Make a copy of of all simple SELECT items + - Reset all sum functions to NULL or 0. +*/ + +void JOIN::clear(table_map *cleared_tables) +{ + clear_tables(this, cleared_tables); + copy_fields(&tmp_table_param); + clear_sum_funcs(); +} + + /** Print an EXPLAIN line with all NULLs and given message in the 'Extra' column diff --git a/sql/sql_select.h b/sql/sql_select.h index 2d3cfa5c7fa..c40c04193e3 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -227,7 +227,7 @@ enum sj_strategy_enum typedef enum_nested_loop_state (*Next_select_func)(JOIN *, struct st_join_table *, bool); -Next_select_func setup_end_select_func(JOIN *join, JOIN_TAB *tab); +Next_select_func setup_end_select_func(JOIN *join); int rr_sequential(READ_RECORD *info); int read_record_func_for_rr_and_unpack(READ_RECORD *info); Item *remove_pushed_top_conjuncts(THD *thd, Item *cond); @@ -1722,7 +1722,8 @@ public: void join_free(); /** Cleanup this JOIN, possibly for reuse */ void cleanup(bool full); - void clear(); + void clear(table_map *cleared_tables); + void inline clear_sum_funcs(); bool send_row_on_empty_set() { return (do_send_rows && implicit_grouping && !group_optimized_away && diff --git a/sql/sql_type.h b/sql/sql_type.h index 52c17d61d2e..8ebdb38db49 100644 --- a/sql/sql_type.h +++ b/sql/sql_type.h @@ -7574,8 +7574,9 @@ extern Named_type_handler type_handler_time; extern Named_type_handler type_handler_time2; extern Named_type_handler type_handler_datetime; extern Named_type_handler type_handler_datetime2; -extern Named_type_handler type_handler_timestamp; -extern Named_type_handler type_handler_timestamp2; + +extern MYSQL_PLUGIN_IMPORT Named_type_handler type_handler_timestamp; +extern MYSQL_PLUGIN_IMPORT Named_type_handler type_handler_timestamp2; extern Type_handler_interval_DDhhmmssff type_handler_interval_DDhhmmssff; diff --git a/sql/table.h b/sql/table.h index d2ea6cfd9e8..c7401f78f65 100644 --- a/sql/table.h +++ b/sql/table.h @@ -3376,10 +3376,16 @@ inline void mark_as_null_row(TABLE *table) bfill(table->null_flags,table->s->null_bytes,255); } +/* + Restore table to state before mark_as_null_row() call. + This assumes that the caller has restored table->null_flags, + as is done in unclear_tables(). +*/ + inline void unmark_as_null_row(TABLE *table) { - table->null_row=0; - table->status= STATUS_NO_RECORD; + table->null_row= 0; + table->status&= ~STATUS_NULL_ROW; } bool is_simple_order(ORDER *order); diff --git a/sql/wsrep_dummy.cc b/sql/wsrep_dummy.cc index 9bfaf9285f3..e1508884075 100644 --- a/sql/wsrep_dummy.cc +++ b/sql/wsrep_dummy.cc @@ -56,6 +56,11 @@ my_bool wsrep_on(const THD *) void wsrep_thd_LOCK(const THD *) { } +int wsrep_thd_TRYLOCK(const THD *) +{ + return 0; +} + void wsrep_thd_UNLOCK(const THD *) { } @@ -154,8 +159,6 @@ void wsrep_thd_set_ignored_error(THD*, my_bool) { } ulong wsrep_OSU_method_get(const THD*) { return 0;} -bool wsrep_thd_set_wsrep_aborter(THD*, THD*) -{ return 0;} void wsrep_report_bf_lock_wait(const THD*, unsigned long long) diff --git a/sql/wsrep_high_priority_service.cc b/sql/wsrep_high_priority_service.cc index 53ef20f3e78..0a2fa273723 100644 --- a/sql/wsrep_high_priority_service.cc +++ b/sql/wsrep_high_priority_service.cc @@ -510,6 +510,7 @@ int Wsrep_high_priority_service::log_dummy_write_set(const wsrep::ws_handle& ws_ m_thd->wait_for_prior_commit(); } + WSREP_DEBUG("checkpointing dummy write set %lld", ws_meta.seqno().get()); wsrep_set_SE_checkpoint(ws_meta.gtid(), wsrep_gtid_server.gtid()); if (!WSREP_EMULATE_BINLOG(m_thd)) diff --git a/sql/wsrep_mysqld.cc b/sql/wsrep_mysqld.cc index 9c11c802cc8..b1aea36881c 100644 --- a/sql/wsrep_mysqld.cc +++ b/sql/wsrep_mysqld.cc @@ -52,6 +52,7 @@ #include "log_event.h" #include "sql_connect.h" #include "thread_cache.h" +#include "debug_sync.h" #include @@ -3107,6 +3108,20 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, request_thd, granted_thd); ticket->wsrep_report(wsrep_debug); + DEBUG_SYNC(request_thd, "before_wsrep_thd_abort"); + DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", { + const char act[]= "now " + "SIGNAL sync.before_wsrep_thd_abort_reached " + "WAIT_FOR signal.before_wsrep_thd_abort"; + DBUG_ASSERT(!debug_sync_set_action(request_thd, STRING_WITH_LEN(act))); + };); + + /* Here we will call wsrep_abort_transaction so we should hold + THD::LOCK_thd_data to protect victim from concurrent usage + and THD::LOCK_thd_kill to protect from disconnect or delete. + + */ + mysql_mutex_lock(&granted_thd->LOCK_thd_kill); mysql_mutex_lock(&granted_thd->LOCK_thd_data); if (wsrep_thd_is_toi(granted_thd) || @@ -3118,13 +3133,11 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, wsrep_thd_query(request_thd)); THD_STAGE_INFO(request_thd, stage_waiting_isolation); ticket->wsrep_report(wsrep_debug); - mysql_mutex_unlock(&granted_thd->LOCK_thd_data); } else if (wsrep_thd_is_SR(granted_thd) && !wsrep_thd_is_SR(request_thd)) { WSREP_MDL_LOG(INFO, "MDL conflict, DDL vs SR", schema, schema_len, request_thd, granted_thd); - mysql_mutex_unlock(&granted_thd->LOCK_thd_data); WSREP_DEBUG("wsrep_handle_mdl_conflict DDL vs SR for %s", wsrep_thd_query(request_thd)); THD_STAGE_INFO(request_thd, stage_waiting_isolation); @@ -3136,6 +3149,7 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, request_thd, granted_thd); ticket->wsrep_report(true); mysql_mutex_unlock(&granted_thd->LOCK_thd_data); + mysql_mutex_unlock(&granted_thd->LOCK_thd_kill); unireg_abort(1); } } @@ -3146,7 +3160,6 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, wsrep_thd_query(request_thd)); THD_STAGE_INFO(request_thd, stage_waiting_ddl); ticket->wsrep_report(wsrep_debug); - mysql_mutex_unlock(&granted_thd->LOCK_thd_data); if (granted_thd->current_backup_stage != BACKUP_FINISHED && wsrep_check_mode(WSREP_MODE_BF_MARIABACKUP)) { @@ -3160,7 +3173,6 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, wsrep_thd_query(request_thd)); THD_STAGE_INFO(request_thd, stage_waiting_isolation); ticket->wsrep_report(wsrep_debug); - mysql_mutex_unlock(&granted_thd->LOCK_thd_data); wsrep_abort_thd(request_thd, granted_thd, 1); } else @@ -3174,7 +3186,6 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, if (granted_thd->wsrep_trx().active()) { - mysql_mutex_unlock(&granted_thd->LOCK_thd_data); wsrep_abort_thd(request_thd, granted_thd, 1); } else @@ -3183,10 +3194,9 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, Granted_thd is likely executing with wsrep_on=0. If the requesting thd is BF, BF abort and wait. */ - mysql_mutex_unlock(&granted_thd->LOCK_thd_data); - if (wsrep_thd_is_BF(request_thd, FALSE)) { + granted_thd->awake_no_mutex(KILL_QUERY_HARD); ha_abort_transaction(request_thd, granted_thd, TRUE); } else @@ -3195,10 +3205,14 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, schema, schema_len, request_thd, granted_thd); ticket->wsrep_report(true); + mysql_mutex_unlock(&granted_thd->LOCK_thd_data); + mysql_mutex_unlock(&granted_thd->LOCK_thd_kill); unireg_abort(1); } } } + mysql_mutex_unlock(&granted_thd->LOCK_thd_data); + mysql_mutex_unlock(&granted_thd->LOCK_thd_kill); } else { @@ -3210,13 +3224,17 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, static bool abort_replicated(THD *thd) { bool ret_code= false; + mysql_mutex_lock(&thd->LOCK_thd_kill); + mysql_mutex_lock(&thd->LOCK_thd_data); if (thd->wsrep_trx().state() == wsrep::transaction::s_committing) { WSREP_DEBUG("aborting replicated trx: %llu", (ulonglong)(thd->real_id)); - (void)wsrep_abort_thd(thd, thd, TRUE); + wsrep_abort_thd(thd, thd, TRUE); ret_code= true; } + mysql_mutex_unlock(&thd->LOCK_thd_data); + mysql_mutex_unlock(&thd->LOCK_thd_kill); return ret_code; } diff --git a/sql/wsrep_server_service.cc b/sql/wsrep_server_service.cc index 9be6af71c56..52a0a9753c1 100644 --- a/sql/wsrep_server_service.cc +++ b/sql/wsrep_server_service.cc @@ -148,9 +148,13 @@ void Wsrep_server_service::release_high_priority_service(wsrep::high_priority_se wsrep_delete_threadvars(); } -void Wsrep_server_service::background_rollback(wsrep::client_state& client_state) +void Wsrep_server_service::background_rollback( + wsrep::unique_lock &lock WSREP_UNUSED, + wsrep::client_state &client_state) { - Wsrep_client_state& cs= static_cast(client_state); + DBUG_ASSERT(lock.owns_lock()); + Wsrep_client_state &cs= static_cast(client_state); + mysql_mutex_assert_owner(&cs.thd()->LOCK_thd_data); wsrep_fire_rollbacker(cs.thd()); } diff --git a/sql/wsrep_server_service.h b/sql/wsrep_server_service.h index 168e98206e3..0fc48402024 100644 --- a/sql/wsrep_server_service.h +++ b/sql/wsrep_server_service.h @@ -46,7 +46,8 @@ public: void release_high_priority_service(wsrep::high_priority_service*); - void background_rollback(wsrep::client_state&); + void background_rollback(wsrep::unique_lock &, + wsrep::client_state &); void bootstrap(); void log_message(enum wsrep::log::level, const char*); diff --git a/sql/wsrep_thd.cc b/sql/wsrep_thd.cc index 420a25dd2ae..682e64859b4 100644 --- a/sql/wsrep_thd.cc +++ b/sql/wsrep_thd.cc @@ -307,48 +307,9 @@ void wsrep_fire_rollbacker(THD *thd) } } - -int wsrep_abort_thd(THD *bf_thd, - THD *victim_thd, - my_bool signal) +static bool wsrep_bf_abort_low(THD *bf_thd, THD *victim_thd) { - DBUG_ENTER("wsrep_abort_thd"); - - mysql_mutex_lock(&victim_thd->LOCK_thd_data); - - /* Note that when you use RSU node is desynced from cluster, thus WSREP(thd) - might not be true. - */ - if ((WSREP_NNULL(bf_thd) || - ((WSREP_ON || bf_thd->variables.wsrep_OSU_method == WSREP_OSU_RSU) && - wsrep_thd_is_toi(bf_thd))) && - !wsrep_thd_is_aborting(victim_thd)) - { - WSREP_DEBUG("wsrep_abort_thd, by: %llu, victim: %llu", - (long long)bf_thd->real_id, (long long)victim_thd->real_id); - mysql_mutex_unlock(&victim_thd->LOCK_thd_data); - ha_abort_transaction(bf_thd, victim_thd, signal); - DBUG_RETURN(1); - } - else - { - WSREP_DEBUG("wsrep_abort_thd not effective: bf %llu victim %llu " - "wsrep %d wsrep_on %d RSU %d TOI %d aborting %d", - (long long)bf_thd->real_id, (long long)victim_thd->real_id, - WSREP_NNULL(bf_thd), WSREP_ON, - bf_thd->variables.wsrep_OSU_method == WSREP_OSU_RSU, - wsrep_thd_is_toi(bf_thd), - wsrep_thd_is_aborting(victim_thd)); - } - - mysql_mutex_unlock(&victim_thd->LOCK_thd_data); - DBUG_RETURN(1); -} - -bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd) -{ - WSREP_LOG_THD(bf_thd, "BF aborter before"); - WSREP_LOG_THD(victim_thd, "victim before"); + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); #ifdef ENABLED_DEBUG_SYNC DBUG_EXECUTE_IF("sync.wsrep_bf_abort", @@ -362,6 +323,85 @@ bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd) };); #endif + wsrep::seqno bf_seqno(bf_thd->wsrep_trx().ws_meta().seqno()); + bool ret; + + { + /* Adopt the lock, it is being held by the caller. */ + Wsrep_mutex wsm{&victim_thd->LOCK_thd_data}; + wsrep::unique_lock lock{wsm, std::adopt_lock}; + + if (wsrep_thd_is_toi(bf_thd)) + { + ret= victim_thd->wsrep_cs().total_order_bf_abort(lock, bf_seqno); + } + else + { + DBUG_ASSERT(WSREP(victim_thd) ? victim_thd->wsrep_trx().active() : 1); + ret= victim_thd->wsrep_cs().bf_abort(lock, bf_seqno); + } + if (ret) + { + /* BF abort should be allowed only once by wsrep-lib.*/ + DBUG_ASSERT(victim_thd->wsrep_aborter == 0); + victim_thd->wsrep_aborter= bf_thd->thread_id; + wsrep_bf_aborts_counter++; + } + lock.release(); /* No unlock at the end of the scope. */ + } + + /* Sanity check for wsrep-lib calls to return with LOCK_thd_data held. */ + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); + + return ret; +} + +void wsrep_abort_thd(THD *bf_thd, + THD *victim_thd, + my_bool signal) +{ + DBUG_ENTER("wsrep_abort_thd"); + + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_kill); + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); + + /* Note that when you use RSU node is desynced from cluster, thus WSREP(thd) + might not be true. + */ + if ((WSREP(bf_thd) + || ((WSREP_ON || bf_thd->variables.wsrep_OSU_method == WSREP_OSU_RSU) + && wsrep_thd_is_toi(bf_thd)) + || bf_thd->lex->sql_command == SQLCOM_KILL) + && !wsrep_thd_is_aborting(victim_thd) && + wsrep_bf_abort_low(bf_thd, victim_thd) && + !victim_thd->wsrep_cs().is_rollbacker_active()) + { + WSREP_DEBUG("wsrep_abort_thd, by: %llu, victim: %llu", + (long long)bf_thd->real_id, (long long)victim_thd->real_id); + victim_thd->awake_no_mutex(KILL_QUERY_HARD); + ha_abort_transaction(bf_thd, victim_thd, signal); + } + else + { + WSREP_DEBUG("wsrep_abort_thd not effective: bf %llu victim %llu " + "wsrep %d wsrep_on %d RSU %d TOI %d aborting %d", + (long long)bf_thd->real_id, (long long)victim_thd->real_id, + WSREP_NNULL(bf_thd), WSREP_ON, + bf_thd->variables.wsrep_OSU_method == WSREP_OSU_RSU, + wsrep_thd_is_toi(bf_thd), + wsrep_thd_is_aborting(victim_thd)); + } + + DBUG_VOID_RETURN; +} + +bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd) +{ + WSREP_LOG_THD(bf_thd, "BF aborter before"); + WSREP_LOG_THD(victim_thd, "victim before"); + + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); + if (WSREP(victim_thd) && !victim_thd->wsrep_trx().active()) { WSREP_DEBUG("wsrep_bf_abort, BF abort for non active transaction." @@ -384,30 +424,84 @@ bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd) wsrep_check_mode(WSREP_MODE_BF_MARIABACKUP)) { WSREP_DEBUG("killing connection for non wsrep session"); - mysql_mutex_lock(&victim_thd->LOCK_thd_data); victim_thd->awake_no_mutex(KILL_CONNECTION); - mysql_mutex_unlock(&victim_thd->LOCK_thd_data); } return false; } - bool ret; - wsrep::seqno bf_seqno(bf_thd->wsrep_trx().ws_meta().seqno()); + return wsrep_bf_abort_low(bf_thd, victim_thd); +} - if (wsrep_thd_is_toi(bf_thd)) +uint wsrep_kill_thd(THD *thd, THD *victim_thd, killed_state kill_signal) +{ + DBUG_ENTER("wsrep_kill_thd"); + DBUG_ASSERT(WSREP(victim_thd)); + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_kill); + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_data); + using trans= wsrep::transaction; + auto trx_state= victim_thd->wsrep_trx().state(); +#ifndef DBUG_OFF + victim_thd->wsrep_killed_state= trx_state; +#endif /* DBUG_OFF */ + /* + Already killed or in commit codepath. Mark the victim as killed, + the killed status will be restored in wsrep_after_commit() and + will be processed after the commit is over. In case of multiple + KILLs happened on commit codepath, the last one will be effective. + */ + if (victim_thd->wsrep_abort_by_kill || + trx_state == trans::s_preparing || + trx_state == trans::s_committing || + trx_state == trans::s_ordered_commit) { - ret= victim_thd->wsrep_cs().total_order_bf_abort(bf_seqno); + victim_thd->wsrep_abort_by_kill= kill_signal; + DBUG_RETURN(0); } - else + /* + Mark killed victim_thd with kill_signal so that awake_no_mutex does + not dive into storage engine. We use ha_abort_transaction() + to do the storage engine part for wsrep THDs. + */ + DEBUG_SYNC(thd, "wsrep_kill_before_awake_no_mutex"); + victim_thd->wsrep_abort_by_kill= kill_signal; + victim_thd->awake_no_mutex(kill_signal); + /* ha_abort_transaction() releases tmp->LOCK_thd_kill, so tmp + is not safe to access anymore. */ + ha_abort_transaction(thd, victim_thd, 1); + DBUG_RETURN(0); +} + +void wsrep_backup_kill_for_commit(THD *thd) +{ + DBUG_ASSERT(WSREP(thd)); + mysql_mutex_assert_owner(&thd->LOCK_thd_kill); + DBUG_ASSERT(thd->killed != NOT_KILLED); + mysql_mutex_lock(&thd->LOCK_thd_data); + /* If the transaction will roll back, keep the killed state. + For must replay, the replay will happen in different THD context + which is high priority and cannot be killed. The owning thread will + pick the killed state in after statement processing. */ + if (thd->wsrep_trx().state() != wsrep::transaction::s_cert_failed && + thd->wsrep_trx().state() != wsrep::transaction::s_must_abort && + thd->wsrep_trx().state() != wsrep::transaction::s_aborting && + thd->wsrep_trx().state() != wsrep::transaction::s_must_replay) { - DBUG_ASSERT(WSREP(victim_thd) ? victim_thd->wsrep_trx().active() : 1); - ret= victim_thd->wsrep_cs().bf_abort(bf_seqno); + thd->wsrep_abort_by_kill= thd->killed; + thd->wsrep_abort_by_kill_err= thd->killed_err; + thd->killed= NOT_KILLED; + thd->killed_err= 0; } - if (ret) - { - wsrep_bf_aborts_counter++; - } - return ret; + mysql_mutex_unlock(&thd->LOCK_thd_data); +} + +void wsrep_restore_kill_after_commit(THD *thd) +{ + DBUG_ASSERT(WSREP(thd)); + mysql_mutex_assert_owner(&thd->LOCK_thd_kill); + thd->killed= thd->wsrep_abort_by_kill; + thd->killed_err= thd->wsrep_abort_by_kill_err; + thd->wsrep_abort_by_kill= NOT_KILLED; + thd->wsrep_abort_by_kill_err= 0; } int wsrep_create_threadvars() diff --git a/sql/wsrep_thd.h b/sql/wsrep_thd.h index 0ce612d6097..f3790887bf5 100644 --- a/sql/wsrep_thd.h +++ b/sql/wsrep_thd.h @@ -88,10 +88,39 @@ bool wsrep_create_appliers(long threads, bool mutex_protected=false); void wsrep_create_rollbacker(); bool wsrep_bf_abort(THD* bf_thd, THD* victim_thd); -int wsrep_abort_thd(THD *bf_thd, +/* + Abort transaction for victim_thd. This function is called from + MDL BF abort codepath. +*/ +void wsrep_abort_thd(THD *bf_thd, THD *victim_thd, my_bool signal) __attribute__((nonnull(1,2))); +/** + Kill wsrep connection with kill_signal. Object thd is not + guaranteed to exist anymore when this function returns. + + Asserts that the caller holds victim_thd->LOCK_thd_kill, + victim_thd->LOCK_thd_data. + + @param thd THD object for connection that executes the KILL. + @param victim_thd THD object for connection to be killed. + @param kill_signal Kill signal. + + @return Zero if the kill was successful, otherwise non-zero error code. + */ +uint wsrep_kill_thd(THD *thd, THD *victim_thd, killed_state kill_signal); + +/* + Backup kill status for commit. + */ +void wsrep_backup_kill_for_commit(THD *); + +/* + Restore KILL status after commit. + */ +void wsrep_restore_kill_after_commit(THD *); + /* Helper methods to deal with thread local storage. The purpose of these methods is to hide the details of thread diff --git a/sql/wsrep_trans_observer.h b/sql/wsrep_trans_observer.h index 812452f451a..6b656f84c78 100644 --- a/sql/wsrep_trans_observer.h +++ b/sql/wsrep_trans_observer.h @@ -256,6 +256,11 @@ static inline int wsrep_before_prepare(THD* thd, bool all) thd->wsrep_trx().ws_meta().gtid(), wsrep_gtid_server.gtid()); } + + mysql_mutex_lock(&thd->LOCK_thd_kill); + if (thd->killed) wsrep_backup_kill_for_commit(thd); + mysql_mutex_unlock(&thd->LOCK_thd_kill); + DBUG_RETURN(ret); } @@ -325,6 +330,11 @@ static inline int wsrep_before_commit(THD* thd, bool all) wsrep_gtid_server.gtid()); wsrep_register_for_group_commit(thd); } + + mysql_mutex_lock(&thd->LOCK_thd_kill); + if (thd->killed) wsrep_backup_kill_for_commit(thd); + mysql_mutex_unlock(&thd->LOCK_thd_kill); + DBUG_RETURN(ret); } @@ -343,7 +353,8 @@ static inline int wsrep_before_commit(THD* thd, bool all) static inline int wsrep_ordered_commit(THD* thd, bool all) { DBUG_ENTER("wsrep_ordered_commit"); - WSREP_DEBUG("wsrep_ordered_commit: %d", wsrep_is_real(thd, all)); + WSREP_DEBUG("wsrep_ordered_commit: %d %lld", wsrep_is_real(thd, all), + (long long) wsrep_thd_trx_seqno(thd)); DBUG_ASSERT(wsrep_run_commit_hook(thd, all)); DBUG_RETURN(thd->wsrep_cs().ordered_commit()); } @@ -451,10 +462,18 @@ int wsrep_after_statement(THD* thd) wsrep::to_c_string(thd->wsrep_cs().state()), wsrep::to_c_string(thd->wsrep_cs().mode()), wsrep::to_c_string(thd->wsrep_cs().transaction().state())); - DBUG_RETURN((thd->wsrep_cs().state() != wsrep::client_state::s_none && + int ret= ((thd->wsrep_cs().state() != wsrep::client_state::s_none && thd->wsrep_cs().mode() == Wsrep_client_state::m_local) && !thd->internal_transaction() ? thd->wsrep_cs().after_statement() : 0); + + if (wsrep_is_active(thd)) + { + mysql_mutex_lock(&thd->LOCK_thd_kill); + wsrep_restore_kill_after_commit(thd); + mysql_mutex_unlock(&thd->LOCK_thd_kill); + } + DBUG_RETURN(ret); } static inline void wsrep_after_apply(THD* thd) diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 922c64c1d58..64637b71632 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2140,6 +2140,8 @@ void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr) } block->page.lock.x_lock(); + if (block->page.is_ibuf_exist()) + ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size()); #ifdef BTR_CUR_HASH_ADAPT if (block->index) btr_search_drop_page_hash_index(block, false); diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 1260145ed1c..510ad02256d 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -372,7 +372,7 @@ void buf_dblwr_t::recover() const uint32_t space_id= page_get_space_id(page); const page_id_t page_id(space_id, page_no); - if (recv_sys.lsn < lsn) + if (recv_sys.scanned_lsn < lsn) { ib::info() << "Ignoring a doublewrite copy of page " << page_id << " with future log sequence number " << lsn; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index acfef5daad2..90263757c19 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -966,11 +966,19 @@ uint32_t fil_space_t::flush_freed(bool writable) mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); mysql_mutex_assert_not_owner(&buf_pool.mutex); - freed_range_mutex.lock(); - if (freed_ranges.empty() || log_sys.get_flushed_lsn() < get_last_freed_lsn()) + for (;;) { + freed_range_mutex.lock(); + if (freed_ranges.empty()) + { + freed_range_mutex.unlock(); + return 0; + } + const lsn_t flush_lsn= last_freed_lsn; + if (log_sys.get_flushed_lsn() >= flush_lsn) + break; freed_range_mutex.unlock(); - return 0; + log_write_up_to(flush_lsn, true); } const unsigned physical{physical_size()}; @@ -2430,6 +2438,7 @@ static void buf_flush_page_cleaner() else if (buf_pool.ran_out()) { buf_pool.page_cleaner_set_idle(false); + buf_pool.get_oldest_modification(0); mysql_mutex_unlock(&buf_pool.flush_list_mutex); n= srv_max_io_capacity; mysql_mutex_lock(&buf_pool.mutex); @@ -2583,6 +2592,7 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init() /** Flush the buffer pool on shutdown. */ ATTRIBUTE_COLD void buf_flush_buffer_pool() { + ut_ad(!os_aio_pending_reads()); ut_ad(!buf_page_cleaner_is_active); ut_ad(!buf_flush_sync_lsn); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 19219ea026b..6a3cc46262d 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -1086,7 +1086,11 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, ut_a(!zip || !bpage->oldest_modification()); ut_ad(bpage->zip_size()); - + /* Skip consistency checks if the page was freed. + In recovery, we could get a sole FREE_PAGE record + and nothing else, for a ROW_FORMAT=COMPRESSED page. + Its contents would be garbage. */ + if (!bpage->is_freed()) switch (fil_page_get_type(page)) { case FIL_PAGE_TYPE_ALLOCATED: case FIL_PAGE_INODE: @@ -1217,6 +1221,7 @@ void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + recv_sys.free_corrupted_page(id); mysql_mutex_lock(&mutex); hash_lock.lock(); @@ -1241,8 +1246,6 @@ void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) buf_LRU_block_free_hashed_page(reinterpret_cast(bpage)); mysql_mutex_unlock(&mutex); - - recv_sys.free_corrupted_page(id); } /** Update buf_pool.LRU_old_ratio. diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 39ecd5de27f..cf76a9bd93a 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -265,9 +265,6 @@ buf_read_page_low( buf_page_t* bpage; if (buf_dblwr.is_inside(page_id)) { - ib::error() << "Trying to read doublewrite buffer page " - << page_id; - ut_ad(0); space->release(); return DB_PAGE_CORRUPTED; } @@ -525,7 +522,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) /* We will check that almost all pages in the area have been accessed in the desired order. */ - const bool descending= page_id == low; + const bool descending= page_id != low; if (!descending && page_id != high_1) /* This is not a border page of the area */ @@ -555,7 +552,7 @@ fail: uint32_t{buf_pool.read_ahead_area}); page_id_t new_low= low, new_high_1= high_1; unsigned prev_accessed= 0; - for (page_id_t i= low; i != high_1; ++i) + for (page_id_t i= low; i <= high_1; ++i) { buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); transactional_shared_lock_guard g @@ -583,12 +580,21 @@ failed: if (prev == FIL_NULL || next == FIL_NULL) goto fail; page_id_t id= page_id; - if (descending && next - 1 == page_id.page_no()) - id.set_page_no(prev); - else if (!descending && prev + 1 == page_id.page_no()) - id.set_page_no(next); + if (descending) + { + if (id == high_1) + ++id; + else if (next - 1 != page_id.page_no()) + goto fail; + else + id.set_page_no(prev); + } else - goto fail; /* Successor or predecessor not in the right order */ + { + if (prev + 1 != page_id.page_no()) + goto fail; + id.set_page_no(next); + } new_low= id - (id.page_no() % buf_read_ahead_area); new_high_1= new_low + (buf_read_ahead_area - 1); @@ -620,7 +626,7 @@ failed: /* If we got this far, read-ahead can be sensible: do it */ count= 0; for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; - new_low != new_high_1; ++new_low) + new_low <= new_high_1; ++new_low) { if (ibuf_bitmap_page(new_low, zip_size)) continue; @@ -649,60 +655,35 @@ failed: return count; } -/** @return whether a page has been freed */ -inline bool fil_space_t::is_freed(uint32_t page) +/** Schedule a page for recovery. +@param space tablespace +@param page_id page identifier +@param recs log records +@param init page initialization, or nullptr if the page needs to be read */ +void buf_read_recover(fil_space_t *space, const page_id_t page_id, + page_recv_t &recs, recv_init *init) { - std::lock_guard freed_lock(freed_range_mutex); - return freed_ranges.contains(page); -} - -/** Issues read requests for pages which recovery wants to read in. -@param space_id tablespace identifier -@param page_nos page numbers to read, in ascending order */ -void buf_read_recv_pages(uint32_t space_id, st_::span page_nos) -{ - fil_space_t* space = fil_space_t::get(space_id); - - if (!space) { - /* The tablespace is missing or unreadable: do nothing */ - return; - } - - const ulint zip_size = space->zip_size(); - - for (ulint i = 0; i < page_nos.size(); i++) { - - /* Ignore if the page already present in freed ranges. */ - if (space->is_freed(page_nos[i])) { - continue; - } - - const page_id_t cur_page_id(space_id, page_nos[i]); - - ulint limit = 0; - for (ulint j = 0; j < buf_pool.n_chunks; j++) { - limit += buf_pool.chunks[j].size / 2; - } - - if (os_aio_pending_reads() >= limit) { - os_aio_wait_until_no_pending_reads(false); - } - - space->reacquire(); - switch (buf_read_page_low(space, false, BUF_READ_ANY_PAGE, - cur_page_id, zip_size, true)) { - case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC: - break; - default: - sql_print_error("InnoDB: Recovery failed to read page " - UINT32PF " from %s", - cur_page_id.page_no(), - space->chain.start->name); - } - } - - - DBUG_PRINT("ib_buf", ("recovery read (%zu pages) for %s", - page_nos.size(), space->chain.start->name)); - space->release(); + ut_ad(space->id == page_id.space()); + space->reacquire(); + const ulint zip_size= space->zip_size(); + + if (init) + { + if (buf_page_t *bpage= buf_page_init_for_read(BUF_READ_ANY_PAGE, page_id, + zip_size, true)) + { + ut_ad(bpage->in_file()); + os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs, + UT_LIST_GET_FIRST(space->chain), + IORequest::READ_ASYNC}, ptrdiff_t(init)); + } + } + else if (dberr_t err= buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + page_id, zip_size, true)) + { + if (err != DB_SUCCESS_LOCKED_REC) + sql_print_error("InnoDB: Recovery failed to read page " + UINT32PF " from %s", + page_id.page_no(), space->chain.start->name); + } } diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 39f5943d5a4..2e8d87dad9e 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -204,7 +204,17 @@ static const dict_table_schema_t table_stats_schema = { {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597}, - {"last_update", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 4}, + /* + Don't check the DATA_UNSIGNED flag in last_update. + It presents if the server is running in a pure MariaDB installation, + because MariaDB's Field_timestampf::flags has UNSIGNED_FLAG. + But DATA_UNSIGNED misses when the server starts on a MySQL-5.7 directory + (during a migration), because MySQL's Field_timestampf::flags does not + have UNSIGNED_FLAG. + This is fine not to check DATA_UNSIGNED, because Field_timestampf + in both MariaDB and MySQL support only non-negative time_t values. + */ + {"last_update", DATA_INT, DATA_NOT_NULL, 4}, {"n_rows", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, {"clustered_index_size", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, {"sum_of_other_index_sizes", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, @@ -218,7 +228,11 @@ static const dict_table_schema_t index_stats_schema = {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597}, {"index_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, - {"last_update", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 4}, + /* + Don't check the DATA_UNSIGNED flag in last_update. + See comments about last_update in table_stats_schema above. + */ + {"last_update", DATA_INT, DATA_NOT_NULL, 4}, {"stat_name", DATA_VARMYSQL, DATA_NOT_NULL, 64*3}, {"stat_value", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, {"sample_size", DATA_INT, DATA_UNSIGNED, 8}, diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index be313140225..eb0fb3601a1 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -447,7 +447,9 @@ static bool fil_node_open_file(fil_node_t *node) } } - return fil_node_open_file_low(node); + /* The node can be opened beween releasing and acquiring fil_system.mutex + in the above code */ + return node->is_open() || fil_node_open_file_low(node); } /** Close the file handle. */ @@ -1953,8 +1955,8 @@ err_exit: FIL_TYPE_TABLESPACE, crypt_data, mode, true)) { fil_node_t* node = space->add(path, file, size, false, true); - mysql_mutex_unlock(&fil_system.mutex); IF_WIN(node->find_metadata(), node->find_metadata(file, true)); + mysql_mutex_unlock(&fil_system.mutex); mtr.start(); mtr.set_named_space(space); ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS); @@ -2775,53 +2777,55 @@ func_exit: #include -/** Callback for AIO completion */ -void fil_aio_callback(const IORequest &request) +void IORequest::write_complete() const { ut_ad(fil_validate_skip()); - ut_ad(request.node); + ut_ad(node); + ut_ad(is_write()); - if (!request.bpage) + if (!bpage) { ut_ad(!srv_read_only_mode); - if (request.type == IORequest::DBLWR_BATCH) - buf_dblwr.flush_buffered_writes_completed(request); + if (type == IORequest::DBLWR_BATCH) + buf_dblwr.flush_buffered_writes_completed(*this); else - ut_ad(request.type == IORequest::WRITE_ASYNC); -write_completed: - request.node->complete_write(); - } - else if (request.is_write()) - { - buf_page_write_complete(request); - goto write_completed; + ut_ad(type == IORequest::WRITE_ASYNC); } else + buf_page_write_complete(*this); + + node->complete_write(); + node->space->release(); +} + +void IORequest::read_complete() const +{ + ut_ad(fil_validate_skip()); + ut_ad(node); + ut_ad(is_read()); + ut_ad(bpage); + + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in fil_system.sys_space, we have to be very careful not to + introduce deadlocks. We never close fil_system.sys_space data files + and never issue asynchronous reads of change buffer pages. */ + const page_id_t id(bpage->id()); + + if (dberr_t err= bpage->read_complete(*node)) { - ut_ad(request.is_read()); - - /* IMPORTANT: since i/o handling for reads will read also the insert - buffer in fil_system.sys_space, we have to be very careful not to - introduce deadlocks. We never close fil_system.sys_space data - files and never issue asynchronous reads of change buffer pages. */ - const page_id_t id(request.bpage->id()); - - if (dberr_t err= request.bpage->read_complete(*request.node)) + if (recv_recovery_is_on() && !srv_force_recovery) { - if (recv_recovery_is_on() && !srv_force_recovery) - { - mysql_mutex_lock(&recv_sys.mutex); - recv_sys.set_corrupt_fs(); - mysql_mutex_unlock(&recv_sys.mutex); - } - - if (err != DB_FAIL) - ib::error() << "Failed to read page " << id.page_no() - << " from file '" << request.node->name << "': " << err; + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.set_corrupt_fs(); + mysql_mutex_unlock(&recv_sys.mutex); } + + if (err != DB_FAIL) + ib::error() << "Failed to read page " << id.page_no() + << " from file '" << node->name << "': " << err; } - request.node->space->release(); + node->space->release(); } /** Flush to disk the writes in file spaces of the given type diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index c6044b201fe..48415f2a2dc 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -1284,23 +1284,20 @@ static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr) + header->page.frame, frag_n_used - 1); } + mtr->free(*space, static_cast(offset)); + xdes_set_free(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr); + ut_ad(err == DB_SUCCESS); + if (!xdes_get_n_used(descr)) { /* The extent has become free: move it to another list */ err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes, xoffset, mtr); - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - return err; - } - err = fsp_free_extent(space, offset, mtr); - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - return err; + if (err == DB_SUCCESS) { + err = fsp_free_extent(space, offset, mtr); } } - mtr->free(*space, static_cast(offset)); - xdes_set_free(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr); - - return DB_SUCCESS; + return err; } /** @return Number of segment inodes which fit on a single page */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 342c51d2fe5..374f536e91a 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -945,7 +945,8 @@ static SHOW_VAR innodb_status_variables[]= { {"buffer_pool_read_ahead", &buf_pool.stat.n_ra_pages_read, SHOW_SIZE_T}, {"buffer_pool_read_ahead_evicted", &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T}, - {"buffer_pool_read_requests", &buf_pool.stat.n_page_gets, SHOW_SIZE_T}, + {"buffer_pool_read_requests", + &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T}, {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T}, {"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T}, @@ -1917,8 +1918,9 @@ static void innodb_disable_internal_writes(bool disable) sst_enable_innodb_writes(); } -static void wsrep_abort_transaction(handlerton*, THD *, THD *, my_bool); -static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid); +static void wsrep_abort_transaction(handlerton *, THD *, THD *, my_bool) + __attribute__((nonnull)); +static int innobase_wsrep_set_checkpoint(handlerton *hton, const XID *xid); static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid); #endif /* WITH_WSREP */ @@ -18622,36 +18624,45 @@ void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id) wsrep_thd_client_mode_str(vthd), wsrep_thd_transaction_state_str(vthd), wsrep_thd_query(vthd)); - /* Mark transaction as a victim for Galera abort */ - vtrx->lock.set_wsrep_victim(); - if (!wsrep_thd_set_wsrep_aborter(bf_thd, vthd)) - aborting= true; - else - WSREP_DEBUG("kill transaction skipped due to wsrep_aborter set"); + aborting= true; } } mysql_mutex_unlock(&lock_sys.wait_mutex); vtrx->mutex_unlock(); } - wsrep_thd_UNLOCK(vthd); - if (aborting) + + DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); + if (aborting && wsrep_thd_bf_abort(bf_thd, vthd, true)) { + /* Need to grab mutexes again to ensure that the trx is still in + right state. */ + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + vtrx->mutex_lock(); + /* if victim is waiting for some other lock, we have to cancel that waiting */ - lock_sys.cancel_lock_wait_for_trx(vtrx); - - DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); - if (!wsrep_thd_bf_abort(bf_thd, vthd, true)) + if (vtrx->id == trx_id) { - wsrep_thd_LOCK(vthd); - wsrep_thd_set_wsrep_aborter(NULL, vthd); - wsrep_thd_UNLOCK(vthd); - - WSREP_DEBUG("wsrep_thd_bf_abort has failed, victim %lu will survive", - thd_get_thread_id(vthd)); + switch (vtrx->state) { + default: + break; + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + lock_sys.cancel_lock_wait_for_wsrep_bf_abort(vtrx); + } } + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + vtrx->mutex_unlock(); } + else + { + WSREP_DEBUG("wsrep_thd_bf_abort has failed, victim %lu will survive", + thd_get_thread_id(vthd)); + } + wsrep_thd_UNLOCK(vthd); wsrep_thd_kill_UNLOCK(vthd); } } @@ -18659,68 +18670,50 @@ void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id) /** This function forces the victim transaction to abort. Aborting the transaction does NOT end it, it still has to be rolled back. + The caller must lock LOCK_thd_kill and LOCK_thd_data. + @param bf_thd brute force THD asking for the abort @param victim_thd victim THD to be aborted - - @return 0 victim was aborted - @return -1 victim thread was aborted (no transaction) */ -static -void -wsrep_abort_transaction( - handlerton*, - THD *bf_thd, - THD *victim_thd, - my_bool signal) +static void wsrep_abort_transaction(handlerton *, THD *bf_thd, THD *victim_thd, + my_bool signal) { - DBUG_ENTER("wsrep_abort_transaction"); - ut_ad(bf_thd); - ut_ad(victim_thd); + DBUG_ENTER("wsrep_abort_transaction"); + ut_ad(bf_thd); + ut_ad(victim_thd); - wsrep_thd_kill_LOCK(victim_thd); - wsrep_thd_LOCK(victim_thd); - trx_t* victim_trx= thd_to_trx(victim_thd); - wsrep_thd_UNLOCK(victim_thd); + trx_t *victim_trx= thd_to_trx(victim_thd); - WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s", - wsrep_thd_query(bf_thd), - wsrep_thd_query(victim_thd), - wsrep_thd_transaction_state_str(victim_thd)); + WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s", + wsrep_thd_query(bf_thd), wsrep_thd_query(victim_thd), + wsrep_thd_transaction_state_str(victim_thd)); - if (victim_trx) { - victim_trx->lock.set_wsrep_victim(); + if (!victim_trx) + { + WSREP_DEBUG("abort transaction: victim did not exist"); + DBUG_VOID_RETURN; + } - wsrep_thd_LOCK(victim_thd); - bool aborting= !wsrep_thd_set_wsrep_aborter(bf_thd, victim_thd); - wsrep_thd_UNLOCK(victim_thd); - if (aborting) { - DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); - DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", - { - const char act[]= - "now " - "SIGNAL sync.before_wsrep_thd_abort_reached " - "WAIT_FOR signal.before_wsrep_thd_abort"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); - wsrep_thd_bf_abort(bf_thd, victim_thd, signal); - } - } else { - DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", - { - const char act[]= - "now " - "SIGNAL sync.before_wsrep_thd_abort_reached " - "WAIT_FOR signal.before_wsrep_thd_abort"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); - wsrep_thd_bf_abort(bf_thd, victim_thd, signal); - } + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + victim_trx->mutex_lock(); - wsrep_thd_kill_UNLOCK(victim_thd); - DBUG_VOID_RETURN; + switch (victim_trx->state) { + default: + break; + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + /* Cancel lock wait if the victim is waiting for a lock in InnoDB. + The transaction which is blocked somewhere else (e.g. waiting + for next command or MDL) has been interrupted by THD::awake_no_mutex() + on server level before calling this function. */ + lock_sys.cancel_lock_wait_for_wsrep_bf_abort(victim_trx); + } + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + victim_trx->mutex_unlock(); + + DBUG_VOID_RETURN; } static diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index 13039e4d07f..12557b08ebe 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -2365,6 +2365,7 @@ tablespace_deleted: } const ulint zip_size = s->zip_size(), size = s->size; + s->x_lock(); s->release(); mtr_t mtr; @@ -2382,13 +2383,17 @@ tablespace_deleted: || !page_is_leaf(block->page.frame); mtr.commit(); if (err == DB_TABLESPACE_DELETED) { + s->x_unlock(); goto tablespace_deleted; } if (!remove) { + s->x_unlock(); continue; } } + s->x_unlock(); + if (srv_shutdown_state == SRV_SHUTDOWN_NONE || srv_fast_shutdown) { continue; @@ -2417,7 +2422,7 @@ tablespace_deleted: /* Prevent an infinite loop, by removing entries from the change buffer in the case the bitmap bits were wrongly clear even though buffered changes exist. */ - ibuf_delete_recs(page_id_t(space_ids[i], page_nos[i])); + ibuf_delete_recs(page_id_t(space_id, page_nos[i])); } } @@ -4195,25 +4200,26 @@ dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block, ibuf_mtr_commit(&mtr); - if (bitmap_bits - && DB_SUCCESS + if (!bitmap_bits) { + done: + /* No changes are buffered for this page. */ + space->release(); + return DB_SUCCESS; + } + + if (!block + || DB_SUCCESS == fseg_page_is_allocated(space, page_id.page_no())) { ibuf_mtr_start(&mtr); mtr.set_named_space(space); ibuf_reset_bitmap(block, page_id, zip_size, &mtr); ibuf_mtr_commit(&mtr); - bitmap_bits = 0; if (!block || btr_page_get_index_id(block->page.frame) != DICT_IBUF_ID_MIN + IBUF_SPACE_ID) { ibuf_delete_recs(page_id); } - } - - if (!bitmap_bits) { - /* No changes are buffered for this page. */ - space->release(); - return DB_SUCCESS; + goto done; } } diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index af676da8214..957810a021e 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -75,8 +75,7 @@ struct buf_pool_info_t ulint flush_list_len; /*!< Length of buf_pool.flush_list */ ulint n_pend_unzip; /*!< buf_pool.n_pend_unzip, pages pending decompress */ - ulint n_pend_reads; /*!< buf_pool.n_pend_reads, pages - pending read */ + ulint n_pend_reads; /*!< os_aio_pending_reads() */ ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */ ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH LIST */ diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index 4ec8938c689..3dd085dda5c 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -102,10 +102,13 @@ which could result in a deadlock if the OS does not support asynchronous io. ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); -/** Issue read requests for pages that need to be recovered. -@param space_id tablespace identifier -@param page_nos page numbers to read, in ascending order */ -void buf_read_recv_pages(uint32_t space_id, st_::span page_nos); +/** Schedule a page for recovery. +@param space tablespace +@param page_id page identifier +@param recs log records +@param init page initialization, or nullptr if the page needs to be read */ +void buf_read_recover(fil_space_t *space, const page_id_t page_id, + page_recv_t &recs, recv_init *init); /** @name Modes used in read-ahead @{ */ /** read only pages belonging to the insert buffer tree */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 73e80d77b56..35ac728b5ea 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -638,8 +638,6 @@ public: /** Close all tablespace files at shutdown */ static void close_all(); - /** @return last_freed_lsn */ - lsn_t get_last_freed_lsn() { return last_freed_lsn; } /** Update last_freed_lsn */ void update_last_freed_lsn(lsn_t lsn) { last_freed_lsn= lsn; } diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 16acd031177..e8299bb1189 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -955,6 +955,10 @@ public: /** Cancel possible lock waiting for a transaction */ static void cancel_lock_wait_for_trx(trx_t *trx); +#ifdef WITH_WSREP + /** Cancel lock waiting for a wsrep BF abort. */ + static void cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx); +#endif /* WITH_WSREP */ }; /** The lock system */ diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index e787d81e8c2..e642b501409 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -38,9 +38,9 @@ Created 9/20/1997 Heikki Tuuri #define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) -/** Apply any buffered redo log to a page that was just read from a data file. -@param[in,out] space tablespace -@param[in,out] bpage buffer pool page +/** Apply any buffered redo log to a page. +@param space tablespace +@param bpage buffer pool page @return whether the page was recovered correctly */ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); @@ -49,17 +49,6 @@ of first system tablespace page @return error code or DB_SUCCESS */ dberr_t recv_recovery_from_checkpoint_start(); -/** Whether to store redo log records in recv_sys.pages */ -enum store_t { - /** Do not store redo log records. */ - STORE_NO, - /** Store redo log records. */ - STORE_YES, - /** Store redo log records if the tablespace exists. */ - STORE_IF_EXISTS -}; - - /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] type file operation redo log type @@ -125,21 +114,15 @@ struct recv_dblwr_t list pages; }; -/** the recovery state and buffered records for a page */ +/** recv_sys.pages entry; protected by recv_sys.mutex */ struct page_recv_t { - /** Recovery state; protected by recv_sys.mutex */ - enum - { - /** not yet processed */ - RECV_NOT_PROCESSED, - /** not processed; the page will be reinitialized */ - RECV_WILL_NOT_READ, - /** page is being read */ - RECV_BEING_READ, - /** log records are being applied on the page */ - RECV_BEING_PROCESSED - } state= RECV_NOT_PROCESSED; + /** Recovery status: 0=not in progress, 1=log is being applied, + -1=log has been applied and the entry may be erased. + Transitions from 1 to -1 are NOT protected by recv_sys.mutex. */ + Atomic_relaxed being_processed{0}; + /** Whether reading the page will be skipped */ + bool skip_read= false; /** Latest written byte offset when applying the log records. @see mtr_t::m_last_offset */ uint16_t last_offset= 1; @@ -162,6 +145,9 @@ struct page_recv_t head= recs; tail= recs; } + /** Remove the last records for the page + @param start_lsn start of the removed log */ + ATTRIBUTE_COLD void rewind(lsn_t start_lsn); /** @return the last log snippet */ const log_rec_t* last() const { return tail; } @@ -180,8 +166,8 @@ struct page_recv_t iterator begin() { return head; } iterator end() { return NULL; } bool empty() const { ut_ad(!head == !tail); return !head; } - /** Clear and free the records; @see recv_sys_t::alloc() */ - inline void clear(); + /** Clear and free the records; @see recv_sys_t::add() */ + void clear(); } log; /** Trim old log records for a page. @@ -190,21 +176,27 @@ struct page_recv_t inline bool trim(lsn_t start_lsn); /** Ignore any earlier redo log records for this page. */ inline void will_not_read(); - /** @return whether the log records for the page are being processed */ - bool is_being_processed() const { return state == RECV_BEING_PROCESSED; } +}; + +/** A page initialization operation that was parsed from the redo log */ +struct recv_init +{ + /** log sequence number of the page initialization */ + lsn_t lsn; + /** Whether btr_page_create() avoided a read of the page. + At the end of the last recovery batch, mark_ibuf_exist() + will mark pages for which this flag is set. */ + bool created; }; /** Recovery system data structure */ struct recv_sys_t { - /** mutex protecting apply_log_recs and page_recv_t::state */ - mysql_mutex_t mutex; + using init= recv_init; + + /** mutex protecting this as well as some of page_recv_t */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; private: - /** condition variable for - !apply_batch_on || pages.empty() || found_corrupt_log || found_corrupt_fs */ - pthread_cond_t cond; - /** whether recv_apply_hashed_log_recs() is running */ - bool apply_batch_on; /** set when finding a corrupt log block or record, or there is a log parsing buffer overflow */ bool found_corrupt_log; @@ -226,6 +218,8 @@ public: size_t offset; /** log sequence number of the first non-parsed record */ lsn_t lsn; + /** log sequence number of the last parsed mini-transaction */ + lsn_t scanned_lsn; /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ lsn_t file_checkpoint; /** the time when progress was last reported */ @@ -238,6 +232,9 @@ public: map pages; private: + /** iterator to pages, used by parse() */ + map::iterator pages_it; + /** Process a record that indicates that a tablespace size is being shrunk. @param page_id first page that is not in the file @param lsn log sequence number of the shrink operation */ @@ -257,30 +254,42 @@ public: /** The contents of the doublewrite buffer */ recv_dblwr_t dblwr; - /** Last added LSN to pages, before switching to STORE_NO */ - lsn_t last_stored_lsn= 0; - inline void read(os_offset_t offset, span buf); inline size_t files_size(); void close_files() { files.clear(); files.shrink_to_fit(); } + /** Advance pages_it if it matches the iterator */ + void pages_it_invalidate(const map::iterator &p) + { + mysql_mutex_assert_owner(&mutex); + if (pages_it == p) + pages_it++; + } + /** Invalidate pages_it if it points to the given tablespace */ + void pages_it_invalidate(uint32_t space_id) + { + mysql_mutex_assert_owner(&mutex); + if (pages_it != pages.end() && pages_it->first.space() == space_id) + pages_it= pages.end(); + } + private: /** Attempt to initialize a page based on redo log records. - @param page_id page identifier - @param p iterator pointing to page_id + @param p iterator @param mtr mini-transaction @param b pre-allocated buffer pool block + @param init page initialization @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ - inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p, - mtr_t &mtr, buf_block_t *b); + inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr, + buf_block_t *b, init &init); /** Attempt to initialize a page based on redo log records. @param page_id page identifier @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ - buf_block_t *recover_low(const page_id_t page_id); + ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id); /** All found log files (multiple ones are possible if we are upgrading from before MariaDB Server 10.5.1) */ @@ -289,10 +298,27 @@ private: /** Base node of the redo block list. List elements are linked via buf_block_t::unzip_LRU. */ UT_LIST_BASE_NODE_T(buf_block_t) blocks; + + /** Allocate a block from the buffer pool for recv_sys.pages */ + ATTRIBUTE_COLD buf_block_t *add_block(); + + /** Wait for buffer pool to become available. + @param pages number of buffer pool pages needed */ + ATTRIBUTE_COLD void wait_for_pool(size_t pages); + + /** Free log for processed pages. */ + void garbage_collect(); + + /** Apply a recovery batch. + @param space_id current tablespace identifier + @param space current tablespace + @param free_block spare buffer block + @param last_batch whether it is possible to write more redo log + @return whether the caller must provide a new free_block */ + bool apply_batch(uint32_t space_id, fil_space_t *&space, + buf_block_t *&free_block, bool last_batch); + public: - /** Check whether the number of read redo log blocks exceeds the maximum. - @return whether the memory is exhausted */ - inline bool is_memory_exhausted(); /** Apply buffered log to persistent data pages. @param last_batch whether it is possible to write more redo log */ void apply(bool last_batch); @@ -310,7 +336,7 @@ public: /** Clean up after create() */ void close(); - bool is_initialised() const { return last_stored_lsn != 0; } + bool is_initialised() const { return scanned_lsn != 0; } /** Find the latest checkpoint. @return error code or DB_SUCCESS */ @@ -321,60 +347,76 @@ public: @param start_lsn start LSN of the mini-transaction @param lsn @see mtr_t::commit_lsn() @param l redo log snippet - @param len length of l, in bytes */ - inline void add(map::iterator it, lsn_t start_lsn, lsn_t lsn, - const byte *l, size_t len); + @param len length of l, in bytes + @return whether we ran out of memory */ + bool add(map::iterator it, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len); - enum parse_mtr_result { OK, PREMATURE_EOF, GOT_EOF }; + /** Parsing result */ + enum parse_mtr_result { + /** a record was successfully parsed */ + OK, + /** the log ended prematurely (need to read more) */ + PREMATURE_EOF, + /** the end of the log was reached */ + GOT_EOF, + /** parse(l, false) ran out of memory */ + GOT_OOM + }; private: /** Parse and register one log_t::FORMAT_10_8 mini-transaction. - @param store whether to store the records - @param l log data source */ + @tparam store whether to store the records + @param l log data source + @param if_exists if store: whether to check if the tablespace exists */ + template + inline parse_mtr_result parse(source &l, bool if_exists) noexcept; + + /** Rewind a mini-transaction when parse() runs out of memory. + @param l log data source + @param begin start of the mini-transaction */ template - inline parse_mtr_result parse(store_t store, source& l) noexcept; + ATTRIBUTE_COLD void rewind(source &l, source &begin) noexcept; + + /** Report progress in terms of LSN or pages remaining */ + ATTRIBUTE_COLD void report_progress() const; public: /** Parse and register one log_t::FORMAT_10_8 mini-transaction, handling log_sys.is_pmem() buffer wrap-around. - @param store whether to store the records */ - static parse_mtr_result parse_mtr(store_t store) noexcept; + @tparam store whether to store the records + @param if_exists if store: whether to check if the tablespace exists */ + template + static parse_mtr_result parse_mtr(bool if_exists) noexcept; /** Parse and register one log_t::FORMAT_10_8 mini-transaction, handling log_sys.is_pmem() buffer wrap-around. - @param store whether to store the records */ - static parse_mtr_result parse_pmem(store_t store) noexcept + @tparam store whether to store the records + @param if_exists if store: whether to check if the tablespace exists */ + template + static parse_mtr_result parse_pmem(bool if_exists) noexcept #ifdef HAVE_PMEM ; #else - { return parse_mtr(store); } + { return parse_mtr(if_exists); } #endif + /** Erase log records for a page. */ + void erase(map::iterator p); + /** Clear a fully processed set of stored redo log records. */ - inline void clear(); + void clear(); /** Determine whether redo log recovery progress should be reported. @param time the current time @return whether progress should be reported (the last report was at least 15 seconds ago) */ - bool report(time_t time) - { - if (time - progress_time < 15) - return false; - - progress_time= time; - return true; - } + bool report(time_t time); /** The alloc() memory alignment, in bytes */ static constexpr size_t ALIGNMENT= sizeof(size_t); - /** Allocate memory for log_rec_t - @param len allocation size, in bytes - @return pointer to len bytes of memory (never NULL) */ - inline void *alloc(size_t len); - /** Free a redo log snippet. - @param data buffer returned by alloc() */ + @param data buffer allocated in add() */ inline void free(const void *data); /** Remove records for a corrupted page. @@ -386,8 +428,6 @@ public: ATTRIBUTE_COLD void set_corrupt_fs(); /** Flag log file corruption during recovery. */ ATTRIBUTE_COLD void set_corrupt_log(); - /** Possibly finish a recovery batch. */ - inline void maybe_finish_batch(); /** @return whether data file corruption was found */ bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); } @@ -405,13 +445,14 @@ public: } /** Try to recover a tablespace that was not readable earlier - @param p iterator, initially pointing to page_id_t{space_id,0}; - the records will be freed and the iterator advanced + @param p iterator @param name tablespace file name @param free_block spare buffer block - @return whether recovery failed */ - bool recover_deferred(map::iterator &p, const std::string &name, - buf_block_t *&free_block); + @return recovered tablespace + @retval nullptr if recovery failed */ + fil_space_t *recover_deferred(const map::iterator &p, + const std::string &name, + buf_block_t *&free_block); }; /** The recovery system */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 13f9d3de3f8..54f7ceeb4c0 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -212,6 +212,10 @@ public: bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; } bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } + void write_complete() const; + void read_complete() const; + void fake_read_complete(os_offset_t offset) const; + /** If requested, free storage space associated with a section of the file. @param off byte offset from the start (SEEK_SET) @param len size of the hole in bytes @@ -1040,6 +1044,11 @@ int os_aio_init(); Frees the asynchronous io system. */ void os_aio_free(); +/** Submit a fake read request during crash recovery. +@param type fake read request +@param offset additional context */ +void os_fake_read(const IORequest &type, os_offset_t offset); + /** Request a read or write. @param type I/O request @param buf buffer diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 0f02e307a47..e14e6b1b1a6 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -617,6 +617,8 @@ struct export_var_t{ #ifdef UNIV_DEBUG ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ #endif /* UNIV_DEBUG */ + /** buf_pool.stat.n_page_gets (a sharded counter) */ + ulint innodb_buffer_pool_read_requests; ulint innodb_checkpoint_age; ulint innodb_checkpoint_max_age; ulint innodb_data_pending_reads; /*!< Pending reads */ diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index bbfed2490e9..4fc4f240444 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -1055,7 +1055,7 @@ public: void close(); /** @return total number of active (non-prepared) transactions */ - ulint any_active_transactions(); + size_t any_active_transactions(size_t *prepared= nullptr); /** diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 670fe00c25b..c1435930551 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -216,14 +216,6 @@ buf_block_t* trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo, mtr_t *mtr, dberr_t *err) MY_ATTRIBUTE((nonnull, warn_unused_result)); -/******************************************************************//** -Sets the state of the undo log segment at a transaction finish. -@return undo log segment header page, x-latched */ -buf_block_t* -trx_undo_set_state_at_finish( -/*=========================*/ - trx_undo_t* undo, /*!< in: undo log memory copy */ - mtr_t* mtr); /*!< in: mtr */ /** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. @param[in,out] trx transaction diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 3c7c3d348af..08547f169f3 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -5732,13 +5732,14 @@ static void lock_release_autoinc_locks(trx_t *trx) } /** Cancel a waiting lock request and release possibly waiting transactions */ -template +template void lock_cancel_waiting_and_release(lock_t *lock) { lock_sys.assert_locked(*lock); mysql_mutex_assert_owner(&lock_sys.wait_mutex); trx_t *trx= lock->trx; - trx->mutex_lock(); + if (inner_trx_lock) + trx->mutex_lock(); ut_d(const auto trx_state= trx->state); ut_ad(trx_state == TRX_STATE_COMMITTED_IN_MEMORY || trx_state == TRX_STATE_ACTIVE); @@ -5762,7 +5763,8 @@ void lock_cancel_waiting_and_release(lock_t *lock) lock_wait_end(trx); - trx->mutex_unlock(); + if (inner_trx_lock) + trx->mutex_unlock(); } void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx) @@ -5779,6 +5781,19 @@ void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx) mysql_mutex_unlock(&lock_sys.wait_mutex); } +#ifdef WITH_WSREP +void lock_sys_t::cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx) +{ + lock_sys.assert_locked(); + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + ut_ad(trx->mutex_is_owner()); + ut_ad(trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED); + trx->lock.set_wsrep_victim(); + if (lock_t *lock= trx->lock.wait_lock) + lock_cancel_waiting_and_release(lock); +} +#endif /* WITH_WSREP */ + /** Cancel a waiting lock request. @tparam check_victim whether to check for DB_DEADLOCK @param trx active transaction diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 37a496725fc..1c77c3cdeec 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -738,7 +738,7 @@ static struct { retry: log_sys.latch.wr_unlock(); - bool fail= false; + fil_space_t *space= fil_system.sys_space; buf_block_t *free_block= buf_LRU_get_free_block(false); log_sys.latch.wr_lock(SRW_LOCK_CALL); mysql_mutex_lock(&recv_sys.mutex); @@ -755,11 +755,12 @@ retry: there were no buffered records. Either way, we must create a dummy tablespace with the latest known name, for dict_drop_index_tree(). */ + recv_sys.pages_it_invalidate(space_id); while (p != recv_sys.pages.end() && p->first.space() == space_id) { + ut_ad(!p->second.being_processed); recv_sys_t::map::iterator r= p++; - r->second.log.clear(); - recv_sys.pages.erase(r); + recv_sys.erase(r); } recv_spaces_t::iterator it{recv_spaces.find(space_id)}; if (it != recv_spaces.end()) @@ -782,11 +783,14 @@ retry: } } else - fail= recv_sys.recover_deferred(p, d->second.file_name, free_block); + space= recv_sys.recover_deferred(p, d->second.file_name, free_block); processed: - defers.erase(d++); - if (fail) + auto e= d++; + defers.erase(e); + if (!space) break; + if (space != fil_system.sys_space) + space->release(); if (free_block) continue; mysql_mutex_unlock(&recv_sys.mutex); @@ -797,7 +801,7 @@ processed: mysql_mutex_unlock(&recv_sys.mutex); if (free_block) buf_pool.free_block(free_block); - return fail; + return !space; } /** Create tablespace metadata for a data file that was initially @@ -905,28 +909,191 @@ free_space: } deferred_spaces; +/** Report an operation to create, delete, or rename a file during backup. +@param[in] space_id tablespace identifier +@param[in] type redo log type +@param[in] name file name (not NUL-terminated) +@param[in] len length of name, in bytes +@param[in] new_name new file name (NULL if not rename) +@param[in] new_len length of new_name, in bytes (0 if NULL) */ +void (*log_file_op)(uint32_t space_id, int type, + const byte* name, ulint len, + const byte* new_name, ulint new_len); + +void (*undo_space_trunc)(uint32_t space_id); + +void (*first_page_init)(uint32_t space_id); + +/** Information about initializing page contents during redo log processing. +FIXME: Rely on recv_sys.pages! */ +class mlog_init_t +{ + using map= std::map, + ut_allocator>>; + /** Map of page initialization operations. + FIXME: Merge this to recv_sys.pages! */ + map inits; + + /** Iterator to the last add() or will_avoid_read(), for speeding up + will_avoid_read(). */ + map::iterator i; +public: + /** Constructor */ + mlog_init_t() : i(inits.end()) {} + + /** Record that a page will be initialized by the redo log. + @param page_id page identifier + @param lsn log sequence number + @return whether the state was changed */ + bool add(const page_id_t page_id, lsn_t lsn) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + const recv_init init = { lsn, false }; + std::pair p= + inits.insert(map::value_type(page_id, init)); + ut_ad(!p.first->second.created); + if (p.second) return true; + if (p.first->second.lsn >= lsn) return false; + p.first->second = init; + i = p.first; + return true; + } + + /** Get the last stored lsn of the page id and its respective + init/load operation. + @param page_id page identifier + @return the latest page initialization; + not valid after releasing recv_sys.mutex. */ + recv_init &last(page_id_t page_id) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + return inits.find(page_id)->second; + } + + /** Determine if a page will be initialized or freed after a time. + @param page_id page identifier + @param lsn log sequence number + @return whether page_id will be freed or initialized after lsn */ + bool will_avoid_read(page_id_t page_id, lsn_t lsn) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + if (i != inits.end() && i->first == page_id) + return i->second.lsn > lsn; + i = inits.lower_bound(page_id); + return i != inits.end() && i->first == page_id && i->second.lsn > lsn; + } + + /** At the end of each recovery batch, reset the 'created' flags. */ + void reset() + { + mysql_mutex_assert_owner(&recv_sys.mutex); + ut_ad(recv_no_ibuf_operations); + for (map::value_type &i : inits) + i.second.created= false; + } + + /** During the last recovery batch, mark whether there exist + buffered changes for the pages that were initialized + by buf_page_create() and still reside in the buffer pool. */ + void mark_ibuf_exist() + { + mysql_mutex_assert_owner(&recv_sys.mutex); + + for (const map::value_type &i : inits) + if (i.second.created) + { + auto &chain= buf_pool.page_hash.cell_get(i.first.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + + hash_lock.lock_shared(); + buf_block_t *block= reinterpret_cast + (buf_pool.page_hash.get(i.first, chain)); + bool got_latch= block && block->page.lock.x_lock_try(); + hash_lock.unlock_shared(); + + if (!block) + continue; + + uint32_t state; + + if (!got_latch) + { + mysql_mutex_lock(&buf_pool.mutex); + block= reinterpret_cast + (buf_pool.page_hash.get(i.first, chain)); + if (!block) + { + mysql_mutex_unlock(&buf_pool.mutex); + continue; + } + + state= block->page.fix(); + mysql_mutex_unlock(&buf_pool.mutex); + if (state < buf_page_t::UNFIXED) + { + block->page.unfix(); + continue; + } + block->page.lock.x_lock(); + state= block->page.unfix(); + ut_ad(state < buf_page_t::READ_FIX); + if (state >= buf_page_t::UNFIXED && block->page.id() == i.first) + goto check_ibuf; + } + else + { + state= block->page.state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + if (state >= buf_page_t::UNFIXED) + { + check_ibuf: + mysql_mutex_unlock(&recv_sys.mutex); + if (ibuf_page_exists(block->page.id(), block->zip_size())) + block->page.set_ibuf_exist(); + mysql_mutex_lock(&recv_sys.mutex); + } + } + + block->page.lock.x_unlock(); + } + } + + /** Clear the data structure */ + void clear() { inits.clear(); i = inits.end(); } +}; + +static mlog_init_t mlog_init; + /** Try to recover a tablespace that was not readable earlier -@param p iterator, initially pointing to page_id_t{space_id,0}; - the records will be freed and the iterator advanced +@param p iterator to the page @param name tablespace file name @param free_block spare buffer block -@return whether recovery failed */ -bool recv_sys_t::recover_deferred(recv_sys_t::map::iterator &p, - const std::string &name, - buf_block_t *&free_block) +@return recovered tablespace +@retval nullptr if recovery failed */ +fil_space_t *recv_sys_t::recover_deferred(const recv_sys_t::map::iterator &p, + const std::string &name, + buf_block_t *&free_block) { mysql_mutex_assert_owner(&mutex); - const page_id_t first{p->first}; - ut_ad(first.space()); + ut_ad(p->first.space()); - recv_spaces_t::iterator it{recv_spaces.find(first.space())}; + recv_spaces_t::iterator it{recv_spaces.find(p->first.space())}; ut_ad(it != recv_spaces.end()); - if (!first.page_no() && p->second.state == page_recv_t::RECV_WILL_NOT_READ) + if (!p->first.page_no() && p->second.skip_read) { mtr_t mtr; - buf_block_t *block= recover_low(first, p, mtr, free_block); + ut_ad(!p->second.being_processed); + p->second.being_processed= 1; + init &init= mlog_init.last(p->first); + mysql_mutex_unlock(&mutex); + buf_block_t *block= recover_low(p, mtr, free_block, init); + mysql_mutex_lock(&mutex); + p->second.being_processed= -1; ut_ad(block == free_block || block == reinterpret_cast(-1)); free_block= nullptr; if (UNIV_UNLIKELY(!block || block == reinterpret_cast(-1))) @@ -939,10 +1106,7 @@ bool recv_sys_t::recover_deferred(recv_sys_t::map::iterator &p, const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET); const uint32_t size= fsp_header_get_field(page, FSP_SIZE); - ut_ad(it != recv_spaces.end()); - - if (page_id_t{space_id, page_no} == first && size >= 4 && - it != recv_spaces.end() && + if (page_id_t{space_id, page_no} == p->first && size >= 4 && fil_space_t::is_valid_flags(flags, space_id) && fil_space_t::logical_size(flags) == srv_page_size) { @@ -996,10 +1160,10 @@ bool recv_sys_t::recover_deferred(recv_sys_t::map::iterator &p, } size_set: node->deferred= false; - space->release(); it->second.space= space; block->page.lock.x_unlock(); - return false; + p->second.being_processed= -1; + return space; } release_and_fail: @@ -1007,179 +1171,34 @@ bool recv_sys_t::recover_deferred(recv_sys_t::map::iterator &p, } fail: - ib::error() << "Cannot apply log to " << first + ib::error() << "Cannot apply log to " << p->first << " of corrupted file '" << name << "'"; - return true; + return nullptr; } -/** Report an operation to create, delete, or rename a file during backup. -@param[in] space_id tablespace identifier -@param[in] type redo log type -@param[in] name file name (not NUL-terminated) -@param[in] len length of name, in bytes -@param[in] new_name new file name (NULL if not rename) -@param[in] new_len length of new_name, in bytes (0 if NULL) */ -void (*log_file_op)(uint32_t space_id, int type, - const byte* name, ulint len, - const byte* new_name, ulint new_len); - -void (*undo_space_trunc)(uint32_t space_id); - -void (*first_page_init)(uint32_t space_id); - -/** Information about initializing page contents during redo log processing. -FIXME: Rely on recv_sys.pages! */ -class mlog_init_t -{ -public: - /** A page initialization operation that was parsed from - the redo log */ - struct init { - /** log sequence number of the page initialization */ - lsn_t lsn; - /** Whether btr_page_create() avoided a read of the page. - - At the end of the last recovery batch, mark_ibuf_exist() - will mark pages for which this flag is set. */ - bool created; - }; - -private: - typedef std::map, - ut_allocator > > - map; - /** Map of page initialization operations. - FIXME: Merge this to recv_sys.pages! */ - map inits; -public: - /** Record that a page will be initialized by the redo log. - @param[in] page_id page identifier - @param[in] lsn log sequence number - @return whether the state was changed */ - bool add(const page_id_t page_id, lsn_t lsn) - { - mysql_mutex_assert_owner(&recv_sys.mutex); - const init init = { lsn, false }; - std::pair p = inits.insert( - map::value_type(page_id, init)); - ut_ad(!p.first->second.created); - if (p.second) return true; - if (p.first->second.lsn >= init.lsn) return false; - p.first->second = init; - return true; - } - - /** Get the last stored lsn of the page id and its respective - init/load operation. - @param[in] page_id page id - @param[in,out] init initialize log or load log - @return the latest page initialization; - not valid after releasing recv_sys.mutex. */ - init& last(page_id_t page_id) - { - mysql_mutex_assert_owner(&recv_sys.mutex); - return inits.find(page_id)->second; - } - - /** Determine if a page will be initialized or freed after a time. - @param page_id page identifier - @param lsn log sequence number - @return whether page_id will be freed or initialized after lsn */ - bool will_avoid_read(page_id_t page_id, lsn_t lsn) const - { - mysql_mutex_assert_owner(&recv_sys.mutex); - auto i= inits.find(page_id); - return i != inits.end() && i->second.lsn > lsn; - } - - /** At the end of each recovery batch, reset the 'created' flags. */ - void reset() - { - mysql_mutex_assert_owner(&recv_sys.mutex); - ut_ad(recv_no_ibuf_operations); - for (map::value_type& i : inits) { - i.second.created = false; - } - } - - /** On the last recovery batch, mark whether there exist - buffered changes for the pages that were initialized - by buf_page_create() and still reside in the buffer pool. - @param[in,out] mtr dummy mini-transaction */ - void mark_ibuf_exist(mtr_t& mtr) - { - mysql_mutex_assert_owner(&recv_sys.mutex); - mtr.start(); - - for (const map::value_type& i : inits) { - if (!i.second.created) { - continue; - } - if (buf_block_t* block = buf_page_get_low( - i.first, 0, RW_X_LATCH, nullptr, - BUF_GET_IF_IN_POOL, - &mtr, nullptr, false)) { - if (UNIV_LIKELY_NULL(block->page.zip.data)) { - switch (fil_page_get_type( - block->page.zip.data)) { - case FIL_PAGE_INDEX: - case FIL_PAGE_RTREE: - if (page_zip_decompress( - &block->page.zip, - block->page.frame, - true)) { - break; - } - ib::error() << "corrupted " - << block->page.id(); - } - } - if (recv_no_ibuf_operations) { - mtr.commit(); - mtr.start(); - continue; - } - mysql_mutex_unlock(&recv_sys.mutex); - if (ibuf_page_exists(block->page.id(), - block->zip_size())) { - block->page.set_ibuf_exist(); - } - mtr.commit(); - mtr.start(); - mysql_mutex_lock(&recv_sys.mutex); - } - } - - mtr.commit(); - clear(); - } - - /** Clear the data structure */ - void clear() { inits.clear(); } -}; - -static mlog_init_t mlog_init; - /** Process a record that indicates that a tablespace is being shrunk in size. @param page_id first page identifier that is not in the file @param lsn log sequence number of the shrink operation */ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn) { - DBUG_ENTER("recv_sys_t::trim"); - DBUG_LOG("ib_log", - "discarding log beyond end of tablespace " - << page_id << " before LSN " << lsn); - mysql_mutex_assert_owner(&mutex); - for (recv_sys_t::map::iterator p = pages.lower_bound(page_id); - p != pages.end() && p->first.space() == page_id.space();) { - recv_sys_t::map::iterator r = p++; - if (r->second.trim(lsn)) { - pages.erase(r); - } - } - DBUG_VOID_RETURN; + DBUG_ENTER("recv_sys_t::trim"); + DBUG_LOG("ib_log", "discarding log beyond end of tablespace " + << page_id << " before LSN " << lsn); + mysql_mutex_assert_owner(&mutex); + if (pages_it != pages.end() && pages_it->first.space() == page_id.space()) + pages_it= pages.end(); + for (recv_sys_t::map::iterator p = pages.lower_bound(page_id); + p != pages.end() && p->first.space() == page_id.space();) + { + recv_sys_t::map::iterator r = p++; + if (r->second.trim(lsn)) + { + ut_ad(!r->second.being_processed); + pages.erase(r); + } + } + DBUG_VOID_RETURN; } inline void recv_sys_t::read(os_offset_t total_offset, span buf) @@ -1202,15 +1221,10 @@ inline size_t recv_sys_t::files_size() @param[in] space_id the tablespace ID @param[in] ftype FILE_MODIFY, FILE_DELETE, or FILE_RENAME @param[in] lsn lsn of the redo log -@param[in] store whether the redo log has to be stored */ +@param[in] if_exists whether to check if the tablespace exists */ static void fil_name_process(const char *name, ulint len, uint32_t space_id, - mfile_type_t ftype, lsn_t lsn, store_t store) + mfile_type_t ftype, lsn_t lsn, bool if_exists) { - if (srv_operation == SRV_OPERATION_BACKUP - || srv_operation == SRV_OPERATION_BACKUP_NO_DEFER) { - return; - } - ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || srv_operation == SRV_OPERATION_RESTORE || srv_operation == SRV_OPERATION_RESTORE_EXPORT); @@ -1321,7 +1335,7 @@ same_space: case FIL_LOAD_DEFER: /** Skip the deferred spaces when lsn is already processed */ - if (store != store_t::STORE_IF_EXISTS) { + if (!if_exists) { deferred_spaces.add( space_id, fname.name.c_str(), lsn); } @@ -1364,9 +1378,8 @@ void recv_sys_t::close() deferred_spaces.clear(); ut_d(mysql_mutex_unlock(&mutex)); - last_stored_lsn= 0; + scanned_lsn= 0; mysql_mutex_destroy(&mutex); - pthread_cond_destroy(&cond); } recv_spaces.clear(); @@ -1381,34 +1394,34 @@ void recv_sys_t::create() ut_ad(this == &recv_sys); ut_ad(!is_initialised()); mysql_mutex_init(recv_sys_mutex_key, &mutex, nullptr); - pthread_cond_init(&cond, nullptr); apply_log_recs = false; - apply_batch_on = false; len = 0; offset = 0; lsn = 0; + scanned_lsn = 1; found_corrupt_log = false; found_corrupt_fs = false; file_checkpoint = 0; progress_time = time(NULL); + ut_ad(pages.empty()); + pages_it = pages.end(); recv_max_page_lsn = 0; memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces); - last_stored_lsn = 1; UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU); } /** Clear a fully processed set of stored redo log records. */ -inline void recv_sys_t::clear() +void recv_sys_t::clear() { mysql_mutex_assert_owner(&mutex); apply_log_recs= false; - apply_batch_on= false; ut_ad(!after_apply || found_corrupt_fs || !UT_LIST_GET_LAST(blocks)); pages.clear(); + pages_it= pages.end(); for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; ) { @@ -1419,8 +1432,6 @@ inline void recv_sys_t::clear() buf_block_free(block); block= prev_block; } - - pthread_cond_broadcast(&cond); } /** Free most recovery data structures. */ @@ -1432,52 +1443,14 @@ void recv_sys_t::debug_free() recovery_on= false; pages.clear(); + pages_it= pages.end(); mysql_mutex_unlock(&mutex); } -inline void *recv_sys_t::alloc(size_t len) -{ - mysql_mutex_assert_owner(&mutex); - ut_ad(len); - ut_ad(len <= srv_page_size); - - buf_block_t *block= UT_LIST_GET_FIRST(blocks); - if (UNIV_UNLIKELY(!block)) - { -create_block: - block= buf_block_alloc(); - block->page.access_time= 1U << 16 | - ut_calc_align(static_cast(len), ALIGNMENT); - static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2"); - UT_LIST_ADD_FIRST(blocks, block); - MEM_MAKE_ADDRESSABLE(block->page.frame, len); - MEM_NOACCESS(block->page.frame + len, srv_page_size - len); - return my_assume_aligned(block->page.frame); - } - - size_t free_offset= static_cast(block->page.access_time); - ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT)); - if (UNIV_UNLIKELY(!free_offset)) - { - ut_ad(srv_page_size == 65536); - goto create_block; - } - ut_ad(free_offset <= srv_page_size); - free_offset+= len; - - if (free_offset > srv_page_size) - goto create_block; - - block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 | - ut_calc_align(static_cast(free_offset), ALIGNMENT); - MEM_MAKE_ADDRESSABLE(block->page.frame + free_offset - len, len); - return my_assume_aligned(block->page.frame + free_offset - len); -} - /** Free a redo log snippet. -@param data buffer returned by alloc() */ +@param data buffer allocated in add() */ inline void recv_sys_t::free(const void *data) { ut_ad(!ut_align_offset(data, ALIGNMENT)); @@ -1502,8 +1475,11 @@ inline void recv_sys_t::free(const void *data) ut_ad(block->page.state() == buf_page_t::MEMORY); ut_ad(static_cast(block->page.access_time - 1) < srv_page_size); - ut_ad(block->page.access_time >= 1U << 16); - if (!((block->page.access_time -= 1U << 16) >> 16)) + unsigned a= block->page.access_time; + ut_ad(a >= 1U << 16); + a-= 1U << 16; + block->page.access_time= a; + if (!(a >> 16)) { UT_LIST_REMOVE(blocks, block); MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size); @@ -1689,6 +1665,9 @@ dberr_t recv_sys_t::find_checkpoint() bool wrong_size= false; byte *buf; + ut_ad(pages.empty()); + pages_it= pages.end(); + if (files.empty()) { file_checkpoint= 0; @@ -1965,7 +1944,31 @@ inline bool page_recv_t::trim(lsn_t start_lsn) } -inline void page_recv_t::recs_t::clear() +void page_recv_t::recs_t::rewind(lsn_t start_lsn) +{ + mysql_mutex_assert_owner(&recv_sys.mutex); + log_phys_t *trim= static_cast(head); + ut_ad(trim); + while (log_phys_t *next= static_cast(trim->next)) + { + ut_ad(trim->start_lsn < start_lsn); + if (next->start_lsn == start_lsn) + break; + trim= next; + } + tail= trim; + log_rec_t *l= tail->next; + tail->next= nullptr; + while (l) + { + log_rec_t *next= l->next; + recv_sys.free(l); + l= next; + } +} + + +void page_recv_t::recs_t::clear() { mysql_mutex_assert_owner(&recv_sys.mutex); for (const log_rec_t *l= head; l; ) @@ -1977,33 +1980,99 @@ inline void page_recv_t::recs_t::clear() head= tail= nullptr; } - /** Ignore any earlier redo log records for this page. */ inline void page_recv_t::will_not_read() { - ut_ad(state == RECV_NOT_PROCESSED || state == RECV_WILL_NOT_READ); - state= RECV_WILL_NOT_READ; + ut_ad(!being_processed); + skip_read= true; log.clear(); } +void recv_sys_t::erase(map::iterator p) +{ + ut_ad(p->second.being_processed <= 0); + p->second.log.clear(); + pages.erase(p); +} + +/** Free log for processed pages. */ +void recv_sys_t::garbage_collect() +{ + mysql_mutex_assert_owner(&mutex); + + if (pages_it != pages.end() && pages_it->second.being_processed < 0) + pages_it= pages.end(); + + for (map::iterator p= pages.begin(); p != pages.end(); ) + { + if (p->second.being_processed < 0) + { + map::iterator r= p++; + erase(r); + } + else + p++; + } +} + +/** Allocate a block from the buffer pool for recv_sys.pages */ +ATTRIBUTE_COLD buf_block_t *recv_sys_t::add_block() +{ + for (bool freed= false;;) + { + const auto rs= UT_LIST_GET_LEN(blocks) * 2; + mysql_mutex_lock(&buf_pool.mutex); + const auto bs= + UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + if (UNIV_LIKELY(bs > BUF_LRU_MIN_LEN || rs < bs)) + { + buf_block_t *block= buf_LRU_get_free_block(true); + mysql_mutex_unlock(&buf_pool.mutex); + return block; + } + /* out of memory: redo log occupies more than 1/3 of buf_pool + and there are fewer than BUF_LRU_MIN_LEN pages left */ + mysql_mutex_unlock(&buf_pool.mutex); + if (freed) + return nullptr; + freed= true; + garbage_collect(); + } +} + +/** Wait for buffer pool to become available. */ +ATTRIBUTE_COLD void recv_sys_t::wait_for_pool(size_t pages) +{ + mysql_mutex_unlock(&mutex); + os_aio_wait_until_no_pending_reads(false); + mysql_mutex_lock(&mutex); + garbage_collect(); + mysql_mutex_lock(&buf_pool.mutex); + bool need_more= UT_LIST_GET_LEN(buf_pool.free) < pages; + mysql_mutex_unlock(&buf_pool.mutex); + if (need_more) + buf_flush_sync_batch(lsn); +} /** Register a redo log snippet for a page. @param it page iterator @param start_lsn start LSN of the mini-transaction @param lsn @see mtr_t::commit_lsn() @param l redo log snippet -@param len length of l, in bytes */ -inline void recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn, - const byte *l, size_t len) +@param len length of l, in bytes +@return whether we ran out of memory */ +ATTRIBUTE_NOINLINE +bool recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len) { mysql_mutex_assert_owner(&mutex); - page_id_t page_id = it->first; page_recv_t &recs= it->second; + buf_block_t *block; switch (*l & 0x70) { case FREE_PAGE: case INIT_PAGE: recs.will_not_read(); - mlog_init.add(page_id, start_lsn); /* FIXME: remove this! */ + mlog_init.add(it->first, start_lsn); /* FIXME: remove this! */ /* fall through */ default: log_phys_t *tail= static_cast(recs.log.last()); @@ -2012,7 +2081,7 @@ inline void recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn, if (tail->start_lsn != start_lsn) break; ut_ad(tail->lsn == lsn); - buf_block_t *block= UT_LIST_GET_LAST(blocks); + block= UT_LIST_GET_LAST(blocks); ut_ad(block); const size_t used= static_cast(block->page.access_time - 1) + 1; ut_ad(used >= ALIGNMENT); @@ -2025,7 +2094,7 @@ append: MEM_MAKE_ADDRESSABLE(end + 1, len); /* Append to the preceding record for the page */ tail->append(l, len); - return; + return false; } if (end <= &block->page.frame[used - ALIGNMENT] || &block->page.frame[used] >= end) @@ -2039,8 +2108,49 @@ append: ut_calc_align(static_cast(new_used), ALIGNMENT); goto append; } - recs.log.append(new (alloc(log_phys_t::alloc_size(len))) + + const size_t size{log_phys_t::alloc_size(len)}; + ut_ad(size <= srv_page_size); + void *buf; + block= UT_LIST_GET_FIRST(blocks); + if (UNIV_UNLIKELY(!block)) + { + create_block: + block= add_block(); + if (UNIV_UNLIKELY(!block)) + return true; + block->page.access_time= 1U << 16 | + ut_calc_align(static_cast(size), ALIGNMENT); + static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2"); + UT_LIST_ADD_FIRST(blocks, block); + MEM_MAKE_ADDRESSABLE(block->page.frame, size); + MEM_NOACCESS(block->page.frame + size, srv_page_size - size); + buf= block->page.frame; + } + else + { + size_t free_offset= static_cast(block->page.access_time); + ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT)); + if (UNIV_UNLIKELY(!free_offset)) + { + ut_ad(srv_page_size == 65536); + goto create_block; + } + ut_ad(free_offset <= srv_page_size); + free_offset+= size; + + if (free_offset > srv_page_size) + goto create_block; + + block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 | + ut_calc_align(static_cast(free_offset), ALIGNMENT); + MEM_MAKE_ADDRESSABLE(block->page.frame + free_offset - size, size); + buf= block->page.frame + free_offset - size; + } + + recs.log.append(new (my_assume_aligned(buf)) log_phys_t{start_lsn, lsn, l, len}); + return false; } /** Store/remove the freed pages in fil_name_t of recv_spaces. @@ -2304,13 +2414,84 @@ struct recv_ring : public recv_buf }; #endif -/** Parse and register one log_t::FORMAT_10_8 mini-transaction. -@param store whether to store the records -@param l log data source */ template -inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) +void recv_sys_t::rewind(source &l, source &begin) noexcept +{ + ut_ad(srv_operation != SRV_OPERATION_BACKUP); + mysql_mutex_assert_owner(&mutex); + + const source end= l; + uint32_t rlen; + for (l= begin; !(l == end); l+= rlen) + { + const source recs{l}; + ++l; + const byte b= *recs; + + ut_ad(b > 1); + ut_ad(UNIV_LIKELY((b & 0x70) != RESERVED) || srv_force_recovery); + + rlen= b & 0xf; + if (!rlen) + { + const uint32_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + ut_ad(!l.is_eof(rlen)); + if (b & 0x80) + continue; + + uint32_t idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen)) + continue; + const uint32_t space_id= mlog_decode_varint(l); + if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR)) + continue; + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen)) + continue; + const uint32_t page_no= mlog_decode_varint(l); + if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR)) + continue; + const page_id_t id{space_id, page_no}; + if (pages_it == pages.end() || pages_it->first != id) + { + pages_it= pages.find(id); + if (pages_it == pages.end()) + continue; + } + + ut_ad(!pages_it->second.being_processed); + const log_phys_t *head= + static_cast(*pages_it->second.log.begin()); + if (!head || head->start_lsn == lsn) + { + erase(pages_it); + pages_it= pages.end(); + } + else + pages_it->second.log.rewind(lsn); + } + + l= begin; + pages_it= pages.end(); +} + +/** Parse and register one log_t::FORMAT_10_8 mini-transaction. +@tparam store whether to store the records +@param l log data source +@param if_exists if store: whether to check if the tablespace exists */ +template +inline +recv_sys_t::parse_mtr_result recv_sys_t::parse(source &l, bool if_exists) noexcept { +restart: #ifndef SUX_LOCK_GENERIC ut_ad(log_sys.latch.is_write_locked() || srv_operation == SRV_OPERATION_BACKUP || @@ -2319,12 +2500,15 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) mysql_mutex_assert_owner(&mutex); ut_ad(log_sys.next_checkpoint_lsn); ut_ad(log_sys.is_latest()); + ut_ad(store || !if_exists); + ut_ad(store || + srv_operation != SRV_OPERATION_BACKUP || + srv_operation != SRV_OPERATION_BACKUP_NO_DEFER); alignas(8) byte iv[MY_AES_BLOCK_SIZE]; byte *decrypt_buf= static_cast(alloca(srv_page_size)); const lsn_t start_lsn{lsn}; - map::iterator cached_pages_it{pages.end()}; /* Check that the entire mini-transaction is included within the buffer */ if (l.is_eof(0)) @@ -2333,7 +2517,7 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) if (*l <= 1) return GOT_EOF; /* We should never write an empty mini-transaction. */ - const source begin{l}; + source begin{l}; uint32_t rlen; for (uint32_t total_len= 0; !l.is_eof(); l+= rlen, total_len+= rlen) { @@ -2433,7 +2617,6 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) sql_print_error("InnoDB: Unknown log record at LSN " LSN_PF, lsn); corrupted: found_corrupt_log= true; - pthread_cond_broadcast(&cond); return GOT_EOF; } @@ -2510,13 +2693,13 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) mach_write_to_4(iv + 12, page_no); got_page_op= !(b & 0x80); if (!got_page_op); - else if (srv_operation == SRV_OPERATION_BACKUP) + else if (!store && srv_operation == SRV_OPERATION_BACKUP) { if (page_no == 0 && first_page_init && (b & 0x10)) first_page_init(space_id); continue; } - else if (file_checkpoint && !is_predefined_tablespace(space_id)) + else if (store && file_checkpoint && !is_predefined_tablespace(space_id)) { recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); if (i != recv_spaces.end() && i->first == space_id); @@ -2585,7 +2768,7 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) trim({space_id, 0}, lsn); truncated_undo_spaces[space_id - srv_undo_space_id_start]= { lsn, page_no }; - if (undo_space_trunc) + if (!store && undo_space_trunc) undo_space_trunc(space_id); #endif last_offset= 1; /* the next record must not be same_page */ @@ -2626,7 +2809,7 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) { if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) goto record_corrupted; - if (UNIV_UNLIKELY(!page_no) && file_checkpoint) + if (store && UNIV_UNLIKELY(!page_no) && file_checkpoint) { const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE && last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4; @@ -2705,38 +2888,57 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE); } #endif - const bool is_init= (b & 0x70) <= INIT_PAGE; - switch (store) { - case STORE_IF_EXISTS: - if (fil_space_t *space= fil_space_t::get(space_id)) + if (store) + { + if (if_exists) { - const auto size= space->get_size(); - space->release(); - if (!size) + if (fil_space_t *space= fil_space_t::get(space_id)) + { + const auto size= space->get_size(); + space->release(); + if (!size) + continue; + } + else if (!deferred_spaces.find(space_id)) continue; } - else if (!deferred_spaces.find(space_id)) - continue; - /* fall through */ - case STORE_YES: if (!mlog_init.will_avoid_read(id, start_lsn)) { - if (cached_pages_it == pages.end() || - cached_pages_it->first != id) - cached_pages_it= pages.emplace(id, page_recv_t{}).first; - add(cached_pages_it, start_lsn, lsn, - l.get_buf(cl, recs, decrypt_buf), l - recs + rlen); + if (pages_it == pages.end() || pages_it->first != id) + pages_it= pages.emplace(id, page_recv_t{}).first; + if (UNIV_UNLIKELY(add(pages_it, start_lsn, lsn, + l.get_buf(cl, recs, decrypt_buf), + l - recs + rlen))) + { + lsn= start_lsn; + log_sys.set_recovered_lsn(start_lsn); + l+= rlen; + offset= begin.ptr - log_sys.buf; + rewind(l, begin); + if (if_exists) + { + apply(false); + if (is_corrupt_fs()) + return GOT_EOF; + goto restart; + } + sql_print_information("InnoDB: Multi-batch recovery needed at LSN " + LSN_PF, lsn); + return GOT_OOM; + } } - continue; - case STORE_NO: - if (!is_init) - continue; + } + else if ((b & 0x70) <= INIT_PAGE) + { mlog_init.add(id, start_lsn); - map::iterator i= pages.find(id); - if (i == pages.end()) - continue; - i->second.log.clear(); - pages.erase(i); + if (pages_it == pages.end() || pages_it->first != id) + { + pages_it= pages.find(id); + if (pages_it == pages.end()) + continue; + } + map::iterator r= pages_it++; + erase(r); } } else if (rlen) @@ -2749,6 +2951,11 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) if (rlen < UNIV_PAGE_SIZE_MAX && !l.is_zero(rlen)) continue; } + else if (store) + { + ut_ad(file_checkpoint); + continue; + } else if (const lsn_t c= l.read8()) { if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) @@ -2830,21 +3037,27 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) if (UNIV_UNLIKELY(!recv_needed_recovery && srv_read_only_mode)) continue; + if (!store && + (srv_operation == SRV_OPERATION_BACKUP || + srv_operation == SRV_OPERATION_BACKUP_NO_DEFER)) + { + if ((b & 0xf0) < FILE_CHECKPOINT && log_file_op) + log_file_op(space_id, b & 0xf0, + reinterpret_cast(fn), + static_cast(fnend - fn), + reinterpret_cast(fn2), + fn2 ? static_cast(fn2end - fn2) : 0); + continue; + } + fil_name_process(fn, fnend - fn, space_id, (b & 0xf0) == FILE_DELETE ? FILE_DELETE : FILE_MODIFY, - start_lsn, store); - - if ((b & 0xf0) < FILE_CHECKPOINT && log_file_op) - log_file_op(space_id, b & 0xf0, - reinterpret_cast(fn), - static_cast(fnend - fn), - reinterpret_cast(fn2), - fn2 ? static_cast(fn2end - fn2) : 0); + start_lsn, if_exists); if (fn2) { fil_name_process(fn2, fn2end - fn2, space_id, - FILE_RENAME, start_lsn, store); + FILE_RENAME, start_lsn, if_exists); if (file_checkpoint) { const size_t len= fn2end - fn2; @@ -2868,18 +3081,23 @@ inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l) return OK; } -ATTRIBUTE_NOINLINE -recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(store_t store) noexcept +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) noexcept { recv_buf s{&log_sys.buf[recv_sys.offset]}; - return recv_sys.parse(store, s); + return recv_sys.parse(s, if_exists); } +/** for mariadb-backup; @see xtrabackup_copy_logfile() */ +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool) noexcept; + #ifdef HAVE_PMEM -recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(store_t store) noexcept +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept { - recv_sys_t::parse_mtr_result r{parse_mtr(store)}; - if (r != PREMATURE_EOF || !log_sys.is_pmem()) + recv_sys_t::parse_mtr_result r{parse_mtr(if_exists)}; + if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_pmem()) return r; ut_ad(recv_sys.len == log_sys.file_size); ut_ad(recv_sys.offset >= log_sys.START_OFFSET); @@ -2888,7 +3106,7 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(store_t store) noexcept {recv_sys.offset == recv_sys.len ? &log_sys.buf[log_sys.START_OFFSET] : &log_sys.buf[recv_sys.offset]}; - return recv_sys.parse(store, s); + return recv_sys.parse(s, if_exists); } #endif @@ -2896,23 +3114,22 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(store_t store) noexcept lsn of a log record. @param[in,out] block buffer pool page @param[in,out] mtr mini-transaction -@param[in,out] p recovery address +@param[in,out] recs log records to apply @param[in,out] space tablespace, or NULL if not looked up yet @param[in,out] init page initialization operation, or NULL @return the recovered page @retval nullptr on failure */ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr, - const recv_sys_t::map::iterator &p, - fil_space_t *space= nullptr, - mlog_init_t::init *init= nullptr) + page_recv_t &recs, + fil_space_t *space, + recv_init *init) { - mysql_mutex_assert_owner(&recv_sys.mutex); + mysql_mutex_assert_not_owner(&recv_sys.mutex); ut_ad(recv_sys.apply_log_recs); ut_ad(recv_needed_recovery); ut_ad(!init || init->created); ut_ad(!init || init->lsn); - ut_ad(block->page.id() == p->first); - ut_ad(!p->second.is_being_processed()); + ut_ad(recs.being_processed == 1); ut_ad(!space || space->id == block->page.id().space()); ut_ad(log_sys.is_latest()); @@ -2924,10 +3141,6 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr, block->page.id().space(), block->page.id().page_no())); - p->second.state = page_recv_t::RECV_BEING_PROCESSED; - - mysql_mutex_unlock(&recv_sys.mutex); - byte *frame = UNIV_LIKELY_NULL(block->page.zip.data) ? block->page.zip.data : block->page.frame; @@ -2941,7 +3154,7 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr, bool skipped_after_init = false; - for (const log_rec_t* recv : p->second.log) { + for (const log_rec_t* recv : recs.log) { const log_phys_t* l = static_cast(recv); ut_ad(l->lsn); ut_ad(end_lsn <= l->lsn); @@ -2999,8 +3212,7 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr, block->page.id().space(), block->page.id().page_no())); - log_phys_t::apply_status a= l->apply(*block, - p->second.last_offset); + log_phys_t::apply_status a= l->apply(*block, recs.last_offset); switch (a) { case log_phys_t::APPLIED_NO: @@ -3067,9 +3279,6 @@ set_start_lsn: || recv_sys.is_corrupt_log()) && !srv_force_recovery) { if (init) { init->created = false; - if (space || block->page.id().page_no()) { - block->page.lock.x_lock_recursive(); - } } mtr.discard_modifications(); @@ -3123,26 +3332,11 @@ set_start_lsn: mtr.commit(); done: - time_t now = time(NULL); - - mysql_mutex_lock(&recv_sys.mutex); - + /* FIXME: do this in page read, protected with recv_sys.mutex! */ if (recv_max_page_lsn < page_lsn) { recv_max_page_lsn = page_lsn; } - ut_ad(!block || p->second.is_being_processed()); - ut_ad(!block || !recv_sys.pages.empty()); - - if (recv_sys.report(now)) { - const size_t n = recv_sys.pages.size(); - sql_print_information("InnoDB: To recover: %zu pages from log", - n); - service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "To recover: %zu pages" - " from log", n); - } - return block; } @@ -3156,146 +3350,347 @@ ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id) mysql_mutex_lock(&mutex); map::iterator p= pages.find(page_id); - if (p != pages.end()) + if (p == pages.end()) { - p->second.log.clear(); - pages.erase(p); - if (!srv_force_recovery) - { - set_corrupt_fs(); - ib::error() << "Unable to apply log to corrupted page " << page_id - << "; set innodb_force_recovery to ignore"; - } - else - ib::warn() << "Discarding log for corrupted page " << page_id; + mysql_mutex_unlock(&mutex); + return; } - if (pages.empty()) - pthread_cond_broadcast(&cond); + p->second.being_processed= -1; + if (!srv_force_recovery) + set_corrupt_fs(); mysql_mutex_unlock(&mutex); -} -/** Possibly finish a recovery batch. */ -inline void recv_sys_t::maybe_finish_batch() -{ - mysql_mutex_assert_owner(&mutex); - ut_ad(recovery_on); - if (!apply_batch_on || pages.empty() || is_corrupt_log() || is_corrupt_fs()) - pthread_cond_broadcast(&cond); + ib::error_or_warn(!srv_force_recovery) + << "Unable to apply log to corrupted page " << page_id; } ATTRIBUTE_COLD void recv_sys_t::set_corrupt_log() { mysql_mutex_lock(&mutex); found_corrupt_log= true; - pthread_cond_broadcast(&cond); mysql_mutex_unlock(&mutex); } ATTRIBUTE_COLD void recv_sys_t::set_corrupt_fs() { mysql_mutex_assert_owner(&mutex); + if (!srv_force_recovery) + sql_print_information("InnoDB: Set innodb_force_recovery=1" + " to ignore corrupted pages."); found_corrupt_fs= true; - pthread_cond_broadcast(&cond); } -/** Apply any buffered redo log to a page that was just read from a data file. -@param[in,out] space tablespace -@param[in,out] bpage buffer pool page +/** Apply any buffered redo log to a page. +@param space tablespace +@param bpage buffer pool page @return whether the page was recovered correctly */ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage) { - mtr_t mtr; - mtr.start(); - mtr.set_log_mode(MTR_LOG_NO_REDO); + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); - ut_ad(bpage->frame); - /* Move the ownership of the x-latch on the page to - this OS thread, so that we can acquire a second - x-latch on it. This is needed for the operations to - the page to pass the debug checks. */ - bpage->lock.claim_ownership(); - bpage->lock.x_lock_recursive(); - bpage->fix_on_recovery(); - mtr.memo_push(reinterpret_cast(bpage), - MTR_MEMO_PAGE_X_FIX); + ut_ad(bpage->frame); + /* Move the ownership of the x-latch on the page to this OS thread, + so that we can acquire a second x-latch on it. This is needed for + the operations to the page to pass the debug checks. */ + bpage->lock.claim_ownership(); + bpage->lock.x_lock_recursive(); + bpage->fix_on_recovery(); + mtr.memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); - buf_block_t* success = reinterpret_cast(bpage); + buf_block_t *success= reinterpret_cast(bpage); - mysql_mutex_lock(&recv_sys.mutex); - if (recv_sys.apply_log_recs) { - recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id()); - if (p != recv_sys.pages.end() - && !p->second.is_being_processed()) { - success = recv_recover_page(success, mtr, p, space); - if (UNIV_LIKELY(!!success)) { - p->second.log.clear(); - recv_sys.pages.erase(p); - } - recv_sys.maybe_finish_batch(); - goto func_exit; - } - } - - mtr.commit(); -func_exit: - mysql_mutex_unlock(&recv_sys.mutex); - ut_ad(mtr.has_committed()); - return success; -} - -/** Read pages for which log needs to be applied. -@param page_id first page identifier to read -@param i iterator to recv_sys.pages */ -TRANSACTIONAL_TARGET -static void recv_read_in_area(page_id_t page_id, recv_sys_t::map::iterator i) -{ - uint32_t page_nos[32]; - ut_ad(page_id == i->first); - page_id.set_page_no(ut_2pow_round(page_id.page_no(), 32U)); - const page_id_t up_limit{page_id + 31}; - uint32_t* p= page_nos; - - for (; i != recv_sys.pages.end() && i->first <= up_limit; i++) + mysql_mutex_lock(&recv_sys.mutex); + if (recv_sys.apply_log_recs) { - if (i->second.state == page_recv_t::RECV_NOT_PROCESSED) + const page_id_t id{bpage->id()}; + recv_sys_t::map::iterator p= recv_sys.pages.find(id); + if (p == recv_sys.pages.end()); + else if (p->second.being_processed < 0) { - i->second.state= page_recv_t::RECV_BEING_READ; - *p++= i->first.page_no(); + recv_sys.pages_it_invalidate(p); + recv_sys.erase(p); + } + else + { + p->second.being_processed= 1; + recv_sys_t::init *init= nullptr; + if (p->second.skip_read) + (init= &mlog_init.last(id))->created= true; + mysql_mutex_unlock(&recv_sys.mutex); + success= recv_recover_page(success, mtr, p->second, space, init); + p->second.being_processed= -1; + goto func_exit; } } - if (p != page_nos) + mysql_mutex_unlock(&recv_sys.mutex); + mtr.commit(); +func_exit: + ut_ad(mtr.has_committed()); + return success; +} + +void IORequest::fake_read_complete(os_offset_t offset) const +{ + ut_ad(node); + ut_ad(is_read()); + ut_ad(bpage); + ut_ad(bpage->frame); + ut_ad(recv_recovery_is_on()); + ut_ad(offset); + + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + ut_ad(bpage->frame); + /* Move the ownership of the x-latch on the page to this OS thread, + so that we can acquire a second x-latch on it. This is needed for + the operations to the page to pass the debug checks. */ + bpage->lock.claim_ownership(); + bpage->lock.x_lock_recursive(); + bpage->fix_on_recovery(); + mtr.memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); + + page_recv_t &recs= *reinterpret_cast(slot); + ut_ad(recs.being_processed == 1); + recv_init &init= *reinterpret_cast(offset); + ut_ad(init.lsn > 1); + init.created= true; + + if (recv_recover_page(reinterpret_cast(bpage), + mtr, recs, node->space, &init)) { - mysql_mutex_unlock(&recv_sys.mutex); - buf_read_recv_pages(page_id.space(), {page_nos, p}); - mysql_mutex_lock(&recv_sys.mutex); + ut_ad(bpage->oldest_modification() || bpage->is_freed()); + bpage->lock.x_unlock(true); + } + recs.being_processed= -1; + ut_ad(mtr.has_committed()); + + node->space->release(); +} + +/** @return whether a page has been freed */ +inline bool fil_space_t::is_freed(uint32_t page) +{ + std::lock_guard freed_lock(freed_range_mutex); + return freed_ranges.contains(page); +} + +bool recv_sys_t::report(time_t time) +{ + if (time - progress_time < 15) + return false; + progress_time= time; + return true; +} + +ATTRIBUTE_COLD +void recv_sys_t::report_progress() const +{ + mysql_mutex_assert_owner(&mutex); + const size_t n{pages.size()}; + if (recv_sys.scanned_lsn == recv_sys.lsn) + { + sql_print_information("InnoDB: To recover: %zu pages", n); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "To recover: %zu pages", n); + } + else + { + sql_print_information("InnoDB: To recover: LSN " LSN_PF + "/" LSN_PF "; %zu pages", + recv_sys.lsn, recv_sys.scanned_lsn, n); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "To recover: LSN " LSN_PF + "/" LSN_PF "; %zu pages", + recv_sys.lsn, recv_sys.scanned_lsn, n); } } +/** Apply a recovery batch. +@param space_id current tablespace identifier +@param space current tablespace +@param free_block spare buffer block +@param last_batch whether it is possible to write more redo log +@return whether the caller must provide a new free_block */ +bool recv_sys_t::apply_batch(uint32_t space_id, fil_space_t *&space, + buf_block_t *&free_block, bool last_batch) +{ + mysql_mutex_assert_owner(&mutex); + ut_ad(pages_it != pages.end()); + ut_ad(!pages_it->second.log.empty()); + + mysql_mutex_lock(&buf_pool.mutex); + size_t n= 0, max_n= std::min(BUF_LRU_MIN_LEN, + UT_LIST_GET_LEN(buf_pool.LRU) + + UT_LIST_GET_LEN(buf_pool.free)); + mysql_mutex_unlock(&buf_pool.mutex); + + map::iterator begin= pages.end(); + page_id_t begin_id{~0ULL}; + + while (pages_it != pages.end() && n < max_n) + { + ut_ad(!buf_dblwr.is_inside(pages_it->first)); + if (!pages_it->second.being_processed) + { + if (space_id != pages_it->first.space()) + { + space_id= pages_it->first.space(); + if (space) + space->release(); + space= fil_space_t::get(space_id); + if (!space) + { + auto d= deferred_spaces.defers.find(space_id); + if (d == deferred_spaces.defers.end() || d->second.deleted) + /* For deleted files we preserve the deferred_spaces entry */; + else if (!free_block) + return true; + else + { + space= recover_deferred(pages_it, d->second.file_name, free_block); + deferred_spaces.defers.erase(d); + if (!space && !srv_force_recovery) + { + set_corrupt_fs(); + return false; + } + } + } + } + if (!space || space->is_freed(pages_it->first.page_no())) + pages_it->second.being_processed= -1; + else if (!n++) + { + begin= pages_it; + begin_id= pages_it->first; + } + } + pages_it++; + } + + if (!last_batch) + log_sys.latch.wr_unlock(); + + pages_it= begin; + + if (report(time(nullptr))) + report_progress(); + + if (!n) + goto wait; + + mysql_mutex_lock(&buf_pool.mutex); + + if (UNIV_UNLIKELY(UT_LIST_GET_LEN(buf_pool.free) < n)) + { + mysql_mutex_unlock(&buf_pool.mutex); + wait: + wait_for_pool(n); + if (n); + else if (!last_batch) + goto unlock_relock; + else + goto get_last; + pages_it= pages.lower_bound(begin_id); + ut_ad(pages_it != pages.end()); + } + else + mysql_mutex_unlock(&buf_pool.mutex); + + while (pages_it != pages.end()) + { + ut_ad(!buf_dblwr.is_inside(pages_it->first)); + if (!pages_it->second.being_processed) + { + const page_id_t id{pages_it->first}; + + if (space_id != id.space()) + { + space_id= id.space(); + if (space) + space->release(); + space= fil_space_t::get(space_id); + } + if (!space) + { + const auto it= deferred_spaces.defers.find(space_id); + if (it != deferred_spaces.defers.end() && !it->second.deleted) + /* The records must be processed after recover_deferred(). */ + goto next; + goto space_not_found; + } + else if (space->is_freed(id.page_no())) + { + space_not_found: + pages_it->second.being_processed= -1; + goto next; + } + else + { + page_recv_t &recs= pages_it->second; + ut_ad(!recs.log.empty()); + recs.being_processed= 1; + init *init= recs.skip_read ? &mlog_init.last(id) : nullptr; + mysql_mutex_unlock(&mutex); + buf_read_recover(space, id, recs, init); + } + + if (!--n) + { + if (last_batch) + goto relock_last; + goto relock; + } + mysql_mutex_lock(&mutex); + pages_it= pages.lower_bound(id); + } + else + next: + pages_it++; + } + + if (!last_batch) + { + unlock_relock: + mysql_mutex_unlock(&mutex); + relock: + log_sys.latch.wr_lock(SRW_LOCK_CALL); + relock_last: + mysql_mutex_lock(&mutex); + get_last: + pages_it= pages.lower_bound(begin_id); + } + + return false; +} + /** Attempt to initialize a page based on redo log records. -@param page_id page identifier -@param p iterator pointing to page_id +@param p iterator @param mtr mini-transaction @param b pre-allocated buffer pool block +@param init page initialization @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ -inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id, - map::iterator &p, mtr_t &mtr, - buf_block_t *b) +inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr, + buf_block_t *b, init &init) { - mysql_mutex_assert_owner(&mutex); - ut_ad(p->first == page_id); + mysql_mutex_assert_not_owner(&mutex); page_recv_t &recs= p->second; - ut_ad(recs.state == page_recv_t::RECV_WILL_NOT_READ); + ut_ad(recs.skip_read); + ut_ad(recs.being_processed == 1); buf_block_t* block= nullptr; - mlog_init_t::init &i= mlog_init.last(page_id); const lsn_t end_lsn= recs.log.last()->lsn; - if (end_lsn < i.lsn) - DBUG_LOG("ib_log", "skip log for page " << page_id - << " LSN " << end_lsn << " < " << i.lsn); - fil_space_t *space= fil_space_t::get(page_id.space()); + if (end_lsn < init.lsn) + DBUG_LOG("ib_log", "skip log for page " << p->first + << " LSN " << end_lsn << " < " << init.lsn); + fil_space_t *space= fil_space_t::get(p->first.space()); mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); @@ -3304,82 +3699,77 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id, if (!space) { - if (page_id.page_no() != 0) + if (p->first.page_no() != 0) { nothing_recoverable: mtr.commit(); return nullptr; } - auto it= recv_spaces.find(page_id.space()); + auto it= recv_spaces.find(p->first.space()); ut_ad(it != recv_spaces.end()); uint32_t flags= it->second.flags; zip_size= fil_space_t::zip_size(flags); - block= buf_page_create_deferred(page_id.space(), zip_size, &mtr, b); + block= buf_page_create_deferred(p->first.space(), zip_size, &mtr, b); ut_ad(block == b); block->page.lock.x_lock_recursive(); } else { - block= buf_page_create(space, page_id.page_no(), zip_size, &mtr, b); + block= buf_page_create(space, p->first.page_no(), zip_size, &mtr, b); if (UNIV_UNLIKELY(block != b)) { /* The page happened to exist in the buffer pool, or it was just being read in. Before the exclusive page latch was acquired by buf_page_create(), all changes to the page must have been applied. */ - ut_ad(pages.find(page_id) == pages.end()); + ut_d(mysql_mutex_lock(&mutex)); + ut_ad(pages.find(p->first) == pages.end()); + ut_d(mysql_mutex_unlock(&mutex)); space->release(); goto nothing_recoverable; } } - ut_ad(&recs == &pages.find(page_id)->second); - i.created= true; - map::iterator r= p++; - block= recv_recover_page(block, mtr, r, space, &i); + ut_d(mysql_mutex_lock(&mutex)); + ut_ad(&recs == &pages.find(p->first)->second); + ut_d(mysql_mutex_unlock(&mutex)); + init.created= true; + block= recv_recover_page(block, mtr, recs, space, &init); ut_ad(mtr.has_committed()); - if (block) - { - recs.log.clear(); - pages.erase(r); - } - else - block= reinterpret_cast(-1); - - if (pages.empty()) - pthread_cond_signal(&cond); - if (space) space->release(); - return block; + return block ? block : reinterpret_cast(-1); } /** Attempt to initialize a page based on redo log records. @param page_id page identifier @return recovered block @retval nullptr if the page cannot be initialized based on log records */ -buf_block_t *recv_sys_t::recover_low(const page_id_t page_id) +ATTRIBUTE_COLD buf_block_t *recv_sys_t::recover_low(const page_id_t page_id) { - buf_block_t *free_block= buf_LRU_get_free_block(false); - buf_block_t *block= nullptr; - mysql_mutex_lock(&mutex); map::iterator p= pages.find(page_id); - if (p != pages.end() && p->second.state == page_recv_t::RECV_WILL_NOT_READ) + if (p != pages.end() && !p->second.being_processed && p->second.skip_read) { + p->second.being_processed= 1; + init &init= mlog_init.last(page_id); + mysql_mutex_unlock(&mutex); + buf_block_t *free_block= buf_LRU_get_free_block(false); mtr_t mtr; - block= recover_low(page_id, p, mtr, free_block); + buf_block_t *block= recover_low(p, mtr, free_block, init); + p->second.being_processed= -1; ut_ad(!block || block == reinterpret_cast(-1) || block == free_block); + if (UNIV_UNLIKELY(!block)) + buf_pool.free_block(free_block); + return block; } mysql_mutex_unlock(&mutex); - if (UNIV_UNLIKELY(!block)) - buf_pool.free_block(free_block); - return block; + return nullptr; } inline fil_space_t *fil_system_t::find(const char *path) const @@ -3427,45 +3817,18 @@ void recv_sys_t::apply(bool last_batch) mysql_mutex_assert_owner(&mutex); - timespec abstime; - - while (apply_batch_on) - { - if (is_corrupt_log()) - return; - if (last_batch) - my_cond_wait(&cond, &mutex.m_mutex); - else - { -#ifndef SUX_LOCK_GENERIC - ut_ad(log_sys.latch.is_write_locked()); -#endif - log_sys.latch.wr_unlock(); - set_timespec_nsec(abstime, 500000000ULL); /* 0.5s */ - my_cond_timedwait(&cond, &mutex.m_mutex, &abstime); - mysql_mutex_unlock(&mutex); - log_sys.latch.wr_lock(SRW_LOCK_CALL); - mysql_mutex_lock(&mutex); - } - } - - recv_no_ibuf_operations = !last_batch || - srv_operation == SRV_OPERATION_RESTORE || - srv_operation == SRV_OPERATION_RESTORE_EXPORT; - - mtr_t mtr; + garbage_collect(); if (!pages.empty()) { - const char *msg= last_batch - ? "Starting final batch to recover" - : "Starting a batch to recover"; - const size_t n= pages.size(); - sql_print_information("InnoDB: %s %zu pages from redo log.", msg, n); - sd_notifyf(0, "STATUS=%s %zu pages from redo log", msg, n); + recv_no_ibuf_operations = !last_batch || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT; + ut_ad(!last_batch || lsn == scanned_lsn); + progress_time= time(nullptr); + report_progress(); apply_log_recs= true; - apply_batch_on= true; for (auto id= srv_undo_tablespaces_open; id--;) { @@ -3481,142 +3844,83 @@ void recv_sys_t::apply(bool last_batch) if (fil_space_t *space = fil_space_get(id + srv_undo_space_id_start)) { ut_ad(UT_LIST_GET_LEN(space->chain) == 1); + ut_ad(space->recv_size >= t.pages); fil_node_t *file= UT_LIST_GET_FIRST(space->chain); ut_ad(file->is_open()); os_file_truncate(file->name, file->handle, - os_offset_t{t.pages} << srv_page_size_shift, true); + os_offset_t{space->recv_size} << + srv_page_size_shift, true); } } } fil_system.extend_to_recv_size(); - /* We must release log_sys.latch and recv_sys.mutex before - invoking buf_LRU_get_free_block(). Allocating a block may initiate - a redo log write and therefore acquire log_sys.latch. To avoid - deadlocks, log_sys.latch must not be acquired while holding - recv_sys.mutex. */ - mysql_mutex_unlock(&mutex); - if (!last_batch) - log_sys.latch.wr_unlock(); + fil_space_t *space= nullptr; + uint32_t space_id= ~0; + buf_block_t *free_block= nullptr; - buf_block_t *free_block= buf_LRU_get_free_block(false); - - if (!last_batch) - log_sys.latch.wr_lock(SRW_LOCK_CALL); - mysql_mutex_lock(&mutex); - - for (map::iterator p= pages.begin(); p != pages.end(); ) + for (pages_it= pages.begin(); pages_it != pages.end(); + pages_it= pages.begin()) { - const page_id_t page_id= p->first; - ut_ad(!p->second.log.empty()); - - const uint32_t space_id= page_id.space(); - auto d= deferred_spaces.defers.find(space_id); - if (d != deferred_spaces.defers.end()) + if (!free_block) { - if (d->second.deleted) - { - /* For deleted files we must preserve the entry in deferred_spaces */ -erase_for_space: - while (p != pages.end() && p->first.space() == space_id) - { - map::iterator r= p++; - r->second.log.clear(); - pages.erase(r); - } - } - else if (recover_deferred(p, d->second.file_name, free_block)) - { - if (!srv_force_recovery) - set_corrupt_fs(); - deferred_spaces.defers.erase(d); - goto erase_for_space; - } - else - deferred_spaces.defers.erase(d); - if (!free_block) - goto next_free_block; - p= pages.lower_bound(page_id); - continue; + if (!last_batch) + log_sys.latch.wr_unlock(); + wait_for_pool(1); + pages_it= pages.begin(); + mysql_mutex_unlock(&mutex); + /* We must release log_sys.latch and recv_sys.mutex before + invoking buf_LRU_get_free_block(). Allocating a block may initiate + a redo log write and therefore acquire log_sys.latch. To avoid + deadlocks, log_sys.latch must not be acquired while holding + recv_sys.mutex. */ + free_block= buf_LRU_get_free_block(false); + if (!last_batch) + log_sys.latch.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&mutex); + pages_it= pages.begin(); } - switch (p->second.state) { - case page_recv_t::RECV_BEING_READ: - case page_recv_t::RECV_BEING_PROCESSED: - p++; - continue; - case page_recv_t::RECV_WILL_NOT_READ: - if (UNIV_LIKELY(!!recover_low(page_id, p, mtr, free_block))) - { -next_free_block: - mysql_mutex_unlock(&mutex); - if (!last_batch) - log_sys.latch.wr_unlock(); - free_block= buf_LRU_get_free_block(false); - if (!last_batch) - log_sys.latch.wr_lock(SRW_LOCK_CALL); - mysql_mutex_lock(&mutex); - break; - } - ut_ad(p == pages.end() || p->first > page_id); - continue; - case page_recv_t::RECV_NOT_PROCESSED: - recv_read_in_area(page_id, p); - } - p= pages.lower_bound(page_id); - /* Ensure that progress will be made. */ - ut_ad(p == pages.end() || p->first > page_id || - p->second.state >= page_recv_t::RECV_BEING_READ); - } - - buf_pool.free_block(free_block); - - /* Wait until all the pages have been processed */ - for (;;) - { - const bool empty= pages.empty(); - if (empty && !os_aio_pending_reads()) - break; - - if (!is_corrupt_fs() && !is_corrupt_log()) + while (pages_it != pages.end()) { - if (last_batch) + if (is_corrupt_fs() || is_corrupt_log()) { - if (!empty) - my_cond_wait(&cond, &mutex.m_mutex); - else + if (space) + space->release(); + if (free_block) { mysql_mutex_unlock(&mutex); - os_aio_wait_until_no_pending_reads(false); + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_block_free_non_file_page(free_block); + mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_lock(&mutex); - ut_ad(pages.empty()); } + return; } - else - { -#ifndef SUX_LOCK_GENERIC - ut_ad(log_sys.latch.is_write_locked()); -#endif - log_sys.latch.wr_unlock(); - set_timespec_nsec(abstime, 500000000ULL); /* 0.5s */ - my_cond_timedwait(&cond, &mutex.m_mutex, &abstime); - mysql_mutex_unlock(&mutex); - log_sys.latch.wr_lock(SRW_LOCK_CALL); - mysql_mutex_lock(&mutex); - } - continue; + if (apply_batch(space_id, space, free_block, last_batch)) + break; } - if (is_corrupt_fs() && !srv_force_recovery) - sql_print_information("InnoDB: Set innodb_force_recovery=1" - " to ignore corrupted pages."); - return; + } + + if (space) + space->release(); + + if (free_block) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_block_free_non_file_page(free_block); + mysql_mutex_unlock(&buf_pool.mutex); } } if (last_batch) - /* We skipped this in buf_page_create(). */ - mlog_init.mark_ibuf_exist(mtr); + { + if (!recv_no_ibuf_operations) + /* We skipped this in buf_page_create(). */ + mlog_init.mark_ibuf_exist(); + mlog_init.clear(); + } else { mlog_init.reset(); @@ -3625,21 +3929,22 @@ next_free_block: mysql_mutex_unlock(&mutex); - if (last_batch && srv_operation != SRV_OPERATION_RESTORE && - srv_operation != SRV_OPERATION_RESTORE_EXPORT) - /* Instead of flushing, last_batch sorts the buf_pool.flush_list - in ascending order of buf_page_t::oldest_modification. */ - log_sort_flush_list(); - else - buf_flush_sync_batch(lsn); - if (!last_batch) { + buf_flush_sync_batch(lsn); buf_pool_invalidate(); log_sys.latch.wr_lock(SRW_LOCK_CALL); } + else if (srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT) + buf_flush_sync_batch(lsn); + else + /* Instead of flushing, last_batch sorts the buf_pool.flush_list + in ascending order of buf_page_t::oldest_modification. */ + log_sort_flush_list(); + #ifdef HAVE_PMEM - else if (log_sys.is_pmem()) + if (last_batch && log_sys.is_pmem()) mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE); #endif @@ -3649,35 +3954,24 @@ next_free_block: clear(); } -/** Check whether the number of read redo log blocks exceeds the maximum. -@return whether the memory is exhausted */ -inline bool recv_sys_t::is_memory_exhausted() -{ - if (UT_LIST_GET_LEN(blocks) * 3 < buf_pool.get_n_pages()) - return false; - DBUG_PRINT("ib_log",("Ran out of memory and last stored lsn " LSN_PF - " last stored offset %zu\n", lsn, offset)); - return true; -} - /** Scan log_t::FORMAT_10_8 log store records to the parsing buffer. @param last_phase whether changes can be applied to the tablespaces @return whether rescan is needed (not everything was stored) */ static bool recv_scan_log(bool last_phase) { DBUG_ENTER("recv_scan_log"); - DBUG_ASSERT(!last_phase || recv_sys.file_checkpoint); ut_ad(log_sys.is_latest()); const size_t block_size_1{log_sys.get_block_size() - 1}; mysql_mutex_lock(&recv_sys.mutex); - recv_sys.clear(); ut_d(recv_sys.after_apply= last_phase); - ut_ad(!last_phase || recv_sys.file_checkpoint); + if (!last_phase) + recv_sys.clear(); + else + ut_ad(recv_sys.file_checkpoint); - store_t store= last_phase - ? STORE_IF_EXISTS : recv_sys.file_checkpoint ? STORE_YES : STORE_NO; + bool store{recv_sys.file_checkpoint != 0}; size_t buf_size= log_sys.buf_size; #ifdef HAVE_PMEM if (log_sys.is_pmem()) @@ -3694,6 +3988,7 @@ static bool recv_scan_log(bool last_phase) recv_sys.len= 0; } + lsn_t rewound_lsn= 0; for (ut_d(lsn_t source_offset= 0);;) { #ifndef SUX_LOCK_GENERIC @@ -3741,27 +4036,29 @@ static bool recv_scan_log(bool last_phase) if (UNIV_UNLIKELY(!recv_needed_recovery)) { - ut_ad(store == (recv_sys.file_checkpoint ? STORE_YES : STORE_NO)); + ut_ad(!last_phase); ut_ad(recv_sys.lsn >= log_sys.next_checkpoint_lsn); - for (;;) + if (!store) { - const byte& b{log_sys.buf[recv_sys.offset]}; - r= recv_sys.parse_pmem(store); - if (r == recv_sys_t::OK) + ut_ad(!recv_sys.file_checkpoint); + for (;;) { - if (store == STORE_NO && - (b == FILE_CHECKPOINT + 2 + 8 || (b & 0xf0) == FILE_MODIFY)) - continue; - } - else if (r == recv_sys_t::PREMATURE_EOF) - goto read_more; - else if (store != STORE_NO) - break; + const byte& b{log_sys.buf[recv_sys.offset]}; + r= recv_sys.parse_pmem(false); + switch (r) { + case recv_sys_t::PREMATURE_EOF: + goto read_more; + default: + ut_ad(r == recv_sys_t::GOT_EOF); + break; + case recv_sys_t::OK: + if (b == FILE_CHECKPOINT + 2 + 8 || (b & 0xf0) == FILE_MODIFY) + continue; + } - if (store == STORE_NO) - { const lsn_t end{recv_sys.file_checkpoint}; + ut_ad(!end || end == recv_sys.lsn); mysql_mutex_unlock(&recv_sys.mutex); if (!end) @@ -3771,45 +4068,73 @@ static bool recv_scan_log(bool last_phase) ") at " LSN_PF, log_sys.next_checkpoint_lsn, recv_sys.lsn); } - else - ut_ad(end == recv_sys.lsn); DBUG_RETURN(true); } - - recv_needed_recovery= true; - if (srv_read_only_mode) - { - mysql_mutex_unlock(&recv_sys.mutex); - DBUG_RETURN(false); + } + else + { + ut_ad(recv_sys.file_checkpoint != 0); + switch ((r= recv_sys.parse_pmem(false))) { + case recv_sys_t::PREMATURE_EOF: + goto read_more; + case recv_sys_t::GOT_EOF: + break; + default: + ut_ad(r == recv_sys_t::OK); + recv_needed_recovery= true; + if (srv_read_only_mode) + { + mysql_mutex_unlock(&recv_sys.mutex); + DBUG_RETURN(false); + } + sql_print_information("InnoDB: Starting crash recovery from" + " checkpoint LSN=" LSN_PF, + log_sys.next_checkpoint_lsn); } - sql_print_information("InnoDB: Starting crash recovery from" - " checkpoint LSN=" LSN_PF, - log_sys.next_checkpoint_lsn); - break; } } - while ((r= recv_sys.parse_pmem(store)) == recv_sys_t::OK) + if (!store) + skip_the_rest: + while ((r= recv_sys.parse_pmem(false)) == recv_sys_t::OK); + else { - if (store != STORE_NO && recv_sys.is_memory_exhausted()) + uint16_t count= 0; + while ((r= recv_sys.parse_pmem(last_phase)) == recv_sys_t::OK) + if (!++count && recv_sys.report(time(nullptr))) + { + const size_t n= recv_sys.pages.size(); + sql_print_information("InnoDB: Parsed redo log up to LSN=" LSN_PF + "; to recover: %zu pages", recv_sys.lsn, n); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Parsed redo log up to LSN=" LSN_PF + "; to recover: %zu pages", + recv_sys.lsn, n); + } + if (r == recv_sys_t::GOT_OOM) { - ut_ad(last_phase == (store == STORE_IF_EXISTS)); - if (store == STORE_YES) - { - store= STORE_NO; - recv_sys.last_stored_lsn= recv_sys.lsn; - } - else - { - ut_ad(store == STORE_IF_EXISTS); - recv_sys.apply(false); - } + ut_ad(!last_phase); + rewound_lsn= recv_sys.lsn; + store= false; + if (recv_sys.scanned_lsn <= 1) + goto skip_the_rest; + ut_ad(recv_sys.file_checkpoint); + goto func_exit; } } if (r != recv_sys_t::PREMATURE_EOF) { ut_ad(r == recv_sys_t::GOT_EOF); + got_eof: + ut_ad(recv_sys.is_initialised()); + if (recv_sys.scanned_lsn > 1) + { + ut_ad(recv_sys.scanned_lsn == recv_sys.lsn); + break; + } + recv_sys.scanned_lsn= recv_sys.lsn; + sql_print_information("InnoDB: End of log at LSN=" LSN_PF, recv_sys.lsn); break; } @@ -3822,7 +4147,7 @@ static bool recv_scan_log(bool last_phase) break; if (recv_sys.offset < log_sys.get_block_size()) - break; + goto got_eof; if (recv_sys.offset > buf_size / 4 || (recv_sys.offset > block_size_1 && @@ -3835,21 +4160,21 @@ static bool recv_scan_log(bool last_phase) } } - const bool corrupt= recv_sys.is_corrupt_log() || recv_sys.is_corrupt_fs(); - recv_sys.maybe_finish_batch(); if (last_phase) + { + ut_ad(!rewound_lsn); + ut_ad(recv_sys.lsn >= recv_sys.file_checkpoint); log_sys.set_recovered_lsn(recv_sys.lsn); + } + else if (rewound_lsn) + { + ut_ad(!store); + ut_ad(recv_sys.file_checkpoint); + recv_sys.lsn= rewound_lsn; + } +func_exit: mysql_mutex_unlock(&recv_sys.mutex); - - if (corrupt) - DBUG_RETURN(false); - - DBUG_PRINT("ib_log", - ("%s " LSN_PF " completed", last_phase ? "rescan" : "scan", - recv_sys.lsn)); - ut_ad(!last_phase || recv_sys.lsn >= recv_sys.file_checkpoint); - - DBUG_RETURN(store == STORE_NO); + DBUG_RETURN(!store); } /** Report a missing tablespace for which page-redo log exists. @@ -3945,8 +4270,8 @@ next: /* fall through */ case file_name_t::DELETED: recv_sys_t::map::iterator r = p++; - r->second.log.clear(); - recv_sys.pages.erase(r); + recv_sys.pages_it_invalidate(r); + recv_sys.erase(r); continue; } ut_ad(0); @@ -3970,8 +4295,6 @@ func_exit: continue; } - missing_tablespace = true; - if (srv_force_recovery) { sql_print_warning("InnoDB: Tablespace " UINT32PF " was not found at %.*s," @@ -3991,14 +4314,11 @@ func_exit: rs.first, int(rs.second.name.size()), rs.second.name.data()); + } else { + missing_tablespace = true; } } - if (!rescan || srv_force_recovery > 0) { - missing_tablespace = false; - } - - err = DB_SUCCESS; goto func_exit; } @@ -4232,35 +4552,41 @@ read_only_recovery: goto early_exit; } - /* If there is any missing tablespace and rescan is needed - then there is a possiblity that hash table will not contain - all space ids redo logs. Rescan the remaining unstored - redo logs for the validation of missing tablespace. */ - ut_ad(rescan || !missing_tablespace); + if (missing_tablespace) { + ut_ad(rescan); + /* If any tablespaces seem to be missing, + validate the remaining log records. */ - while (missing_tablespace) { - recv_sys.lsn = recv_sys.last_stored_lsn; - DBUG_PRINT("ib_log", ("Rescan of redo log to validate " - "the missing tablespace. Scan " - "from last stored LSN " LSN_PF, - recv_sys.lsn)); - rescan = recv_scan_log(false); - ut_ad(!recv_sys.is_corrupt_fs()); + do { + rescan = recv_scan_log(false); + ut_ad(!recv_sys.is_corrupt_fs()); - missing_tablespace = false; + if (recv_sys.is_corrupt_log()) { + goto err_exit; + } - if (recv_sys.is_corrupt_log()) { - goto err_exit; - } + missing_tablespace = false; - err = recv_validate_tablespace( - rescan, missing_tablespace); + err = recv_validate_tablespace( + rescan, missing_tablespace); - if (err != DB_SUCCESS) { - goto early_exit; - } + if (err != DB_SUCCESS) { + goto early_exit; + } + } while (missing_tablespace); rescan = true; + /* Because in the loop above we overwrote the + initially stored recv_sys.pages, we must + restart parsing the log from the very beginning. */ + + /* FIXME: Use a separate loop for checking for + tablespaces (not individual pages), while retaining + the initial recv_sys.pages. */ + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.clear(); + recv_sys.lsn = log_sys.next_checkpoint_lsn; + mysql_mutex_unlock(&recv_sys.mutex); } if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) { @@ -4271,8 +4597,7 @@ read_only_recovery: ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN); if (rescan) { - recv_sys.lsn = log_sys.next_checkpoint_lsn; - rescan = recv_scan_log(true); + recv_scan_log(true); if ((recv_sys.is_corrupt_log() && !srv_force_recovery) || recv_sys.is_corrupt_fs()) { diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 1354de3cef2..0a915f9c168 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -3427,15 +3427,12 @@ os_file_get_status( return(ret); } - -extern void fil_aio_callback(const IORequest &request); - -static void io_callback(tpool::aiocb *cb) +static void io_callback_errorcheck(const tpool::aiocb *cb) { - const IORequest &request= *static_cast - (static_cast(cb->m_userdata)); if (cb->m_err != DB_SUCCESS) { + const IORequest &request= *static_cast + (static_cast(cb->m_userdata)); ib::fatal() << "IO Error: " << cb->m_err << " during " << (request.is_async() ? "async " : "sync ") << (request.is_LRU() ? "lru " : "") << @@ -3443,19 +3440,36 @@ static void io_callback(tpool::aiocb *cb) " of " << cb->m_len << " bytes, for file " << cb->m_fh << ", returned " << cb->m_ret_len; } - /* Return cb back to cache*/ - if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD) - { - ut_ad(read_slots->contains(cb)); - fil_aio_callback(request); - read_slots->release(cb); - } - else - { - ut_ad(write_slots->contains(cb)); - fil_aio_callback(request); - write_slots->release(cb); - } +} + +static void fake_io_callback(void *c) +{ + tpool::aiocb *cb= static_cast(c); + ut_ad(read_slots->contains(cb)); + static_cast(static_cast(cb->m_userdata))-> + fake_read_complete(cb->m_offset); + read_slots->release(cb); +} + +static void read_io_callback(void *c) +{ + tpool::aiocb *cb= static_cast(c); + ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PREAD); + io_callback_errorcheck(cb); + ut_ad(read_slots->contains(cb)); + static_cast + (static_cast(cb->m_userdata))->read_complete(); + read_slots->release(cb); +} + +static void write_io_callback(void *c) +{ + tpool::aiocb *cb= static_cast(c); + ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PWRITE); + ut_ad(write_slots->contains(cb)); + static_cast + (static_cast(cb->m_userdata))->write_complete(); + write_slots->release(cb); } #ifdef LINUX_NATIVE_AIO @@ -3758,6 +3772,28 @@ void os_aio_wait_until_no_pending_reads(bool declare) tpool::tpool_wait_end(); } +/** Submit a fake read request during crash recovery. +@param type fake read request +@param offset additional context */ +void os_fake_read(const IORequest &type, os_offset_t offset) +{ + tpool::aiocb *cb= read_slots->acquire(); + + cb->m_group= read_slots->get_task_group(); + cb->m_fh= type.node->handle.m_file; + cb->m_buffer= nullptr; + cb->m_len= 0; + cb->m_offset= offset; + cb->m_opcode= tpool::aio_opcode::AIO_PREAD; + new (cb->m_userdata) IORequest{type}; + cb->m_internal_task.m_func= fake_io_callback; + cb->m_internal_task.m_arg= cb; + cb->m_internal_task.m_group= cb->m_group; + + srv_thread_pool->submit_task(&cb->m_internal_task); +} + + /** Request a read or write. @param type I/O request @param buf buffer @@ -3803,23 +3839,32 @@ func_exit: return err; } + io_slots* slots; + tpool::callback_func callback; + tpool::aio_opcode opcode; + if (type.is_read()) { ++os_n_file_reads; + slots = read_slots; + callback = read_io_callback; + opcode = tpool::aio_opcode::AIO_PREAD; } else { ++os_n_file_writes; + slots = write_slots; + callback = write_io_callback; + opcode = tpool::aio_opcode::AIO_PWRITE; } compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN); - io_slots* slots= type.is_read() ? read_slots : write_slots; tpool::aiocb* cb = slots->acquire(); cb->m_buffer = buf; - cb->m_callback = (tpool::callback_func)io_callback; + cb->m_callback = callback; cb->m_group = slots->get_task_group(); cb->m_fh = type.node->handle.m_file; cb->m_len = (int)n; cb->m_offset = offset; - cb->m_opcode = type.is_read() ? tpool::aio_opcode::AIO_PREAD : tpool::aio_opcode::AIO_PWRITE; + cb->m_opcode = opcode; new (cb->m_userdata) IORequest{type}; if (srv_thread_pool->submit_io(cb)) { @@ -3827,6 +3872,7 @@ func_exit: os_file_handle_error(type.node->name, type.is_read() ? "aio read" : "aio write"); err = DB_IO_ERROR; + type.node->space->release(); } goto func_exit; diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index d177cc3a129..1f46da67943 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -214,14 +214,14 @@ row_ins_sec_index_entry_by_modify( made to the clustered index, and completed the secondary index creation before we got here. In this case, the change would already be there. The CREATE - INDEX should be waiting for a MySQL meta-data lock - upgrade at least until this INSERT or UPDATE - returns. After that point, set_committed(true) - would be invoked in commit_inplace_alter_table(). */ + INDEX should be in wait_while_table_is_used() at least + until this INSERT or UPDATE returns. After that point, + set_committed(true) would be invoked in + commit_inplace_alter_table(). */ ut_a(update->n_fields == 0); - ut_a(!cursor->index()->is_committed()); ut_ad(!dict_index_is_online_ddl(cursor->index())); - return(DB_SUCCESS); + return cursor->index()->is_committed() + ? DB_CORRUPTION : DB_SUCCESS; } if (mode == BTR_MODIFY_LEAF) { diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index cfb42ec46b2..3f62eb79aed 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -615,6 +615,8 @@ row_purge_del_mark( const auto type= node->index->type; if (type & (DICT_FTS | DICT_CORRUPT)) continue; + if (node->index->online_status > ONLINE_INDEX_CREATION) + continue; if (UNIV_UNLIKELY(DICT_VIRTUAL & type) && !node->index->is_committed() && node->index->has_new_v_col()) continue; @@ -767,6 +769,11 @@ row_purge_upd_exist_or_extern_func( continue; } + if (node->index->online_status + > ONLINE_INDEX_CREATION) { + continue; + } + if (row_upd_changes_ord_field_binary(node->index, node->update, thr, NULL, NULL)) { /* Build the older version of the index entry */ diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 83a38829f60..03b707e8a4e 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -674,7 +674,7 @@ static monitor_info_t innodb_counter_info[] = {"trx_rseg_history_len", "transaction", "Length of the TRX_RSEG_HISTORY list", static_cast( - MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN}, {"trx_undo_slots_used", "transaction", "Number of undo slots used", diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index cf31a2e6190..9150916fabc 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1409,7 +1409,7 @@ void srv_master_callback(void*) } /** @return whether purge should exit due to shutdown */ -static bool srv_purge_should_exit() +static bool srv_purge_should_exit(size_t old_history_size) { ut_ad(srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP); @@ -1420,8 +1420,12 @@ static bool srv_purge_should_exit() return true; /* Slow shutdown was requested. */ + size_t prepared, active= trx_sys.any_active_transactions(&prepared); const size_t history_size= trx_sys.history_size(); - if (history_size) + + if (!history_size); + else if (!active && history_size == old_history_size && prepared); + else { static time_t progress_time; time_t now= time(NULL); @@ -1438,7 +1442,7 @@ static bool srv_purge_should_exit() return false; } - return !trx_sys.any_active_transactions(); + return !active; } /*********************************************************************//** @@ -1581,7 +1585,7 @@ fewer_threads: break; } - if (!srv_purge_should_exit()) + if (!srv_purge_should_exit(history_size)) goto loop; } @@ -1777,15 +1781,19 @@ ulint srv_get_task_queue_length() /** Shut down the purge threads. */ void srv_purge_shutdown() { - if (purge_sys.enabled()) { - if (!srv_fast_shutdown && !opt_bootstrap) - srv_update_purge_thread_count(innodb_purge_threads_MAX); - while(!srv_purge_should_exit()) { - ut_a(!purge_sys.paused()); - srv_wake_purge_thread_if_not_active(); - purge_coordinator_task.wait(); - } - purge_sys.coordinator_shutdown(); - srv_shutdown_purge_tasks(); - } + if (purge_sys.enabled()) + { + if (!srv_fast_shutdown && !opt_bootstrap) + srv_update_purge_thread_count(innodb_purge_threads_MAX); + size_t history_size= trx_sys.history_size(); + while (!srv_purge_should_exit(history_size)) + { + history_size= trx_sys.history_size(); + ut_a(!purge_sys.paused()); + srv_wake_purge_thread_if_not_active(); + purge_coordinator_task.wait(); + } + purge_sys.coordinator_shutdown(); + srv_shutdown_purge_tasks(); + } } diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 985b414041a..7d812fb97fa 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -246,142 +246,122 @@ Remove the undo log segment from the rseg slot if it is too big for reuse. void trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) { - DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")", - trx->id, trx_id_t{trx->rw_trx_hash_element->no})); - ut_ad(undo == trx->rsegs.m_redo.undo); - trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; - ut_ad(undo->rseg == rseg); - buf_block_t* rseg_header = rseg->get(mtr, nullptr); - /* We are in transaction commit; we cannot return an error. If the - database is corrupted, it is better to crash it than to - intentionally violate ACID by committing something that is known to - be corrupted. */ - ut_ad(rseg_header); - buf_block_t* undo_page = trx_undo_set_state_at_finish( - undo, mtr); - trx_ulogf_t* undo_header = undo_page->page.frame - + undo->hdr_offset; + DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")", + trx->id, trx_id_t{trx->rw_trx_hash_element->no})); + ut_ad(undo->id < TRX_RSEG_N_SLOTS); + ut_ad(undo == trx->rsegs.m_redo.undo); + trx_rseg_t *rseg= trx->rsegs.m_redo.rseg; + ut_ad(undo->rseg == rseg); + buf_block_t *rseg_header= rseg->get(mtr, nullptr); + /* We are in transaction commit; we cannot return an error. If the + database is corrupted, it is better to crash it than to + intentionally violate ACID by committing something that is known to + be corrupted. */ + ut_ad(rseg_header); + buf_block_t *undo_page= + buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0, + RW_X_LATCH, mtr); + /* This function is invoked during transaction commit, which is not + allowed to fail. If we get a corrupted undo header, we will crash here. */ + ut_a(undo_page); + trx_ulogf_t *undo_header= undo_page->page.frame + undo->hdr_offset; - ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1); - ut_ad(rseg->needs_purge > trx->id); + ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1); + ut_ad(rseg->needs_purge > trx->id); - if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT - + rseg_header->page.frame))) { - /* This database must have been upgraded from - before MariaDB 10.3.5. */ - trx_rseg_format_upgrade(rseg_header, mtr); - } + if (rseg->last_page_no == FIL_NULL) + { + rseg->last_page_no= undo->hdr_page_no; + rseg->set_last_commit(undo->hdr_offset, trx->rw_trx_hash_element->no); + } - if (undo->state != TRX_UNDO_CACHED) { - /* The undo log segment will not be reused */ - ut_a(undo->id < TRX_RSEG_N_SLOTS); - static_assert(FIL_NULL == 0xffffffff, ""); - mtr->memset(rseg_header, - TRX_RSEG + TRX_RSEG_UNDO_SLOTS - + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff); + rseg->history_size++; - uint32_t hist_size = mach_read_from_4( - TRX_RSEG_HISTORY_SIZE + TRX_RSEG - + rseg_header->page.frame); + if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rseg_header->page.frame))) + /* This database must have been upgraded from before MariaDB 10.3.5. */ + trx_rseg_format_upgrade(rseg_header, mtr); - ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR - + TRX_UNDO_PAGE_LIST - + undo_page->page.frame)); + uint16_t undo_state; - mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE - + rseg_header->page.frame, - hist_size + undo->size); - mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID - + rseg_header->page.frame, - trx_sys.get_max_trx_id()); - } + if (undo->size == 1 && + TRX_UNDO_PAGE_REUSE_LIMIT > + mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_page->page.frame)) + { + undo->state= undo_state= TRX_UNDO_CACHED; + UT_LIST_ADD_FIRST(rseg->undo_cached, undo); + } + else + { + ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + + undo_page->page.frame)); + /* The undo log segment will not be reused */ + static_assert(FIL_NULL == 0xffffffff, ""); + mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS + + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff); + uint32_t hist_size= mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG + + rseg_header->page.frame); + mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE + + rseg_header->page.frame, hist_size + undo->size); + mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + + rseg_header->page.frame, trx_sys.get_max_trx_id()); + ut_free(undo); + undo_state= TRX_UNDO_TO_PURGE; + } - /* After the purge thread has been given permission to exit, - we may roll back transactions (trx->undo_no==0) - in THD::cleanup() invoked from unlink_thd() in fast shutdown, - or in trx_rollback_recovered() in slow shutdown. + undo= nullptr; - Before any transaction-generating background threads or the - purge have been started, we can - start transactions in row_merge_drop_temp_indexes(), - and roll back recovered transactions. + /* After the purge thread has been given permission to exit, + we may roll back transactions (trx->undo_no==0) + in THD::cleanup() invoked from unlink_thd() in fast shutdown, + or in trx_rollback_recovered() in slow shutdown. - Arbitrary user transactions may be executed when all the undo log - related background processes (including purge) are disabled due to - innodb_force_recovery=2 or innodb_force_recovery=3. - DROP TABLE may be executed at any innodb_force_recovery level. + Before any transaction-generating background threads or the purge + have been started, we can start transactions in + row_merge_drop_temp_indexes(), and roll back recovered transactions. - During fast shutdown, we may also continue to execute - user transactions. */ - ut_ad(srv_undo_sources - || trx->undo_no == 0 - || (!purge_sys.enabled() - && (srv_is_being_started - || trx_rollback_is_active - || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)) - || srv_fast_shutdown); + Arbitrary user transactions may be executed when all the undo log + related background processes (including purge) are disabled due to + innodb_force_recovery=2 or innodb_force_recovery=3. DROP TABLE may + be executed at any innodb_force_recovery level. -#ifdef WITH_WSREP - if (wsrep_is_wsrep_xid(&trx->xid)) { - trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr); - } + During fast shutdown, we may also continue to execute user + transactions. */ + ut_ad(srv_undo_sources || trx->undo_no == 0 || + (!purge_sys.enabled() && + (srv_is_being_started || + trx_rollback_is_active || + srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)) || + srv_fast_shutdown); + +#ifdef WITH_WSREP + if (wsrep_is_wsrep_xid(&trx->xid)) + trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr); #endif - if (trx->mysql_log_file_name && *trx->mysql_log_file_name) { - /* Update the latest MySQL binlog name and offset info - in rollback segment header if MySQL binlogging is on - or the database server is a MySQL replication save. */ - trx_rseg_update_binlog_offset( - rseg_header, trx->mysql_log_file_name, - trx->mysql_log_offset, mtr); - } + if (trx->mysql_log_file_name && *trx->mysql_log_file_name) + /* Update the latest binlog name and offset if log_bin=ON or this + is a replica. */ + trx_rseg_update_binlog_offset(rseg_header, trx->mysql_log_file_name, + trx->mysql_log_offset, mtr); - /* Add the log as the first in the history list */ + /* Add the log as the first in the history list */ - /* We are in transaction commit; we cannot return an error - when detecting corruption. It is better to crash the server - than to intentionally violate ACID by committing something - that is known to be corrupted. */ - ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page, - static_cast(undo->hdr_offset - + TRX_UNDO_HISTORY_NODE), - mtr) == DB_SUCCESS); + /* We are in transaction commit; we cannot return an error + when detecting corruption. It is better to crash the server + than to intentionally violate ACID by committing something + that is known to be corrupted. */ + ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page, + uint16_t(page_offset(undo_header) + + TRX_UNDO_HISTORY_NODE), mtr) == DB_SUCCESS); - mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, - undo_header + TRX_UNDO_TRX_NO, - trx->rw_trx_hash_element->no); - mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header - + TRX_UNDO_NEEDS_PURGE, 1U); - - if (rseg->last_page_no == FIL_NULL) { - rseg->last_page_no = undo->hdr_page_no; - rseg->set_last_commit(undo->hdr_offset, - trx->rw_trx_hash_element->no); - } - - rseg->history_size++; - - if (undo->state == TRX_UNDO_CACHED) { - UT_LIST_ADD_FIRST(rseg->undo_cached, undo); - } else { - ut_ad(undo->state == TRX_UNDO_TO_PURGE); - ut_free(undo); - } - - undo = NULL; -} - -MY_ATTRIBUTE((nonnull, warn_unused_result)) -/** Remove undo log header from the history list. -@param[in,out] rseg rollback segment header page -@param[in] log undo log segment header page -@param[in] offset byte offset in the undo log segment header page -@param[in,out] mtr mini-transaction */ -static dberr_t trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log, - uint16_t offset, mtr_t *mtr) -{ - return flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY, log, - uint16_t(offset + TRX_UNDO_HISTORY_NODE), mtr); + mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + undo_page->page.frame, undo_state); + mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, undo_header + TRX_UNDO_TRX_NO, + trx->rw_trx_hash_element->no); + mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header + + TRX_UNDO_NEEDS_PURGE, 1U); } /** Free an undo log segment. @@ -393,7 +373,7 @@ static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr) block->page.frame, &mtr)) { block->fix(); - const page_id_t id{block->page.id()}; + ut_d(const page_id_t id{block->page.id()}); mtr.commit(); /* NOTE: If the server is killed after the log that was produced up to this point was written, and before the log from the mtr.commit() @@ -405,16 +385,8 @@ static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr) log_free_check(); mtr.start(); block->page.lock.x_lock(); - if (UNIV_UNLIKELY(block->page.id() != id)) - { - block->unfix(); - block->page.lock.x_unlock(); - block= buf_page_get_gen(id, 0, RW_X_LATCH, nullptr, BUF_GET, &mtr); - if (!block) - return; - } - else - mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY); + ut_ad(block->page.id() == id); + mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY); } while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + @@ -422,12 +394,13 @@ static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr) } /** Remove unnecessary history data from a rollback segment. -@param[in,out] rseg rollback segment -@param[in] limit truncate anything before this +@param rseg rollback segment +@param limit truncate anything before this +@param all whether everything can be truncated @return error code */ static dberr_t -trx_purge_truncate_rseg_history(trx_rseg_t& rseg, - const purge_sys_t::iterator& limit) +trx_purge_truncate_rseg_history(trx_rseg_t &rseg, + const purge_sys_t::iterator &limit, bool all) { fil_addr_t hdr_addr; mtr_t mtr; @@ -436,7 +409,6 @@ trx_purge_truncate_rseg_history(trx_rseg_t& rseg, mtr.start(); dberr_t err; -reget: buf_block_t *rseg_hdr= rseg.get(&mtr, &err); if (!rseg_hdr) { @@ -471,23 +443,24 @@ loop: goto func_exit; } + if (!all) + goto func_exit; + fil_addr_t prev_hdr_addr= flst_get_prev_addr(b->page.frame + hdr_addr.boffset + TRX_UNDO_HISTORY_NODE); prev_hdr_addr.boffset= static_cast(prev_hdr_addr.boffset - TRX_UNDO_HISTORY_NODE); - err= trx_purge_remove_log_hdr(rseg_hdr, b, hdr_addr.boffset, &mtr); + + err= flst_remove(rseg_hdr, TRX_RSEG + TRX_RSEG_HISTORY, b, + uint16_t(hdr_addr.boffset + TRX_UNDO_HISTORY_NODE), &mtr); if (UNIV_UNLIKELY(err != DB_SUCCESS)) goto func_exit; rseg_hdr->fix(); - if (mach_read_from_2(b->page.frame + hdr_addr.boffset + TRX_UNDO_NEXT_LOG) || - rseg.is_referenced() || - rseg.needs_purge > (purge_sys.head.trx_no - ? purge_sys.head.trx_no - : purge_sys.tail.trx_no)) - /* We cannot free the entire undo page. */; + if (mach_read_from_2(b->page.frame + hdr_addr.boffset + TRX_UNDO_NEXT_LOG)) + /* We cannot free the entire undo log segment. */; else { const uint32_t seg_size= @@ -512,9 +485,9 @@ loop: if (undo->hdr_page_no == hdr_addr.page) goto found_cached; ut_ad("inconsistent undo logs" == 0); - break; - found_cached: - UT_LIST_REMOVE(rseg.undo_cached, undo); + if (false) + found_cached: + UT_LIST_REMOVE(rseg.undo_cached, undo); static_assert(FIL_NULL == 0xffffffff, ""); if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->page.frame))) @@ -537,12 +510,7 @@ loop: log_free_check(); mtr.start(); rseg_hdr->page.lock.x_lock(); - if (UNIV_UNLIKELY(rseg_hdr->page.id() != rseg.page_id())) - { - rseg_hdr->unfix(); - rseg_hdr->page.lock.x_unlock(); - goto reget; - } + ut_ad(rseg_hdr->page.id() == rseg.page_id()); mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_MODIFY); goto loop; @@ -615,7 +583,10 @@ TRANSACTIONAL_TARGET static void trx_purge_truncate_history() { ut_ad(rseg.is_persistent()); rseg.latch.wr_lock(SRW_LOCK_CALL); - if (dberr_t e= trx_purge_truncate_rseg_history(rseg, head)) + if (dberr_t e= + trx_purge_truncate_rseg_history(rseg, head, + !rseg.is_referenced() && + rseg.needs_purge <= head.trx_no)) err= e; rseg.latch.wr_unlock(); } @@ -694,7 +665,8 @@ not_free: } ut_ad(rseg.curr_size > cached); - if (rseg.curr_size > cached + 1) + if (rseg.curr_size > cached + 1 && + (rseg.history_size || srv_fast_shutdown || srv_undo_sources)) goto not_free; rseg.latch.rd_unlock(); @@ -708,6 +680,7 @@ not_free: mtr_t mtr; mtr.start(); mtr.x_lock_space(&space); + const auto space_id= space.id; /* Lock all modified pages of the tablespace. @@ -717,8 +690,8 @@ not_free: mini-transaction commit and the server was killed, then discarding the to-be-trimmed pages without flushing would break crash recovery. */ + rescan: mysql_mutex_lock(&buf_pool.flush_list_mutex); - for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) { ut_ad(bpage->oldest_modification()); @@ -726,46 +699,47 @@ not_free: buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - if (bpage->id().space() == space.id && - bpage->oldest_modification() != 1) + if (bpage->oldest_modification() > 2 && bpage->id().space() == space_id) { ut_ad(bpage->frame); - auto block= reinterpret_cast(bpage); - if (!bpage->lock.x_lock_try()) + bpage->fix(); { - rescan: - /* Let buf_pool_t::release_freed_page() proceed. */ + /* Try to acquire an exclusive latch while the cache line is + fresh after fix(). */ + const bool got_lock{bpage->lock.x_lock_try()}; + buf_pool.flush_hp.set(prev); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - mysql_mutex_lock(&buf_pool.mutex); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - mysql_mutex_unlock(&buf_pool.mutex); - bpage= UT_LIST_GET_LAST(buf_pool.flush_list); - continue; + if (!got_lock) + bpage->lock.x_lock(); } - buf_pool.flush_hp.set(prev); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); #ifdef BTR_CUR_HASH_ADAPT - ut_ad(!block->index); /* There is no AHI on undo tablespaces. */ + /* There is no AHI on undo tablespaces. */ + ut_ad(!reinterpret_cast(bpage)->index); #endif - bpage->fix(); ut_ad(!bpage->is_io_fixed()); - mysql_mutex_lock(&buf_pool.flush_list_mutex); + ut_ad(bpage->id().space() == space_id); - if (bpage->oldest_modification() > 1) + if (bpage->oldest_modification() > 2) { + mtr.memo_push(reinterpret_cast(bpage), + MTR_MEMO_PAGE_X_FIX); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + ut_ad(bpage->oldest_modification() > 2); bpage->reset_oldest_modification(); - mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); } else { bpage->unfix(); bpage->lock.x_unlock(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); } if (prev != buf_pool.flush_hp.get()) - /* Rescan, because we may have lost the position. */ + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); goto rescan; + } } bpage= prev; diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 374a9d724bc..319ba99afa0 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -342,15 +342,29 @@ trx_sys_t::close() } /** @return total number of active (non-prepared) transactions */ -ulint trx_sys_t::any_active_transactions() +size_t trx_sys_t::any_active_transactions(size_t *prepared) { - uint32_t total_trx= 0; + size_t total_trx= 0, prepared_trx= 0; - trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) { - if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY || - (trx.state == TRX_STATE_ACTIVE && trx.id)) + trx_sys.trx_list.for_each([&](const trx_t &trx) { + switch (trx.state) { + case TRX_STATE_NOT_STARTED: + break; + case TRX_STATE_ACTIVE: + if (!trx.id) + break; + /* fall through */ + case TRX_STATE_COMMITTED_IN_MEMORY: total_trx++; + break; + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + prepared_trx++; + } }); + if (prepared) + *prepared= prepared_trx; + return total_trx; } diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 20434d9fb9c..4811d2380aa 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -1463,37 +1463,6 @@ template buf_block_t* trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo, mtr_t *mtr, dberr_t *err); -/******************************************************************//** -Sets the state of the undo log segment at a transaction finish. -@return undo log segment header page, x-latched */ -buf_block_t* -trx_undo_set_state_at_finish( -/*=========================*/ - trx_undo_t* undo, /*!< in: undo log memory copy */ - mtr_t* mtr) /*!< in: mtr */ -{ - ut_ad(undo->id < TRX_RSEG_N_SLOTS); - ut_ad(undo->rseg->is_persistent()); - - buf_block_t *block= - buf_page_get(page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0, - RW_X_LATCH, mtr); - /* This function is invoked during transaction commit, which is not - allowed to fail. If we get a corrupted undo header, we will crash here. */ - ut_a(block); - const uint16_t state = undo->size == 1 && - TRX_UNDO_PAGE_REUSE_LIMIT > - mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + - block->page.frame) - ? TRX_UNDO_CACHED - : TRX_UNDO_TO_PURGE; - - undo->state= state; - mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->page.frame, - state); - return block; -} - /** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. @param[in,out] trx transaction @param[in,out] undo undo log diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c index b80c2b69f16..8b82a71ff7c 100644 --- a/storage/myisam/mi_open.c +++ b/storage/myisam/mi_open.c @@ -518,6 +518,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) share->kfile=kfile; share->this_process=(ulong) getpid(); share->last_process= share->state.process; + share->base.base_key_parts= base_key_parts; share->base.key_parts=key_parts; share->base.all_key_parts=key_parts+unique_key_parts; if (!(share->last_version=share->state.version)) diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h index c90d989c975..f84ad6fa184 100644 --- a/storage/myisam/myisamdef.h +++ b/storage/myisam/myisamdef.h @@ -132,7 +132,7 @@ typedef struct st_mi_base_info uint extra_alloc_bytes; uint extra_alloc_procent; /* The following are from the header */ - uint key_parts, all_key_parts; + uint key_parts, all_key_parts, base_key_parts; } MI_BASE_INFO; diff --git a/storage/myisammrg/myrg_open.c b/storage/myisammrg/myrg_open.c index d9ea4b754f2..4a983684394 100644 --- a/storage/myisammrg/myrg_open.c +++ b/storage/myisammrg/myrg_open.c @@ -432,17 +432,20 @@ int myrg_attach_children(MYRG_INFO *m_info, int handle_locking, first_child= FALSE; m_info->reclength= myisam->s->base.reclength; min_keys= myisam->s->base.keys; - key_parts= myisam->s->base.key_parts; + key_parts= myisam->s->base.base_key_parts; if (*need_compat_check && m_info->rec_per_key_part) { my_free(m_info->rec_per_key_part); m_info->rec_per_key_part= NULL; } - if (!m_info->rec_per_key_part) + if (!m_info->rec_per_key_part || m_info->key_parts != key_parts) { - if(!(m_info->rec_per_key_part= (ulong*) - my_malloc(rg_key_memory_MYRG_INFO, - key_parts * sizeof(long), MYF(MY_WME)))) + m_info->key_parts= key_parts; + /* The +1 is because by my_realloc() don't allow zero length */ + if (!(m_info->rec_per_key_part= (ulong*) + my_realloc(rg_key_memory_MYRG_INFO, m_info->rec_per_key_part, + key_parts * sizeof(long) +1, + MYF(MY_WME | MY_ALLOW_ZERO_PTR | MY_FREE_ON_ERROR)))) goto err; /* purecov: inspected */ errpos= 1; } @@ -457,7 +460,8 @@ int myrg_attach_children(MYRG_INFO *m_info, int handle_locking, myisam->open_flag|= HA_OPEN_MERGE_TABLE; /* Check table definition match. */ - if (m_info->reclength != myisam->s->base.reclength) + if (m_info->reclength != myisam->s->base.reclength || + key_parts != myisam->s->base.base_key_parts) { DBUG_PRINT("error", ("definition mismatch table: '%s' repair: %d", myisam->filename, diff --git a/storage/spider/mysql-test/spider/bugfix/t/mdev_30370.test b/storage/spider/mysql-test/spider/bugfix/t/mdev_30370.test index 788ea2323f7..99e56ab062a 100644 --- a/storage/spider/mysql-test/spider/bugfix/t/mdev_30370.test +++ b/storage/spider/mysql-test/spider/bugfix/t/mdev_30370.test @@ -2,4 +2,7 @@ --echo # MDEV-30370 mariadbd hangs when running with --wsrep-recover and --plugin-load-add=ha_spider.so --echo # ---exec $MYSQLD_BOOTSTRAP_CMD --wsrep-recover --plugin-load-add=ha_spider.so +let $MYSQLD_DATADIR=$MYSQLTEST_VARDIR/mdev_30370; +--mkdir $MYSQLD_DATADIR +--exec $MYSQLD_BOOTSTRAP_CMD --wsrep-recover --plugin-load-add=ha_spider.so --datadir=$MYSQLD_DATADIR +--rmdir $MYSQLD_DATADIR diff --git a/strings/json_lib.c b/strings/json_lib.c index 903dec978b4..920fb1d4a89 100644 --- a/strings/json_lib.c +++ b/strings/json_lib.c @@ -1324,7 +1324,7 @@ int json_skip_key(json_engine_t *j) } -#define SKIPPED_STEP_MARK ((int) ~0) +#define SKIPPED_STEP_MARK INT_MAX32 /* Current step of the patch matches the JSON construction. diff --git a/wsrep-lib b/wsrep-lib index 4951c383577..e238c0d240c 160000 --- a/wsrep-lib +++ b/wsrep-lib @@ -1 +1 @@ -Subproject commit 4951c38357737d568b554402bc5b6abe88a38fe1 +Subproject commit e238c0d240c2557229b0523a4a032f3cf8b41639