diff --git a/mysql-test/include/maria_empty_logs.inc b/mysql-test/include/maria_empty_logs.inc new file mode 100644 index 00000000000..7613cea2a52 --- /dev/null +++ b/mysql-test/include/maria_empty_logs.inc @@ -0,0 +1,33 @@ +# Maria help script. +# Cleans up all logs to give recovery a fresh start. + +# API: none, just uses vardir, port and socket. + +connection admin; + +-- echo * shut down mysqld, removed logs, restarted it +append_file $MYSQLTEST_VARDIR/tmp/master0.expect; +wait-maria_empty_logs.inc +EOF + +--exec $MYSQLADMIN --no-defaults -S $MASTER_MYSOCK -P $MASTER_MYPORT -u root --password= shutdown 2>&1; + +remove_file $MYSQLTEST_VARDIR/master-data/maria_log_control; +remove_file $MYSQLTEST_VARDIR/master-data/maria_log.00000001; +-- error 0,1 # maybe there is just one log +remove_file $MYSQLTEST_VARDIR/master-data/maria_log.00000002; +# Hope there were not more than these logs. + +-- error 0,1 +remove_file $MYSQLTEST_VARDIR/master-data/maria_recovery.trace; + +append_file $MYSQLTEST_VARDIR/tmp/master0.expect; +restart-maria_empty_logs.inc +EOF + +--source include/wait_until_connected_again.inc + +connection default; +# the effect of "use" is lost after a restart so we are back into db "test", +# because connection 'default' was created with db "test". +use mysqltest; diff --git a/mysql-test/include/maria_make_snapshot.inc b/mysql-test/include/maria_make_snapshot.inc new file mode 100644 index 00000000000..0096aca2377 --- /dev/null +++ b/mysql-test/include/maria_make_snapshot.inc @@ -0,0 +1,58 @@ +# Maria helper script +# Copies table' data and index file to other directory, or back, or compares. +# The other directory looks like a database directory, so that we can +# read copies from inside mysqld, that's also why we copy the frm. + +# "mms" is a namespace for Maria_Make_Snapshot + +# API: +# 1) set one of +# $mms_copy : to copy table from database to spare directory +# $mms_reverse : to copy it back +# $mms_compare : to compare both +# 2) set $mms_table_to_use to a number N: table will be mysqltest.tN +# 3) set $mms_purpose to say what this copy is for (influences the naming +# of the spare directory). + +if (!$mms_copy) +{ + if (!$mms_reverse_copy) + { + if (!$mms_compare) + { + --die misuse of maria_make_snapshot.inc: no command + } + } +} + +if ($mms_copy) +{ + --echo * copied t$mms_table_to_use for $mms_purpose + copy_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAD; + copy_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAI; + copy_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.frm $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.frm; +} + +if ($mms_reverse_copy) +{ + # do not call this without flushing target table first! + --echo * copied t$mms_table_to_use back for $mms_purpose + -- error 0,1 + remove_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD; + copy_file $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAD $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD; + -- error 0,1 + remove_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI; + copy_file $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAI $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI; +} + +if ($mms_compare) +{ + # this was meant to do a physical file compare (diff_files) + # but after the UNDO phase this is normally impossible + # (UNDO execution has created new log records => pages have new LSNs). + # So for now it does nothing. +# --echo * compared t$mms_table_to_use +# diff_files $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAD; +# index file not yet recovered +# diff_files $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAI; +} diff --git a/mysql-test/include/maria_make_snapshot_for_comparison.inc b/mysql-test/include/maria_make_snapshot_for_comparison.inc new file mode 100644 index 00000000000..71b821b5212 --- /dev/null +++ b/mysql-test/include/maria_make_snapshot_for_comparison.inc @@ -0,0 +1,30 @@ +# Maria helper script +# Copies clean tables' data and index file to other directory +# Tables are t1...t[$mms_tables] +# They are later used as a reference to see if recovery works. + +# API: +# set $mms_tables to N, the script will cover tables mysqltest.t1,...tN + +connection admin; + +let $mms_table_to_use=$mms_tables; +let $mms_purpose=comparison; +let $mms_copy=1; + +--disable_query_log +--disable_warnings +eval drop database if exists mysqltest_for_$mms_purpose; +--enable_warnings +eval create database mysqltest_for_$mms_purpose; +--enable_query_log + +while ($mms_table_to_use) +{ + # to serve as a reference, table must be in a clean state + eval flush table t$mms_table_to_use; + -- source include/maria_make_snapshot.inc + dec $mms_table_to_use; +} +let $mms_copy=0; +connection default; diff --git a/mysql-test/include/maria_make_snapshot_for_feeding_recovery.inc b/mysql-test/include/maria_make_snapshot_for_feeding_recovery.inc new file mode 100644 index 00000000000..2ca6fb07808 --- /dev/null +++ b/mysql-test/include/maria_make_snapshot_for_feeding_recovery.inc @@ -0,0 +1,35 @@ +# Maria helper script +# Copies tables' data and index file to other directory, and control file. +# Tables are t1...t[$mms_tables]. +# Later, mysqld is shutdown, and that snapshot is put back into the +# datadir, control file too ("flashing recovery's brain"), and recovery is let +# to run on it (see maria_verify_recovery.inc). + +# API: +# set $mms_tables to N, the script will cover tables mysqltest.t1,...tN + +connection admin; + +let $mms_table_to_use=$mms_tables; +let $mms_purpose=feeding_recovery; +let $mms_copy=1; + +--disable_query_log +--disable_warnings +eval drop database if exists mysqltest_for_$mms_purpose; +--enable_warnings +eval create database mysqltest_for_$mms_purpose; +--enable_query_log + +while ($mms_table_to_use) +{ + -- source include/maria_make_snapshot.inc + dec $mms_table_to_use; +} +let $mms_copy=0; + +-- error 0,1 +remove_file $MYSQLTEST_VARDIR/tmp/mms_for_$mms_purpose.maria_log_control; +copy_file $MYSQLTEST_VARDIR/master-data/maria_log_control $MYSQLTEST_VARDIR/tmp/mms_for_$mms_purpose.maria_log_control; + +connection default; diff --git a/mysql-test/include/maria_verify_recovery.inc b/mysql-test/include/maria_verify_recovery.inc new file mode 100644 index 00000000000..51d7500800e --- /dev/null +++ b/mysql-test/include/maria_verify_recovery.inc @@ -0,0 +1,87 @@ +# Maria helper script. +# Runs recovery, compare with expected table data. + +# API: +# 1) set $mms_tables to N, the script will cover tables mysqltest.t1,...tN +# 2) set $mvr_debug_option to the crash way +# 3) set $mvr_restore_old_snapshot to 1 if you want recovery to run on +# an old copy of tables and of the control file, 0 for normal recovery. + +# "mvr" is a namespace for Maria_Verify_Recovery + +connection admin; + +# warn mtr that mysqld is going to die and should not be restarted immediately +#append_file $MYSQLTEST_VARDIR/tmp/master0.expect; +#wait-maria_verify_recovery.inc +#EOF +# todo: remove this "system" and uncomment above when BUG#32296 is fixed +system echo wait-maria_verify_recovery.inc >> $MYSQLTEST_VARDIR/tmp/master0.expect; + +# flush page cache and log, only log, or nothing, and kill mysqld with +# abort(). +# When we restore an old snapshot, we could just kill mysqld nicely, +# but that would implicitely commit all work, which the tester may +# not want (tester may want to observe rollback happening). + +eval SET SESSION debug=$mvr_debug_option; +--echo * crashing mysqld intentionally +--error 2013 +set global maria_checkpoint_interval=1; # this will crash (DBUG magic) + +if ($mvr_restore_old_snapshot) +{ + + # copy snapshot made by maria_make_snapshot_for_feeding_recovery back + # into datadir. + + let $mms_table_to_use=$mms_tables; + let $mms_purpose=feeding_recovery; + let $mms_reverse_copy=1; + while ($mms_table_to_use) + { + -- source include/maria_make_snapshot.inc + dec $mms_table_to_use; + } + let $mms_reverse_copy=0; + + # also copy back control file, to force recovery to start from an early + # point, ignoring further checkpoints. + -- error 0,1 + remove_file $MYSQLTEST_VARDIR/master-data/maria_log_control; + copy_file $MYSQLTEST_VARDIR/tmp/mms_for_$mms_purpose.maria_log_control $MYSQLTEST_VARDIR/master-data/maria_log_control; +} + +--echo * recovery happens +# let mtr restart mysqld (and thus execute the maria log) +#append_file $MYSQLTEST_VARDIR/tmp/master0.expect; +#restart-maria_verify_recovery.inc +#EOF +system echo restart-maria_verify_recovery.inc >> $MYSQLTEST_VARDIR/tmp/master0.expect; + +--source include/wait_until_connected_again.inc + +# compare that tables of $mms_tables are identical to old. +let $mms_table_to_use=$mms_tables; +let $mms_purpose=comparison; +let $mms_compare=1; +while ($mms_table_to_use) +{ + # Todo: remove this REPAIR when we have index recovery working. + # It is a quick repair, so that it will fail if data file is corrupted. + --echo * rebuilding index (until we have recovery of index) + eval repair table t$mms_table_to_use quick; + --echo * testing that checksum after recovery is as expected + let $new_checksum=`CHECKSUM TABLE t$mms_table_to_use`; + let $old_checksum=`CHECKSUM TABLE mysqltest_for_$mms_purpose.t$mms_table_to_use`; + # the $ text variables above are of the form "db.tablename\tchecksum", + # as db differs, we use substring(). + eval select if(substring("$new_checksum",instr("$new_checksum",".t1")) = substring("$old_checksum",instr("$old_checksum",".t1")),"ok","failure"); + -- source include/maria_make_snapshot.inc + dec $mms_table_to_use; +} +let $mms_compare=0; + +connection default; +# the effect of "use" is lost after a restart so we are back into db "test" +use mysqltest; diff --git a/mysql-test/include/wait_until_connected_again.inc b/mysql-test/include/wait_until_connected_again.inc index dc96f646cb3..d803dd2f699 100644 --- a/mysql-test/include/wait_until_connected_again.inc +++ b/mysql-test/include/wait_until_connected_again.inc @@ -1,12 +1,14 @@ # # Include this script to wait until the connection to the -# server has been restored or timeout occurs +# server has been restored or timeout occurs. +# You should have done --enable_reconnect first --disable_result_log --disable_query_log let $counter= 500; +let $mysql_errno= 1; while ($mysql_errno) { - --error 0,2002,2006 + --error 0,2002,2003,2006 show status; dec $counter; diff --git a/mysql-test/lib/mtr_process.pl b/mysql-test/lib/mtr_process.pl index 8fd900330da..945a9af6d2a 100644 --- a/mysql-test/lib/mtr_process.pl +++ b/mysql-test/lib/mtr_process.pl @@ -886,15 +886,33 @@ sub check_expected_crash_and_restart($) mtr_verbose("$mysqld->{'type'} $mysqld->{'idx'} exited, pid: $ret_pid"); $mysqld->{'pid'}= 0; - # Check if crash expected and restart if it was + # Check if crash expected, and restart if it was my $expect_file= "$::opt_vardir/tmp/" . "$mysqld->{'type'}" . "$mysqld->{'idx'}" . ".expect"; - if ( -f $expect_file ) + while ( 1 ) { - mtr_verbose("Crash was expected, file $expect_file exists"); - mysqld_start($mysqld, $mysqld->{'start_opts'}, - $mysqld->{'start_slave_master_info'}); - unlink($expect_file); + if ( -f $expect_file ) + { + mtr_verbose("Crash was expected, file $expect_file exists"); + my $expect_file_handler; + open($expect_file_handler, "<$expect_file") or die; + my @expect_lines= <$expect_file_handler>; + close $expect_file_handler; + # look at most recent order by the test + my $expect_content= pop @expect_lines; + chomp $expect_content; + if ( $expect_content =~ /^wait/ ) + { + mtr_verbose("Test asks that we wait before restart"); + # Millisceond sleep emulated with select + select(undef, undef, undef, (0.1)); + next; + } + unlink($expect_file); + mysqld_start($mysqld, $mysqld->{'start_opts'}, + $mysqld->{'start_slave_master_info'}); + } + last; } return; @@ -914,8 +932,8 @@ sub check_expected_crash_and_restart($) if ( -f $expect_file ) { mtr_verbose("Crash was expected, file $expect_file exists"); - ndbmgmd_start($cluster); unlink($expect_file); + ndbmgmd_start($cluster); } return; } @@ -933,9 +951,9 @@ sub check_expected_crash_and_restart($) if ( -f $expect_file ) { mtr_verbose("Crash was expected, file $expect_file exists"); + unlink($expect_file); ndbd_start($cluster, $ndbd->{'idx'}, $ndbd->{'start_extra_args'}); - unlink($expect_file); } return; } diff --git a/mysql-test/r/maria-recovery.result b/mysql-test/r/maria-recovery.result new file mode 100644 index 00000000000..0749e9b0826 --- /dev/null +++ b/mysql-test/r/maria-recovery.result @@ -0,0 +1,168 @@ +drop database if exists mysqltest; +create database mysqltest; +use mysqltest; +* shut down mysqld, removed logs, restarted it +use mysqltest; +create table t1 (a varchar(1000)) engine=maria; +* TEST: see if recovery can reconstruct if we give it an old table +* copied t1 for feeding_recovery +insert into t1 values ("00000000"); +flush table t1; +* copied t1 for comparison +lock tables t1 write; +insert into t1 values ("aaaaaaaaa"); +SET SESSION debug="+d,maria_flush_whole_log,maria_crash"; +* crashing mysqld intentionally +set global maria_checkpoint_interval=1; +ERROR HY000: Lost connection to MySQL server during query +* copied t1 back for feeding_recovery +* recovery happens +* rebuilding index (until we have recovery of index) +repair table t1 quick; +Table Op Msg_type Msg_text +mysqltest.t1 repair status OK +* testing that checksum after recovery is as expected +select if(substring("mysqltest.t1 488070860",instr("mysqltest.t1 488070860",".t1")) = substring("mysqltest_for_comparison.t1 488070860",instr("mysqltest_for_comparison.t1 488070860",".t1")),"ok","failure"); +if(substring("mysqltest.t1 488070860",instr("mysqltest.t1 488070860",".t1")) = substring("mysqltest_for_comparison.t1 488070860",instr("mysqltest_for_comparison.t1 488070860",".t1")),"ok","failure") +ok +use mysqltest; +select * from t1; +a +00000000 +* TEST: normal recovery test (no moving tables under its feet) +insert into t1 values ("00000000"); +flush table t1; +* copied t1 for comparison +lock tables t1 write; +insert into t1 values ("aaaaaaaaa"); +SET SESSION debug="+d,maria_crash"; +* crashing mysqld intentionally +set global maria_checkpoint_interval=1; +ERROR HY000: Lost connection to MySQL server during query +* recovery happens +* rebuilding index (until we have recovery of index) +repair table t1 quick; +Table Op Msg_type Msg_text +mysqltest.t1 repair status OK +* testing that checksum after recovery is as expected +select if(substring("mysqltest.t1 976141720",instr("mysqltest.t1 976141720",".t1")) = substring("mysqltest_for_comparison.t1 976141720",instr("mysqltest_for_comparison.t1 976141720",".t1")),"ok","failure"); +if(substring("mysqltest.t1 976141720",instr("mysqltest.t1 976141720",".t1")) = substring("mysqltest_for_comparison.t1 976141720",instr("mysqltest_for_comparison.t1 976141720",".t1")),"ok","failure") +ok +use mysqltest; +select * from t1; +a +00000000 +00000000 +insert into t1 values ("00000000"); +flush table t1; +* copied t1 for comparison +lock tables t1 write; +insert into t1 values ("aaaaaaaaa"); +SET SESSION debug="+d,maria_flush_whole_page_cache,maria_crash"; +* crashing mysqld intentionally +set global maria_checkpoint_interval=1; +ERROR HY000: Lost connection to MySQL server during query +* recovery happens +* rebuilding index (until we have recovery of index) +repair table t1 quick; +Table Op Msg_type Msg_text +mysqltest.t1 repair status OK +* testing that checksum after recovery is as expected +select if(substring("mysqltest.t1 1464212580",instr("mysqltest.t1 1464212580",".t1")) = substring("mysqltest_for_comparison.t1 1464212580",instr("mysqltest_for_comparison.t1 1464212580",".t1")),"ok","failure"); +if(substring("mysqltest.t1 1464212580",instr("mysqltest.t1 1464212580",".t1")) = substring("mysqltest_for_comparison.t1 1464212580",instr("mysqltest_for_comparison.t1 1464212580",".t1")),"ok","failure") +ok +use mysqltest; +select * from t1; +a +00000000 +00000000 +00000000 +insert into t1 values ("00000000"); +flush table t1; +* copied t1 for comparison +lock tables t1 write; +insert into t1 values ("aaaaaaaaa"); +SET SESSION debug="+d,maria_flush_states,maria_flush_whole_log,maria_crash"; +* crashing mysqld intentionally +set global maria_checkpoint_interval=1; +ERROR HY000: Lost connection to MySQL server during query +* recovery happens +* rebuilding index (until we have recovery of index) +repair table t1 quick; +Table Op Msg_type Msg_text +mysqltest.t1 repair status OK +* testing that checksum after recovery is as expected +select if(substring("mysqltest.t1 1952283440",instr("mysqltest.t1 1952283440",".t1")) = substring("mysqltest_for_comparison.t1 1952283440",instr("mysqltest_for_comparison.t1 1952283440",".t1")),"ok","failure"); +if(substring("mysqltest.t1 1952283440",instr("mysqltest.t1 1952283440",".t1")) = substring("mysqltest_for_comparison.t1 1952283440",instr("mysqltest_for_comparison.t1 1952283440",".t1")),"ok","failure") +ok +use mysqltest; +select * from t1; +a +00000000 +00000000 +00000000 +00000000 +insert into t1 values ("00000000"); +flush table t1; +* copied t1 for comparison +lock tables t1 write; +insert into t1 values ("aaaaaaaaa"); +SET SESSION debug="+d,maria_flush_whole_log,maria_crash"; +* crashing mysqld intentionally +set global maria_checkpoint_interval=1; +ERROR HY000: Lost connection to MySQL server during query +* recovery happens +* rebuilding index (until we have recovery of index) +repair table t1 quick; +Table Op Msg_type Msg_text +mysqltest.t1 repair status OK +* testing that checksum after recovery is as expected +select if(substring("mysqltest.t1 2440354300",instr("mysqltest.t1 2440354300",".t1")) = substring("mysqltest_for_comparison.t1 2440354300",instr("mysqltest_for_comparison.t1 2440354300",".t1")),"ok","failure"); +if(substring("mysqltest.t1 2440354300",instr("mysqltest.t1 2440354300",".t1")) = substring("mysqltest_for_comparison.t1 2440354300",instr("mysqltest_for_comparison.t1 2440354300",".t1")),"ok","failure") +ok +use mysqltest; +select * from t1; +a +00000000 +00000000 +00000000 +00000000 +00000000 +drop table t1; +* shut down mysqld, removed logs, restarted it +use mysqltest; +CREATE TABLE t1 ( +i int, +b blob default NULL, +c varchar(6000) default NULL +) ENGINE=MARIA CHECKSUM=1; +* copied t1 for feeding_recovery +INSERT INTO t1 VALUES (1, REPEAT('a', 5000), REPEAT('b', 5000)); +UPDATE t1 SET i=3, b=CONCAT(b,'c') WHERE i=1; +SELECT LENGTH(b) FROM t1 WHERE i=3; +LENGTH(b) +5001 +flush table t1; +* copied t1 for comparison +SET SESSION debug="+d,maria_flush_whole_log,maria_crash"; +* crashing mysqld intentionally +set global maria_checkpoint_interval=1; +ERROR HY000: Lost connection to MySQL server during query +* copied t1 back for feeding_recovery +* recovery happens +* rebuilding index (until we have recovery of index) +repair table t1 quick; +Table Op Msg_type Msg_text +mysqltest.t1 repair status OK +* testing that checksum after recovery is as expected +select if(substring("mysqltest.t1 3472399915",instr("mysqltest.t1 3472399915",".t1")) = substring("mysqltest_for_comparison.t1 3472399915",instr("mysqltest_for_comparison.t1 3472399915",".t1")),"ok","failure"); +if(substring("mysqltest.t1 3472399915",instr("mysqltest.t1 3472399915",".t1")) = substring("mysqltest_for_comparison.t1 3472399915",instr("mysqltest_for_comparison.t1 3472399915",".t1")),"ok","failure") +ok +use mysqltest; +SELECT LENGTH(b) FROM t1 WHERE i=3; +LENGTH(b) +5001 +drop table t1; +drop database mysqltest_for_feeding_recovery; +drop database mysqltest_for_comparison; +drop database mysqltest; diff --git a/mysql-test/t/maria-recovery-master.opt b/mysql-test/t/maria-recovery-master.opt new file mode 100644 index 00000000000..a745693594e --- /dev/null +++ b/mysql-test/t/maria-recovery-master.opt @@ -0,0 +1,2 @@ +--skip-stack-trace --skip-core-file + diff --git a/mysql-test/t/maria-recovery.test b/mysql-test/t/maria-recovery.test new file mode 100644 index 00000000000..4b03c15592d --- /dev/null +++ b/mysql-test/t/maria-recovery.test @@ -0,0 +1,141 @@ +--source include/not_embedded.inc +# Don't test this under valgrind, memory leaks will occur as we crash +--source include/not_valgrind.inc +# Binary must be compiled with debug for crash to occur +--source include/have_debug.inc +--source include/have_maria.inc + +--disable_warnings +drop database if exists mysqltest; +--enable_warnings +create database mysqltest; + +# Include scripts can perform SQL. For it to not influence the main test +# they use a separate connection. This way if they use a DDL it would +# not autocommit in the main test. +connect (admin, 127.0.0.1, root,,mysqltest,,); +--enable_reconnect + +connection default; +use mysqltest; +--enable_reconnect + +# A sample test +-- source include/maria_empty_logs.inc +let $mms_tables=1; +create table t1 (a varchar(1000)) engine=maria; + +--echo * TEST: see if recovery can reconstruct if we give it an old table + +-- source include/maria_make_snapshot_for_feeding_recovery.inc +# Your committed statements here, which we expect to +# be reconstructed from the log +insert into t1 values ("00000000"); +-- source include/maria_make_snapshot_for_comparison.inc +# Your statements which should not commit and we expect to be rolled back +lock tables t1 write; +insert into t1 values ("aaaaaaaaa"); +# we want recovery to run on the first snapshot made above +let $mvr_restore_old_snapshot=1; +# the script below will trigger recovery and compare checksums +let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash"; +-- source include/maria_verify_recovery.inc +# so a SELECT like this is pure visual effect, brings nothing. +select * from t1; + +--echo * TEST: normal recovery test (no moving tables under its feet) + +# different types of crash => a loop; here are loop control variables +let $crash_no_flush=1; +let $crash_flush_whole_page_cache=0; +let $crash_flush_states=0; +let $crash_flush_whole_log=0; +let $crash_loop=1; + +# we want recovery to use the tables as they were at time of crash +let $mvr_restore_old_snapshot=0; + +# Note that we don't remove logs between iterations. Test is +# cumulative (each new recovery processes more log records than the previous). + +while ($crash_loop) +{ + if ($crash_flush_whole_log) + { + let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash"; + # set up what next iteration should do: + let $crash_flush_whole_log=0; + let $crash_loop=0; + } + if ($crash_flush_states) + { + let $mvr_debug_option="+d,maria_flush_states,maria_flush_whole_log,maria_crash"; + let $crash_flush_states=0; + let $crash_flush_whole_log=1; + } + if ($crash_flush_whole_page_cache) + { + let $mvr_debug_option="+d,maria_flush_whole_page_cache,maria_crash"; + let $crash_flush_whole_page_cache=0; + let $crash_flush_states=1; + } + if ($crash_no_flush) + { + let $mvr_debug_option="+d,maria_crash"; + let $crash_no_flush=0; + let $crash_flush_whole_page_cache=1; + } + # Your committed statements here + insert into t1 values ("00000000"); + -- source include/maria_make_snapshot_for_comparison.inc + # Your statements which we expect to be rolled back + lock tables t1 write; + insert into t1 values ("aaaaaaaaa"); + -- source include/maria_verify_recovery.inc + select * from t1; +} + +drop table t1; + +# what did we compare above: +# - checksum: tells that the tables contain the same amount of rows +# and same data in rows +# - index: no, neither state nor pages were compared +# - bitmap pages: the REPAIR QUICK done above very probably checks +# that bitmap reflects page occupation; do we need to do physical +# compare? +# - page LSN: not compared; we should compare that page's LSN in new +# table is >= page's LSN in old table (it can be >, due to UNDO phase) +# we had a bug where new page's LSN was 0... todo. + +# +# Test problem where an UPDATE purges and rewrites a tail page, and +# recovery applied the purge, stamped page with UNDO's LSN, thus +# the rewrite was ignored. +# + +-- source include/maria_empty_logs.inc +let $mms_tables=1; +CREATE TABLE t1 ( + i int, + b blob default NULL, + c varchar(6000) default NULL +) ENGINE=MARIA CHECKSUM=1; +-- source include/maria_make_snapshot_for_feeding_recovery.inc +INSERT INTO t1 VALUES (1, REPEAT('a', 5000), REPEAT('b', 5000)); +UPDATE t1 SET i=3, b=CONCAT(b,'c') WHERE i=1; +SELECT LENGTH(b) FROM t1 WHERE i=3; +-- source include/maria_make_snapshot_for_comparison.inc +# we want recovery to run on the first snapshot made above +let $mvr_restore_old_snapshot=1; +let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash"; +-- source include/maria_verify_recovery.inc +SELECT LENGTH(b) FROM t1 WHERE i=3; +drop table t1; + +# clean up everything +let $mms_purpose=feeding_recovery; +eval drop database mysqltest_for_$mms_purpose; +let $mms_purpose=comparison; +eval drop database mysqltest_for_$mms_purpose; +drop database mysqltest; diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c index 38032e4cdfc..6bd708e4738 100644 --- a/storage/maria/ma_blockrec.c +++ b/storage/maria/ma_blockrec.c @@ -773,11 +773,13 @@ void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) DBUG_ENTER("_ma_unpin_all_pages"); DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); - /* True if not disk error */ - DBUG_ASSERT((undo_lsn != LSN_IMPOSSIBLE) || !info->s->now_transactional); - - if (!info->s->now_transactional) - undo_lsn= LSN_IMPOSSIBLE; /* don't try to set a LSN on pages */ + if (info->s->now_transactional) + DBUG_ASSERT(undo_lsn != LSN_IMPOSSIBLE); /* true except if disk error */ + else + { + /* In recovery, we stamp page though table has transactions disabled */ + DBUG_ASSERT((undo_lsn == LSN_IMPOSSIBLE) || maria_in_recovery); + } while (pinned_page-- != page_link) pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, @@ -2380,7 +2382,7 @@ static my_bool write_block_record(MARIA_HA *info, } /* Write UNDO or CLR record */ - lsn= 0; + lsn= LSN_IMPOSSIBLE; if (share->now_transactional) { LEX_STRING *log_array= info->log_row_parts; @@ -2546,7 +2548,7 @@ disk_err: Unpin all pinned pages to not cause problems for disk cache. This is safe to call even if we already called _ma_unpin_all_pages() above. */ - _ma_unpin_all_pages_and_finalize_row(info, 0); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); DBUG_RETURN(1); } @@ -2879,7 +2881,7 @@ static my_bool _ma_update_block_record2(MARIA_HA *info, DBUG_RETURN(res); err: - _ma_unpin_all_pages_and_finalize_row(info, 0); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); DBUG_RETURN(1); } @@ -3141,6 +3143,7 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) ulonglong page; uint record_number; MARIA_SHARE *share= info->s; + LSN lsn= LSN_IMPOSSIBLE; DBUG_ENTER("_ma_delete_block_record"); page= ma_recordpos_to_page(info->cur_row.lastpos); @@ -3157,7 +3160,6 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) if (share->now_transactional) { - LSN lsn; uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + HA_CHECKSUM_STORE_SIZE]; size_t row_length; @@ -3195,11 +3197,11 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) } - _ma_unpin_all_pages_and_finalize_row(info, info->trn->undo_lsn); + _ma_unpin_all_pages_and_finalize_row(info, lsn); DBUG_RETURN(0); err: - _ma_unpin_all_pages_and_finalize_row(info, 0); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); DBUG_RETURN(1); } @@ -4953,8 +4955,8 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, empty_space= (block_size - PAGE_OVERHEAD_SIZE); rec_offset= PAGE_HEADER_SIZE; dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; - unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED; - unpin_method= PAGECACHE_PIN_LEFT_UNPINNED; + unlock_method= PAGECACHE_LOCK_WRITE; + unpin_method= PAGECACHE_PIN; } else { @@ -4976,8 +4978,8 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, DBUG_RETURN(my_errno); DBUG_RETURN(0); } - unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK; - unpin_method= PAGECACHE_UNPIN; + unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + unpin_method= PAGECACHE_PIN_LEFT_PINNED; max_entry= (uint) ((uchar*) buff)[DIR_COUNT_OFFSET]; if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type)) @@ -5043,16 +5045,23 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, empty_space-= data_length; int2store(buff + EMPTY_SPACE_OFFSET, empty_space); - /* Write modified page */ - lsn_store(buff, lsn); + /* + Write modified page. We don't update its LSN, and keep it pinned. When we + have processed all REDOs for this page in the current REDO's group, we + will stamp page with UNDO's LSN (if we stamped it now, a next REDO, in + this group, for this page, would be skipped) and unpin then. + */ if (pagecache_write(share->pagecache, &info->dfile, page, 0, buff, PAGECACHE_PLAIN_PAGE, unlock_method, unpin_method, - PAGECACHE_WRITE_DELAY, 0, + PAGECACHE_WRITE_DELAY, &page_link.link, LSN_IMPOSSIBLE)) DBUG_RETURN(my_errno); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + /* Fix bitmap */ if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) DBUG_RETURN(my_errno); @@ -5070,7 +5079,7 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, DBUG_RETURN(0); err: - if (unlock_method == PAGECACHE_LOCK_WRITE_UNLOCK) + if (unlock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED) pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, @@ -5151,19 +5160,23 @@ uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0) goto err; - lsn_store(buff, lsn); result= 0; if (pagecache_write(share->pagecache, &info->dfile, page, 0, buff, PAGECACHE_PLAIN_PAGE, - PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) result= my_errno; - - /* This will work even if the page was marked as UNALLOCATED_PAGE */ - if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) - result= my_errno; + else + { + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); + /* This will work even if the page was marked as UNALLOCATED_PAGE */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + result= my_errno; + } DBUG_RETURN(result); @@ -5210,7 +5223,12 @@ uint _ma_apply_redo_purge_blocks(MARIA_HA *info, uint i; page= page_korr(header); header+= PAGE_STORE_SIZE; - page_range= pagerange_korr(header); + /* Page range may have this bit set to indicate a tail page */ + page_range= pagerange_korr(header) & ~TAIL_BIT; + /** @todo RECOVERY BUG enable this assertion when newer tree pulled */ +#if 0 + DBUG_ASSERT(page_range > 0); +#endif header+= PAGERANGE_STORE_SIZE; for (i= 0; i < page_range ; i++) @@ -5233,14 +5251,23 @@ uint _ma_apply_redo_purge_blocks(MARIA_HA *info, continue; } buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; - lsn_store(buff, lsn); + /** + @todo RECOVERY BUG + we need to distinguish between blob page (too big, can't pin, and + needn't pin because in a group there is a single REDO touching the + page) and head/tail. + When merged with newer code this should become possible. + */ if (pagecache_write(share->pagecache, &info->dfile, page+i, 0, buff, PAGECACHE_PLAIN_PAGE, - PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) DBUG_RETURN(my_errno); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + push_dynamic(&info->pinned_pages, (void*) &page_link); } /** @todo leave bitmap lock to the bitmap code... */ pthread_mutex_lock(&share->bitmap.bitmap_lock); diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c index 058e86d8106..680dc73be35 100644 --- a/storage/maria/ma_checkpoint.c +++ b/storage/maria/ma_checkpoint.c @@ -346,6 +346,43 @@ int ma_checkpoint_init(ulong interval) } +#ifndef DBUG_OFF +/** + Function used to test recovery: flush some table pieces and then caller + crashes. + + @param what_to_flush 0: current bitmap and all data pages + 1: state +*/ +static void flush_all_tables(int what_to_flush) +{ + LIST *pos; /**< to iterate over open tables */ + pthread_mutex_lock(&THR_LOCK_maria); + for (pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + if (info->s->now_transactional) + { + switch (what_to_flush) + { + case 0: + _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP); + break; + case 1: + _ma_state_info_write(info->s, 1|4); + DBUG_PRINT("maria_flush_states", + ("is_of_horizon: LSN (%lu,0x%lx)", + LSN_IN_PARTS(info->s->state.is_of_horizon))); + break; + } + } + } + pthread_mutex_unlock(&THR_LOCK_maria); +} +#endif + + /** @brief Destroys the checkpoint module */ @@ -353,6 +390,32 @@ int ma_checkpoint_init(ulong interval) void ma_checkpoint_end(void) { DBUG_ENTER("ma_checkpoint_end"); + DBUG_EXECUTE_IF("maria_flush_whole_page_cache", + { + DBUG_PRINT("maria_flush_whole_page_cache", ("now")); + flush_all_tables(0); + }); + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + /* + Note that for WAL reasons, maria_flush_states requires + maria_flush_whole_log. + */ + DBUG_EXECUTE_IF("maria_flush_states", + { + DBUG_PRINT("maria_flush_states", ("now")); + flush_all_tables(1); + }); + DBUG_EXECUTE_IF("maria_crash", + { + DBUG_PRINT("maria_crash", ("now")); + fflush(DBUG_FILE); + abort(); + }); + if (checkpoint_inited) { pthread_mutex_lock(&LOCK_checkpoint); diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c index 39d2c1c7422..26acb682b75 100644 --- a/storage/maria/ma_open.c +++ b/storage/maria/ma_open.c @@ -1014,6 +1014,8 @@ uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite) is too new). Recovery does it by itself. */ share->state.is_of_horizon= translog_get_horizon(); + DBUG_PRINT("info", ("is_of_horizon set to LSN (%lu,0x%lx)", + LSN_IN_PARTS(share->state.is_of_horizon))); } res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite); if (pWrite & 4) diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c index a74d20a6b70..9db9aecd399 100755 --- a/storage/maria/ma_pagecache.c +++ b/storage/maria/ma_pagecache.c @@ -2504,7 +2504,12 @@ static void check_and_set_lsn(PAGECACHE *pagecache, { LSN old; DBUG_ENTER("check_and_set_lsn"); - DBUG_ASSERT(block->type == PAGECACHE_LSN_PAGE); + /* + In recovery, we can _ma_unpin_all_pages() to put a LSN on page, though + page would be PAGECACHE_PLAIN_PAGE (transactionality temporarily disabled + to not log REDOs). + */ + DBUG_ASSERT((block->type == PAGECACHE_LSN_PAGE) || maria_in_recovery); old= lsn_korr(block->buffer + PAGE_LSN_OFFSET); DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx) new lsn: (%lu, 0x%lx)", LSN_IN_PARTS(old), LSN_IN_PARTS(lsn))); diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c index bc959138083..6f8f55b0784 100644 --- a/storage/maria/ma_recovery.c +++ b/storage/maria/ma_recovery.c @@ -48,6 +48,10 @@ static HASH all_dirty_pages; static struct st_dirty_page *dirty_pages_pool; static LSN current_group_end_lsn, checkpoint_start= LSN_IMPOSSIBLE; +#ifndef DBUG_OFF +/** Current group of REDOs is about this table and only this one */ +static MARIA_HA *current_group_table; +#endif static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */ static FILE *tracef; /**< trace file for debugging */ static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */ @@ -169,7 +173,7 @@ int maria_recover(void) maria_in_recovery= TRUE; #ifdef EXTRA_DEBUG - trace_file= fopen("maria_recovery.trace", "w"); + trace_file= fopen("maria_recovery.trace", "a+"); #else trace_file= NULL; /* no trace file for being fast */ #endif @@ -1040,8 +1044,12 @@ static int new_table(uint16 sid, const char *name, if (share->reopen != 1) { tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); - ALERT_USER(); - goto end; + /* + It could be that we have in the log + FILE_ID(t1,10) ... (t1 was flushed) ... FILE_ID(t1,12); + */ + if (close_one_table(share->open_file_name, lsn_of_file_id)) + goto end; } DBUG_ASSERT(share->now_transactional == share->base.born_transactional); if (!share->base.born_transactional) @@ -1062,6 +1070,8 @@ static int new_table(uint16 sid, const char *name, } /* don't log any records for this work */ _ma_tmp_disable_logging_for_table(share); + /* _ma_unpin_all_pages() reads info->trn: */ + info->trn= &dummy_transaction_object; /* execution of some REDO records relies on data_file_length */ my_off_t dfile_len= my_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME)); my_off_t kfile_len= my_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME)); @@ -1316,7 +1326,8 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT) set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) { - tprint(tracef, " state older than record, updating rows' count\n"); + tprint(tracef, " state has LSN (%lu,0x%lx) older than record, updating" + " rows' count\n", LSN_IN_PARTS(share->state.is_of_horizon)); share->state.state.records++; if (share->calc_checksum) { @@ -1339,6 +1350,8 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT) STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; } tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); + /* Unpin all pages, stamp them with UNDO's LSN */ + _ma_unpin_all_pages(info, rec->lsn); return 0; } @@ -1371,6 +1384,7 @@ prototype_redo_exec_hook(UNDO_ROW_DELETE) STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; } tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records); + _ma_unpin_all_pages(info, rec->lsn); return 0; } @@ -1400,6 +1414,7 @@ prototype_redo_exec_hook(UNDO_ROW_UPDATE) share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; } + _ma_unpin_all_pages(info, rec->lsn); return 0; } @@ -1488,6 +1503,7 @@ prototype_redo_exec_hook(CLR_END) STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; } tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records); + _ma_unpin_all_pages(info, rec->lsn); return 0; } @@ -1659,6 +1675,9 @@ static int run_redo_phase(LSN lsn, enum maria_apply_log_way apply) install_undo_exec_hook(UNDO_ROW_UPDATE); current_group_end_lsn= LSN_IMPOSSIBLE; +#ifndef DBUG_OFF + current_group_table= NULL; +#endif if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon())) { @@ -1774,6 +1793,9 @@ static int run_redo_phase(LSN lsn, enum maria_apply_log_way apply) if (apply == MARIA_LOG_APPLY && display_and_apply_record(log_desc, &rec)) goto err; +#ifndef DBUG_OFF + current_group_table= NULL; +#endif } else /* record does not end group */ { @@ -1983,6 +2005,7 @@ static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon) _ma_state_info_write_sub(share->kfile.file, &share->state, 1); } _ma_reenable_logging_for_table(share); + info->trn= NULL; /* safety */ } @@ -2012,6 +2035,10 @@ static MARIA_HA *get_MARIA_HA_from_REDO_record(const break; } info= all_tables[sid].info; +#ifndef DBUG_OFF + DBUG_ASSERT(current_group_table == NULL || current_group_table == info); + current_group_table= info; +#endif if (info == NULL) { tprint(tracef, ", table skipped, so skipping record\n"); @@ -2074,6 +2101,10 @@ static MARIA_HA *get_MARIA_HA_from_UNDO_record(const sid= fileid_korr(rec->header + LSN_STORE_SIZE); tprint(tracef, " For table of short id %u", sid); info= all_tables[sid].info; +#ifndef DBUG_OFF + DBUG_ASSERT(current_group_table == NULL || current_group_table == info); + current_group_table= info; +#endif if (info == NULL) { tprint(tracef, ", table skipped, so skipping record\n");