diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index 220534335e1..65d217d7e09 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -1288,6 +1288,11 @@ int ha_maria::repair(THD *thd, HA_CHECK ¶m, bool do_optimize) thd->proc_info= "Repair with keycache"; param.testflag &= ~T_REP_BY_SORT; error= maria_repair(¶m, file, fixed_name, param.testflag & T_QUICK); + /** + @todo RECOVERY BUG we do things with the index file + (maria_sort_index() after the above which already has logged the + record and bumped create_rename_lsn. Is it ok? + */ } param.testflag= testflag; optimize_done= 1; @@ -1356,6 +1361,11 @@ int ha_maria::repair(THD *thd, HA_CHECK ¶m, bool do_optimize) thd->proc_info= old_proc_info; if (!thd->locked_tables) { + /** + @todo RECOVERY BUG find why this is needed. Monty says it's because a + new non-transactional table is created by maria_repair(): find how this + new table's state influences the old one's. + */ _ma_reenable_logging_for_table(file->s); maria_lock_database(file, F_UNLCK); } diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c index e0c1cbd73b0..304b7751aee 100644 --- a/storage/maria/ma_bitmap.c +++ b/storage/maria/ma_bitmap.c @@ -533,6 +533,17 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share, Inexistent or half-created page (could be crash in the middle of _ma_bitmap_create_first(), before appending maria_bitmap_marker). */ + /* + We are updating data_file_length before writing any log record for the + row operation. What if now state is flushed by a checkpoint with the + new value, and crash before the checkpoint record is written, recovery + may not even open the table (no log records) so not fix + data_file_length ("WAL violation")? In fact this is ok: + - checkpoint flushes state only if share->id!=0 + - so if state was flushed, table had share->id!=0, so had a + LOGREC_FILE_ID (or was in previous checkpoint record), so recovery will + meet and open it and fix data_file_length. + */ share->state.state.data_file_length= end_of_page; bzero(bitmap->map, bitmap->block_size); memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker), diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c index a159756f509..b9702b22149 100644 --- a/storage/maria/ma_blockrec.c +++ b/storage/maria/ma_blockrec.c @@ -1441,7 +1441,23 @@ static my_bool write_tail(MARIA_HA *info, /* Increase data file size, if extended */ position= (my_off_t) block->page * block_size; if (info->state->data_file_length <= position) + { + /* + We are modifying a state member before writing the UNDO; this is a WAL + violation: assume this setting is made, checkpoint flushes new state, + and crash happens before the UNDO is written: how to undo the bad state? + Fortunately for data_file_length this is ok: as long as we change + data_file_length after writing any REDO or UNDO we are safe: + - checkpoint flushes state only if it's older than + checkpoint_start_log_horizon, and flushes log up to that horizon first + - so if checkpoint flushed state with new data_file_length, REDO is in + log so LOGREC_FILE_ID too, recovery will meet and open the table thus + fix data_file_length to be the file's physical size. + Same property is currently true in all places of this file which change + data_file_length. + */ info->state->data_file_length= position + block_size; + } DBUG_ASSERT(share->pagecache->block_size == block_size); if (!(res= pagecache_write(share->pagecache, diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c index 76f1723b053..8eac0908d19 100644 --- a/storage/maria/ma_checkpoint.c +++ b/storage/maria/ma_checkpoint.c @@ -176,7 +176,27 @@ static int really_execute_checkpoint(void) DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)", LSN_IN_PARTS(checkpoint_start_log_horizon))); lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); - + /* + We are going to flush the state of some tables (in collect_tables()) if + it's older than checkpoint_start_log_horizon. Before, all records + describing how to undo this flushed state must be in the log + (WAL). Usually this means UNDOs. In the special case of data_file_length, + recovery just needs to open the table, so any LOGREC_FILE_ID/REDO/UNDO + allowing recovery to understand it must open a table, is enough. + */ + /** + Apart from data|key_file_length which are easily recoverable from the OS, + all other state members must be updated only when writing the UNDO; + otherwise, if updated before, if their new value is flushed by a + checkpoint and there is a crash before UNDO is written, their REDO group + will be missing or at least incomplete and skipped by recovery, so bad + state value will stay. For example, setting key_root before writing the + UNDO: the table would have old index page (they were pinned at time of + crash) and a new, thus wrong, key_root. + @todo RECOVERY BUG check that all code honours that. + */ + if (translog_flush(checkpoint_start_log_horizon)) + goto err; /* STEP 2: fetch information about transactions. @@ -891,11 +911,15 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) dfile= share->bitmap.file; /* Ignore table which has no logged writes (all its future log records will - be found naturally by Recovery). Ignore obsolete shares (_before_ - setting themselves to last_version=0 they already did all flush and - sync; if we flush their state now we may be flushing an obsolete state - onto a newer one (assuming the table has been reopened with a different - share but of course same physical index file). + be found naturally by Recovery). This also avoids flushing + a data_file_length changed too early by a client (before any log record + was written, giving no chance to recovery to meet and open the table, + see _ma_read_bitmap_page()). + Ignore obsolete shares (_before_ setting themselves to last_version=0 + they already did all flush and sync; if we flush their state now we may + be flushing an obsolete state onto a newer one (assuming the table has + been reopened with a different share but of course same physical index + file). */ if ((share->id != 0) && (share->last_version != 0)) { @@ -950,6 +974,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) It may also be a share which got last_version==0 since we checked last_version; in this case, it flushed its state and the LSN test above will catch it. + Last, see comments at start of really_execute_checkpoint(). */ } else diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c index 9b654803945..728c92efdd2 100644 --- a/storage/maria/ma_close.c +++ b/storage/maria/ma_close.c @@ -86,8 +86,9 @@ int maria_close(register MARIA_HA *info) may be using the file at this point IF using --external-locking, which does not apply to Maria. */ - if ((share->changed && share->base.born_transactional) || - (share->mode != O_RDONLY && maria_is_crashed(info))) + if (share->mode != O_RDONLY && + ((share->changed && share->base.born_transactional) || + maria_is_crashed(info))) { /* State must be written to file as it was not done at table's diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c index 97c9b56d9e8..bf16fb8a022 100644 --- a/storage/maria/ma_recovery.c +++ b/storage/maria/ma_recovery.c @@ -1066,8 +1066,16 @@ static int new_table(uint16 sid, const char *name, tprint(tracef, ", length unknown\n"); goto end; } - share->state.state.data_file_length= dfile_len; - share->state.state.key_file_length= kfile_len; + if (share->state.state.data_file_length != dfile_len) + { + tprint(tracef, ", has wrong state.data_file_length (fixing it)"); + share->state.state.data_file_length= dfile_len; + } + if (share->state.state.key_file_length != kfile_len) + { + tprint(tracef, ", has wrong state.key_file_length (fixing it)"); + share->state.state.key_file_length= kfile_len; + } if ((dfile_len % share->block_size) > 0) { tprint(tracef, ", has too short last page\n"); @@ -1145,6 +1153,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD) goto end; } buff= log_record_buffer.str; + /** + @todo RECOVERY BUG + we stamp page with UNDO's LSN. Assume an operation logs REDO-REDO-UNDO + where the two REDOs are about the same page. Then recovery applies first + REDO and skips second REDO which is wrong. Solutions: + a) when applying REDO, keep page pinned, don't stamp it, remember it; + when seeing UNDO, unpin pages and stamp them; for BLOB pages we cannot + pin them (too large for memory) so need an additional pass in REDO phase: + - find UNDO + - execute all REDOs about this UNDO but skipping REDOs for + or b) when applying REDO, stamp page with REDO's LSN (=> difference in + 'cmp' between run-time and recovery, need a special 'cmp'...). + */ if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, HEAD_PAGE, buff + FILEID_STORE_SIZE,