mirror of
https://github.com/MariaDB/server.git
synced 2025-12-10 19:44:09 +03:00
WL#3071 - Maria checkpoint
Observe WAL for the table's state: all log records needed for undoing uncommitted state must be in the log before we flush state. storage/maria/ha_maria.cc: comments storage/maria/ma_bitmap.c: Comment for why there is no bug storage/maria/ma_blockrec.c: comment for why there is no bug storage/maria/ma_checkpoint.c: Observe WAL for the table's state: all log records needed for undoing uncommitted state must be in the log before we flush state. I tested by hand that the bug existed (create table, insert one row into it but let that insert pause after increasing data_file_length, let checkpoint start but kill it after it has flushed state). Log contains nothing, table is not recovered though it has a too big data_file_length. With this bugfix, the log contains REDO so table is opened so data_file_length is corrected. storage/maria/ma_close.c: If table is read-only we must never write to it. Should be a no-change in fact, as if read-only, share->changed is normally always false. storage/maria/ma_recovery.c: documenting bug found by Monty. Print when fixing data_file_length.
This commit is contained in:
@@ -1288,6 +1288,11 @@ int ha_maria::repair(THD *thd, HA_CHECK ¶m, bool do_optimize)
|
||||
thd->proc_info= "Repair with keycache";
|
||||
param.testflag &= ~T_REP_BY_SORT;
|
||||
error= maria_repair(¶m, file, fixed_name, param.testflag & T_QUICK);
|
||||
/**
|
||||
@todo RECOVERY BUG we do things with the index file
|
||||
(maria_sort_index() after the above which already has logged the
|
||||
record and bumped create_rename_lsn. Is it ok?
|
||||
*/
|
||||
}
|
||||
param.testflag= testflag;
|
||||
optimize_done= 1;
|
||||
@@ -1356,6 +1361,11 @@ int ha_maria::repair(THD *thd, HA_CHECK ¶m, bool do_optimize)
|
||||
thd->proc_info= old_proc_info;
|
||||
if (!thd->locked_tables)
|
||||
{
|
||||
/**
|
||||
@todo RECOVERY BUG find why this is needed. Monty says it's because a
|
||||
new non-transactional table is created by maria_repair(): find how this
|
||||
new table's state influences the old one's.
|
||||
*/
|
||||
_ma_reenable_logging_for_table(file->s);
|
||||
maria_lock_database(file, F_UNLCK);
|
||||
}
|
||||
|
||||
@@ -533,6 +533,17 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
|
||||
Inexistent or half-created page (could be crash in the middle of
|
||||
_ma_bitmap_create_first(), before appending maria_bitmap_marker).
|
||||
*/
|
||||
/*
|
||||
We are updating data_file_length before writing any log record for the
|
||||
row operation. What if now state is flushed by a checkpoint with the
|
||||
new value, and crash before the checkpoint record is written, recovery
|
||||
may not even open the table (no log records) so not fix
|
||||
data_file_length ("WAL violation")? In fact this is ok:
|
||||
- checkpoint flushes state only if share->id!=0
|
||||
- so if state was flushed, table had share->id!=0, so had a
|
||||
LOGREC_FILE_ID (or was in previous checkpoint record), so recovery will
|
||||
meet and open it and fix data_file_length.
|
||||
*/
|
||||
share->state.state.data_file_length= end_of_page;
|
||||
bzero(bitmap->map, bitmap->block_size);
|
||||
memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker),
|
||||
|
||||
@@ -1441,7 +1441,23 @@ static my_bool write_tail(MARIA_HA *info,
|
||||
/* Increase data file size, if extended */
|
||||
position= (my_off_t) block->page * block_size;
|
||||
if (info->state->data_file_length <= position)
|
||||
{
|
||||
/*
|
||||
We are modifying a state member before writing the UNDO; this is a WAL
|
||||
violation: assume this setting is made, checkpoint flushes new state,
|
||||
and crash happens before the UNDO is written: how to undo the bad state?
|
||||
Fortunately for data_file_length this is ok: as long as we change
|
||||
data_file_length after writing any REDO or UNDO we are safe:
|
||||
- checkpoint flushes state only if it's older than
|
||||
checkpoint_start_log_horizon, and flushes log up to that horizon first
|
||||
- so if checkpoint flushed state with new data_file_length, REDO is in
|
||||
log so LOGREC_FILE_ID too, recovery will meet and open the table thus
|
||||
fix data_file_length to be the file's physical size.
|
||||
Same property is currently true in all places of this file which change
|
||||
data_file_length.
|
||||
*/
|
||||
info->state->data_file_length= position + block_size;
|
||||
}
|
||||
|
||||
DBUG_ASSERT(share->pagecache->block_size == block_size);
|
||||
if (!(res= pagecache_write(share->pagecache,
|
||||
|
||||
@@ -176,7 +176,27 @@ static int really_execute_checkpoint(void)
|
||||
DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
|
||||
LSN_IN_PARTS(checkpoint_start_log_horizon)));
|
||||
lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
|
||||
|
||||
/*
|
||||
We are going to flush the state of some tables (in collect_tables()) if
|
||||
it's older than checkpoint_start_log_horizon. Before, all records
|
||||
describing how to undo this flushed state must be in the log
|
||||
(WAL). Usually this means UNDOs. In the special case of data_file_length,
|
||||
recovery just needs to open the table, so any LOGREC_FILE_ID/REDO/UNDO
|
||||
allowing recovery to understand it must open a table, is enough.
|
||||
*/
|
||||
/**
|
||||
Apart from data|key_file_length which are easily recoverable from the OS,
|
||||
all other state members must be updated only when writing the UNDO;
|
||||
otherwise, if updated before, if their new value is flushed by a
|
||||
checkpoint and there is a crash before UNDO is written, their REDO group
|
||||
will be missing or at least incomplete and skipped by recovery, so bad
|
||||
state value will stay. For example, setting key_root before writing the
|
||||
UNDO: the table would have old index page (they were pinned at time of
|
||||
crash) and a new, thus wrong, key_root.
|
||||
@todo RECOVERY BUG check that all code honours that.
|
||||
*/
|
||||
if (translog_flush(checkpoint_start_log_horizon))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
STEP 2: fetch information about transactions.
|
||||
@@ -891,11 +911,15 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
|
||||
dfile= share->bitmap.file;
|
||||
/*
|
||||
Ignore table which has no logged writes (all its future log records will
|
||||
be found naturally by Recovery). Ignore obsolete shares (_before_
|
||||
setting themselves to last_version=0 they already did all flush and
|
||||
sync; if we flush their state now we may be flushing an obsolete state
|
||||
onto a newer one (assuming the table has been reopened with a different
|
||||
share but of course same physical index file).
|
||||
be found naturally by Recovery). This also avoids flushing
|
||||
a data_file_length changed too early by a client (before any log record
|
||||
was written, giving no chance to recovery to meet and open the table,
|
||||
see _ma_read_bitmap_page()).
|
||||
Ignore obsolete shares (_before_ setting themselves to last_version=0
|
||||
they already did all flush and sync; if we flush their state now we may
|
||||
be flushing an obsolete state onto a newer one (assuming the table has
|
||||
been reopened with a different share but of course same physical index
|
||||
file).
|
||||
*/
|
||||
if ((share->id != 0) && (share->last_version != 0))
|
||||
{
|
||||
@@ -950,6 +974,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
|
||||
It may also be a share which got last_version==0 since we checked
|
||||
last_version; in this case, it flushed its state and the LSN test
|
||||
above will catch it.
|
||||
Last, see comments at start of really_execute_checkpoint().
|
||||
*/
|
||||
}
|
||||
else
|
||||
|
||||
@@ -86,8 +86,9 @@ int maria_close(register MARIA_HA *info)
|
||||
may be using the file at this point
|
||||
IF using --external-locking, which does not apply to Maria.
|
||||
*/
|
||||
if ((share->changed && share->base.born_transactional) ||
|
||||
(share->mode != O_RDONLY && maria_is_crashed(info)))
|
||||
if (share->mode != O_RDONLY &&
|
||||
((share->changed && share->base.born_transactional) ||
|
||||
maria_is_crashed(info)))
|
||||
{
|
||||
/*
|
||||
State must be written to file as it was not done at table's
|
||||
|
||||
@@ -1066,8 +1066,16 @@ static int new_table(uint16 sid, const char *name,
|
||||
tprint(tracef, ", length unknown\n");
|
||||
goto end;
|
||||
}
|
||||
share->state.state.data_file_length= dfile_len;
|
||||
share->state.state.key_file_length= kfile_len;
|
||||
if (share->state.state.data_file_length != dfile_len)
|
||||
{
|
||||
tprint(tracef, ", has wrong state.data_file_length (fixing it)");
|
||||
share->state.state.data_file_length= dfile_len;
|
||||
}
|
||||
if (share->state.state.key_file_length != kfile_len)
|
||||
{
|
||||
tprint(tracef, ", has wrong state.key_file_length (fixing it)");
|
||||
share->state.state.key_file_length= kfile_len;
|
||||
}
|
||||
if ((dfile_len % share->block_size) > 0)
|
||||
{
|
||||
tprint(tracef, ", has too short last page\n");
|
||||
@@ -1145,6 +1153,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD)
|
||||
goto end;
|
||||
}
|
||||
buff= log_record_buffer.str;
|
||||
/**
|
||||
@todo RECOVERY BUG
|
||||
we stamp page with UNDO's LSN. Assume an operation logs REDO-REDO-UNDO
|
||||
where the two REDOs are about the same page. Then recovery applies first
|
||||
REDO and skips second REDO which is wrong. Solutions:
|
||||
a) when applying REDO, keep page pinned, don't stamp it, remember it;
|
||||
when seeing UNDO, unpin pages and stamp them; for BLOB pages we cannot
|
||||
pin them (too large for memory) so need an additional pass in REDO phase:
|
||||
- find UNDO
|
||||
- execute all REDOs about this UNDO but skipping REDOs for
|
||||
or b) when applying REDO, stamp page with REDO's LSN (=> difference in
|
||||
'cmp' between run-time and recovery, need a special 'cmp'...).
|
||||
*/
|
||||
if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
|
||||
HEAD_PAGE,
|
||||
buff + FILEID_STORE_SIZE,
|
||||
|
||||
Reference in New Issue
Block a user