diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 220534335e1..65d217d7e09 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -1288,6 +1288,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
       thd->proc_info= "Repair with keycache";
       param.testflag &= ~T_REP_BY_SORT;
       error= maria_repair(&param, file, fixed_name, param.testflag & T_QUICK);
+      /**
+         @todo RECOVERY BUG we do things with the index file
+         (maria_sort_index() after the above which already has logged the
+         record and bumped create_rename_lsn. Is it ok?
+      */
     }
     param.testflag= testflag;
     optimize_done= 1;
@@ -1356,6 +1361,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
   thd->proc_info= old_proc_info;
   if (!thd->locked_tables)
   {
+    /**
+       @todo RECOVERY BUG find why this is needed. Monty says it's because a
+       new non-transactional table is created by maria_repair(): find how this
+       new table's state influences the old one's.
+    */
     _ma_reenable_logging_for_table(file->s);
     maria_lock_database(file, F_UNLCK);
   }
diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c
index e0c1cbd73b0..304b7751aee 100644
--- a/storage/maria/ma_bitmap.c
+++ b/storage/maria/ma_bitmap.c
@@ -533,6 +533,17 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
       Inexistent or half-created page (could be crash in the middle of
       _ma_bitmap_create_first(), before appending maria_bitmap_marker).
     */
+    /*
+      We are updating data_file_length before writing any log record for the
+      row operation. What if now state is flushed by a checkpoint with the
+      new value, and crash before the checkpoint record is written, recovery
+      may not even open the table (no log records) so not fix
+      data_file_length ("WAL violation")? In fact this is ok:
+      - checkpoint flushes state only if share->id!=0
+      - so if state was flushed, table had share->id!=0, so had a
+      LOGREC_FILE_ID (or was in previous checkpoint record), so recovery will
+      meet and open it and fix data_file_length.
+    */
     share->state.state.data_file_length= end_of_page;
     bzero(bitmap->map, bitmap->block_size);
     memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker),
diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c
index a159756f509..b9702b22149 100644
--- a/storage/maria/ma_blockrec.c
+++ b/storage/maria/ma_blockrec.c
@@ -1441,7 +1441,23 @@ static my_bool write_tail(MARIA_HA *info,
   /* Increase data file size, if extended */
   position= (my_off_t) block->page * block_size;
   if (info->state->data_file_length <= position)
+  {
+    /*
+      We are modifying a state member before writing the UNDO; this is a WAL
+      violation: assume this setting is made, checkpoint flushes new state,
+      and crash happens before the UNDO is written: how to undo the bad state?
+      Fortunately for data_file_length this is ok: as long as we change
+      data_file_length after writing any REDO or UNDO we are safe:
+      - checkpoint flushes state only if it's older than
+      checkpoint_start_log_horizon, and flushes log up to that horizon first
+      - so if checkpoint flushed state with new data_file_length, REDO is in
+      log so LOGREC_FILE_ID too, recovery will meet and open the table thus
+      fix data_file_length to be the file's physical size.
+      Same property is currently true in all places of this file which change
+      data_file_length.
+    */
     info->state->data_file_length= position + block_size;
+  }
 
   DBUG_ASSERT(share->pagecache->block_size == block_size);
   if (!(res= pagecache_write(share->pagecache,
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
index 76f1723b053..8eac0908d19 100644
--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -176,7 +176,27 @@ static int really_execute_checkpoint(void)
   DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
                      LSN_IN_PARTS(checkpoint_start_log_horizon)));
   lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
-
+  /*
+    We are going to flush the state of some tables (in collect_tables()) if
+    it's older than checkpoint_start_log_horizon. Before, all records
+    describing how to undo this flushed state must be in the log
+    (WAL). Usually this means UNDOs. In the special case of data_file_length,
+    recovery just needs to open the table, so any LOGREC_FILE_ID/REDO/UNDO
+    allowing recovery to understand it must open a table, is enough.
+  */
+  /**
+     Apart from data|key_file_length which are easily recoverable from the OS,
+     all other state members must be updated only when writing the UNDO;
+     otherwise, if updated before, if their new value is flushed by a
+     checkpoint and there is a crash before UNDO is written, their REDO group
+     will be missing or at least incomplete and skipped by recovery, so bad
+     state value will stay. For example, setting key_root before writing the
+     UNDO: the table would have old index page (they were pinned at time of
+     crash) and a new, thus wrong, key_root.
+     @todo RECOVERY BUG check that all code honours that.
+  */
+  if (translog_flush(checkpoint_start_log_horizon))
+    goto err;
 
   /*
     STEP 2: fetch information about transactions.
@@ -891,11 +911,15 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
     dfile= share->bitmap.file;
     /*
       Ignore table which has no logged writes (all its future log records will
-      be found naturally by Recovery). Ignore obsolete shares (_before_
-      setting themselves to last_version=0 they already did all flush and
-      sync; if we flush their state now we may be flushing an obsolete state
-      onto a newer one (assuming the table has been reopened with a different
-      share but of course same physical index file).
+      be found naturally by Recovery). This also avoids flushing
+      a data_file_length changed too early by a client (before any log record
+      was written, giving no chance to recovery to meet and open the table,
+      see _ma_read_bitmap_page()).
+      Ignore obsolete shares (_before_ setting themselves to last_version=0
+      they already did all flush and sync; if we flush their state now we may
+      be flushing an obsolete state onto a newer one (assuming the table has
+      been reopened with a different share but of course same physical index
+      file).
     */
     if ((share->id != 0) && (share->last_version != 0))
     {
@@ -950,6 +974,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
           It may also be a share which got last_version==0 since we checked
           last_version; in this case, it flushed its state and the LSN test
           above will catch it.
+          Last, see comments at start of really_execute_checkpoint().
         */
       }
       else
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
index 9b654803945..728c92efdd2 100644
--- a/storage/maria/ma_close.c
+++ b/storage/maria/ma_close.c
@@ -86,8 +86,9 @@ int maria_close(register MARIA_HA *info)
         may be using the file at this point
         IF using --external-locking, which does not apply to Maria.
       */
-      if ((share->changed && share->base.born_transactional) ||
-          (share->mode != O_RDONLY && maria_is_crashed(info)))
+      if (share->mode != O_RDONLY &&
+          ((share->changed && share->base.born_transactional) ||
+           maria_is_crashed(info)))
       {
         /*
           State must be written to file as it was not done at table's
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
index 97c9b56d9e8..bf16fb8a022 100644
--- a/storage/maria/ma_recovery.c
+++ b/storage/maria/ma_recovery.c
@@ -1066,8 +1066,16 @@ static int new_table(uint16 sid, const char *name,
     tprint(tracef, ", length unknown\n");
     goto end;
   }
-  share->state.state.data_file_length= dfile_len;
-  share->state.state.key_file_length=  kfile_len;
+  if (share->state.state.data_file_length != dfile_len)
+  {
+    tprint(tracef, ", has wrong state.data_file_length (fixing it)");
+    share->state.state.data_file_length= dfile_len;
+  }
+  if (share->state.state.key_file_length != kfile_len)
+  {
+    tprint(tracef, ", has wrong state.key_file_length (fixing it)");
+    share->state.state.key_file_length= kfile_len;
+  }
   if ((dfile_len % share->block_size) > 0)
   {
     tprint(tracef, ", has too short last page\n");
@@ -1145,6 +1153,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD)
     goto end;
   }
   buff= log_record_buffer.str;
+  /**
+     @todo RECOVERY BUG
+     we stamp page with UNDO's LSN. Assume an operation logs REDO-REDO-UNDO
+     where the two REDOs are about the same page. Then recovery applies first
+     REDO and skips second REDO which is wrong. Solutions:
+     a) when applying REDO, keep page pinned, don't stamp it, remember it;
+     when seeing UNDO, unpin pages and stamp them; for BLOB pages we cannot
+     pin them (too large for memory) so need an additional pass in REDO phase:
+      - find UNDO
+      - execute all REDOs about this UNDO but skipping REDOs for
+     or b) when applying REDO, stamp page with REDO's LSN (=> difference in
+     'cmp' between run-time and recovery, need a special 'cmp'...).
+  */
   if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
                                              HEAD_PAGE,
                                              buff + FILEID_STORE_SIZE,