1
0
mirror of https://github.com/MariaDB/server.git synced 2025-12-24 11:21:21 +03:00
Files
mariadb/storage/maria/ma_checkpoint.c
unknown fa05e9c9f4 WL#3071 - Maria checkpoint
Adding rec_lsn to Maria's page cache. Misc fixes to Checkpoint.


mysys/mf_pagecache.c:
  adding rec_lsn, the LSN when a page first became dirty.
  It is set when unlocking a page (TODO: should also be set when
  the unlocking is an implicit part of pagecache_write()).
  It is reset in link_to_file_list() and free_block()
  (one of which is used every time we flush a block).
  It is a ulonglong and not LSN, because its destination is comparisons
  for which ulonglong is better than a struct.
storage/maria/ma_checkpoint.c:
  misc fixes to Checkpoint (updates now that the transaction manager
  and the page cache are more known)
storage/maria/ma_close.c:
  an important note for the future.
storage/maria/ma_least_recently_dirtied.c:
  comment
2006-12-16 18:10:47 +01:00

543 lines
18 KiB
C

/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
/*
WL#3071 Maria checkpoint
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* Here is the implementation of this module */
/*
Summary:
- there are asynchronous checkpoints (a writer to the log notices that it's
been a long time since we last checkpoint-ed, so posts a request for a
background thread to do a checkpoint; does not care about the success of the
checkpoint). Then the checkpoint is done by the checkpoint thread, at an
unspecified moment ("later") (==soon, of course).
- there are synchronous checkpoints: a thread requests a checkpoint to
happen now and wants to know when it finishes and if it succeeded; then the
checkpoint is done by that same thread.
*/
#include "page_cache.h"
#include "least_recently_dirtied.h"
#include "transaction.h"
#include "share.h"
#include "log.h"
#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */
#define LSN_MAX ((LSN)ULONGLONG_MAX)
/*
this transaction is used for any system work (purge, checkpoint writing
etc), that is, background threads. It will not be declared/initialized here
in the final version.
*/
st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...};
/* those three are protected by the log's mutex */
/*
The maximum rec_lsn in the LRD when last checkpoint was run, serves for the
MEDIUM checkpoint.
*/
LSN max_rec_lsn_at_last_checkpoint= 0;
/* last submitted checkpoint request; cleared only when executed */
CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
static inline ulonglong read_non_atomic(ulonglong volatile *x);
/*
Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA
ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the
UNDO recovery phase.
*/
my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
{
my_bool result;
DBUG_ENTER("execute_synchronous_checkpoint");
DBUG_ASSERT(level > NONE);
lock(log_mutex);
while ((synchronous_checkpoint_in_progress != NONE) ||
(next_asynchronous_checkpoint_to_do != NONE))
wait_on_checkpoint_done_cond();
synchronous_checkpoint_in_progress= level;
result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
synchronous_checkpoint_in_progress= NONE;
unlock(log_mutex);
broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
/*
If no checkpoint is running, and there is a pending asynchronous checkpoint
request, executes it.
Is safe if multiple threads call it, though in first version only one will.
It's intended to be used by a thread which regularly calls this function;
this is why, if there is a request,it does not wait in a loop for
synchronous checkpoints to be finished, but just exits (because the thread
may want to do something useful meanwhile (flushing dirty pages for example)
instead of waiting).
*/
my_bool execute_asynchronous_checkpoint_if_any()
{
my_bool result;
CHECKPOINT_LEVEL level;
DBUG_ENTER("execute_asynchronous_checkpoint");
lock(log_mutex);
if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
(synchronous_checkpoint_in_progress != NONE)))
{
unlock(log_mutex);
DBUG_RETURN(FALSE);
}
level= next_asynchronous_checkpoint_to_do;
DBUG_ASSERT(level > NONE);
result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
/* If only one thread calls this function, "<" can never happen below */
if (next_asynchronous_checkpoint_to_do <= level)
{
/* it's our request or weaker/equal ones, all work is done */
next_asynchronous_checkpoint_to_do= NONE;
}
/* otherwise if it is a stronger request, we'll deal with it at next call */
unlock(log_mutex);
broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
/*
Does the actual checkpointing. Called by
execute_synchronous_checkpoint() and
execute_asynchronous_checkpoint_if_any().
*/
my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
{
DBUG_ENTER("execute_checkpoint");
safemutex_assert_owner(log_mutex);
if (unlikely(level > INDIRECT))
{
LSN copy_of_max_rec_lsn_at_last_checkpoint=
max_rec_lsn_at_last_checkpoint;
/* much I/O work to do, release log mutex */
unlock(log_mutex);
switch (level)
{
case FULL:
/* flush all pages up to the current end of the LRD */
flush_all_LRD_to_lsn(LSN_MAX);
/* this will go full speed (normal scheduling, no sleep) */
break;
case MEDIUM:
/*
flush all pages which were already dirty at last checkpoint:
ensures that recovery will never start from before the next-to-last
checkpoint (two-checkpoint rule).
*/
flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint);
/* this will go full speed (normal scheduling, no sleep) */
break;
}
lock(log_mutex);
}
/*
keep mutex locked upon exit because callers will want to clear
mutex-protected status variables
*/
DBUG_RETURN(execute_checkpoint_indirect());
}
/*
Does an indirect checpoint (collects data from data structures, writes into
a checkpoint log record).
Starts and ends while having log's mutex (released in the middle).
*/
my_bool execute_checkpoint_indirect()
{
int error= 0;
/* checkpoint record data: */
LSN checkpoint_start_lsn;
LEX_STRING string1={0,0}, string2={0,0}, string3={0,0};
LEX_STRING *string_array[4];
char *ptr;
LSN checkpoint_lsn;
LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
DBUG_ENTER("execute_checkpoint_indirect");
DBUG_ASSERT(sizeof(byte *) <= 8);
DBUG_ASSERT(sizeof(LSN) <= 8);
safemutex_assert_owner(log_mutex);
checkpoint_start_lsn= log_read_end_lsn();
if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
DBUG_RETURN(TRUE);
unlock(log_mutex);
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
/* STEP 1: fetch information about dirty pages */
/* note: this piece will move into mysys/mf_pagecache.c */
{
ulong stored_LRD_size= 0;
/*
We lock the entire cache but will be quick, just reading/writing a few MBs
of memory at most.
When we enter here, we must be sure that no "first_in_switch" situation
is happening or will happen (either we have to get rid of
first_in_switch in the code or, first_in_switch has to increment a
"danger" counter for Checkpoint to know it has to wait.
*/
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
/*
This is an over-estimation, as in theory blocks_changed may contain
non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
checkpoint record; the true number of page-LRD-info we'll store into the
record is stored_LRD_size.
*/
/*
TODO: Ingo says blocks_changed is not a reliable number (see his
document); ask him.
*/
string1.length= 8+8+(8+8+8)*pagecache->blocks_changed;
if (NULL == (string1.str= my_malloc(string1.length)))
goto err;
ptr= string1.str;
int8store(ptr, checkpoint_start_lsn);
ptr+= 8+8; /* don't store stored_LRD_size now, wait */
if (pagecache->blocks_changed > 0)
{
uint file_hash;
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
PAGECACHE_BLOCK_LINK *block;
for (block= pagecache->changed_blocks[file_hash] ;
block;
block= block->next_changed)
{
DBUG_ASSERT(block->hash_link != NULL);
DBUG_ASSERT(block->status & BLOCK_CHANGED);
if (block->type != PAGECACHE_LSN_PAGE)
{
continue; /* no need to store it in the checkpoint record */
}
/*
In the current pagecache, rec_lsn is not set correctly:
1) it is set on pagecache_unlock(), too late (a page is dirty
(BLOCK_CHANGED) since the first pagecache_write()). It may however
be not too late, because until unlock(), the page's update is not
committed, so it's ok that REDOs for it be skipped at Recovery
(which is what happens with an unset rec_lsn). Note that this
relies on the assumption that a transaction never commits while
holding locks on pages.
2) sometimes the unlocking can be an implicit action of
pagecache_write(), without any call to pagecache_unlock(), then
rec_lsn is not set. That one is a critical problem.
TODO: fix this when Monty has explained how he writes BLOB pages.
*/
if (0 == block->rec_lsn)
abort(); /* always fail in all builds, in case it's problem 2) */
int8store(ptr, block->hash_link->file.file);
ptr+= 8;
int8store(ptr, block->hash_link->pageno);
ptr+= 8;
int8store(ptr, block->rec_lsn);
ptr+= 8;
stored_LRD_size++;
DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
block->rec_lsn);
}
}
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
int8store(string1.str+8, stored_LRD_size);
string1.length= 8+8+(8+8+8)*stored_LRD_size;
}
/* STEP 2: fetch information about transactions */
/* note: this piece will move into trnman.c */
/*
Transactions are in the "active list" (protected by a mutex) and in a
lock-free hash of "committed" (insertion protected by the same mutex,
deletion lock-free).
*/
{
TRN *trn;
ulong stored_trn_size= 0;
/* First, the active transactions */
pthread_mutex_lock(LOCK_trn_list);
string2.length= 8+(7+2+8+8+8)*trnman_active_transactions;
if (NULL == (string2.str= my_malloc(string2.length)))
goto err;
ptr= string2.str;
ptr+= 8;
for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
{
/* we would latch trn.rwlock if it existed */
if (0 == trn->short_trid) /* trn is not inited, skip */
continue;
/* state is not needed for now (only when we have prepared trx) */
/* int7store does not exist but mi_int7store does */
int7store(ptr, trn->trid);
ptr+= 7;
int2store(ptr, trn->short_trid);
ptr+= 2;
int8store(ptr, trn->undo_lsn); /* is an LSN 7 or 8 bytes really? */
ptr+= 8;
int8store(ptr, trn->undo_purge_lsn);
ptr+= 8;
int8store(ptr, read_non_atomic(&trn->first_undo_lsn));
ptr+= 8;
/* possibly unlatch el.rwlock */
stored_trn_size++;
}
pthread_mutex_unlock(LOCK_trn_list);
/*
Now the committed ones.
We need a function which scans the hash's list of elements in a
lock-free manner (a bit like lfind(), starting from bucket 0), and for
each node (committed transaction) stores the transaction's
information (trid, undo_purge_lsn, first_undo_lsn) into a buffer.
This big buffer is malloc'ed at the start, so the number of elements (or
an upper bound of it) found in the hash needs to be known in advance
(one solution is to keep LOCK_trn_list locked, ensuring that nodes are
only deleted).
*/
/*
TODO: if we see there exists no transaction (active and committed) we can
tell the lock-free structures to do some freeing (my_free()).
*/
int8store(string1.str, stored_trn_size);
string2.length= 8+(7+2+8+8+8)*stored_trn_size;
}
/* STEP 3: fetch information about table files */
{
/* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
lock(global_share_list_mutex);
string3.length= 8+(8+8)*share_list->count;
if (NULL == (string3.str= my_malloc(string3.length)))
goto err;
ptr= string3.str;
/* possibly latch each MARIA_SHARE, one by one, like this: */
pthread_mutex_lock(&share->intern_lock);
/*
We'll copy the file id (a bit like share->kfile), the file name
(like share->unique_file_name[_length]).
*/
make_copy_of_global_share_list_to_array;
pthread_mutex_unlock(&share->intern_lock);
unlock(global_share_list_mutex);
/* work on copy */
int8store(ptr, elements_in_array);
ptr+= 8;
for (el in array)
{
int8store(ptr, array[...].file_id);
ptr+= 8;
memcpy(ptr, array[...].file_name, ...);
ptr+= ...;
/*
these two are long ops (involving disk I/O) that's why we copied the
list, to not keep the list locked for long:
*/
/* TODO: what if the table pointer is gone/reused now? */
flush_bitmap_pages(el);
/* TODO: and also autoinc counter, logical file end, free page list */
/*
fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
second, so if you have touched 1000 files it's 7 seconds).
*/
force_file(el);
}
}
/* LAST STEP: now write the checkpoint log record */
string_array[0]= string1;
string_array[1]= string2;
string_array[2]= string3;
string_array[3]= NULL;
checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
&system_trans, string_array);
/*
Do nothing between the log write and the control file write, for the
"repair control file" tool to be possible one day.
*/
if (LSN_IMPOSSIBLE == checkpoint_lsn)
goto err;
if (0 != control_file_write_and_force(checkpoint_lsn, NULL))
goto err;
goto end;
err:
print_error_to_error_log(the_error_message);
candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE;
end:
my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
/*
this portion cannot be done as a hook in write_log_record() for the
LOGREC_CHECKPOINT type because:
- at that moment we still have not written to the control file so cannot
mark the request as done; this could be solved by writing to the control
file in the hook but that would be an I/O under the log's mutex, bad.
- it would not be nice organisation of code (I tried it :).
*/
if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
{
/* checkpoint succeeded */
/*
TODO: compute log's low water mark (how to do that with our fuzzy
ARIES-like reads of data structures? TODO think about it :).
*/
lock(log_mutex);
/* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
written_since_last_checkpoint= (my_off_t)0;
DBUG_RETURN(FALSE);
}
lock(log_mutex);
DBUG_RETURN(TRUE);
/*
keep mutex locked upon exit because callers will want to clear
mutex-protected status variables
*/
}
/*
Here's what should be put in log_write_record() in the log handler:
*/
log_write_record(...)
{
...;
lock(log_mutex);
...;
write_to_log(length);
written_since_last_checkpoint+= length;
if (written_since_last_checkpoint >
MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS)
{
/*
ask one system thread (the "LRD background flusher and checkpointer
thread" WL#3261) to do a checkpoint
*/
request_asynchronous_checkpoint(INDIRECT);
}
...;
unlock(log_mutex);
...;
}
/*
Requests a checkpoint from the background thread, *asynchronously*
(requestor does not wait for completion, and does not even later check the
result).
In real life it will be called by log_write_record().
*/
void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
{
safemutex_assert_owner(log_mutex);
DBUG_ASSERT(level > NONE);
if (next_asynchronous_checkpoint_to_do < level)
{
/* no equal or stronger running or to run, we post request */
/*
note that thousands of requests for checkpoints are going to come all
at the same time (when the log bound
MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS is passed), so it may not be a
good idea for each of them to broadcast a cond to wake up the background
checkpoint thread. We just don't broacast a cond, the checkpoint thread
(see least_recently_dirtied.c) will notice our request in max a few
seconds.
*/
next_asynchronous_checkpoint_to_do= level; /* post request */
}
/*
If there was an error, only an error
message to the error log will say it; normal, for a checkpoint triggered
by a log write, we probably don't want the client's log write to throw an
error, as the log write succeeded and a checkpoint failure is not
critical: the failure in this case is more for the DBA to know than for
the end user.
*/
}
/*
If a 64-bit variable transitions from both halves being zero to both halves
being non-zero, and never changes after that (like the transaction's
first_undo_lsn), this function can be used to do a read of it (without
mutex, without atomic load) which always produces a correct (though maybe
slightly old) value (even on 32-bit CPUs).
*/
static inline ulonglong read_non_atomic(ulonglong volatile *x)
{
#if ( SIZEOF_CHARP >= 8 )
/* 64-bit CPU (right?), 64-bit reads are atomic */
return *x;
#else
/*
32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
low bits and new high bits, or the contrary).
As the variable we read transitions from both halves being zero to both
halves being non-zero, and never changes then, we can detect atomicity
problems:
*/
ulonglong y;
for (;;) /* loop until no atomicity problems */
{
y= *x;
if (likely(((0 == y) ||
((0 != (y >> 32)) && (0 != (y << 32)))))
return y;
/* Worth seeing it! */
DBUG_PRINT("info",("atomicity problem"));
}
#endif
}