From 48a596edac4a85d3fd49f6e60c215287a485df40 Mon Sep 17 00:00:00 2001 From: "istruewing@chilla.local" <> Date: Wed, 31 Jan 2007 18:49:07 +0100 Subject: [PATCH 01/40] Bug#17332 - changing key_buffer_size on a running server can crash under load Resizing a key cache while it was in heavy use could crash the server. There were several race conditions. I reworked some of the algorithms to fix the race conditions. No test case. Repeating the crashes requires heavy concurrent load on the key cache. A test script is attached to the bug report. More explanations to the changes are contained in a text file attached to the bug report. --- include/keycache.h | 6 + mysys/mf_keycache.c | 2566 ++++++++++++++++++++++++++++------- mysys/my_static.c | 3 - sql/handler.cc | 4 +- sql/sql_table.cc | 7 +- storage/myisam/ha_myisam.cc | 2 +- storage/myisam/mi_preload.c | 27 + 7 files changed, 2136 insertions(+), 479 deletions(-) diff --git a/include/keycache.h b/include/keycache.h index dc763b8cc08..4d99c68844f 100644 --- a/include/keycache.h +++ b/include/keycache.h @@ -44,6 +44,7 @@ typedef struct st_keycache_wqueue typedef struct st_key_cache { my_bool key_cache_inited; + my_bool in_resize; /* true during resize operation */ my_bool resize_in_flush; /* true during flush of resize operation */ my_bool can_be_used; /* usage of cache for read/write is allowed */ uint key_cache_shift; @@ -72,6 +73,11 @@ typedef struct st_key_cache BLOCK_LINK *used_ins; /* ptr to the insertion block in LRU chain */ pthread_mutex_t cache_lock; /* to lock access to the cache structure */ KEYCACHE_WQUEUE resize_queue; /* threads waiting during resize operation */ + /* + Waiting for a zero resize count. Using a queue for symmetry though + only one thread can wait here. + */ + KEYCACHE_WQUEUE waiting_for_resize_cnt; KEYCACHE_WQUEUE waiting_for_hash_link; /* waiting for a free hash link */ KEYCACHE_WQUEUE waiting_for_block; /* requests waiting for a free block */ BLOCK_LINK *changed_blocks[CHANGED_BLOCKS_HASH]; /* hash for dirty file bl.*/ diff --git a/mysys/mf_keycache.c b/mysys/mf_keycache.c index ff202e7b313..263d5384057 100644 --- a/mysys/mf_keycache.c +++ b/mysys/mf_keycache.c @@ -36,6 +36,64 @@ blocks_unused is the sum of never used blocks in the pool and of currently free blocks. blocks_used is the number of blocks fetched from the pool and as such gives the maximum number of in-use blocks at any time. + + Key Cache Locking + ================= + + All key cache locking is done with a single mutex per key cache: + keycache->cache_lock. This mutex is locked almost all the time + when executing code in this file (mf_keycache.c). + However it is released for I/O and some copy operations. + + The cache_lock is also released when waiting for some event. Waiting + and signalling is done via condition variables. In most cases the + thread waits on its thread->suspend condition variable. Every thread + has a my_thread_var structure, which contains this variable and a + '*next' and '**prev' pointer. These pointers are used to insert the + thread into a wait queue. + + NOTE: Since there is only one pair of queue pointers per thread, a + thread can be in one wait queue only. + + Before starting to wait on its condition variable with + pthread_cond_wait(), the thread enters itself to a specific wait queue + with link_into_queue() (double linked with '*next' + '**prev') or + wait_on_queue() (single linked with '*next'). + + Another thread, when releasing a resource, looks up the waiting thread + in the related wait queue. It sends a signal with + pthread_cond_signal() to the waiting thread. + + NOTE: Depending on the particular wait situation, either the sending + thread removes the waiting thread from the wait queue with + unlink_from_queue() or release_whole_queue() respectively, or the waiting + thread removes itself. + + There is one exception from this locking scheme. Each block has a + reference to a condition variable (condvar). It holds a reference to + the thread->suspend condition variable, if that thread is waiting for + the block. When that thread is signalled, the reference is cleared. + This is similar to the above, but it clearly means that only one + thread can wait for a particular block. There is no queue in this + case. Strangely enough block->convar is used for waiting for the + assigned hash_link only. More precisely it is used to wait for all + requests to be unregistered from the assigned hash_link. + + The resize_queue serves two purposes: + 1. Threads that want to do a resize wait there if in_resize is set. + This is not used in the server. The server refuses a second resize + request if one is already active. keycache->in_init is used for the + synchronization. See set_var.cc. + 2. Threads that want to access blocks during resize wait here during + the re-initialization phase. + When the resize is done, all threads on the queue are signalled. + Hypothetical resizers can compete for resizing, and read/write + requests will restart to request blocks from the freshly resized + cache. If the cache has been resized too small, it is disabled and + 'can_be_used' is false. In this case read/write requests bypass the + cache. Since they increment and decrement 'cnt_for_resize_op', the + next resizer can wait on the queue 'waiting_for_resize_cnt' until all + I/O finished. */ #include "mysys_priv.h" @@ -111,12 +169,16 @@ struct st_hash_link }; /* simple states of a block */ -#define BLOCK_ERROR 1 /* an error occured when performing disk i/o */ -#define BLOCK_READ 2 /* the is page in the block buffer */ -#define BLOCK_IN_SWITCH 4 /* block is preparing to read new page */ -#define BLOCK_REASSIGNED 8 /* block does not accept requests for old page */ -#define BLOCK_IN_FLUSH 16 /* block is in flush operation */ -#define BLOCK_CHANGED 32 /* block buffer contains a dirty page */ +#define BLOCK_ERROR 1 /* an error occured when performing file i/o */ +#define BLOCK_READ 2 /* file block is in the block buffer */ +#define BLOCK_IN_SWITCH 4 /* block is preparing to read new page */ +#define BLOCK_REASSIGNED 8 /* blk does not accept requests for old page */ +#define BLOCK_IN_FLUSH 16 /* block is selected for flush */ +#define BLOCK_CHANGED 32 /* block buffer contains a dirty page */ +#define BLOCK_IN_USE 64 /* block is not free */ +#define BLOCK_IN_EVICTION 128 /* block is selected for eviction */ +#define BLOCK_IN_FLUSHWRITE 256 /* block is in write to file */ +#define BLOCK_FOR_UPDATE 512 /* block is selected for buffer modification */ /* page status, returned by find_key_block */ #define PAGE_READ 0 @@ -153,14 +215,18 @@ KEY_CACHE *dflt_key_cache= &dflt_key_cache_var; static int flush_all_key_blocks(KEY_CACHE *keycache); #ifdef THREAD -static void link_into_queue(KEYCACHE_WQUEUE *wqueue, - struct st_my_thread_var *thread); -static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue, - struct st_my_thread_var *thread); +static void wait_on_queue(KEYCACHE_WQUEUE *wqueue, + pthread_mutex_t *mutex); +static void release_whole_queue(KEYCACHE_WQUEUE *wqueue); +#else +#define wait_on_queue(wqueue, mutex) KEYCACHE_DBUG_ASSERT(0); +#define release_whole_queue(wqueue) /* release_whole_queue() */ #endif static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block); +#if !defined(DBUG_OFF) static void test_key_cache(KEY_CACHE *keycache, const char *where, my_bool lock); +#endif #define KEYCACHE_HASH(f, pos) \ (((ulong) ((pos) >> keycache->key_cache_shift)+ \ @@ -253,6 +319,13 @@ static int keycache_pthread_cond_signal(pthread_cond_t *cond); #define keycache_pthread_cond_signal pthread_cond_signal #endif /* defined(KEYCACHE_DEBUG) */ +#if !defined(DBUG_OFF) +#define inline /* disabled inline for easier debugging */ +static int fail_block(BLOCK_LINK *block); +static int fail_hlink(HASH_LINK *hlink); +static int cache_empty(KEY_CACHE *keycache); +#endif + static inline uint next_power(uint value) { return (uint) my_round_up_to_next_power((uint32) value) << 1; @@ -305,10 +378,19 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, keycache->disk_blocks= -1; if (! keycache->key_cache_inited) { - keycache->key_cache_inited= 1; + /* + Initialize these variables once only. + Their value must survive re-initialization during resizing. + */ + keycache->in_resize= 0; + keycache->resize_in_flush= 0; + keycache->cnt_for_resize_op= 0; + keycache->waiting_for_resize_cnt.last_thread= NULL; keycache->in_init= 0; pthread_mutex_init(&keycache->cache_lock, MY_MUTEX_INIT_FAST); keycache->resize_queue.last_thread= NULL; + /* Initialize this after the mutex. It is read asynchronously. */ + keycache->key_cache_inited= 1; } keycache->key_cache_mem_size= use_mem; @@ -320,7 +402,8 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, blocks= (uint) (use_mem / (sizeof(BLOCK_LINK) + 2 * sizeof(HASH_LINK) + sizeof(HASH_LINK*) * 5/4 + key_cache_block_size)); /* It doesn't make sense to have too few blocks (less than 8) */ - if (blocks >= 8 && keycache->disk_blocks < 0) + /* Comment to be deleted: disk_blocks is set to -1 above unconditionally. */ + if (blocks >= 8) { for ( ; ; ) { @@ -394,8 +477,6 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, blocks * age_threshold / 100 : blocks); - keycache->cnt_for_resize_op= 0; - keycache->resize_in_flush= 0; keycache->can_be_used= 1; keycache->waiting_for_hash_link.last_thread= NULL; @@ -411,6 +492,11 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, bzero((gptr) keycache->file_blocks, sizeof(keycache->file_blocks[0]) * CHANGED_BLOCKS_HASH); } + else + { + /* key_buffer_size is specified too small. Disable the cache. */ + keycache->can_be_used= 0; + } keycache->blocks= keycache->disk_blocks > 0 ? keycache->disk_blocks : 0; DBUG_RETURN((int) keycache->disk_blocks); @@ -469,10 +555,6 @@ int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, uint age_threshold) { int blocks; -#ifdef THREAD - struct st_my_thread_var *thread; - KEYCACHE_WQUEUE *wqueue; -#endif DBUG_ENTER("resize_key_cache"); if (!keycache->key_cache_inited) @@ -488,54 +570,89 @@ int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, keycache_pthread_mutex_lock(&keycache->cache_lock); #ifdef THREAD - wqueue= &keycache->resize_queue; - thread= my_thread_var; - link_into_queue(wqueue, thread); - - while (wqueue->last_thread->next != thread) + /* + We may need to wait for another thread which is doing a resize + already. This cannot happen in the MySQL server though. It allows + one resizer only. In set_var.cc keycache->in_init is used to block + multiple attempts. + */ + while (keycache->in_resize) { - keycache_pthread_cond_wait(&thread->suspend, &keycache->cache_lock); + /* purecov: begin inspected */ + wait_on_queue(&keycache->resize_queue, &keycache->cache_lock); + /* purecov: end */ } #endif - keycache->resize_in_flush= 1; - if (flush_all_key_blocks(keycache)) + /* + Mark the operation in progress. This blocks other threads from doing + a resize in parallel. It prohibits new blocks to enter the cache. + Read/write requests can bypass the cache during the flush phase. + */ + keycache->in_resize= 1; + + /* Need to flush only if keycache is enabled. */ + if (keycache->can_be_used) { - /* TODO: if this happens, we should write a warning in the log file ! */ + /* Start the flush phase. */ + keycache->resize_in_flush= 1; + + if (flush_all_key_blocks(keycache)) + { + /* TODO: if this happens, we should write a warning in the log file ! */ + keycache->resize_in_flush= 0; + blocks= 0; + keycache->can_be_used= 0; + goto finish; + } + + /* End the flush phase. */ keycache->resize_in_flush= 0; - blocks= 0; - keycache->can_be_used= 0; - goto finish; } - keycache->resize_in_flush= 0; - keycache->can_be_used= 0; + #ifdef THREAD + /* + Some direct read/write operations (bypassing the cache) may still be + unfinished. Wait until they are done. If the key cache can be used, + direct I/O is done in increments of key_cache_block_size. That is, + every block is checked if it is in the cache. We need to wait for + pending I/O before re-initializing the cache, because we may change + the block size. Otherwise they could check for blocks at file + positions where the new block division has none. We do also want to + wait for I/O done when (if) the cache was disabled. It must not + run in parallel with normal cache operation. + */ while (keycache->cnt_for_resize_op) { - KEYCACHE_DBUG_PRINT("resize_key_cache: wait", - ("suspend thread %ld", thread->id)); - keycache_pthread_cond_wait(&thread->suspend, &keycache->cache_lock); + wait_on_queue(&keycache->waiting_for_resize_cnt, &keycache->cache_lock); } #else KEYCACHE_DBUG_ASSERT(keycache->cnt_for_resize_op == 0); #endif + /* + Free old cache structures, allocate new structures, and initialize + them. Note that the cache_lock mutex and the resize_queue are left + untouched. We do not lose the cache_lock and will release it only at + the end of this function. + */ end_key_cache(keycache, 0); /* Don't free mutex */ /* The following will work even if use_mem is 0 */ blocks= init_key_cache(keycache, key_cache_block_size, use_mem, division_limit, age_threshold); finish: + /* + Mark the resize finished. This allows other threads to start a + resize or to request new cache blocks. + */ + keycache->in_resize= 0; + #ifdef THREAD - unlink_from_queue(wqueue, thread); - /* Signal for the next resize request to proceeed if any */ - if (wqueue->last_thread) - { - KEYCACHE_DBUG_PRINT("resize_key_cache: signal", - ("thread %ld", wqueue->last_thread->next->id)); - keycache_pthread_cond_signal(&wqueue->last_thread->next->suspend); - } + /* Signal waiting threads. */ + release_whole_queue(&keycache->resize_queue); #endif + keycache_pthread_mutex_unlock(&keycache->cache_lock); DBUG_RETURN(blocks); } @@ -557,14 +674,8 @@ static inline void inc_counter_for_resize_op(KEY_CACHE *keycache) static inline void dec_counter_for_resize_op(KEY_CACHE *keycache) { #ifdef THREAD - struct st_my_thread_var *last_thread; - if (!--keycache->cnt_for_resize_op && - (last_thread= keycache->resize_queue.last_thread)) - { - KEYCACHE_DBUG_PRINT("dec_counter_for_resize_op: signal", - ("thread %ld", last_thread->next->id)); - keycache_pthread_cond_signal(&last_thread->next->suspend); - } + if (!--keycache->cnt_for_resize_op) + release_whole_queue(&keycache->waiting_for_resize_cnt); #else keycache->cnt_for_resize_op--; #endif @@ -658,6 +769,7 @@ void end_key_cache(KEY_CACHE *keycache, my_bool cleanup) #ifdef THREAD + /* Link a thread into double-linked queue of waiting threads. @@ -673,12 +785,17 @@ void end_key_cache(KEY_CACHE *keycache, my_bool cleanup) Queue is represented by a circular list of the thread structures The list is double-linked of the type (**prev,*next), accessed by a pointer to the last element. + + Since there is only one pair of queue pointers per thread, a + thread can be part of one wait queue only. */ static void link_into_queue(KEYCACHE_WQUEUE *wqueue, struct st_my_thread_var *thread) { struct st_my_thread_var *last; + + DBUG_ASSERT(!thread->next && !thread->prev); if (! (last= wqueue->last_thread)) { /* Queue is empty */ @@ -714,6 +831,7 @@ static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue, struct st_my_thread_var *thread) { KEYCACHE_DBUG_PRINT("unlink_from_queue", ("thread %ld", thread->id)); + DBUG_ASSERT(thread->next && thread->prev); if (thread->next == thread) /* The queue contains only one member */ wqueue->last_thread= NULL; @@ -726,6 +844,13 @@ static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue, thread->prev); } thread->next= NULL; +#if !defined(DBUG_OFF) + /* + This makes it easier to see it's not in a chain during debugging. + And some DBUG_ASSERT() rely on it. + */ + thread->prev= NULL; +#endif } @@ -733,9 +858,9 @@ static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue, Add a thread to single-linked queue of waiting threads SYNOPSIS - add_to_queue() - wqueue pointer to the queue structure - thread pointer to the thread to be added to the queue + wait_on_queue() + wqueue Pointer to the queue structure. + mutex Cache_lock to acquire after awake. RETURN VALUE none @@ -744,12 +869,26 @@ static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue, Queue is represented by a circular list of the thread structures The list is single-linked of the type (*next), accessed by a pointer to the last element. + + Since there is only one pair of queue pointers per thread, a + thread can be part of one wait queue only. + + The function protects against stray signals by verifying that the + current thread is unlinked from the queue when awaking. However, + since several threads can wait for the same event, it might be + necessary for the caller of the function to check again if the + condition for awake is indeed matched. */ -static inline void add_to_queue(KEYCACHE_WQUEUE *wqueue, - struct st_my_thread_var *thread) +static void wait_on_queue(KEYCACHE_WQUEUE *wqueue, + pthread_mutex_t *mutex) { struct st_my_thread_var *last; + struct st_my_thread_var *thread= my_thread_var; + + /* Add to queue. */ + DBUG_ASSERT(!thread->next); + DBUG_ASSERT(!thread->prev); /* Not required, but must be true anyway. */ if (! (last= wqueue->last_thread)) thread->next= thread; else @@ -758,6 +897,17 @@ static inline void add_to_queue(KEYCACHE_WQUEUE *wqueue, last->next= thread; } wqueue->last_thread= thread; + + /* + Wait until thread is removed from queue by the signalling thread. + The loop protects against stray signals. + */ + do + { + KEYCACHE_DBUG_PRINT("wait", ("suspend thread %ld", thread->id)); + keycache_pthread_cond_wait(&thread->suspend, mutex); + } + while(thread->next); } @@ -765,36 +915,47 @@ static inline void add_to_queue(KEYCACHE_WQUEUE *wqueue, Remove all threads from queue signaling them to proceed SYNOPSIS - realease_queue() - wqueue pointer to the queue structure - thread pointer to the thread to be added to the queue + release_whole_queue() + wqueue pointer to the queue structure RETURN VALUE none NOTES. - See notes for add_to_queue + See notes for wait_on_queue(). When removed from the queue each thread is signaled via condition variable thread->suspend. */ -static void release_queue(KEYCACHE_WQUEUE *wqueue) +static void release_whole_queue(KEYCACHE_WQUEUE *wqueue) { - struct st_my_thread_var *last= wqueue->last_thread; - struct st_my_thread_var *next= last->next; + struct st_my_thread_var *last; + struct st_my_thread_var *next; struct st_my_thread_var *thread; + + /* Queue may be empty. */ + if (!(last= wqueue->last_thread)) + return; + + next= last->next; do { thread=next; - KEYCACHE_DBUG_PRINT("release_queue: signal", ("thread %ld", thread->id)); + KEYCACHE_DBUG_PRINT("release_whole_queue: signal", + ("thread %ld", thread->id)); + /* Signal the thread. */ keycache_pthread_cond_signal(&thread->suspend); + /* Take thread from queue. */ next=thread->next; thread->next= NULL; } while (thread != last); + + /* Now queue is definitely empty. */ wqueue->last_thread= NULL; } -#endif + +#endif /* THREAD */ /* @@ -803,9 +964,19 @@ static void release_queue(KEYCACHE_WQUEUE *wqueue) static inline void unlink_changed(BLOCK_LINK *block) { + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); if (block->next_changed) block->next_changed->prev_changed= block->prev_changed; *block->prev_changed= block->next_changed; + +#if !defined(DBUG_OFF) + /* + This makes it easier to see it's not in a chain during debugging. + And some DBUG_ASSERT() rely on it. + */ + block->next_changed= NULL; + block->prev_changed= NULL; +#endif } @@ -815,6 +986,8 @@ static inline void unlink_changed(BLOCK_LINK *block) static inline void link_changed(BLOCK_LINK *block, BLOCK_LINK **phead) { + DBUG_ASSERT(!block->next_changed); + DBUG_ASSERT(!block->prev_changed); block->prev_changed= phead; if ((block->next_changed= *phead)) (*phead)->prev_changed= &block->next_changed; @@ -823,13 +996,36 @@ static inline void link_changed(BLOCK_LINK *block, BLOCK_LINK **phead) /* - Unlink a block from the chain of dirty/clean blocks, if it's asked for, - and link it to the chain of clean blocks for the specified file + Link a block in a chain of clean blocks of a file. + + SYNOPSIS + link_to_file_list() + keycache Key cache handle + block Block to relink + file File to be linked to + unlink If to unlink first + + DESCRIPTION + Unlink a block from whichever chain it is linked in, if it's + asked for, and link it to the chain of clean blocks of the + specified file. + + NOTE + Please do never set/clear BLOCK_CHANGED outside of + link_to_file_list() or link_to_changed_list(). + You would risk to damage correct counting of changed blocks + and to find blocks in the wrong hash. + + RETURN + void */ static void link_to_file_list(KEY_CACHE *keycache, BLOCK_LINK *block, int file, my_bool unlink) { + DBUG_ASSERT(block->status & BLOCK_IN_USE); + DBUG_ASSERT(block->hash_link && block->hash_link->block == block); + DBUG_ASSERT(block->hash_link->file == file); if (unlink) unlink_changed(block); link_changed(block, &keycache->file_blocks[FILE_HASH(file)]); @@ -843,13 +1039,34 @@ static void link_to_file_list(KEY_CACHE *keycache, /* - Unlink a block from the chain of clean blocks for the specified - file and link it to the chain of dirty blocks for this file + Re-link a block from the clean chain to the dirty chain of a file. + + SYNOPSIS + link_to_changed_list() + keycache key cache handle + block block to relink + + DESCRIPTION + Unlink a block from the chain of clean blocks of a file + and link it to the chain of dirty blocks of the same file. + + NOTE + Please do never set/clear BLOCK_CHANGED outside of + link_to_file_list() or link_to_changed_list(). + You would risk to damage correct counting of changed blocks + and to find blocks in the wrong hash. + + RETURN + void */ -static inline void link_to_changed_list(KEY_CACHE *keycache, - BLOCK_LINK *block) +static void link_to_changed_list(KEY_CACHE *keycache, + BLOCK_LINK *block) { + DBUG_ASSERT(block->status & BLOCK_IN_USE); + DBUG_ASSERT(!(block->status & BLOCK_CHANGED)); + DBUG_ASSERT(block->hash_link && block->hash_link->block == block); + unlink_changed(block); link_changed(block, &keycache->changed_blocks[FILE_HASH(block->hash_link->file)]); @@ -874,13 +1091,13 @@ static inline void link_to_changed_list(KEY_CACHE *keycache, none NOTES. - The LRU chain is represented by a curcular list of block structures. + The LRU ring is represented by a circular list of block structures. The list is double-linked of the type (**prev,*next) type. - The LRU chain is divided into two parts - hot and warm. + The LRU ring is divided into two parts - hot and warm. There are two pointers to access the last blocks of these two parts. The beginning of the warm part follows right after the end of the hot part. - Only blocks of the warm part can be used for replacement. + Only blocks of the warm part can be used for eviction. The first block from the beginning of this subchain is always taken for eviction (keycache->last_used->next) @@ -893,6 +1110,9 @@ static inline void link_to_changed_list(KEY_CACHE *keycache, +----| beg |---->...----| end |----+ +------+ +------+ins first for eviction + + It is also possible that the block is selected for eviction and thus + not linked in the LRU ring. */ static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot, @@ -901,7 +1121,12 @@ static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot, BLOCK_LINK *ins; BLOCK_LINK **pins; - KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests)); + DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/ + DBUG_ASSERT(!block->requests); + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + DBUG_ASSERT(!block->next_used); + DBUG_ASSERT(!block->prev_used); #ifdef THREAD if (!hot && keycache->waiting_for_block.last_thread) { @@ -930,6 +1155,29 @@ static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot, } while (thread != last_thread); hash_link->block= block; + /* + NOTE: We assigned the block to the hash_link and signalled the + requesting thread(s). But it is possible that other threads runs + first. These threads see the hash_link assigned to a block which + is assigned to another hash_link and not marked BLOCK_IN_SWITCH. + This can be a problem for functions that do not select the block + via its hash_link: flush and free. They do only see a block which + is in a "normal" state and don't know that it will be evicted soon. + + We cannot set BLOCK_IN_SWITCH here because only one of the + requesting threads must handle the eviction. All others must wait + for it to complete. If we set the flag here, the threads would not + know who is in charge of the eviction. Without the flag, the first + thread takes the stick and sets the flag. + + But we need to note in the block that is has been selected for + eviction. It must not be freed. The evicting thread will not + expect the block in the free list. Before freeing we could also + check if block->requests > 1. But I think including another flag + in the check of block->status is slightly more efficient and + probably easier to read. + */ + block->status|= BLOCK_IN_EVICTION; KEYCACHE_THREAD_TRACE("link_block: after signaling"); #if defined(KEYCACHE_DEBUG) KEYCACHE_DBUG_PRINT("link_block", @@ -956,7 +1204,7 @@ static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot, } else { - /* The LRU chain is empty */ + /* The LRU ring is empty. Let the block point to itself. */ keycache->used_last= keycache->used_ins= block->next_used= block; block->prev_used= &block->next_used; } @@ -990,6 +1238,13 @@ static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot, static void unlink_block(KEY_CACHE *keycache, BLOCK_LINK *block) { + DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/ + DBUG_ASSERT(!block->requests); + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + DBUG_ASSERT(block->next_used && block->prev_used && + (block->next_used->prev_used == &block->next_used) && + (*block->prev_used == block)); if (block->next_used == block) /* The list contains only one member */ keycache->used_last= keycache->used_ins= NULL; @@ -1003,6 +1258,13 @@ static void unlink_block(KEY_CACHE *keycache, BLOCK_LINK *block) keycache->used_ins=STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used); } block->next_used= NULL; +#if !defined(DBUG_OFF) + /* + This makes it easier to see it's not in a chain during debugging. + And some DBUG_ASSERT() rely on it. + */ + block->prev_used= NULL; +#endif KEYCACHE_THREAD_TRACE("unlink_block"); #if defined(KEYCACHE_DEBUG) @@ -1017,12 +1279,27 @@ static void unlink_block(KEY_CACHE *keycache, BLOCK_LINK *block) /* - Register requests for a block + Register requests for a block. + + SYNOPSIS + reg_requests() + keycache Pointer to a key cache data structure. + block Pointer to the block to register a request on. + count Number of requests. Always 1. + + NOTE + The first request unlinks the block from the LRU ring. This means + that it is protected against eveiction. + + RETURN + void */ static void reg_requests(KEY_CACHE *keycache, BLOCK_LINK *block, int count) { - if (! block->requests) - /* First request for the block unlinks it */ + DBUG_ASSERT(block->status & BLOCK_IN_USE); + DBUG_ASSERT(block->hash_link); + + if (!block->requests) unlink_block(keycache, block); block->requests+=count; } @@ -1042,7 +1319,7 @@ static void reg_requests(KEY_CACHE *keycache, BLOCK_LINK *block, int count) none NOTES. - Every linking to the LRU chain decrements by one a special block + Every linking to the LRU ring decrements by one a special block counter (if it's positive). If the at_end parameter is TRUE the block is added either at the end of warm sub-chain or at the end of hot sub-chain. It is added to the hot subchain if its counter is zero and number of @@ -1055,11 +1332,20 @@ static void reg_requests(KEY_CACHE *keycache, BLOCK_LINK *block, int count) At the same time the block at the very beginning of the hot subchain might be moved to the beginning of the warm subchain if it stays untouched for a too long time (this time is determined by parameter age_threshold). + + It is also possible that the block is selected for eviction and thus + not linked in the LRU ring. */ static void unreg_request(KEY_CACHE *keycache, BLOCK_LINK *block, int at_end) { + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/ + DBUG_ASSERT(block->requests); + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + DBUG_ASSERT(!block->next_used); + DBUG_ASSERT(!block->prev_used); if (! --block->requests) { my_bool hot; @@ -1078,9 +1364,22 @@ static void unreg_request(KEY_CACHE *keycache, link_block(keycache, block, hot, (my_bool)at_end); block->last_hit_time= keycache->keycache_time; keycache->keycache_time++; + /* + At this place, the block might be in the LRU ring or not. If an + evicter was waiting for a block, it was selected for eviction and + not linked in the LRU ring. + */ + /* + Check if we should link a hot block to the warm block sub-chain. + It is possible that we select the same block as above. But it can + also be another block. In any case a block from the LRU ring is + selected. In other words it works even if the above block was + selected for eviction and not linked in the LRU ring. Since this + happens only if the LRU ring is empty, the block selected below + would be NULL and the rest of the function skipped. + */ block= keycache->used_ins; - /* Check if we should link a hot block to the warm block */ if (block && keycache->keycache_time - block->last_hit_time > keycache->age_threshold) { @@ -1101,8 +1400,14 @@ static void unreg_request(KEY_CACHE *keycache, Remove a reader of the page in block */ -static inline void remove_reader(BLOCK_LINK *block) +static void remove_reader(BLOCK_LINK *block) { + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(block->hash_link && block->hash_link->block == block); + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + DBUG_ASSERT(!block->next_used); + DBUG_ASSERT(!block->prev_used); + DBUG_ASSERT(block->hash_link->requests); #ifdef THREAD if (! --block->hash_link->requests && block->condvar) keycache_pthread_cond_signal(block->condvar); @@ -1117,19 +1422,34 @@ static inline void remove_reader(BLOCK_LINK *block) signals on its termination */ -static inline void wait_for_readers(KEY_CACHE *keycache __attribute__((unused)), - BLOCK_LINK *block) +static void wait_for_readers(KEY_CACHE *keycache, + BLOCK_LINK *block) { #ifdef THREAD struct st_my_thread_var *thread= my_thread_var; + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(!(block->status & (BLOCK_ERROR | BLOCK_IN_FLUSH | + BLOCK_CHANGED))); + DBUG_ASSERT(block->hash_link); + DBUG_ASSERT(block->hash_link->block == block); + /* Linked in file_blocks or changed_blocks hash. */ + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + /* Not linked in LRU ring. */ + DBUG_ASSERT(!block->next_used); + DBUG_ASSERT(!block->prev_used); while (block->hash_link->requests) { KEYCACHE_DBUG_PRINT("wait_for_readers: wait", ("suspend thread %ld block %u", thread->id, BLOCK_NUMBER(block))); + /* There must be no other waiter. We have no queue here. */ + DBUG_ASSERT(!block->condvar); block->condvar= &thread->suspend; keycache_pthread_cond_wait(&thread->suspend, &keycache->cache_lock); block->condvar= NULL; + /* The other thread might have freed the block in between. */ + if (!block->hash_link) + break; } #else KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0); @@ -1355,77 +1675,312 @@ static BLOCK_LINK *find_key_block(KEY_CACHE *keycache, #endif restart: - /* Find the hash link for the requested page (file, filepos) */ + /* + If the flush phase of a resize operation fails, the cache is left + unusable. This will be detected only after "goto restart". + */ + if (!keycache->can_be_used) + DBUG_RETURN(0); + + /* + Find the hash_link for the requested file block (file, filepos). We + do always get a hash_link here. It has registered our request so + that no other thread can use it for another file block until we + release the request (which is done by remove_reader() usually). The + hash_link can have a block assigned to it or not. If there is a + block, it may be assigned to this hash_link or not. In cases where a + block is evicted from the cache, it is taken from the LRU ring and + referenced by the new hash_link. But the block can still be assigned + to its old hash_link for some time if it needs to be flushed first, + or if there are other threads still reading it. + + Summary: + hash_link is always returned. + hash_link->block can be: + - NULL or + - not assigned to this hash_link or + - assigned to this hash_link. If assigned, the block can have + - invalid data (when freshly assigned) or + - valid data. Valid data can be + - changed over the file contents (dirty) or + - not changed (clean). + */ hash_link= get_hash_link(keycache, file, filepos); + DBUG_ASSERT((hash_link->file == file) && (hash_link->diskpos == filepos)); page_status= -1; if ((block= hash_link->block) && block->hash_link == hash_link && (block->status & BLOCK_READ)) - page_status= PAGE_READ; - - if (wrmode && keycache->resize_in_flush) { - /* This is a write request during the flush phase of a resize operation */ + /* Assigned block with valid (changed or unchanged) contents. */ + page_status= PAGE_READ; + } + /* + else (page_status == -1) + - block == NULL or + - block not assigned to this hash_link or + - block assigned but not yet read from file (invalid data). + */ - if (page_status != PAGE_READ) + if (keycache->in_resize) + { + /* This is a request during a resize operation */ + + if (!block) { - /* We don't need the page in the cache: we are going to write on disk */ - hash_link->requests--; - unlink_hash(keycache, hash_link); - return 0; - } - if (!(block->status & BLOCK_IN_FLUSH)) - { - hash_link->requests--; + struct st_my_thread_var *thread; + /* - Remove block to invalidate the page in the block buffer - as we are going to write directly on disk. - Although we have an exlusive lock for the updated key part - the control can be yieded by the current thread as we might - have unfinished readers of other key parts in the block - buffer. Still we are guaranteed not to have any readers - of the key part we are writing into until the block is - removed from the cache as we set the BLOCL_REASSIGNED - flag (see the code below that handles reading requests). + The file block is not in the cache. We don't need it in the + cache: we are going to read or write directly to file. Cancel + the request. We can simply decrement hash_link->requests because + we did not release cache_lock since increasing it. So no other + thread can wait for our request to become released. */ - free_block(keycache, block); - return 0; - } - /* Wait intil the page is flushed on disk */ - hash_link->requests--; - { -#ifdef THREAD - struct st_my_thread_var *thread= my_thread_var; - add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + if (!--hash_link->requests) + { + /* + We are the only one to request this hash_link (this file/pos). + Free the hash_link. + */ + unlink_hash(keycache, hash_link); + DBUG_RETURN(0); + } + + /* + More requests on the hash_link. Someone tries to evict a block + for this hash_link (could have started before resizing started). + This means that the LRU ring is empty. Otherwise a block could + be assigned immediately. Behave like a thread that wants to + evict a block for this file/pos. Add to the queue of threads + waiting for a block. Wait until there is one assigned. + + Refresh the request on the hash-link so that it cannot be reused + for another file/pos. + */ + hash_link->requests++; + thread= my_thread_var; + thread->opt_info= (void *) hash_link; + link_into_queue(&keycache->waiting_for_block, thread); do { KEYCACHE_DBUG_PRINT("find_key_block: wait", ("suspend thread %ld", thread->id)); keycache_pthread_cond_wait(&thread->suspend, &keycache->cache_lock); - } - while(thread->next); -#else - KEYCACHE_DBUG_ASSERT(0); + } while (thread->next); + thread->opt_info= NULL; /* - Given the use of "resize_in_flush", it seems impossible - that this whole branch is ever entered in single-threaded case - because "(wrmode && keycache->resize_in_flush)" cannot be true. - TODO: Check this, and then put the whole branch into the - "#ifdef THREAD" guard. + A block should now be assigned to the hash_link. But it may + still need to be evicted. Anyway, we should re-check the + situation. page_status must be set correctly. */ -#endif + hash_link->requests--; + goto restart; + } /* end of if (!block) */ + + /* + There is a block for this file/pos in the cache. Register a + request on it. This unlinks it from the LRU ring (if it is there) + and hence protects it against eviction (if not already in + eviction). We need this for returning the block to the caller, for + calling remove_reader() (for debugging purposes), and for calling + free_block(). The only case where we don't need the request is if + the block is in eviction. In that case we have to unregister the + request later. + */ + reg_requests(keycache, block, 1); + + if (page_status != PAGE_READ) + { + /* + - block not assigned to this hash_link or + - block assigned but not yet read from file (invalid data). + + This must be a block in eviction. It will be read soon. We need + to wait here until this happened. Otherwise the caller could + access a wrong block or a block which is in read. While waiting + we cannot lose hash_link nor block. We have registered a request + on the hash_link. Everything can happen to the block but changes + in the hash_link -> block relationship. In other words: + everything can happen to the block but free or another completed + eviction. + + Note that we bahave like a secondary requestor here. We just + cannot return with PAGE_WAIT_TO_BE_READ. This would work for + read requests and writes on dirty blocks that are not in flush + only. Waiting here on COND_FOR_REQUESTED works in all + situations. + */ + DBUG_ASSERT(((block->hash_link != hash_link) && + (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) || + ((block->hash_link == hash_link) && + !(block->status & BLOCK_READ))); + wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock); + /* + Here we can trust that the block has been assigned to this + hash_link (block->hash_link == hash_link) and read into the + buffer (BLOCK_READ). The worst things possible here are that the + block is in free (BLOCK_REASSIGNED). But the block is still + assigned to the hash_link. The freeing thread waits until we + release our request on the hash_link. The block must not be + again in eviction because we registered an request on it before + starting to wait. + */ + DBUG_ASSERT(block->hash_link == hash_link); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))); } - /* Invalidate page in the block if it has not been done yet */ - if (block->status) + /* + The block is in the cache. Assigned to the hash_link. Valid data. + Note that in case of page_st == PAGE_READ, the block can be marked + for eviction. In any case it can be marked for freeing. + */ + + if (!wrmode) + { + /* A reader can just read the block. */ + *page_st= PAGE_READ; + DBUG_ASSERT((hash_link->file == file) && + (hash_link->diskpos == filepos) && + (block->hash_link == hash_link)); + DBUG_RETURN(block); + } + + /* + This is a writer. No two writers for the same block can exist. + This must be assured by locks outside of the key cache. + */ + DBUG_ASSERT(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block)); + + while (block->status & BLOCK_IN_FLUSH) + { + /* + Wait until the block is flushed to file. Do not release the + request on the hash_link yet to prevent that the block is freed + or reassigned while we wait. While we wait, several things can + happen to the block, including another flush. But the block + cannot be reassigned to another hash_link until we release our + request on it. But it can be marked BLOCK_REASSIGNED from free + or eviction, while they wait for us to release the hash_link. + */ + wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock); + /* + If the flush phase failed, the resize could have finished while + we waited here. + */ + if (!keycache->in_resize) + { + remove_reader(block); + unreg_request(keycache, block, 1); + goto restart; + } + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block)); + DBUG_ASSERT(block->hash_link == hash_link); + } + + if (block->status & BLOCK_CHANGED) + { + /* + We want to write a block with changed contents. If the cache + block size is bigger than the callers block size (e.g. MyISAM), + the caller may replace part of the block only. Changes of the + other part of the block must be preserved. Since the block has + not yet been selected for flush, we can still add our changes. + */ + *page_st= PAGE_READ; + DBUG_ASSERT((hash_link->file == file) && + (hash_link->diskpos == filepos) && + (block->hash_link == hash_link)); + DBUG_RETURN(block); + } + + /* + This is a write request for a clean block. We do not want to have + new dirty blocks in the cache while resizing. We will free the + block and write directly to file. If the block is in eviction or + in free, we just let it go. + + Unregister from the hash_link. This must be done before freeing + the block. And it must be done if not freeing the block. Because + we could have waited above, we need to call remove_reader(). Other + threads could wait for us to release our request on the hash_link. + */ + remove_reader(block); + + /* If the block is not in eviction and not in free, we can free it. */ + if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH | + BLOCK_REASSIGNED))) + { + /* + Free block as we are going to write directly to file. + Although we have an exlusive lock for the updated key part, + the control can be yielded by the current thread as we might + have unfinished readers of other key parts in the block + buffer. Still we are guaranteed not to have any readers + of the key part we are writing into until the block is + removed from the cache as we set the BLOCK_REASSIGNED + flag (see the code below that handles reading requests). + */ free_block(keycache, block); - return 0; + } + else + { + /* + The block will be evicted/freed soon. Don't touch it in any way. + Unregister the request that we registered above. + */ + unreg_request(keycache, block, 1); + + /* + The block is still assigned to the hash_link (the file/pos that + we are goig to write to). Wait until the eviction/free is + complete. Otherwise the direct write could complete before all + readers are done with the block. So they could read outdated + data. + + Comment to be deleted: This was the reason why I experienced + index corruptions during resize. Since I introduced the wait + loop here, they are gone. + + Since we released our request on the hash_link, it can be reused + for another file/pos. Hence we cannot just check for + block->hash_link == hash_link. As long as the resize is + proceeding the block cannot be reassigned to the same file/pos + again. So we can terminate the loop when the block is no longer + assigned to this file/pos. + */ + do + { + wait_on_queue(&block->wqueue[COND_FOR_SAVED], + &keycache->cache_lock); + /* + If the flush phase failed, the resize could have finished + while we waited here. + */ + if (!keycache->in_resize) + goto restart; + } while (block->hash_link && + (block->hash_link->file == file) && + (block->hash_link->diskpos == filepos)); + } + DBUG_RETURN(0); } if (page_status == PAGE_READ && - (block->status & (BLOCK_IN_SWITCH | BLOCK_REASSIGNED))) + (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH | + BLOCK_REASSIGNED))) { - /* This is a request for a page to be removed from cache */ + /* + This is a request for a block to be removed from cache. The block + is assigned to this hash_link and contains valid data, but is + marked for eviction or to be freed. Possible reasons why it has + not yet been evicted/freed can be a flush before reassignment + (BLOCK_IN_SWITCH), readers of the block have not finished yet + (BLOCK_REASSIGNED), or the evicting thread did not yet awake after + the block has been selected for it (BLOCK_IN_EVICTION). + */ KEYCACHE_DBUG_PRINT("find_key_block", ("request for old page in block %u " @@ -1436,43 +1991,58 @@ restart: all others are to be suspended, then resubmitted */ if (!wrmode && !(block->status & BLOCK_REASSIGNED)) + { + /* + This is a read request and the block not yet reassigned. We can + register our request and proceed. This unlinks the block from + the LRU ring and protects it against eviction. + */ reg_requests(keycache, block, 1); + } else { + /* + Either this is a write request for a block that is in eviction + or in free. We must not use it any more. Instead we must evict + another block. But we cannot do this before the eviction/free is + done. Otherwise we would find the same hash_link + block again + and again. + + Or this is a read request for a block in eviction/free that does + not require a flush, but waits for readers to finish with the + block. We do not read this block to let the eviction/free happen + as soon as possible. Again we must wait so that we don't find + the same hash_link + block again and again. + */ + DBUG_ASSERT(hash_link->requests); hash_link->requests--; KEYCACHE_DBUG_PRINT("find_key_block", ("request waiting for old page to be saved")); - { -#ifdef THREAD - struct st_my_thread_var *thread= my_thread_var; - /* Put the request into the queue of those waiting for the old page */ - add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); - /* Wait until the request can be resubmitted */ - do - { - KEYCACHE_DBUG_PRINT("find_key_block: wait", - ("suspend thread %ld", thread->id)); - keycache_pthread_cond_wait(&thread->suspend, - &keycache->cache_lock); - } - while(thread->next); -#else - KEYCACHE_DBUG_ASSERT(0); - /* No parallel requests in single-threaded case */ -#endif - } + wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock); KEYCACHE_DBUG_PRINT("find_key_block", ("request for old page resubmitted")); - /* Resubmit the request */ + /* + The block is no longer assigned to this hash_link. + Get another one. + */ goto restart; } } else { - /* This is a request for a new page or for a page not to be removed */ + /* + This is a request for a new block or for a block not to be removed. + Either + - block == NULL or + - block not assigned to this hash_link or + - block assigned but not yet read from file, + or + - block assigned with valid (changed or unchanged) data and + - it will not be reassigned/freed. + */ if (! block) { - /* No block is assigned for the page yet */ + /* No block is assigned to the hash_link yet. */ if (keycache->blocks_unused) { if (keycache->free_block_list) @@ -1481,28 +2051,42 @@ restart: block= keycache->free_block_list; keycache->free_block_list= block->next_used; block->next_used= NULL; + DBUG_ASSERT(!block->prev_used); + DBUG_ASSERT(!block->next_changed); + DBUG_ASSERT(!block->prev_changed); + DBUG_ASSERT(!block->hash_link); + DBUG_ASSERT(!block->status); + DBUG_ASSERT(!block->requests); } else { /* There are some never used blocks, take first of them */ + DBUG_ASSERT(keycache->blocks_used < (ulong) keycache->disk_blocks); block= &keycache->block_root[keycache->blocks_used]; block->buffer= ADD_TO_PTR(keycache->block_mem, ((ulong) keycache->blocks_used* keycache->key_cache_block_size), byte*); keycache->blocks_used++; + DBUG_ASSERT(!block->next_used); + DBUG_ASSERT(!block->prev_used); + DBUG_ASSERT(!block->next_changed); + DBUG_ASSERT(!block->prev_changed); + DBUG_ASSERT(!block->hash_link); + DBUG_ASSERT(!block->status); + DBUG_ASSERT(!block->requests); } keycache->blocks_unused--; - block->status= 0; + block->status= BLOCK_IN_USE; block->length= 0; block->offset= keycache->key_cache_block_size; block->requests= 1; block->temperature= BLOCK_COLD; block->hits_left= init_hits_left; block->last_hit_time= 0; - link_to_file_list(keycache, block, file, 0); block->hash_link= hash_link; hash_link->block= block; + link_to_file_list(keycache, block, file, 0); page_status= PAGE_TO_BE_READ; KEYCACHE_DBUG_PRINT("find_key_block", ("got free or never used block %u", @@ -1510,17 +2094,26 @@ restart: } else { - /* There are no never used blocks, use a block from the LRU chain */ - - /* - Wait until a new block is added to the LRU chain; - several threads might wait here for the same page, - all of them must get the same block + /* + There are no free blocks and no never used blocks, use a block + from the LRU ring. */ #ifdef THREAD if (! keycache->used_last) { + /* + The LRU ring is empty. Wait until a new block is added to + it. Several threads might wait here for the same hash_link, + all of them must get the same block. While waiting for a + block, after a block is selected for this hash_link, other + threads can run first before this one awakes. During this + time interval other threads find this hash_link pointing to + the block, which is still assigned to another hash_link. In + this case the block is not marked BLOCK_IN_SWITCH yet, but + it is marked BLOCK_IN_EVICTION. + */ + struct st_my_thread_var *thread= my_thread_var; thread->opt_info= (void *) hash_link; link_into_queue(&keycache->waiting_for_block, thread); @@ -1533,24 +2126,50 @@ restart: } while (thread->next); thread->opt_info= NULL; + /* Assert that block has a request registered. */ + DBUG_ASSERT(hash_link->block->requests); + /* Assert that block is not in LRU ring. */ + DBUG_ASSERT(!hash_link->block->next_used); + DBUG_ASSERT(!hash_link->block->prev_used); } #else KEYCACHE_DBUG_ASSERT(keycache->used_last); #endif + /* + If we waited above, hash_link->block has been assigned by + link_block(). Otherwise it is still NULL. In the latter case + we need to grab a block from the LRU ring ourselves. + */ block= hash_link->block; if (! block) { - /* - Take the first block from the LRU chain - unlinking it from the chain - */ + /* Select the last block from the LRU ring. */ block= keycache->used_last->next_used; block->hits_left= init_hits_left; block->last_hit_time= 0; - reg_requests(keycache, block,1); hash_link->block= block; + /* + Register a request on the block. This unlinks it from the + LRU ring and protects it against eviction. + */ + DBUG_ASSERT(!block->requests); + reg_requests(keycache, block,1); + /* + We do not need to set block->status|= BLOCK_IN_EVICTION here + because we will set block->status|= BLOCK_IN_SWITCH + immediately without releasing the lock in between. This does + also support debugging. When looking at the block, one can + see if the block has been selected by link_block() after the + LRU ring was empty, or if it was grabbed directly from the + LRU ring in this branch. + */ } + /* + If we had to wait above, there is a small chance that another + thread grabbed this block for the same file block already. But + in most cases the first condition is true. + */ if (block->hash_link != hash_link && ! (block->status & BLOCK_IN_SWITCH) ) { @@ -1565,46 +2184,117 @@ restart: /* The block contains a dirty page - push it out of the cache */ KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty")); + if (block->status & BLOCK_IN_FLUSH) + { + /* + The block is marked for flush. If we do not wait here, + it could happen that we write the block, reassign it to + another file block, then, before the new owner can read + the new file block, the flusher writes the cache block + (wich still has the old contents) to the new file block! + */ + wait_on_queue(&block->wqueue[COND_FOR_SAVED], + &keycache->cache_lock); + /* + The block is marked BLOCK_IN_SWITCH. It should be left + alone except for reading. No free, no write. + */ + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + DBUG_ASSERT(!(block->status & (BLOCK_REASSIGNED | + BLOCK_CHANGED | + BLOCK_FOR_UPDATE))); + } + else + { + block->status|= BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE; + /* + BLOCK_IN_EVICTION may be true or not. Other flags must + have a fixed value. + */ + DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) == + (BLOCK_READ | BLOCK_IN_SWITCH | + BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE | + BLOCK_CHANGED | BLOCK_IN_USE)); + DBUG_ASSERT(block->hash_link); - keycache_pthread_mutex_unlock(&keycache->cache_lock); - /* - The call is thread safe because only the current - thread might change the block->hash_link value - */ - error= my_pwrite(block->hash_link->file, - block->buffer+block->offset, - block->length - block->offset, - block->hash_link->diskpos+ block->offset, - MYF(MY_NABP | MY_WAIT_IF_FULL)); - keycache_pthread_mutex_lock(&keycache->cache_lock); - keycache->global_cache_write++; + keycache_pthread_mutex_unlock(&keycache->cache_lock); + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + error= my_pwrite(block->hash_link->file, + block->buffer+block->offset, + block->length - block->offset, + block->hash_link->diskpos+ block->offset, + MYF(MY_NABP | MY_WAIT_IF_FULL)); + keycache_pthread_mutex_lock(&keycache->cache_lock); + + /* Block status must not have changed. */ + DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) == + (BLOCK_READ | BLOCK_IN_SWITCH | + BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE | + BLOCK_CHANGED | BLOCK_IN_USE) || fail_block(block)); + keycache->global_cache_write++; + } } block->status|= BLOCK_REASSIGNED; + /* + The block comes from the LRU ring. It must have a hash_link + assigned. + */ + DBUG_ASSERT(block->hash_link); if (block->hash_link) { /* + All pending requests for this page must be resubmitted. + This must be done before waiting for readers. They could + wait for the flush to complete. And we must also do it + after the wait. Flushers might try to free the block while + we wait. They would wait until the reassignment is + complete. Also the block status must reflect the correct + situation: The block is not changed nor in flush any more. + Note that we must not change the BLOCK_CHANGED flag + outside of link_to_file_list() so that it is always in the + correct queue and the *blocks_changed counters are + correct. + */ + block->status&= ~(BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE); + link_to_file_list(keycache, block, block->hash_link->file, 1); + release_whole_queue(&block->wqueue[COND_FOR_SAVED]); + /* + The block is still assigned to its old hash_link. Wait until all pending read requests for this page are executed (we could have avoided this waiting, if we had read a page in the cache in a sweep, without yielding control) */ wait_for_readers(keycache, block); + DBUG_ASSERT(block->hash_link && block->hash_link->block == block && + block->prev_changed); + /* The reader must not have been a writer. */ + DBUG_ASSERT(!(block->status & BLOCK_CHANGED)); - /* Remove the hash link for this page from the hash table */ + /* Wake flushers that might have found the block in between. */ + release_whole_queue(&block->wqueue[COND_FOR_SAVED]); + + /* Remove the hash link for the old file block from the hash. */ unlink_hash(keycache, block->hash_link); - /* All pending requests for this page must be resubmitted */ -#ifdef THREAD - if (block->wqueue[COND_FOR_SAVED].last_thread) - release_queue(&block->wqueue[COND_FOR_SAVED]); -#endif + + /* + For sanity checks link_to_file_list() asserts that block + and hash_link refer to each other. Hence we need to assign + the hash_link first, but then we would not know if it was + linked before. Hence we would not know if to unlink it. So + unlink it here and call link_to_file_list(..., FALSE). + */ + unlink_changed(block); } - link_to_file_list(keycache, block, file, - (my_bool)(block->hash_link ? 1 : 0)); - block->status= error? BLOCK_ERROR : 0; + block->status= error ? BLOCK_ERROR : BLOCK_IN_USE ; block->length= 0; block->offset= keycache->key_cache_block_size; block->hash_link= hash_link; + link_to_file_list(keycache, block, file, 0); page_status= PAGE_TO_BE_READ; KEYCACHE_DBUG_ASSERT(block->hash_link->block == block); @@ -1612,7 +2302,20 @@ restart: } else { - /* This is for secondary requests for a new page only */ + /* + Either (block->hash_link == hash_link), + or (block->status & BLOCK_IN_SWITCH). + + This is for secondary requests for a new file block only. + Either it is already assigned to the new hash_link meanwhile + (if we had to wait due to empty LRU), or it is already in + eviction by another thread. Since this block has been + grabbed from the LRU ring and attached to this hash_link, + another thread cannot grab the same block from the LRU ring + anymore. If the block is in eviction already, it must become + attached to the same hash_link and as such destined for the + same file block. + */ KEYCACHE_DBUG_PRINT("find_key_block", ("block->hash_link: %p hash_link: %p " "block->status: %u", block->hash_link, @@ -1622,10 +2325,40 @@ restart: PAGE_READ : PAGE_WAIT_TO_BE_READ); } } - keycache->global_cache_read++; + /* + Comment to be deleted: keycache->global_cache_read++; moved to + read_block(). At this place it was counted for primary and + secondary requests. Better count it where the actual read is done. + */ } else { + /* + Block is not NULL. This hash_link points to a block. + Either + - block not assigned to this hash_link (yet) or + - block assigned but not yet read from file, + or + - block assigned with valid (changed or unchanged) data and + - it will not be reassigned/freed. + + The first condition means hash_link points to a block in + eviction. This is not necessarily marked by BLOCK_IN_SWITCH yet. + But then it is marked BLOCK_IN_EVICTION. See the NOTE in + link_block(). In both cases it is destined for this hash_link + and its file block address. When this hash_link got its block + address, the block was removed from the LRU ring and cannot be + selected for eviction (for another hash_link) again. + + Register a request on the block. This is another protection + against eviction. + */ + DBUG_ASSERT(((block->hash_link != hash_link) && + (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) || + ((block->hash_link == hash_link) && + !(block->status & BLOCK_READ)) || + ((block->status & BLOCK_READ) && + !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)))); reg_requests(keycache, block, 1); KEYCACHE_DBUG_PRINT("find_key_block", ("block->hash_link: %p hash_link: %p " @@ -1638,6 +2371,16 @@ restart: } KEYCACHE_DBUG_ASSERT(page_status != -1); + /* Same assert basically, but be very sure. */ + KEYCACHE_DBUG_ASSERT(block); + /* Assert that block has a request and is not in LRU ring. */ + DBUG_ASSERT(block->requests); + DBUG_ASSERT(!block->next_used); + DBUG_ASSERT(!block->prev_used); + /* Assert that we return the correct block. */ + DBUG_ASSERT((page_status == PAGE_WAIT_TO_BE_READ) || + ((block->hash_link->file == file) && + (block->hash_link->diskpos == filepos))); *page_st=page_status; KEYCACHE_DBUG_PRINT("find_key_block", ("fd: %d pos: %lu block->status: %u page_status: %u", @@ -1677,7 +2420,7 @@ restart: portion is less than read_length, but not less than min_length. */ -static void read_block(KEY_CACHE *keycache __attribute__((unused)), +static void read_block(KEY_CACHE *keycache, BLOCK_LINK *block, uint read_length, uint min_length, my_bool primary) { @@ -1689,13 +2432,27 @@ static void read_block(KEY_CACHE *keycache __attribute__((unused)), if (primary) { /* - This code is executed only by threads - that submitted primary requests + This code is executed only by threads that submitted primary + requests. Until block->status contains BLOCK_READ, all other + request for the block become secondary requests. For a primary + request the block must be properly initialized. */ + DBUG_ASSERT(((block->status & ~BLOCK_FOR_UPDATE) == BLOCK_IN_USE) || + fail_block(block)); + DBUG_ASSERT((block->length == 0) || fail_block(block)); + DBUG_ASSERT((block->offset == keycache->key_cache_block_size) || + fail_block(block)); + DBUG_ASSERT((block->requests > 0) || fail_block(block)); KEYCACHE_DBUG_PRINT("read_block", ("page to be read by primary request")); + /* + Comment to be deleted: keycache->global_cache_read++; moved here + from find_key_block(). At this place it counts primary requests + only. + */ + keycache->global_cache_read++; /* Page is not in buffer yet, is to be read from disk */ keycache_pthread_mutex_unlock(&keycache->cache_lock); /* @@ -1705,47 +2462,51 @@ static void read_block(KEY_CACHE *keycache __attribute__((unused)), got_length= my_pread(block->hash_link->file, block->buffer, read_length, block->hash_link->diskpos, MYF(0)); keycache_pthread_mutex_lock(&keycache->cache_lock); + /* + The block can now have been marked for free (in case of + FLUSH_RELEASE). Otherwise the state must be unchanged. + */ + DBUG_ASSERT(((block->status & ~(BLOCK_REASSIGNED | + BLOCK_FOR_UPDATE)) == BLOCK_IN_USE) || + fail_block(block)); + DBUG_ASSERT((block->length == 0) || fail_block(block)); + DBUG_ASSERT((block->offset == keycache->key_cache_block_size) || + fail_block(block)); + DBUG_ASSERT((block->requests > 0) || fail_block(block)); + if (got_length < min_length) block->status|= BLOCK_ERROR; else { - block->status= BLOCK_READ; + /* Comment to be deleted: Do not kill other block status flags. */ + block->status|= BLOCK_READ; block->length= got_length; + /* + Do not set block->offset here. If this block is marked + BLOCK_CHANGED later, we want to flush only the modified part. So + only a writer may set block->offset down from + keycache->key_cache_block_size. + */ } KEYCACHE_DBUG_PRINT("read_block", ("primary request: new page in cache")); /* Signal that all pending requests for this page now can be processed */ -#ifdef THREAD - if (block->wqueue[COND_FOR_REQUESTED].last_thread) - release_queue(&block->wqueue[COND_FOR_REQUESTED]); -#endif + release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]); } else { /* - This code is executed only by threads - that submitted secondary requests + This code is executed only by threads that submitted secondary + requests. At this point it could happen that the cache block is + not yet assigned to the hash_link for the requested file block. + But at awake from the wait this should be the case. Unfortunately + we cannot assert this here because we do not know the hash_link + for the requested file block nor the file and position. So we have + to assert this in the caller. */ KEYCACHE_DBUG_PRINT("read_block", ("secondary request waiting for new page to be read")); - { -#ifdef THREAD - struct st_my_thread_var *thread= my_thread_var; - /* Put the request into a queue and wait until it can be processed */ - add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread); - do - { - KEYCACHE_DBUG_PRINT("read_block: wait", - ("suspend thread %ld", thread->id)); - keycache_pthread_cond_wait(&thread->suspend, - &keycache->cache_lock); - } - while (thread->next); -#else - KEYCACHE_DBUG_ASSERT(0); - /* No parallel requests in single-threaded case */ -#endif - } + wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock); KEYCACHE_DBUG_PRINT("read_block", ("secondary request: new page in cache")); } @@ -1786,32 +2547,59 @@ byte *key_cache_read(KEY_CACHE *keycache, uint block_length __attribute__((unused)), int return_buffer __attribute__((unused))) { + my_bool incremented= FALSE; int error=0; - uint offset= 0; byte *start= buff; DBUG_ENTER("key_cache_read"); DBUG_PRINT("enter", ("fd: %u pos: %lu length: %u", (uint) file, (ulong) filepos, length)); - if (keycache->can_be_used) + if (keycache->key_cache_inited) { /* Key cache is used */ reg1 BLOCK_LINK *block; uint read_length; + uint offset; uint status; int page_st; + /* + When the key cache is once initialized, we use the cache_lock to + reliably distinguish the cases of normal operation, resizing, and + disabled cache. We always increment and decrement + 'cnt_for_resize_op' so that a resizer can wait for pending I/O. + */ + keycache_pthread_mutex_lock(&keycache->cache_lock); + /* + Cache resizing has two phases: Flushing and re-initializing. In + the flush phase read requests are allowed to bypass the cache for + blocks not in the cache. find_key_block() returns NULL in this + case. + + After the flush phase new I/O requests must wait until the + re-initialization is done. The re-initialization can be done only + if no I/O request is in progress. The reason is that + key_cache_block_size can change. With enabled cache, I/O is done + in chunks of key_cache_block_size. Every chunk tries to use a + cache block first. If the block size changes in the middle, a + block could be missed and old data could be read. + */ + while (keycache->in_resize && !keycache->resize_in_flush) + wait_on_queue(&keycache->resize_queue, &keycache->cache_lock); + /* Register the I/O for the next resize. */ + inc_counter_for_resize_op(keycache); + incremented= TRUE; + /* Requested data may not always be aligned to cache blocks. */ offset= (uint) (filepos & (keycache->key_cache_block_size-1)); /* Read data in key_cache_block_size increments */ do { - keycache_pthread_mutex_lock(&keycache->cache_lock); + /* Cache could be disabled in a later iteration. */ if (!keycache->can_be_used) - { - keycache_pthread_mutex_unlock(&keycache->cache_lock); goto no_key_cache; - } + /* Start reading at the beginning of the cache block. */ filepos-= offset; + /* Do not read beyond the end of the cache block. */ read_length= length; set_if_smaller(read_length, keycache->key_cache_block_size-offset); KEYCACHE_DBUG_ASSERT(read_length > 0); @@ -1821,34 +2609,64 @@ byte *key_cache_read(KEY_CACHE *keycache, return_buffer=0; #endif - inc_counter_for_resize_op(keycache); + /* Request the cache block that matches file/pos. */ keycache->global_cache_r_requests++; block=find_key_block(keycache, file, filepos, level, 0, &page_st); - if (block->status != BLOCK_ERROR && page_st != PAGE_READ) - { - /* The requested page is to be read into the block buffer */ - read_block(keycache, block, - keycache->key_cache_block_size, read_length+offset, - (my_bool)(page_st == PAGE_TO_BE_READ)); - } - else if (! (block->status & BLOCK_ERROR) && - block->length < read_length + offset) + if (!block) { /* - Impossible if nothing goes wrong: - this could only happen if we are using a file with - small key blocks and are trying to read outside the file + This happens only for requests submitted during key cache + resize. The block is not in the cache and shall not go in. + Read directly from file. */ - my_errno= -1; - block->status|= BLOCK_ERROR; + keycache->global_cache_read++; + keycache_pthread_mutex_unlock(&keycache->cache_lock); + if (my_pread(file, (byte*) buff, read_length, + filepos + offset, MYF(MY_NABP))) + { + error= 1; + } + keycache_pthread_mutex_lock(&keycache->cache_lock); + goto next_block; + } + if (block->status != BLOCK_ERROR) + { + if (page_st != PAGE_READ) + { + /* The requested page is to be read into the block buffer */ + read_block(keycache, block, + keycache->key_cache_block_size, read_length+offset, + (my_bool)(page_st == PAGE_TO_BE_READ)); + /* + A secondary request must now have the block assigned to the + requested file block. It does not hurt to check it for + primary requests too. + */ + DBUG_ASSERT(keycache->can_be_used); + DBUG_ASSERT(block->hash_link->file == file); + DBUG_ASSERT(block->hash_link->diskpos == filepos); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + } + else if (block->length < read_length + offset) + { + /* + Impossible if nothing goes wrong: + this could only happen if we are using a file with + small key blocks and are trying to read outside the file + */ + my_errno= -1; + block->status|= BLOCK_ERROR; + } } + /* block status may have added BLOCK_ERROR in the above 'if'. */ if (! ((status= block->status) & BLOCK_ERROR)) { #ifndef THREAD if (! return_buffer) #endif { + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); #if !defined(SERIALIZED_READ_FROM_CACHE) keycache_pthread_mutex_unlock(&keycache->cache_lock); #endif @@ -1861,44 +2679,63 @@ byte *key_cache_read(KEY_CACHE *keycache, #if !defined(SERIALIZED_READ_FROM_CACHE) keycache_pthread_mutex_lock(&keycache->cache_lock); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); #endif } } remove_reader(block); + /* - Link the block into the LRU chain - if it's the last submitted request for the block + Link the block into the LRU ring if it's the last submitted + request for the block. This enables eviction for the block. */ unreg_request(keycache, block, 1); - dec_counter_for_resize_op(keycache); - - keycache_pthread_mutex_unlock(&keycache->cache_lock); - if (status & BLOCK_ERROR) - DBUG_RETURN((byte *) 0); + { + error= 1; + break; + } #ifndef THREAD /* This is only true if we where able to read everything in one block */ if (return_buffer) DBUG_RETURN(block->buffer); #endif + next_block: buff+= read_length; filepos+= read_length+offset; offset= 0; } while ((length-= read_length)); - DBUG_RETURN(start); + goto end; } -no_key_cache: /* Key cache is not used */ +no_key_cache: + /* Key cache is not used */ - /* We can't use mutex here as the key cache may not be initialized */ + if (keycache->key_cache_inited && !incremented) + { + keycache_pthread_mutex_lock(&keycache->cache_lock); + inc_counter_for_resize_op(keycache); + incremented= TRUE; + } keycache->global_cache_r_requests++; keycache->global_cache_read++; - if (my_pread(file, (byte*) buff, length, filepos+offset, MYF(MY_NABP))) + if (incremented) + keycache_pthread_mutex_unlock(&keycache->cache_lock); + if (my_pread(file, (byte*) buff, length, filepos, MYF(MY_NABP))) error= 1; + if (incremented) + keycache_pthread_mutex_lock(&keycache->cache_lock); + +end: + if (incremented) + { + dec_counter_for_resize_op(keycache); + keycache_pthread_mutex_unlock(&keycache->cache_lock); + } DBUG_RETURN(error ? (byte*) 0 : start); } @@ -1927,92 +2764,219 @@ int key_cache_insert(KEY_CACHE *keycache, File file, my_off_t filepos, int level, byte *buff, uint length) { + int error= 0; DBUG_ENTER("key_cache_insert"); DBUG_PRINT("enter", ("fd: %u pos: %lu length: %u", (uint) file,(ulong) filepos, length)); - if (keycache->can_be_used) + if (keycache->key_cache_inited) { /* Key cache is used */ reg1 BLOCK_LINK *block; uint read_length; - int page_st; - int error; uint offset; + int page_st; + my_bool incremented= FALSE; + /* + When the keycache is once initialized, we use the cache_lock to + reliably distinguish the cases of normal operation, resizing, and + disabled cache. We always increment and decrement + 'cnt_for_resize_op' so that a resizer can wait for pending I/O. + */ + keycache_pthread_mutex_lock(&keycache->cache_lock); + /* + We do not load index data into a disabled cache nor into an + ongoing resize. + */ + if (!keycache->can_be_used || keycache->in_resize) + goto no_key_cache; + /* Register the pseudo I/O for the next resize. */ + inc_counter_for_resize_op(keycache); + incremented= TRUE; + /* Loaded data may not always be aligned to cache blocks. */ offset= (uint) (filepos & (keycache->key_cache_block_size-1)); + /* Load data in key_cache_block_size increments. */ do { - keycache_pthread_mutex_lock(&keycache->cache_lock); - if (!keycache->can_be_used) - { - keycache_pthread_mutex_unlock(&keycache->cache_lock); - DBUG_RETURN(0); - } - /* Read data into key cache from buff in key_cache_block_size incr. */ + /* Cache could be disabled or resizing in a later iteration. */ + if (!keycache->can_be_used || keycache->in_resize) + goto no_key_cache; + /* Start loading at the beginning of the cache block. */ filepos-= offset; + /* Do not load beyond the end of the cache block. */ read_length= length; set_if_smaller(read_length, keycache->key_cache_block_size-offset); KEYCACHE_DBUG_ASSERT(read_length > 0); - inc_counter_for_resize_op(keycache); + /* The block has been read by the caller already. */ + keycache->global_cache_read++; + /* Request the cache block that matches file/pos. */ keycache->global_cache_r_requests++; block= find_key_block(keycache, file, filepos, level, 0, &page_st); - if (block->status != BLOCK_ERROR && page_st != PAGE_READ) + if (!block) { - /* The requested page is to be read into the block buffer */ -#if !defined(SERIALIZED_READ_FROM_CACHE) - keycache_pthread_mutex_unlock(&keycache->cache_lock); /* - Here other threads may step in and register as secondary readers. - They will register in block->wqueue[COND_FOR_REQUESTED]. + This happens only for requests submitted during key cache + resize. The block is not in the cache and shall not go in. + Stop loading index data. */ + goto no_key_cache; + } + if (block->status != BLOCK_ERROR) + { + if ((page_st == PAGE_WAIT_TO_BE_READ) || + ((page_st == PAGE_TO_BE_READ) && + (offset || (read_length < keycache->key_cache_block_size)))) + { + /* + Either + + this is a secondary request for a block to be read into the + cache. The block is in eviction. It is not yet assigned to + the requested file block (It does not point to the right + hash_link). So we cannot call remove_reader() on the block. + And we cannot access the hash_link directly here. We need to + wait until the assignment is complete. read_block() executes + the correct wait when called with primary == FALSE. + + Or + + this is a primary request for a block to be read into the + cache and the supplied data does not fill the whole block. + + This function is called on behalf of a LOAD INDEX INTO CACHE + statement, which is a read-only task and allows other + readers. It is possible that a parallel running reader tries + to access this block. If it needs more data than has been + supplied here, it would report an error. To be sure that we + have all data in the block that is available in the file, we + read the block ourselves. + + Though reading again what the caller did read already is an + expensive operation, we need to do this for correctness. + */ +#if !defined(INGO_TEST_LOADIDX_OFF) + /* + Note that this happen only for key_cache_block_size > + MI_MIN_KEY_BLOCK_LENGTH *and* LOAD INDEX INTO CACHE ... + IGNORE LEAVES. Otherwise mi_preload() supplies this function + with aligned blocks. + */ +#endif + read_block(keycache, block, keycache->key_cache_block_size, + read_length + offset, (page_st == PAGE_TO_BE_READ)); + /* + A secondary request must now have the block assigned to the + requested file block. It does not hurt to check it for + primary requests too. + */ + DBUG_ASSERT(keycache->can_be_used); + DBUG_ASSERT(block->hash_link->file == file); + DBUG_ASSERT(block->hash_link->diskpos == filepos); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + } + else if (page_st == PAGE_TO_BE_READ) + { + /* + This is a new block in the cache. If we come here, we have + data for the whole block. + */ + DBUG_ASSERT(block->hash_link->requests); + DBUG_ASSERT(block->status & BLOCK_IN_USE); + DBUG_ASSERT((page_st == PAGE_TO_BE_READ) || + (block->status & BLOCK_READ)); +#if !defined(SERIALIZED_READ_FROM_CACHE) + keycache_pthread_mutex_unlock(&keycache->cache_lock); + /* + Here other threads may step in and register as secondary readers. + They will register in block->wqueue[COND_FOR_REQUESTED]. + */ #endif - /* Copy data from buff */ - if (!(read_length & 511)) - bmove512(block->buffer+offset, buff, read_length); - else - memcpy(block->buffer+offset, buff, (size_t) read_length); + /* Copy data from buff */ + if (!(read_length & 511)) + bmove512(block->buffer+offset, buff, read_length); + else + memcpy(block->buffer+offset, buff, (size_t) read_length); #if !defined(SERIALIZED_READ_FROM_CACHE) - keycache_pthread_mutex_lock(&keycache->cache_lock); - /* Here we are alone again. */ + keycache_pthread_mutex_lock(&keycache->cache_lock); + DBUG_ASSERT(block->status & BLOCK_IN_USE); + DBUG_ASSERT((page_st == PAGE_TO_BE_READ) || + (block->status & BLOCK_READ)); #endif - block->status= BLOCK_READ; - block->length= read_length+offset; - KEYCACHE_DBUG_PRINT("key_cache_insert", - ("primary request: new page in cache")); -#ifdef THREAD - /* Signal that all pending requests for this now can be processed. */ - if (block->wqueue[COND_FOR_REQUESTED].last_thread) - release_queue(&block->wqueue[COND_FOR_REQUESTED]); -#endif - } + /* + After the data is in the buffer, we can declare the block + valid. Now other threads do not need to register as + secondary readers any more. They can immediately access the + block. + */ + block->status|= BLOCK_READ; + block->length= read_length+offset; + /* + Do not set block->offset here. If this block is marked + BLOCK_CHANGED later, we want to flush only the modified part. So + only a writer may set block->offset down from + keycache->key_cache_block_size. + */ + KEYCACHE_DBUG_PRINT("key_cache_insert", + ("primary request: new page in cache")); + /* Signal all pending requests. */ + release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]); + } + else + { + /* + page_st == PAGE_READ. The block is in the buffer. All data + must already be present. Blocks are always read with all + data available on file. Assert that the block does not have + less contents than the preloader supplies. If the caller has + data beyond block->length, it means that a file write has + been done while this block was in cache and not extended + with the new data. If the condition is met, we can simply + ignore the block. + */ + DBUG_ASSERT((page_st == PAGE_READ) && + (read_length + offset <= block->length)); + } + + /* + A secondary request must now have the block assigned to the + requested file block. It does not hurt to check it for primary + requests too. + */ + DBUG_ASSERT(block->hash_link->file == file); + DBUG_ASSERT(block->hash_link->diskpos == filepos); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + } /* end of if (block->status != BLOCK_ERROR) */ remove_reader(block); + /* - Link the block into the LRU chain - if it's the last submitted request for the block + Link the block into the LRU ring if it's the last submitted + request for the block. This enables eviction for the block. */ unreg_request(keycache, block, 1); error= (block->status & BLOCK_ERROR); - dec_counter_for_resize_op(keycache); - - keycache_pthread_mutex_unlock(&keycache->cache_lock); - if (error) - DBUG_RETURN(1); + break; buff+= read_length; filepos+= read_length+offset; offset= 0; } while ((length-= read_length)); + + no_key_cache: + if (incremented) + dec_counter_for_resize_op(keycache); + keycache_pthread_mutex_unlock(&keycache->cache_lock); } - DBUG_RETURN(0); + + DBUG_RETURN(error); } @@ -2041,6 +3005,8 @@ int key_cache_insert(KEY_CACHE *keycache, It ensures that this data is flushed to the file if dont_write is FALSE. Filepos must be a multiple of 'block_length', but it doesn't have to be a multiple of key_cache_block_size; + + dont_write is always TRUE in the server (info->lock_type is never F_UNLCK). */ int key_cache_write(KEY_CACHE *keycache, @@ -2049,7 +3015,7 @@ int key_cache_write(KEY_CACHE *keycache, uint block_length __attribute__((unused)), int dont_write) { - reg1 BLOCK_LINK *block; + my_bool incremented= FALSE; int error=0; DBUG_ENTER("key_cache_write"); DBUG_PRINT("enter", @@ -2059,10 +3025,14 @@ int key_cache_write(KEY_CACHE *keycache, if (!dont_write) { - /* Force writing from buff into disk */ + /* purecov: begin inspected */ + /* Not used in the server. */ + /* Force writing from buff into disk. */ + keycache->global_cache_w_requests++; keycache->global_cache_write++; if (my_pwrite(file, buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL))) DBUG_RETURN(1); + /* purecov: end */ } #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) @@ -2070,92 +3040,222 @@ int key_cache_write(KEY_CACHE *keycache, test_key_cache(keycache, "start of key_cache_write", 1);); #endif - if (keycache->can_be_used) + if (keycache->key_cache_inited) { /* Key cache is used */ + reg1 BLOCK_LINK *block; uint read_length; - int page_st; uint offset; + int page_st; + /* + When the key cache is once initialized, we use the cache_lock to + reliably distinguish the cases of normal operation, resizing, and + disabled cache. We always increment and decrement + 'cnt_for_resize_op' so that a resizer can wait for pending I/O. + */ + keycache_pthread_mutex_lock(&keycache->cache_lock); + /* + Cache resizing has two phases: Flushing and re-initializing. In + the flush phase write requests can modify dirty blocks that are + not yet in flush. Otherwise they are allowed to bypass the cache. + find_key_block() returns NULL in both cases (clean blocks and + non-cached blocks). + + After the flush phase new I/O requests must wait until the + re-initialization is done. The re-initialization can be done only + if no I/O request is in progress. The reason is that + key_cache_block_size can change. With enabled cache I/O is done in + chunks of key_cache_block_size. Every chunk tries to use a cache + block first. If the block size changes in the middle, a block + could be missed and data could be written below a cached block. + */ + while (keycache->in_resize && !keycache->resize_in_flush) + wait_on_queue(&keycache->resize_queue, &keycache->cache_lock); + /* Register the I/O for the next resize. */ + inc_counter_for_resize_op(keycache); + incremented= TRUE; + /* Requested data may not always be aligned to cache blocks. */ offset= (uint) (filepos & (keycache->key_cache_block_size-1)); + /* Write data in key_cache_block_size increments. */ do { - keycache_pthread_mutex_lock(&keycache->cache_lock); + /* Cache could be disabled in a later iteration. */ if (!keycache->can_be_used) - { - keycache_pthread_mutex_unlock(&keycache->cache_lock); goto no_key_cache; - } - /* Write data in key_cache_block_size increments */ + /* Start writing at the beginning of the cache block. */ filepos-= offset; + /* Do not write beyond the end of the cache block. */ read_length= length; set_if_smaller(read_length, keycache->key_cache_block_size-offset); KEYCACHE_DBUG_ASSERT(read_length > 0); - inc_counter_for_resize_op(keycache); + /* Request the cache block that matches file/pos. */ keycache->global_cache_w_requests++; block= find_key_block(keycache, file, filepos, level, 1, &page_st); if (!block) { - /* It happens only for requests submitted during resize operation */ - dec_counter_for_resize_op(keycache); - keycache_pthread_mutex_unlock(&keycache->cache_lock); - if (dont_write) + /* + This happens only for requests submitted during key cache + resize. The block is not in the cache and shall not go in. + Write directly to file. + */ + if (dont_write) { - keycache->global_cache_w_requests++; + /* Used in the server. */ keycache->global_cache_write++; - if (my_pwrite(file, (byte*) buff, length, filepos, - MYF(MY_NABP | MY_WAIT_IF_FULL))) + keycache_pthread_mutex_unlock(&keycache->cache_lock); + if (my_pwrite(file, (byte*) buff, read_length, filepos + offset, + MYF(MY_NABP | MY_WAIT_IF_FULL))) error=1; - } + keycache_pthread_mutex_lock(&keycache->cache_lock); + } goto next_block; } + /* + Prevent block from flushing and from being selected for to be + freed. This must be set when we release the cache_lock. + However, we must not set the status of the block before it is + assigned to this file/pos. + */ + if (page_st != PAGE_WAIT_TO_BE_READ) + block->status|= BLOCK_FOR_UPDATE; + /* + We must read the file block first if it is not yet in the cache + and we do not replace all of its contents. - if (block->status != BLOCK_ERROR && page_st != PAGE_READ && - (offset || read_length < keycache->key_cache_block_size)) + In cases where the cache block is big enough to contain (parts + of) index blocks of different indexes, our request can be + secondary (PAGE_WAIT_TO_BE_READ). In this case another thread is + reading the file block. If the read completes after us, it + overwrites our new contents with the old contents. So we have to + wait for the other thread to complete the read of this block. + read_block() takes care for the wait. + */ + if (block->status != BLOCK_ERROR && + ((page_st == PAGE_TO_BE_READ && + (offset || read_length < keycache->key_cache_block_size)) || + (page_st == PAGE_WAIT_TO_BE_READ))) + { read_block(keycache, block, offset + read_length >= keycache->key_cache_block_size? offset : keycache->key_cache_block_size, - offset,(my_bool)(page_st == PAGE_TO_BE_READ)); + offset, (page_st == PAGE_TO_BE_READ)); + DBUG_ASSERT(keycache->can_be_used); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + /* + Prevent block from flushing and from being selected for to be + freed. This must be set when we release the cache_lock. + Here we set it in case we could not set it above. + */ + block->status|= BLOCK_FOR_UPDATE; + } + /* + The block should always be assigned to the requested file block + here. It need not be BLOCK_READ when overwriting the whole block. + */ + DBUG_ASSERT(block->hash_link->file == file); + DBUG_ASSERT(block->hash_link->diskpos == filepos); + DBUG_ASSERT(block->status & BLOCK_IN_USE); + DBUG_ASSERT((page_st == PAGE_TO_BE_READ) || (block->status & BLOCK_READ)); + /* + The block to be written must not be marked BLOCK_REASSIGNED. + Otherwise it could be freed in dirty state or reused without + another flush during eviction. It must also not be in flush. + Otherwise the old contens may have been flushed already and + the flusher could clear BLOCK_CHANGED without flushing the + new changes again. + */ + DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED)); + + while (block->status & BLOCK_IN_FLUSHWRITE) + { + /* + Another thread is flushing the block. It was dirty already. + Wait until the block is flushed to file. Otherwise we could + modify the buffer contents just while it is written to file. + An unpredictable file block contents would be the result. + While we wait, several things can happen to the block, + including another flush. But the block cannot be reassigned to + another hash_link until we release our request on it. + */ + wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock); + DBUG_ASSERT(keycache->can_be_used); + DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); + /* Still must not be marked for free. */ + DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED)); + DBUG_ASSERT(block->hash_link && (block->hash_link->block == block)); + } + + /* + We could perhaps release the cache_lock during access of the + data like in the other functions. Locks outside of the key cache + assure that readers and a writer do not access the same range of + data. Parallel accesses should happen only if the cache block + contains multiple index block(fragment)s. So different parts of + the buffer would be read/written. An attempt to flush during + memcpy() is prevented with BLOCK_FOR_UPDATE. + */ + if (! (block->status & BLOCK_ERROR)) + { +#if !defined(SERIALIZED_READ_FROM_CACHE) + keycache_pthread_mutex_unlock(&keycache->cache_lock); +#endif + if (!(read_length & 511)) + bmove512(block->buffer+offset, buff, read_length); + else + memcpy(block->buffer+offset, buff, (size_t) read_length); + +#if !defined(SERIALIZED_READ_FROM_CACHE) + keycache_pthread_mutex_lock(&keycache->cache_lock); +#endif + } if (!dont_write) { - /* buff has been written to disk at start */ + /* Not used in the server. buff has been written to disk at start. */ if ((block->status & BLOCK_CHANGED) && (!offset && read_length >= keycache->key_cache_block_size)) link_to_file_list(keycache, block, block->hash_link->file, 1); } else if (! (block->status & BLOCK_CHANGED)) link_to_changed_list(keycache, block); + block->status|=BLOCK_READ; + /* + Allow block to be selected for to be freed. Since it is marked + BLOCK_CHANGED too, it won't be selected for to be freed without + a flush. + */ + block->status&= ~BLOCK_FOR_UPDATE; set_if_smaller(block->offset, offset); set_if_bigger(block->length, read_length+offset); - if (! (block->status & BLOCK_ERROR)) - { - if (!(read_length & 511)) - bmove512(block->buffer+offset, buff, read_length); - else - memcpy(block->buffer+offset, buff, (size_t) read_length); - } + /* Threads may be waiting for the changes to be complete. */ + release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]); - block->status|=BLOCK_READ; + /* + If only a part of the cache block is to be replaced, and the + rest has been read from file, then the cache lock has been + released for I/O and it could be possible that another thread + wants to evict or free the block and waits for it to be + released. So we must not just decrement hash_link->requests, but + also wake a waiting thread. + */ + remove_reader(block); - /* Unregister the request */ - block->hash_link->requests--; + /* + Link the block into the LRU ring if it's the last submitted + request for the block. This enables eviction for the block. + */ unreg_request(keycache, block, 1); if (block->status & BLOCK_ERROR) { - keycache_pthread_mutex_unlock(&keycache->cache_lock); error= 1; break; } - dec_counter_for_resize_op(keycache); - - keycache_pthread_mutex_unlock(&keycache->cache_lock); - next_block: buff+= read_length; filepos+= read_length+offset; @@ -2169,14 +3269,30 @@ no_key_cache: /* Key cache is not used */ if (dont_write) { + /* Used in the server. */ + if (keycache->key_cache_inited && !incremented) + { + keycache_pthread_mutex_lock(&keycache->cache_lock); + inc_counter_for_resize_op(keycache); + incremented= TRUE; + } keycache->global_cache_w_requests++; keycache->global_cache_write++; + if (incremented) + keycache_pthread_mutex_unlock(&keycache->cache_lock); if (my_pwrite(file, (byte*) buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL))) error=1; + if (incremented) + keycache_pthread_mutex_lock(&keycache->cache_lock); } end: + if (incremented) + { + dec_counter_for_resize_op(keycache); + keycache_pthread_mutex_unlock(&keycache->cache_lock); + } #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) DBUG_EXECUTE("exec", test_key_cache(keycache, "end of key_cache_write", 1);); @@ -2186,9 +3302,30 @@ end: /* - Free block: remove reference to it from hash table, - remove it from the chain file of dirty/clean blocks - and add it to the free list. + Free block. + + SYNOPSIS + free_block() + keycache Pointer to a key cache data structure + block Pointer to the block to free + + DESCRIPTION + Remove reference to block from hash table. + Remove block from the chain of clean blocks. + Add block to the free list. + + NOTE + Block must not be free (status == 0). + Block must not be in free_block_list. + Block must not be in the LRU ring. + Block must not be in eviction (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH). + Block must not be in free (BLOCK_REASSIGNED). + Block must not be in flush (BLOCK_IN_FLUSH). + Block must not be dirty (BLOCK_CHANGED). + Block must not be in changed_blocks (dirty) hash. + Block must be in file_blocks (clean) hash. + Block must refer to a hash_link. + Block must have a request registered on it. */ static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block) @@ -2197,6 +3334,31 @@ static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block) KEYCACHE_DBUG_PRINT("free_block", ("block %u to be freed, hash_link %p", BLOCK_NUMBER(block), block->hash_link)); + /* + Assert that the block is not free already. And that it is in a clean + state. Note that the block might just be assigned to a hash_link and + not yet read (BLOCK_READ may not be set here). In this case a reader + is registered in the hash_link and free_block() will wait for it + below. + */ + DBUG_ASSERT((block->status & BLOCK_IN_USE) && + !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH | + BLOCK_REASSIGNED | BLOCK_IN_FLUSH | + BLOCK_CHANGED | BLOCK_FOR_UPDATE))); + /* Assert that the block is in a file_blocks chain. */ + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + /* Assert that the block is not in the LRU ring. */ + DBUG_ASSERT(!block->next_used && !block->prev_used); + /* + IMHO the below condition (if()) makes no sense. I can't see how it + could be possible that free_block() is entered with a NULL hash_link + pointer. The only place where it can become NULL is in free_block() + (or before its first use ever, but for those blocks free_block() is + not called). I don't remove the conditional as it cannot harm, but + place an DBUG_ASSERT to confirm my hypothesis. Eventually the + condition (if()) can be removed. + */ + DBUG_ASSERT(block->hash_link && block->hash_link->block == block); if (block->hash_link) { /* @@ -2207,35 +3369,85 @@ static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block) */ block->status|= BLOCK_REASSIGNED; wait_for_readers(keycache, block); - unlink_hash(keycache, block->hash_link); + /* + The block must not have been freed by another thread. Repeat some + checks. An additional requirement is that it must be read now + (BLOCK_READ). + */ + DBUG_ASSERT(block->hash_link && block->hash_link->block == block); + DBUG_ASSERT((block->status & (BLOCK_READ | BLOCK_IN_USE | + BLOCK_REASSIGNED)) && + !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH | + BLOCK_IN_FLUSH | BLOCK_CHANGED | + BLOCK_FOR_UPDATE))); + DBUG_ASSERT(block->prev_changed && *block->prev_changed == block); + DBUG_ASSERT(!block->prev_used); + /* + Unset BLOCK_REASSIGNED again. If we hand the block to an evicting + thread (through unreg_request() below), other threads must not see + this flag. They could become confused. + */ + block->status&= ~BLOCK_REASSIGNED; + /* + Do not release the hash_link until the block is off all lists. + At least not if we hand it over for eviction in unreg_request(). + */ } - unlink_changed(block); - block->status= 0; - block->length= 0; - block->offset= keycache->key_cache_block_size; - KEYCACHE_THREAD_TRACE("free block"); - KEYCACHE_DBUG_PRINT("free_block", - ("block is freed")); + /* + Unregister the block request and link the block into the LRU ring. + This enables eviction for the block. If the LRU ring was empty and + threads are waiting for a block, then the block wil be handed over + for eviction immediately. Otherwise we will unlink it from the LRU + ring again, without releasing the lock in between. So decrementing + the request counter and updating statistics are the only relevant + operation in this case. Assert that there are no other requests + registered. + */ + DBUG_ASSERT(block->requests == 1); unreg_request(keycache, block, 0); - block->hash_link= NULL; + /* + Note that even without releasing the cache lock it is possible that + the block is immediately selected for eviction by link_block() and + thus not added to the LRU ring. In this case we must not touch the + block any more. + */ + if (block->status & BLOCK_IN_EVICTION) + return; - /* Remove the free block from the LRU ring. */ + /* Here the block must be in the LRU ring. Unlink it again. */ + DBUG_ASSERT(block->next_used && block->prev_used && + *block->prev_used == block); unlink_block(keycache, block); if (block->temperature == BLOCK_WARM) keycache->warm_blocks--; block->temperature= BLOCK_COLD; + + /* Remove from file_blocks hash. */ + unlink_changed(block); + + /* Remove reference to block from hash table. */ + unlink_hash(keycache, block->hash_link); + block->hash_link= NULL; + + block->status= 0; + block->length= 0; + block->offset= keycache->key_cache_block_size; + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", ("block is freed")); + + /* Enforced by unlink_changed(), but just to be sure. */ + DBUG_ASSERT(!block->next_changed && !block->prev_changed); + /* Enforced by unlink_block(): not in LRU ring nor in free_block_list. */ + DBUG_ASSERT(!block->next_used && !block->prev_used); /* Insert the free block in the free list. */ block->next_used= keycache->free_block_list; keycache->free_block_list= block; /* Keep track of the number of currently unused blocks. */ keycache->blocks_unused++; -#ifdef THREAD /* All pending requests for this page must be resubmitted. */ - if (block->wqueue[COND_FOR_SAVED].last_thread) - release_queue(&block->wqueue[COND_FOR_SAVED]); -#endif + release_whole_queue(&block->wqueue[COND_FOR_SAVED]); } @@ -2269,56 +3481,99 @@ static int flush_cached_blocks(KEY_CACHE *keycache, qsort((byte*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link); keycache_pthread_mutex_lock(&keycache->cache_lock); + /* + Note: Do not break the loop. We have registered a request on every + block in 'cache'. These must be unregistered by free_block() or + unreg_request(). + */ for ( ; cache != end ; cache++) { BLOCK_LINK *block= *cache; KEYCACHE_DBUG_PRINT("flush_cached_blocks", ("block %u to be flushed", BLOCK_NUMBER(block))); - keycache_pthread_mutex_unlock(&keycache->cache_lock); - error= my_pwrite(file, - block->buffer+block->offset, - block->length - block->offset, - block->hash_link->diskpos+ block->offset, - MYF(MY_NABP | MY_WAIT_IF_FULL)); - keycache_pthread_mutex_lock(&keycache->cache_lock); - keycache->global_cache_write++; - if (error) + /* + If the block contents is going to be changed, we abandon the flush + for this block. flush_key_blocks_int() will restart its search and + handle the block properly. + */ + if (!(block->status & BLOCK_FOR_UPDATE)) { - block->status|= BLOCK_ERROR; - if (!last_errno) - last_errno= errno ? errno : -1; + /* Blocks coming here must have a certain status. */ + DBUG_ASSERT(block->hash_link); + DBUG_ASSERT(block->hash_link->block == block); + DBUG_ASSERT(block->hash_link->file == file); + DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) == + (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE)); + block->status|= BLOCK_IN_FLUSHWRITE; + keycache_pthread_mutex_unlock(&keycache->cache_lock); + error= my_pwrite(file, + block->buffer+block->offset, + block->length - block->offset, + block->hash_link->diskpos+ block->offset, + MYF(MY_NABP | MY_WAIT_IF_FULL)); + keycache_pthread_mutex_lock(&keycache->cache_lock); + keycache->global_cache_write++; + if (error) + { + block->status|= BLOCK_ERROR; + if (!last_errno) + last_errno= errno ? errno : -1; + } + block->status&= ~BLOCK_IN_FLUSHWRITE; + /* Block must not have changed status except BLOCK_FOR_UPDATE. */ + DBUG_ASSERT(block->hash_link); + DBUG_ASSERT(block->hash_link->block == block); + DBUG_ASSERT(block->hash_link->file == file); + DBUG_ASSERT((block->status & ~(BLOCK_FOR_UPDATE | BLOCK_IN_EVICTION)) == + (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE)); + /* + Set correct status and link in right queue for free or later use. + free_block() must not see BLOCK_CHANGED and it may need to wait + for readers of the block. These should not see the block in the + wrong hash. If not freeing the block, we need to have it in the + right queue anyway. + */ + link_to_file_list(keycache, block, file, 1); } - #ifdef THREAD + block->status&= ~BLOCK_IN_FLUSH; /* Let to proceed for possible waiting requests to write to the block page. It might happen only during an operation to resize the key cache. */ - if (block->wqueue[COND_FOR_SAVED].last_thread) - release_queue(&block->wqueue[COND_FOR_SAVED]); -#endif + release_whole_queue(&block->wqueue[COND_FOR_SAVED]); /* type will never be FLUSH_IGNORE_CHANGED here */ - if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) + if (!(type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE) && + !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH | + BLOCK_FOR_UPDATE))) { - keycache->blocks_changed--; - keycache->global_blocks_changed--; + /* + Note that a request has been registered against the block in + flush_key_blocks_int(). + */ free_block(keycache, block); } else { - block->status&= ~BLOCK_IN_FLUSH; - link_to_file_list(keycache, block, file, 1); + /* + Link the block into the LRU ring if it's the last submitted + request for the block. This enables eviction for the block. + Note that a request has been registered against the block in + flush_key_blocks_int(). + */ unreg_request(keycache, block, 1); } - } + } /* end of for ( ; cache != end ; cache++) */ + return last_errno; } /* - flush all key blocks for a file to disk, but don't do any mutex locks + Flush all key blocks for a file to disk, but don't do any mutex locks. + SYNOPSIS flush_key_blocks_int() keycache pointer to a key cache data structure file handler for the file to flush to @@ -2339,6 +3594,7 @@ static int flush_key_blocks_int(KEY_CACHE *keycache, { BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache; int last_errno= 0; + int last_errcnt= 0; DBUG_ENTER("flush_key_blocks_int"); DBUG_PRINT("enter",("file: %d blocks_used: %lu blocks_changed: %lu", file, keycache->blocks_used, keycache->blocks_changed)); @@ -2354,9 +3610,11 @@ static int flush_key_blocks_int(KEY_CACHE *keycache, { /* Key cache exists and flush is not disabled */ int error= 0; - uint count= 0; + uint count= FLUSH_CACHE; BLOCK_LINK **pos,**end; BLOCK_LINK *first_in_switch= NULL; + BLOCK_LINK *last_in_flush; + BLOCK_LINK *last_for_update; BLOCK_LINK *block, *next; #if defined(KEYCACHE_DEBUG) uint cnt=0; @@ -2368,20 +3626,27 @@ static int flush_key_blocks_int(KEY_CACHE *keycache, Count how many key blocks we have to cache to be able to flush all dirty pages with minimum seek moves */ + count= 0; for (block= keycache->changed_blocks[FILE_HASH(file)] ; block ; block= block->next_changed) { - if (block->hash_link->file == file) + if ((block->hash_link->file == file) && + !(block->status & BLOCK_IN_FLUSH)) { count++; KEYCACHE_DBUG_ASSERT(count<= keycache->blocks_used); } } - /* Allocate a new buffer only if its bigger than the one we have */ - if (count > FLUSH_CACHE && - !(cache= (BLOCK_LINK**) my_malloc(sizeof(BLOCK_LINK*)*count, - MYF(0)))) + /* + Allocate a new buffer only if its bigger than the one we have. + Assure that we always have some entries for the case that new + changed blocks appear while we need to wait for something. + */ + if ((count <= FLUSH_CACHE) || + ((count > FLUSH_CACHE) && + !(cache= (BLOCK_LINK**) my_malloc(sizeof(BLOCK_LINK*)*count, + MYF(0))))) { cache= cache_buff; count= FLUSH_CACHE; @@ -2390,6 +3655,8 @@ static int flush_key_blocks_int(KEY_CACHE *keycache, /* Retrieve the blocks and write them to a buffer to be flushed */ restart: + last_in_flush= NULL; + last_for_update= NULL; end= (pos= cache)+count; for (block= keycache->changed_blocks[FILE_HASH(file)] ; block ; @@ -2402,121 +3669,328 @@ restart: next= block->next_changed; if (block->hash_link->file == file) { - /* - Mark the block with BLOCK_IN_FLUSH in order not to let - other threads to use it for new pages and interfere with - our sequence ot flushing dirty file pages - */ - block->status|= BLOCK_IN_FLUSH; - - if (! (block->status & BLOCK_IN_SWITCH)) + if (!(block->status & (BLOCK_IN_FLUSH | BLOCK_FOR_UPDATE))) { - /* - We care only for the blocks for which flushing was not - initiated by other threads as a result of page swapping + /* + Note: The special handling of BLOCK_IN_SWITCH is obsolete + since we set BLOCK_IN_FLUSH if the eviction includes a + flush. It can be removed in a later version. */ - reg_requests(keycache, block, 1); - if (type != FLUSH_IGNORE_CHANGED) + if (!(block->status & BLOCK_IN_SWITCH)) { - /* It's not a temporary file */ - if (pos == end) + /* + We care only for the blocks for which flushing was not + initiated by another thread and which are not in eviction. + Registering a request on the block unlinks it from the LRU + ring and protects against eviction. + */ + reg_requests(keycache, block, 1); + if (type != FLUSH_IGNORE_CHANGED) { - /* - This happens only if there is not enough - memory for the big block - */ - if ((error= flush_cached_blocks(keycache, file, cache, - end,type))) - last_errno=error; + /* It's not a temporary file */ + if (pos == end) + { + /* + This should happen relatively seldom. Remove the + request because we won't do anything with the block + but restart and pick it again in the next iteration. + */ + unreg_request(keycache, block, 0); + /* + This happens only if there is not enough + memory for the big block + */ + if ((error= flush_cached_blocks(keycache, file, cache, + end,type))) + { + /* Do not loop infnitely trying to flush in vain. */ + if ((last_errno == error) && (++last_errcnt > 5)) + goto err; + last_errno= error; + } + /* + Restart the scan as some other thread might have changed + the changed blocks chain: the blocks that were in switch + state before the flush started have to be excluded + */ + goto restart; + } /* - Restart the scan as some other thread might have changed - the changed blocks chain: the blocks that were in switch - state before the flush started have to be excluded + Mark the block with BLOCK_IN_FLUSH in order not to let + other threads to use it for new pages and interfere with + our sequence of flushing dirty file pages. We must not + set this flag before actually putting the block on the + write burst array called 'cache'. */ - goto restart; + block->status|= BLOCK_IN_FLUSH; + /* Add block to the array for a write burst. */ + *pos++= block; + } + else + { + /* It's a temporary file */ + DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED)); + /* + free_block() must not be called with BLOCK_CHANGED. Note + that we must not change the BLOCK_CHANGED flag outside of + link_to_file_list() so that it is always in the correct + queue and the *blocks_changed counters are correct. + */ + link_to_file_list(keycache, block, file, 1); + if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) + { + /* A request has been registered against the block above. */ + free_block(keycache, block); + } + else + { + /* + Link the block into the LRU ring if it's the last + submitted request for the block. This enables eviction + for the block. A request has been registered against + the block above. + */ + unreg_request(keycache, block, 1); + } } - *pos++= block; } else { - /* It's a temporary file */ - keycache->blocks_changed--; - keycache->global_blocks_changed--; - free_block(keycache, block); + /* + Link the block into a list of blocks 'in switch'. + + WARNING: Here we introduce a place where a changed block + is not in the changed_blocks hash! This is acceptable for + a BLOCK_IN_SWITCH. Never try this for another situation. + Other parts of the key cache code rely on changed blocks + being in the changed_blocks hash. + */ + unlink_changed(block); + link_changed(block, &first_in_switch); } } else { - /* Link the block into a list of blocks 'in switch' */ - unlink_changed(block); - link_changed(block, &first_in_switch); + if (block->status & BLOCK_IN_FLUSH) + { + /* Remember the last block found to be in flush. */ + last_in_flush= block; + } + else + { + /* Remember the last block found to be selected for update. */ + last_for_update= block; + } } } } if (pos != cache) { if ((error= flush_cached_blocks(keycache, file, cache, pos, type))) + { + /* Do not loop inifnitely trying to flush in vain. */ + if ((last_errno == error) && (++last_errcnt > 5)) + goto err; last_errno= error; + } + /* + While releasing the lock for writing, new blocks may be changed. + This should not happen during resize as no new changed blocks + are accepted. But it can happen during other flushes. Anyway + check again. + */ + goto restart; } - /* Wait until list of blocks in switch is empty */ + if (last_in_flush) + { + /* + There are no blocks to be flushed by this thread, but blocks in + flush by other threads. Wait until one of the blocks is flushed. + */ + wait_on_queue(&last_in_flush->wqueue[COND_FOR_SAVED], + &keycache->cache_lock); + /* Be sure not to lose a block. They may be flushed in random order. */ + goto restart; + } + if (last_for_update) + { + /* + There are no blocks to be flushed by this thread, but blocks for + update by other threads. Wait until one of the blocks is updated. + */ + wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED], + &keycache->cache_lock); + /* The block is now changed. Flush it. */ + goto restart; + } + + /* + Wait until the list of blocks in switch is empty. The threads that + are switching these blocks will relink them to clean file chains + while we wait and thus empty the 'first_in_switch' chain. + */ while (first_in_switch) { #if defined(KEYCACHE_DEBUG) cnt= 0; #endif - block= first_in_switch; - { -#ifdef THREAD - struct st_my_thread_var *thread= my_thread_var; - add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); - do - { - KEYCACHE_DBUG_PRINT("flush_key_blocks_int: wait", - ("suspend thread %ld", thread->id)); - keycache_pthread_cond_wait(&thread->suspend, - &keycache->cache_lock); - } - while (thread->next); -#else - KEYCACHE_DBUG_ASSERT(0); - /* No parallel requests in single-threaded case */ -#endif - } + wait_on_queue(&first_in_switch->wqueue[COND_FOR_SAVED], + &keycache->cache_lock); #if defined(KEYCACHE_DEBUG) cnt++; KEYCACHE_DBUG_ASSERT(cnt <= keycache->blocks_used); #endif + /* While waiting here, we might have got another changed block. */ + goto restart; } - /* The following happens very seldom */ + if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) { -#if defined(KEYCACHE_DEBUG) - cnt=0; -#endif - for (block= keycache->file_blocks[FILE_HASH(file)] ; - block ; - block= next) + BLOCK_LINK *last_for_update= NULL; + BLOCK_LINK *last_in_switch= NULL; + uint total_found= 0; + uint found; + + /* + Finally free all clean blocks for this file. + During resize this may be run by two threads in parallel. + */ + do { -#if defined(KEYCACHE_DEBUG) - cnt++; - KEYCACHE_DBUG_ASSERT(cnt <= keycache->blocks_used); -#endif - next= block->next_changed; - if (block->hash_link->file == file && - (! (block->status & BLOCK_CHANGED) - || type == FLUSH_IGNORE_CHANGED)) + found= 0; + for (block= keycache->file_blocks[FILE_HASH(file)] ; + block ; + block= next) { - reg_requests(keycache, block, 1); - free_block(keycache, block); - } + /* Remember the next block. After freeing we cannot get at it. */ + next= block->next_changed; + + /* Changed blocks cannot appear in the file_blocks hash. */ + DBUG_ASSERT(!(block->status & BLOCK_CHANGED)); + if (block->hash_link->file == file) + { + /* We must skip blocks that will be changed. */ + if (block->status & BLOCK_FOR_UPDATE) + { + last_for_update= block; + continue; + } + + /* + We must not free blocks in eviction (BLOCK_IN_EVICTION | + BLOCK_IN_SWITCH) or blocks intended to be freed + (BLOCK_REASSIGNED). + */ + if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH | + BLOCK_REASSIGNED))) + { + struct st_hash_link *next_hash_link; + my_off_t next_diskpos; + File next_file; + uint next_status; + uint hash_requests; + + total_found++; + found++; + KEYCACHE_DBUG_ASSERT(found <= keycache->blocks_used); + + /* + Register a request. This unlinks the block from the LRU + ring and protects it against eviction. This is required + by free_block(). + */ + reg_requests(keycache, block, 1); + + /* + free_block() may need to wait for readers of the block. + This is the moment where the other thread can move the + 'next' block from the chain. free_block() needs to wait + if there are requests for the block pending. + */ + if (next && (hash_requests= block->hash_link->requests)) + { + /* Copy values from the 'next' block and its hash_link. */ + next_status= next->status; + next_hash_link= next->hash_link; + next_diskpos= next_hash_link->diskpos; + next_file= next_hash_link->file; + DBUG_ASSERT(next == next_hash_link->block); + } + + free_block(keycache, block); + /* + If we had to wait and the state of the 'next' block + changed, break the inner loop. 'next' may no longer be + part of the current chain. + + We do not want to break the loop after every free_block(), + not even only after waits. The chain might be quite long + and contain blocks for many files. Traversing it again and + again to find more blocks for this file could become quite + inefficient. + */ + if (next && hash_requests && + ((next_status != next->status) || + (next_hash_link != next->hash_link) || + (next_file != next_hash_link->file) || + (next_diskpos != next_hash_link->diskpos) || + (next != next_hash_link->block))) + break; + } + else + { + last_in_switch= block; + } + } + } /* end for block in file_blocks */ + } while (found); + + /* + If any clean block has been found, we may have waited for it to + become free. In this case it could be possible that another clean + block became dirty. This is possible if the write request existed + before the flush started (BLOCK_FOR_UPDATE). Re-check the hashes. + */ + if (total_found) + goto restart; + + /* + To avoid an infinite loop wait until one of the blocks marked + for update is updated. + */ + if (last_for_update) + { + /* We did not wait. Block must not have changed status. */ + DBUG_ASSERT(last_for_update->status & BLOCK_FOR_UPDATE); + wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED], + &keycache->cache_lock); + goto restart; } - } - } + + /* + To avoid an infinite loop wait until one of the blocks marked + for eviction is switched. + */ + if (last_in_switch) + { + /* We did not wait. Block must not have changed status. */ + DBUG_ASSERT(last_in_switch->status & (BLOCK_IN_EVICTION | + BLOCK_IN_SWITCH | + BLOCK_REASSIGNED)); + wait_on_queue(&last_in_switch->wqueue[COND_FOR_SAVED], + &keycache->cache_lock); + goto restart; + } + + } /* if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) */ + + } /* if (keycache->disk_blocks > 0 */ #ifndef DBUG_OFF DBUG_EXECUTE("check_keycache", test_key_cache(keycache, "end of flush_key_blocks", 0);); #endif +err: if (cache != cache_buff) my_free((gptr) cache, MYF(0)); if (last_errno) @@ -2543,16 +4017,21 @@ restart: int flush_key_blocks(KEY_CACHE *keycache, File file, enum flush_type type) { - int res; + int res= 0; DBUG_ENTER("flush_key_blocks"); DBUG_PRINT("enter", ("keycache: 0x%lx", (long) keycache)); - if (keycache->disk_blocks <= 0) + if (!keycache->key_cache_inited) DBUG_RETURN(0); + keycache_pthread_mutex_lock(&keycache->cache_lock); - inc_counter_for_resize_op(keycache); - res= flush_key_blocks_int(keycache, file, type); - dec_counter_for_resize_op(keycache); + /* While waiting for lock, keycache could have been ended. */ + if (keycache->disk_blocks > 0) + { + inc_counter_for_resize_op(keycache); + res= flush_key_blocks_int(keycache, file, type); + dec_counter_for_resize_op(keycache); + } keycache_pthread_mutex_unlock(&keycache->cache_lock); DBUG_RETURN(res); } @@ -2564,30 +4043,101 @@ int flush_key_blocks(KEY_CACHE *keycache, static int flush_all_key_blocks(KEY_CACHE *keycache) { -#if defined(KEYCACHE_DEBUG) - uint cnt=0; -#endif - while (keycache->blocks_changed > 0) + BLOCK_LINK *block; + uint total_found; + uint found; + uint idx; + DBUG_ENTER("flush_all_key_blocks"); + + safe_mutex_assert_owner(&keycache->cache_lock); + + do { - BLOCK_LINK *block; - for (block= keycache->used_last->next_used ; ; block=block->next_used) + total_found= 0; + + /* Flush all changed blocks first. */ + do { - if (block->hash_link) + found= 0; + /* Step over the whole changed_blocks hash array. */ + for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++) { -#if defined(KEYCACHE_DEBUG) - cnt++; - KEYCACHE_DBUG_ASSERT(cnt <= keycache->blocks_used); -#endif - if (flush_key_blocks_int(keycache, block->hash_link->file, - FLUSH_RELEASE)) - return 1; - break; + /* + If an array element is non-empty, use the first block from its + chain to find a file for flush. All blocks for this file are + flushed. So the same block will not appear at this place again + with the next iteration. New writes for blocks are not accepted + during the flush. + */ + if ((block= keycache->changed_blocks[idx])) + { + /* A block in the changed_blocks hash must have a hash_link. */ + DBUG_ASSERT(block->hash_link); + DBUG_ASSERT(block->hash_link->block == block); + + found++; + /* + Flush dirty blocks but do not free them yet. They can be used + for reading until all other blocks are flushed too. + */ + if (flush_key_blocks_int(keycache, block->hash_link->file, + FLUSH_FORCE_WRITE)) + DBUG_RETURN(1); + } } - if (block == keycache->used_last) - break; - } + + } while (found); + + /* Now flush (free) all clean blocks. */ + do + { + found= 0; + /* Step over the whole file_blocks hash array. */ + for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++) + { + /* + If an array element is non-empty, use the first block from its + chain to find a file for flush. All blocks for this file are + freed. So the same block will not appear at this place again + with the next iteration. Unless it has been read into the cache + anew. In this case readers and the flusher fight against each + other. But since the flusher does not need to do I/O for clean + blocks, and writes for blocks are not accepted during the flush, + it will win finally. + */ + if ((block= keycache->file_blocks[idx])) + { + /* A block in the file_blocks hash must have a hash_link. */ + DBUG_ASSERT(block->hash_link); + + total_found++; + found++; + if (flush_key_blocks_int(keycache, block->hash_link->file, + FLUSH_RELEASE)) + DBUG_RETURN(1); + } + } + + } while (found); + + /* + If any clean block has been found, we may have waited for it to + become free. In this case it could be possible that another clean + block became dirty. This is possible if the write request existed + before the resize started (BLOCK_FOR_UPDATE). Re-check the hashes. + */ + } while (total_found); + +#ifndef DBUG_OFF + /* Now there should not exist any block any more. */ + for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++) + { + DBUG_ASSERT(!keycache->changed_blocks[idx]); + DBUG_ASSERT(!keycache->file_blocks[idx]); } - return 0; +#endif + + DBUG_RETURN(0); } @@ -2607,7 +4157,8 @@ static int flush_all_key_blocks(KEY_CACHE *keycache) 0 on success (always because it can't fail) */ -int reset_key_cache_counters(const char *name, KEY_CACHE *key_cache) +int reset_key_cache_counters(const char *name __attribute__((unused)), + KEY_CACHE *key_cache) { DBUG_ENTER("reset_key_cache_counters"); if (!key_cache->key_cache_inited) @@ -2855,3 +4406,74 @@ void keycache_debug_log_close(void) #endif /* defined(KEYCACHE_DEBUG_LOG) */ #endif /* defined(KEYCACHE_DEBUG) */ + +#if !defined(DBUG_OFF) +#define F_B_PRT(_f_, _v_) fprintf(stderr, "Assert fails: " _f_, _v_) + +static int fail_block(BLOCK_LINK *block) +{ + fprintf(stderr, "\n"); + F_B_PRT("block->next_used: %lx\n", (ulong) block->next_used); + F_B_PRT("block->prev_used: %lx\n", (ulong) block->prev_used); + F_B_PRT("block->next_changed: %lx\n", (ulong) block->next_changed); + F_B_PRT("block->prev_changed: %lx\n", (ulong) block->prev_changed); + F_B_PRT("block->hash_link: %lx\n", (ulong) block->hash_link); + F_B_PRT("block->status: %u\n", block->status); + F_B_PRT("block->length: %u\n", block->length); + F_B_PRT("block->offset: %u\n", block->offset); + F_B_PRT("block->requests: %u\n", block->requests); + F_B_PRT("block->temperature: %u\n", block->temperature); + fprintf(stderr, "\n"); + return 0; /* Let the assert fail. */ +} + +static int fail_hlink(HASH_LINK *hlink) +{ + fprintf(stderr, "\n"); + F_B_PRT("hlink->next: %lx\n", (ulong) hlink->next); + F_B_PRT("hlink->prev: %lx\n", (ulong) hlink->prev); + F_B_PRT("hlink->block: %lx\n", (ulong) hlink->block); + F_B_PRT("hlink->diskpos: %lu\n", (ulong) hlink->diskpos); + F_B_PRT("hlink->file: %d\n", hlink->file); + fprintf(stderr, "\n"); + return 0; /* Let the assert fail. */ +} + +static int cache_empty(KEY_CACHE *keycache) +{ + int errcnt= 0; + int idx; + if (keycache->disk_blocks <= 0) + return 1; + for (idx= 0; idx < keycache->disk_blocks; idx++) + { + BLOCK_LINK *block= keycache->block_root + idx; + if (block->status || block->requests || block->hash_link) + { + fprintf(stderr, "block index: %u\n", idx); + fail_block(block); + errcnt++; + } + } + for (idx= 0; idx < keycache->hash_links; idx++) + { + HASH_LINK *hash_link= keycache->hash_link_root + idx; + if (hash_link->block || hash_link->file || hash_link->diskpos) + { + fprintf(stderr, "hash_link index: %u\n", idx); + fail_hlink(hash_link); + errcnt++; + } + } + if (errcnt) + { + fprintf(stderr, "blocks: %d used: %lu\n", + keycache->disk_blocks, keycache->blocks_used); + fprintf(stderr, "hash_links: %d used: %d\n", + keycache->hash_links, keycache->hash_links_used); + fprintf(stderr, "\n"); + } + return !errcnt; +} +#endif + diff --git a/mysys/my_static.c b/mysys/my_static.c index 95521c49ab7..cb2dd84b804 100644 --- a/mysys/my_static.c +++ b/mysys/my_static.c @@ -48,9 +48,6 @@ struct st_remember _my_sig_remember[MAX_SIGNALS]={{0,0}}; sigset_t my_signals; /* signals blocked by mf_brkhant */ #endif - /* from mf_keycache.c */ -my_bool key_cache_inited=0; - /* from mf_reccache.c */ ulong my_default_record_cache_size=RECORD_CACHE_SIZE; diff --git a/sql/handler.cc b/sql/handler.cc index 360d528f0ad..a9a0c3794a4 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -2716,8 +2716,8 @@ int ha_init_key_cache(const char *name, KEY_CACHE *key_cache) if (!key_cache->key_cache_inited) { pthread_mutex_lock(&LOCK_global_system_variables); - long tmp_buff_size= (long) key_cache->param_buff_size; - long tmp_block_size= (long) key_cache->param_block_size; + ulong tmp_buff_size= (ulong) key_cache->param_buff_size; + uint tmp_block_size= (uint) key_cache->param_block_size; uint division_limit= key_cache->param_division_limit; uint age_threshold= key_cache->param_age_threshold; pthread_mutex_unlock(&LOCK_global_system_variables); diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 35a5547a730..7ecb199a50b 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -4575,8 +4575,13 @@ int reassign_keycache_tables(THD *thd, KEY_CACHE *src_cache, bool mysql_preload_keys(THD* thd, TABLE_LIST* tables) { DBUG_ENTER("mysql_preload_keys"); + /* + We cannot allow concurrent inserts. The storage engine reads + directly from the index file, bypassing the cache. It could read + outdated information if parallel inserts into cache blocks happen. + */ DBUG_RETURN(mysql_admin_table(thd, tables, 0, - "preload_keys", TL_READ, 0, 0, 0, 0, + "preload_keys", TL_READ_NO_INSERT, 0, 0, 0, 0, &handler::preload_keys, 0)); } diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc index 397856a4a4e..9f9fce98b43 100644 --- a/storage/myisam/ha_myisam.cc +++ b/storage/myisam/ha_myisam.cc @@ -858,6 +858,7 @@ int ha_myisam::preload_keys(THD* thd, HA_CHECK_OPT *check_opt) ulonglong map= ~(ulonglong) 0; TABLE_LIST *table_list= table->pos_in_table_list; my_bool ignore_leaves= table_list->ignore_leaves; + char buf[ERRMSGSIZE+20]; DBUG_ENTER("ha_myisam::preload_keys"); @@ -889,7 +890,6 @@ int ha_myisam::preload_keys(THD* thd, HA_CHECK_OPT *check_opt) errmsg= "Failed to allocate buffer"; break; default: - char buf[ERRMSGSIZE+20]; my_snprintf(buf, ERRMSGSIZE, "Failed to read from index file (errno: %d)", my_errno); errmsg= buf; diff --git a/storage/myisam/mi_preload.c b/storage/myisam/mi_preload.c index 78729f18424..fd6e99c6bc3 100644 --- a/storage/myisam/mi_preload.c +++ b/storage/myisam/mi_preload.c @@ -58,12 +58,39 @@ int mi_preload(MI_INFO *info, ulonglong key_map, my_bool ignore_leaves) /* Check whether all indexes use the same block size */ for (i= 1 ; i < keys ; i++) { +#if !defined(INGO_TEST_LOADIDX_OFF) + /* Allow non-IGNORE-LEAVES index loading even with different block sizes. */ + if (ignore_leaves && (keyinfo[i].block_length != block_length)) + DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE); + set_if_bigger(block_length, keyinfo[i].block_length); +#else if (keyinfo[i].block_length != block_length) DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE); +#endif } +#if !defined(INGO_TEST_LOADIDX_OFF) + /* Align non-IGNORE-LEAVES index loads. */ + if (!ignore_leaves) + { + /* Round up to the next multiple of key_cache_block_size. */ + length= ((info->preload_buff_size + + share->key_cache->key_cache_block_size - 1) / + share->key_cache->key_cache_block_size * + share->key_cache->key_cache_block_size); + /* Round down to the next multiple of key_cache_block_size. */ + pos= (share->base.keystart / share->key_cache->key_cache_block_size * + share->key_cache->key_cache_block_size); + } + else + { + length= info->preload_buff_size/block_length * block_length; + set_if_bigger(length, block_length); + } +#else length= info->preload_buff_size/block_length * block_length; set_if_bigger(length, block_length); +#endif if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME)))) DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM); From 942cd7d42448e9ad0128fb99ac1267638511899d Mon Sep 17 00:00:00 2001 From: "istruewing@chilla.local" <> Date: Fri, 23 Mar 2007 11:52:45 +0100 Subject: [PATCH 02/40] Bug#17332 - changing key_buffer_size on a running server can crash under load After review fixes --- mysys/mf_keycache.c | 219 ++++++++++++++---------------------- storage/myisam/mi_preload.c | 40 ++----- 2 files changed, 95 insertions(+), 164 deletions(-) diff --git a/mysys/mf_keycache.c b/mysys/mf_keycache.c index 332b2b29469..651a2b1070a 100644 --- a/mysys/mf_keycache.c +++ b/mysys/mf_keycache.c @@ -52,8 +52,8 @@ '*next' and '**prev' pointer. These pointers are used to insert the thread into a wait queue. - NOTE: Since there is only one pair of queue pointers per thread, a - thread can be in one wait queue only. + A thread can wait for one block and thus be in one wait queue at a + time only. Before starting to wait on its condition variable with pthread_cond_wait(), the thread enters itself to a specific wait queue @@ -69,13 +69,18 @@ unlink_from_queue() or release_whole_queue() respectively, or the waiting thread removes itself. - There is one exception from this locking scheme. Each block has a + There is one exception from this locking scheme when one thread wants + to reuse a block for some other address. This works by first marking + the block reserved (status= BLOCK_IN_SWITCH) and then waiting for all + threads that are reading the block to finish. Each block has a reference to a condition variable (condvar). It holds a reference to - the thread->suspend condition variable, if that thread is waiting for - the block. When that thread is signalled, the reference is cleared. - This is similar to the above, but it clearly means that only one - thread can wait for a particular block. There is no queue in this - case. Strangely enough block->convar is used for waiting for the + the thread->suspend condition variable for the waiting thread (if such + a thread exists). When that thread is signaled, the reference is + cleared. The number of readers of a block is registered in + block->hash_link->requests. See wait_for_readers() / remove_reader() + for details. This is similar to the above, but it clearly means that + only one thread can wait for a particular block. There is no queue in + this case. Strangely enough block->convar is used for waiting for the assigned hash_link only. More precisely it is used to wait for all requests to be unregistered from the assigned hash_link. @@ -219,8 +224,8 @@ static void wait_on_queue(KEYCACHE_WQUEUE *wqueue, pthread_mutex_t *mutex); static void release_whole_queue(KEYCACHE_WQUEUE *wqueue); #else -#define wait_on_queue(wqueue, mutex) KEYCACHE_DBUG_ASSERT(0); -#define release_whole_queue(wqueue) /* release_whole_queue() */ +#define wait_on_queue(wqueue, mutex) do {} while (0) +#define release_whole_queue(wqueue) do {} while (0) #endif static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block); #if !defined(DBUG_OFF) @@ -378,6 +383,7 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, keycache->disk_blocks= -1; if (! keycache->key_cache_inited) { + keycache->key_cache_inited= 1; /* Initialize these variables once only. Their value must survive re-initialization during resizing. @@ -389,8 +395,6 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, keycache->in_init= 0; pthread_mutex_init(&keycache->cache_lock, MY_MUTEX_INIT_FAST); keycache->resize_queue.last_thread= NULL; - /* Initialize this after the mutex. It is read asynchronously. */ - keycache->key_cache_inited= 1; } keycache->key_cache_mem_size= use_mem; @@ -402,7 +406,6 @@ int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, blocks= (uint) (use_mem / (sizeof(BLOCK_LINK) + 2 * sizeof(HASH_LINK) + sizeof(HASH_LINK*) * 5/4 + key_cache_block_size)); /* It doesn't make sense to have too few blocks (less than 8) */ - /* Comment to be deleted: disk_blocks is set to -1 above unconditionally. */ if (blocks >= 8) { for ( ; ; ) @@ -623,9 +626,7 @@ int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, run in parallel with normal cache operation. */ while (keycache->cnt_for_resize_op) - { wait_on_queue(&keycache->waiting_for_resize_cnt, &keycache->cache_lock); - } #else KEYCACHE_DBUG_ASSERT(keycache->cnt_for_resize_op == 0); #endif @@ -648,10 +649,8 @@ finish: */ keycache->in_resize= 0; -#ifdef THREAD /* Signal waiting threads. */ release_whole_queue(&keycache->resize_queue); -#endif keycache_pthread_mutex_unlock(&keycache->cache_lock); DBUG_RETURN(blocks); @@ -673,12 +672,8 @@ static inline void inc_counter_for_resize_op(KEY_CACHE *keycache) */ static inline void dec_counter_for_resize_op(KEY_CACHE *keycache) { -#ifdef THREAD if (!--keycache->cnt_for_resize_op) release_whole_queue(&keycache->waiting_for_resize_cnt); -#else - keycache->cnt_for_resize_op--; -#endif } /* @@ -785,9 +780,6 @@ void end_key_cache(KEY_CACHE *keycache, my_bool cleanup) Queue is represented by a circular list of the thread structures The list is double-linked of the type (**prev,*next), accessed by a pointer to the last element. - - Since there is only one pair of queue pointers per thread, a - thread can be part of one wait queue only. */ static void link_into_queue(KEYCACHE_WQUEUE *wqueue, @@ -870,9 +862,6 @@ static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue, The list is single-linked of the type (*next), accessed by a pointer to the last element. - Since there is only one pair of queue pointers per thread, a - thread can be part of one wait queue only. - The function protects against stray signals by verifying that the current thread is unlinked from the queue when awaking. However, since several threads can wait for the same event, it might be @@ -907,7 +896,7 @@ static void wait_on_queue(KEYCACHE_WQUEUE *wqueue, KEYCACHE_DBUG_PRINT("wait", ("suspend thread %ld", thread->id)); keycache_pthread_cond_wait(&thread->suspend, mutex); } - while(thread->next); + while (thread->next); } @@ -1448,9 +1437,6 @@ static void wait_for_readers(KEY_CACHE *keycache, block->condvar= &thread->suspend; keycache_pthread_cond_wait(&thread->suspend, &keycache->cache_lock); block->condvar= NULL; - /* The other thread might have freed the block in between. */ - if (!block->hash_link) - break; } #else KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0); @@ -1738,12 +1724,13 @@ restart: we did not release cache_lock since increasing it. So no other thread can wait for our request to become released. */ - if (!--hash_link->requests) + if (hash_link->requests == 1) { /* We are the only one to request this hash_link (this file/pos). Free the hash_link. */ + hash_link->requests--; unlink_hash(keycache, hash_link); DBUG_RETURN(0); } @@ -1759,7 +1746,6 @@ restart: Refresh the request on the hash-link so that it cannot be reused for another file/pos. */ - hash_link->requests++; thread= my_thread_var; thread->opt_info= (void *) hash_link; link_into_queue(&keycache->waiting_for_block, thread); @@ -1936,15 +1922,11 @@ restart: /* The block is still assigned to the hash_link (the file/pos that - we are goig to write to). Wait until the eviction/free is + we are going to write to). Wait until the eviction/free is complete. Otherwise the direct write could complete before all readers are done with the block. So they could read outdated data. - Comment to be deleted: This was the reason why I experienced - index corruptions during resize. Since I introduced the wait - loop here, they are gone. - Since we released our request on the hash_link, it can be reused for another file/pos. Hence we cannot just check for block->hash_link == hash_link. As long as the resize is @@ -2052,12 +2034,6 @@ restart: block= keycache->free_block_list; keycache->free_block_list= block->next_used; block->next_used= NULL; - DBUG_ASSERT(!block->prev_used); - DBUG_ASSERT(!block->next_changed); - DBUG_ASSERT(!block->prev_changed); - DBUG_ASSERT(!block->hash_link); - DBUG_ASSERT(!block->status); - DBUG_ASSERT(!block->requests); } else { @@ -2070,13 +2046,13 @@ restart: byte*); keycache->blocks_used++; DBUG_ASSERT(!block->next_used); - DBUG_ASSERT(!block->prev_used); - DBUG_ASSERT(!block->next_changed); - DBUG_ASSERT(!block->prev_changed); - DBUG_ASSERT(!block->hash_link); - DBUG_ASSERT(!block->status); - DBUG_ASSERT(!block->requests); } + DBUG_ASSERT(!block->prev_used); + DBUG_ASSERT(!block->next_changed); + DBUG_ASSERT(!block->prev_changed); + DBUG_ASSERT(!block->hash_link); + DBUG_ASSERT(!block->status); + DBUG_ASSERT(!block->requests); keycache->blocks_unused--; block->status= BLOCK_IN_USE; block->length= 0; @@ -2192,7 +2168,7 @@ restart: it could happen that we write the block, reassign it to another file block, then, before the new owner can read the new file block, the flusher writes the cache block - (wich still has the old contents) to the new file block! + (which still has the old contents) to the new file block! */ wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock); @@ -2326,11 +2302,6 @@ restart: PAGE_READ : PAGE_WAIT_TO_BE_READ); } } - /* - Comment to be deleted: keycache->global_cache_read++; moved to - read_block(). At this place it was counted for primary and - secondary requests. Better count it where the actual read is done. - */ } else { @@ -2448,11 +2419,6 @@ static void read_block(KEY_CACHE *keycache, KEYCACHE_DBUG_PRINT("read_block", ("page to be read by primary request")); - /* - Comment to be deleted: keycache->global_cache_read++; moved here - from find_key_block(). At this place it counts primary requests - only. - */ keycache->global_cache_read++; /* Page is not in buffer yet, is to be read from disk */ keycache_pthread_mutex_unlock(&keycache->cache_lock); @@ -2479,7 +2445,6 @@ static void read_block(KEY_CACHE *keycache, block->status|= BLOCK_ERROR; else { - /* Comment to be deleted: Do not kill other block status flags. */ block->status|= BLOCK_READ; block->length= got_length; /* @@ -2548,7 +2513,7 @@ byte *key_cache_read(KEY_CACHE *keycache, uint block_length __attribute__((unused)), int return_buffer __attribute__((unused))) { - my_bool incremented= FALSE; + my_bool locked_and_incremented= FALSE; int error=0; byte *start= buff; DBUG_ENTER("key_cache_read"); @@ -2589,7 +2554,7 @@ byte *key_cache_read(KEY_CACHE *keycache, wait_on_queue(&keycache->resize_queue, &keycache->cache_lock); /* Register the I/O for the next resize. */ inc_counter_for_resize_op(keycache); - incremented= TRUE; + locked_and_incremented= TRUE; /* Requested data may not always be aligned to cache blocks. */ offset= (uint) (filepos & (keycache->key_cache_block_size-1)); /* Read data in key_cache_block_size increments */ @@ -2622,15 +2587,12 @@ byte *key_cache_read(KEY_CACHE *keycache, */ keycache->global_cache_read++; keycache_pthread_mutex_unlock(&keycache->cache_lock); - if (my_pread(file, (byte*) buff, read_length, - filepos + offset, MYF(MY_NABP))) - { - error= 1; - } + error= (my_pread(file, (byte*) buff, read_length, + filepos + offset, MYF(MY_NABP)) != 0); keycache_pthread_mutex_lock(&keycache->cache_lock); goto next_block; } - if (block->status != BLOCK_ERROR) + if (!(block->status & BLOCK_ERROR)) { if (page_st != PAGE_READ) { @@ -2661,7 +2623,7 @@ byte *key_cache_read(KEY_CACHE *keycache, } /* block status may have added BLOCK_ERROR in the above 'if'. */ - if (! ((status= block->status) & BLOCK_ERROR)) + if (!((status= block->status) & BLOCK_ERROR)) { #ifndef THREAD if (! return_buffer) @@ -2716,23 +2678,17 @@ byte *key_cache_read(KEY_CACHE *keycache, no_key_cache: /* Key cache is not used */ - if (keycache->key_cache_inited && !incremented) - { - keycache_pthread_mutex_lock(&keycache->cache_lock); - inc_counter_for_resize_op(keycache); - incremented= TRUE; - } keycache->global_cache_r_requests++; keycache->global_cache_read++; - if (incremented) + if (locked_and_incremented) keycache_pthread_mutex_unlock(&keycache->cache_lock); if (my_pread(file, (byte*) buff, length, filepos, MYF(MY_NABP))) error= 1; - if (incremented) + if (locked_and_incremented) keycache_pthread_mutex_lock(&keycache->cache_lock); end: - if (incremented) + if (locked_and_incremented) { dec_counter_for_resize_op(keycache); keycache_pthread_mutex_unlock(&keycache->cache_lock); @@ -2777,7 +2733,7 @@ int key_cache_insert(KEY_CACHE *keycache, uint read_length; uint offset; int page_st; - my_bool incremented= FALSE; + my_bool locked_and_incremented= FALSE; /* When the keycache is once initialized, we use the cache_lock to @@ -2794,7 +2750,7 @@ int key_cache_insert(KEY_CACHE *keycache, goto no_key_cache; /* Register the pseudo I/O for the next resize. */ inc_counter_for_resize_op(keycache); - incremented= TRUE; + locked_and_incremented= TRUE; /* Loaded data may not always be aligned to cache blocks. */ offset= (uint) (filepos & (keycache->key_cache_block_size-1)); /* Load data in key_cache_block_size increments. */ @@ -2824,7 +2780,7 @@ int key_cache_insert(KEY_CACHE *keycache, */ goto no_key_cache; } - if (block->status != BLOCK_ERROR) + if (!(block->status & BLOCK_ERROR)) { if ((page_st == PAGE_WAIT_TO_BE_READ) || ((page_st == PAGE_TO_BE_READ) && @@ -2857,14 +2813,6 @@ int key_cache_insert(KEY_CACHE *keycache, Though reading again what the caller did read already is an expensive operation, we need to do this for correctness. */ -#if !defined(INGO_TEST_LOADIDX_OFF) - /* - Note that this happen only for key_cache_block_size > - MI_MIN_KEY_BLOCK_LENGTH *and* LOAD INDEX INTO CACHE ... - IGNORE LEAVES. Otherwise mi_preload() supplies this function - with aligned blocks. - */ -#endif read_block(keycache, block, keycache->key_cache_block_size, read_length + offset, (page_st == PAGE_TO_BE_READ)); /* @@ -2950,7 +2898,7 @@ int key_cache_insert(KEY_CACHE *keycache, DBUG_ASSERT(block->hash_link->file == file); DBUG_ASSERT(block->hash_link->diskpos == filepos); DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE)); - } /* end of if (block->status != BLOCK_ERROR) */ + } /* end of if (!(block->status & BLOCK_ERROR)) */ remove_reader(block); @@ -2972,7 +2920,7 @@ int key_cache_insert(KEY_CACHE *keycache, } while ((length-= read_length)); no_key_cache: - if (incremented) + if (locked_and_incremented) dec_counter_for_resize_op(keycache); keycache_pthread_mutex_unlock(&keycache->cache_lock); } @@ -3016,7 +2964,7 @@ int key_cache_write(KEY_CACHE *keycache, uint block_length __attribute__((unused)), int dont_write) { - my_bool incremented= FALSE; + my_bool locked_and_incremented= FALSE; int error=0; DBUG_ENTER("key_cache_write"); DBUG_PRINT("enter", @@ -3075,7 +3023,7 @@ int key_cache_write(KEY_CACHE *keycache, wait_on_queue(&keycache->resize_queue, &keycache->cache_lock); /* Register the I/O for the next resize. */ inc_counter_for_resize_op(keycache); - incremented= TRUE; + locked_and_incremented= TRUE; /* Requested data may not always be aligned to cache blocks. */ offset= (uint) (filepos & (keycache->key_cache_block_size-1)); /* Write data in key_cache_block_size increments. */ @@ -3133,7 +3081,7 @@ int key_cache_write(KEY_CACHE *keycache, wait for the other thread to complete the read of this block. read_block() takes care for the wait. */ - if (block->status != BLOCK_ERROR && + if (!(block->status & BLOCK_ERROR) && ((page_st == PAGE_TO_BE_READ && (offset || read_length < keycache->key_cache_block_size)) || (page_st == PAGE_WAIT_TO_BE_READ))) @@ -3197,7 +3145,7 @@ int key_cache_write(KEY_CACHE *keycache, the buffer would be read/written. An attempt to flush during memcpy() is prevented with BLOCK_FOR_UPDATE. */ - if (! (block->status & BLOCK_ERROR)) + if (!(block->status & BLOCK_ERROR)) { #if !defined(SERIALIZED_READ_FROM_CACHE) keycache_pthread_mutex_unlock(&keycache->cache_lock); @@ -3271,25 +3219,19 @@ no_key_cache: if (dont_write) { /* Used in the server. */ - if (keycache->key_cache_inited && !incremented) - { - keycache_pthread_mutex_lock(&keycache->cache_lock); - inc_counter_for_resize_op(keycache); - incremented= TRUE; - } keycache->global_cache_w_requests++; keycache->global_cache_write++; - if (incremented) + if (locked_and_incremented) keycache_pthread_mutex_unlock(&keycache->cache_lock); if (my_pwrite(file, (byte*) buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL))) error=1; - if (incremented) + if (locked_and_incremented) keycache_pthread_mutex_lock(&keycache->cache_lock); } end: - if (incremented) + if (locked_and_incremented) { dec_counter_for_resize_op(keycache); keycache_pthread_mutex_unlock(&keycache->cache_lock); @@ -3585,6 +3527,10 @@ static int flush_cached_blocks(KEY_CACHE *keycache, from flush_key_blocks and flush_all_key_blocks (the later one does the mutex lock in the resize_key_cache() function). + We do only care about changed blocks that exist when the function is + entered. We do not guarantee that all changed blocks of the file are + flushed if more blocks change while this function is running. + RETURN 0 ok 1 error @@ -3644,14 +3590,16 @@ static int flush_key_blocks_int(KEY_CACHE *keycache, Assure that we always have some entries for the case that new changed blocks appear while we need to wait for something. */ - if ((count <= FLUSH_CACHE) || - ((count > FLUSH_CACHE) && - !(cache= (BLOCK_LINK**) my_malloc(sizeof(BLOCK_LINK*)*count, - MYF(0))))) - { + if ((count > FLUSH_CACHE) && + !(cache= (BLOCK_LINK**) my_malloc(sizeof(BLOCK_LINK*)*count, + MYF(0)))) cache= cache_buff; - count= FLUSH_CACHE; - } + /* + After a restart there could be more changed blocks than now. + So we should not let count become smaller than the fixed buffer. + */ + if (cache == cache_buff) + count == FLUSH_CACHE; } /* Retrieve the blocks and write them to a buffer to be flushed */ @@ -3704,7 +3652,7 @@ restart: if ((error= flush_cached_blocks(keycache, file, cache, end,type))) { - /* Do not loop infnitely trying to flush in vain. */ + /* Do not loop infinitely trying to flush in vain. */ if ((last_errno == error) && (++last_errcnt > 5)) goto err; last_errno= error; @@ -3795,21 +3743,22 @@ restart: last_errno= error; } /* - While releasing the lock for writing, new blocks may be changed. - This should not happen during resize as no new changed blocks - are accepted. But it can happen during other flushes. Anyway - check again. + Do not restart here. We have now flushed at least all blocks + that were changed when entering this function. */ - goto restart; } if (last_in_flush) { /* There are no blocks to be flushed by this thread, but blocks in flush by other threads. Wait until one of the blocks is flushed. - */ - wait_on_queue(&last_in_flush->wqueue[COND_FOR_SAVED], - &keycache->cache_lock); + Re-check the condition for last_in_flush. We may have unlocked + the cache_lock in flush_cached_blocks(). The state of the block + could have changed. + */ + if (last_in_flush->status & BLOCK_IN_FLUSH) + wait_on_queue(&last_in_flush->wqueue[COND_FOR_SAVED], + &keycache->cache_lock); /* Be sure not to lose a block. They may be flushed in random order. */ goto restart; } @@ -3818,9 +3767,13 @@ restart: /* There are no blocks to be flushed by this thread, but blocks for update by other threads. Wait until one of the blocks is updated. - */ - wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED], - &keycache->cache_lock); + Re-check the condition for last_for_update. We may have unlocked + the cache_lock in flush_cached_blocks(). The state of the block + could have changed. + */ + if (last_for_update->status & BLOCK_FOR_UPDATE) + wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED], + &keycache->cache_lock); /* The block is now changed. Flush it. */ goto restart; } @@ -3841,8 +3794,12 @@ restart: cnt++; KEYCACHE_DBUG_ASSERT(cnt <= keycache->blocks_used); #endif - /* While waiting here, we might have got another changed block. */ - goto restart; + /* + Do not restart here. We have flushed all blocks that were + changed when entering this function and were not marked for + eviction. Other threads have now flushed all remaining blocks in + the course of their eviction. + */ } if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) @@ -3956,7 +3913,7 @@ restart: goto restart; /* - To avoid an infinite loop wait until one of the blocks marked + To avoid an infinite loop, wait until one of the blocks marked for update is updated. */ if (last_for_update) @@ -4409,11 +4366,10 @@ void keycache_debug_log_close(void) #endif /* defined(KEYCACHE_DEBUG) */ #if !defined(DBUG_OFF) -#define F_B_PRT(_f_, _v_) fprintf(stderr, "Assert fails: " _f_, _v_) +#define F_B_PRT(_f_, _v_) DBUG_PRINT("assert_fail", (_f_, _v_)) static int fail_block(BLOCK_LINK *block) { - fprintf(stderr, "\n"); F_B_PRT("block->next_used: %lx\n", (ulong) block->next_used); F_B_PRT("block->prev_used: %lx\n", (ulong) block->prev_used); F_B_PRT("block->next_changed: %lx\n", (ulong) block->next_changed); @@ -4424,19 +4380,16 @@ static int fail_block(BLOCK_LINK *block) F_B_PRT("block->offset: %u\n", block->offset); F_B_PRT("block->requests: %u\n", block->requests); F_B_PRT("block->temperature: %u\n", block->temperature); - fprintf(stderr, "\n"); return 0; /* Let the assert fail. */ } static int fail_hlink(HASH_LINK *hlink) { - fprintf(stderr, "\n"); F_B_PRT("hlink->next: %lx\n", (ulong) hlink->next); F_B_PRT("hlink->prev: %lx\n", (ulong) hlink->prev); F_B_PRT("hlink->block: %lx\n", (ulong) hlink->block); F_B_PRT("hlink->diskpos: %lu\n", (ulong) hlink->diskpos); F_B_PRT("hlink->file: %d\n", hlink->file); - fprintf(stderr, "\n"); return 0; /* Let the assert fail. */ } diff --git a/storage/myisam/mi_preload.c b/storage/myisam/mi_preload.c index fd6e99c6bc3..06c66c06bf4 100644 --- a/storage/myisam/mi_preload.c +++ b/storage/myisam/mi_preload.c @@ -55,42 +55,20 @@ int mi_preload(MI_INFO *info, ulonglong key_map, my_bool ignore_leaves) block_length= keyinfo[0].block_length; - /* Check whether all indexes use the same block size */ - for (i= 1 ; i < keys ; i++) + if (ignore_leaves) { -#if !defined(INGO_TEST_LOADIDX_OFF) - /* Allow non-IGNORE-LEAVES index loading even with different block sizes. */ - if (ignore_leaves && (keyinfo[i].block_length != block_length)) - DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE); - set_if_bigger(block_length, keyinfo[i].block_length); -#else - if (keyinfo[i].block_length != block_length) - DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE); -#endif - } - -#if !defined(INGO_TEST_LOADIDX_OFF) - /* Align non-IGNORE-LEAVES index loads. */ - if (!ignore_leaves) - { - /* Round up to the next multiple of key_cache_block_size. */ - length= ((info->preload_buff_size + - share->key_cache->key_cache_block_size - 1) / - share->key_cache->key_cache_block_size * - share->key_cache->key_cache_block_size); - /* Round down to the next multiple of key_cache_block_size. */ - pos= (share->base.keystart / share->key_cache->key_cache_block_size * - share->key_cache->key_cache_block_size); + /* Check whether all indexes use the same block size */ + for (i= 1 ; i < keys ; i++) + { + if (keyinfo[i].block_length != block_length) + DBUG_RETURN(my_errno= HA_ERR_NON_UNIQUE_BLOCK_SIZE); + } } else - { - length= info->preload_buff_size/block_length * block_length; - set_if_bigger(length, block_length); - } -#else + block_length= share->key_cache->key_cache_block_size; + length= info->preload_buff_size/block_length * block_length; set_if_bigger(length, block_length); -#endif if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME)))) DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM); From 0c7631e76312c5e8872adfd02a14c9ab2a27700b Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Sat, 28 Apr 2007 14:37:40 +0500 Subject: [PATCH 03/40] BUG#27998 - mysqld crashed when executing INSERT DELAYED on a BLACKHOLE table Using INSERT DELAYED on BLACKHOLE tables could lead to server crash. This happens because delayed thread wants to upgrade a lock, but BLACKHOLE tables do not have locks at all. This patch rejects attempts to use INSERT DELAYED on MERGE tables. --- mysql-test/r/blackhole.result | 4 ++++ mysql-test/t/blackhole.test | 9 +++++++++ sql/ha_blackhole.h | 3 +-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/mysql-test/r/blackhole.result b/mysql-test/r/blackhole.result index 4b779094376..2d020e0eed7 100644 --- a/mysql-test/r/blackhole.result +++ b/mysql-test/r/blackhole.result @@ -123,3 +123,7 @@ master-bin.000001 # Query 1 # use `test`; create table t3 like t1 master-bin.000001 # Query 1 # use `test`; insert into t1 select * from t3 master-bin.000001 # Query 1 # use `test`; replace into t1 select * from t3 drop table t1,t2,t3; +CREATE TABLE t1(a INT) ENGINE=BLACKHOLE; +INSERT DELAYED INTO t1 VALUES(1); +ERROR HY000: Table storage engine for 't1' doesn't have this option +DROP TABLE t1; diff --git a/mysql-test/t/blackhole.test b/mysql-test/t/blackhole.test index 257770d311c..4375f1c13ce 100644 --- a/mysql-test/t/blackhole.test +++ b/mysql-test/t/blackhole.test @@ -126,4 +126,13 @@ show binlog events; drop table t1,t2,t3; +# +# BUG#27998 - mysqld crashed when executing INSERT DELAYED on a BLACKHOLE +# table +# +CREATE TABLE t1(a INT) ENGINE=BLACKHOLE; +--error 1031 +INSERT DELAYED INTO t1 VALUES(1); +DROP TABLE t1; + # End of 4.1 tests diff --git a/sql/ha_blackhole.h b/sql/ha_blackhole.h index 177b59fa970..e5f5ee69a82 100644 --- a/sql/ha_blackhole.h +++ b/sql/ha_blackhole.h @@ -46,8 +46,7 @@ public: { return(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER | HA_DUPP_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY | - HA_FILE_BASED | HA_CAN_GEOMETRY | HA_READ_RND_SAME | - HA_CAN_INSERT_DELAYED); + HA_FILE_BASED | HA_CAN_GEOMETRY | HA_READ_RND_SAME); } ulong index_flags(uint inx, uint part, bool all_parts) const { From 4cdf0fab489814cd3cf4516dc18708d405f27be5 Mon Sep 17 00:00:00 2001 From: "mikron@mikael-ronstr-ms-dator.local" <> Date: Mon, 7 May 2007 15:25:24 +0200 Subject: [PATCH 04/40] Fix SCI Transporter --- config/ac-macros/ha_ndbcluster.m4 | 2 +- libmysqld/Makefile.am | 2 +- libmysqld/examples/Makefile.am | 3 +- .../common/transporter/SCI_Transporter.cpp | 216 ++++-------------- .../common/transporter/SCI_Transporter.hpp | 18 +- 5 files changed, 54 insertions(+), 187 deletions(-) diff --git a/config/ac-macros/ha_ndbcluster.m4 b/config/ac-macros/ha_ndbcluster.m4 index a4963a5e20e..55fe6ad8350 100644 --- a/config/ac-macros/ha_ndbcluster.m4 +++ b/config/ac-macros/ha_ndbcluster.m4 @@ -22,7 +22,7 @@ AC_DEFUN([MYSQL_CHECK_NDB_OPTIONS], [ if test -f "$mysql_sci_dir/lib/libsisci.a" -a \ -f "$mysql_sci_dir/include/sisci_api.h"; then NDB_SCI_INCLUDES="-I$mysql_sci_dir/include" - NDB_SCI_LIBS="-L$mysql_sci_dir/lib -lsisci" + NDB_SCI_LIBS="$mysql_sci_dir/lib/libsisci.a" AC_MSG_RESULT([-- including sci transporter]) AC_DEFINE([NDB_SCI_TRANSPORTER], [1], [Including Ndb Cluster DB sci transporter]) diff --git a/libmysqld/Makefile.am b/libmysqld/Makefile.am index 95e3e539eee..81da1e43cc9 100644 --- a/libmysqld/Makefile.am +++ b/libmysqld/Makefile.am @@ -81,7 +81,7 @@ INC_LIB= $(top_builddir)/regex/libregex.a \ $(top_builddir)/mysys/libmysys.a \ $(top_builddir)/strings/libmystrings.a \ $(top_builddir)/dbug/libdbug.a \ - $(top_builddir)/vio/libvio.a + $(top_builddir)/vio/libvio.a @NDB_SCI_LIBS@ # diff --git a/libmysqld/examples/Makefile.am b/libmysqld/examples/Makefile.am index f30951a5d81..e0dd8491688 100644 --- a/libmysqld/examples/Makefile.am +++ b/libmysqld/examples/Makefile.am @@ -35,7 +35,8 @@ INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include -I$(srcdir) \ -I$(top_srcdir) -I$(top_srcdir)/client -I$(top_srcdir)/regex \ $(openssl_includes) LIBS = @LIBS@ @WRAPLIBS@ @CLIENT_LIBS@ $(yassl_libs) -LDADD = @CLIENT_EXTRA_LDFLAGS@ ../libmysqld.a @innodb_system_libs@ @LIBDL@ $(CXXLDFLAGS) +LDADD = @CLIENT_EXTRA_LDFLAGS@ ../libmysqld.a @innodb_system_libs@ @LIBDL@ $(CXXLDFLAGS) \ + @NDB_SCI_LIBS@ mysqltest_embedded_LINK = $(CXXLINK) mysqltest_embedded_SOURCES = mysqltest.c diff --git a/ndb/src/common/transporter/SCI_Transporter.cpp b/ndb/src/common/transporter/SCI_Transporter.cpp index 138b79acb51..0720fe84973 100644 --- a/ndb/src/common/transporter/SCI_Transporter.cpp +++ b/ndb/src/common/transporter/SCI_Transporter.cpp @@ -65,13 +65,10 @@ SCI_Transporter::SCI_Transporter(TransporterRegistry &t_reg, m_initLocal=false; - m_swapCounter=0; m_failCounter=0; m_remoteNodes[0]=remoteSciNodeId0; m_remoteNodes[1]=remoteSciNodeId1; m_adapters = nAdapters; - // The maximum number of times to try and create, - // start and destroy a sequence m_ActiveAdapterId=0; m_StandbyAdapterId=1; @@ -102,8 +99,6 @@ SCI_Transporter::SCI_Transporter(TransporterRegistry &t_reg, DBUG_VOID_RETURN; } - - void SCI_Transporter::disconnectImpl() { DBUG_ENTER("SCI_Transporter::disconnectImpl"); @@ -129,7 +124,8 @@ void SCI_Transporter::disconnectImpl() if(err != SCI_ERR_OK) { report_error(TE_SCI_UNABLE_TO_CLOSE_CHANNEL); - DBUG_PRINT("error", ("Cannot close channel to the driver. Error code 0x%x", + DBUG_PRINT("error", + ("Cannot close channel to the driver. Error code 0x%x", err)); } } @@ -164,19 +160,18 @@ bool SCI_Transporter::initTransporter() { m_sendBuffer.m_buffer = new Uint32[m_sendBuffer.m_sendBufferSize / 4]; m_sendBuffer.m_dataSize = 0; - DBUG_PRINT("info", ("Created SCI Send Buffer with buffer size %d and packet size %d", + DBUG_PRINT("info", + ("Created SCI Send Buffer with buffer size %d and packet size %d", m_sendBuffer.m_sendBufferSize, m_PacketSize * 4)); if(!getLinkStatus(m_ActiveAdapterId) || (m_adapters > 1 && !getLinkStatus(m_StandbyAdapterId))) { - DBUG_PRINT("error", ("The link is not fully operational. Check the cables and the switches")); - //reportDisconnect(remoteNodeId, 0); - //doDisconnect(); + DBUG_PRINT("error", + ("The link is not fully operational. Check the cables and the switches")); //NDB should terminate report_error(TE_SCI_LINK_ERROR); DBUG_RETURN(false); } - DBUG_RETURN(true); } // initTransporter() @@ -235,7 +230,8 @@ sci_error_t SCI_Transporter::initLocalSegment() { DBUG_PRINT("info", ("SCInode iD %d adapter %d\n", sciAdapters[i].localSciNodeId, i)); if(err != SCI_ERR_OK) { - DBUG_PRINT("error", ("Cannot open an SCI virtual device. Error code 0x%x", + DBUG_PRINT("error", + ("Cannot open an SCI virtual device. Error code 0x%x", err)); DBUG_RETURN(err); } @@ -269,7 +265,8 @@ sci_error_t SCI_Transporter::initLocalSegment() { &err); if(err != SCI_ERR_OK) { - DBUG_PRINT("error", ("Local Segment is not accessible by an SCI adapter. Error code 0x%x\n", + DBUG_PRINT("error", + ("Local Segment is not accessible by an SCI adapter. Error code 0x%x\n", err)); DBUG_RETURN(err); } @@ -303,15 +300,13 @@ sci_error_t SCI_Transporter::initLocalSegment() { &err); if(err != SCI_ERR_OK) { - DBUG_PRINT("error", ("Local Segment is not available for remote connections. Error code 0x%x\n", + DBUG_PRINT("error", + ("Local Segment is not available for remote connections. Error code 0x%x\n", err)); DBUG_RETURN(err); } } - - setupLocalSegment(); - DBUG_RETURN(err); } // initLocalSegment() @@ -343,12 +338,6 @@ bool SCI_Transporter::doSend() { if(sizeToSend==4097) i4097++; #endif - if(startSequence(m_ActiveAdapterId)!=SCI_ERR_OK) { - DBUG_PRINT("error", ("Start sequence failed")); - report_error(TE_SCI_UNABLE_TO_START_SEQUENCE); - return false; - } - tryagain: retry++; @@ -374,119 +363,36 @@ bool SCI_Transporter::doSend() { SCI_FLAG_ERROR_CHECK, &err); - if (err != SCI_ERR_OK) { - if(err == SCI_ERR_OUT_OF_RANGE) { - DBUG_PRINT("error", ("Data transfer : out of range error")); - goto tryagain; - } - if(err == SCI_ERR_SIZE_ALIGNMENT) { - DBUG_PRINT("error", ("Data transfer : alignment error")); - DBUG_PRINT("info", ("sendPtr 0x%x, sizeToSend = %d", sendPtr, sizeToSend)); - goto tryagain; - } - if(err == SCI_ERR_OFFSET_ALIGNMENT) { - DBUG_PRINT("error", ("Data transfer : offset alignment")); - goto tryagain; - } - if(err == SCI_ERR_TRANSFER_FAILED) { - //(m_TargetSegm[m_StandbyAdapterId].writer)->heavyLock(); - if(getLinkStatus(m_ActiveAdapterId)) { - goto tryagain; - } - if (m_adapters == 1) { - DBUG_PRINT("error", ("SCI Transfer failed")); + if (err == SCI_ERR_OUT_OF_RANGE || + err == SCI_ERR_SIZE_ALIGNMENT || + err == SCI_ERR_OFFSET_ALIGNMENT) { + DBUG_PRINT("error", ("Data transfer error = %d", err)); report_error(TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR); return false; - } - m_failCounter++; - Uint32 temp=m_ActiveAdapterId; - switch(m_swapCounter) { - case 0: - /**swap from active (0) to standby (1)*/ - if(getLinkStatus(m_StandbyAdapterId)) { - DBUG_PRINT("error", ("Swapping from adapter 0 to 1")); + } + if(err == SCI_ERR_TRANSFER_FAILED) { + if(getLinkStatus(m_ActiveAdapterId)) + goto tryagain; + if (m_adapters == 1) { + DBUG_PRINT("error", ("SCI Transfer failed")); + report_error(TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR); + return false; + } + m_failCounter++; + Uint32 temp=m_ActiveAdapterId; + if (getLinkStatus(m_StandbyAdapterId)) { failoverShmWriter(); SCIStoreBarrier(m_TargetSegm[m_StandbyAdapterId].sequence,0); m_ActiveAdapterId=m_StandbyAdapterId; m_StandbyAdapterId=temp; - SCIRemoveSequence((m_TargetSegm[m_StandbyAdapterId].sequence), - FLAGS, - &err); - if(err!=SCI_ERR_OK) { - report_error(TE_SCI_UNABLE_TO_REMOVE_SEQUENCE); - DBUG_PRINT("error", ("Unable to remove sequence")); - return false; - } - if(startSequence(m_ActiveAdapterId)!=SCI_ERR_OK) { - DBUG_PRINT("error", ("Start sequence failed")); - report_error(TE_SCI_UNABLE_TO_START_SEQUENCE); - return false; - } - m_swapCounter++; - DBUG_PRINT("info", ("failover complete")); - goto tryagain; - } else { + DBUG_PRINT("error", ("Swapping from adapter %u to %u", + m_StandbyAdapterId, m_ActiveAdapterId)); + } else { report_error(TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR); DBUG_PRINT("error", ("SCI Transfer failed")); - return false; } - return false; - break; - case 1: - /** swap back from 1 to 0 - must check that the link is up */ - - if(getLinkStatus(m_StandbyAdapterId)) { - failoverShmWriter(); - m_ActiveAdapterId=m_StandbyAdapterId; - m_StandbyAdapterId=temp; - DBUG_PRINT("info", ("Swapping from 1 to 0")); - if(createSequence(m_ActiveAdapterId)!=SCI_ERR_OK) { - DBUG_PRINT("error", ("Unable to create sequence")); - report_error(TE_SCI_UNABLE_TO_CREATE_SEQUENCE); - return false; - } - if(startSequence(m_ActiveAdapterId)!=SCI_ERR_OK) { - DBUG_PRINT("error", ("startSequence failed... disconnecting")); - report_error(TE_SCI_UNABLE_TO_START_SEQUENCE); - return false; - } - - SCIRemoveSequence((m_TargetSegm[m_StandbyAdapterId].sequence) - , FLAGS, - &err); - if(err!=SCI_ERR_OK) { - DBUG_PRINT("error", ("Unable to remove sequence")); - report_error(TE_SCI_UNABLE_TO_REMOVE_SEQUENCE); - return false; - } - - if(createSequence(m_StandbyAdapterId)!=SCI_ERR_OK) { - DBUG_PRINT("error", ("Unable to create sequence on standby")); - report_error(TE_SCI_UNABLE_TO_CREATE_SEQUENCE); - return false; - } - - m_swapCounter=0; - - DBUG_PRINT("info", ("failover complete..")); - goto tryagain; - - } else { - DBUG_PRINT("error", ("Unrecoverable data transfer error")); - report_error(TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR); - return false; - } - - break; - default: - DBUG_PRINT("error", ("Unrecoverable data transfer error")); - report_error(TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR); - return false; - break; - } - } + } } else { SHM_Writer * writer = (m_TargetSegm[m_ActiveAdapterId].writer); writer->updateWritePtr(sizeToSend); @@ -497,7 +403,6 @@ bool SCI_Transporter::doSend() { m_sendBuffer.m_dataSize = 0; m_sendBuffer.m_forceSendLimit = sendLimit; } - } else { /** * If we end up here, the SCI segment is full. @@ -552,15 +457,12 @@ void SCI_Transporter::setupLocalSegment() DBUG_VOID_RETURN; } //setupLocalSegment - - void SCI_Transporter::setupRemoteSegment() { DBUG_ENTER("SCI_Transporter::setupRemoteSegment"); Uint32 sharedSize = 0; sharedSize =4096; //start of the buffer is page aligned - Uint32 sizeOfBuffer = m_BufferSize; const Uint32 slack = MAX_MESSAGE_SIZE; sizeOfBuffer -= sharedSize; @@ -666,7 +568,6 @@ SCI_Transporter::init_remote() DBUG_PRINT("error", ("Error connecting segment, err 0x%x", err)); DBUG_RETURN(false); } - } // Map the remote memory segment into program space for(Uint32 i=0; i < m_adapters ; i++) { @@ -679,13 +580,14 @@ SCI_Transporter::init_remote() FLAGS, &err); - - if(err!= SCI_ERR_OK) { - DBUG_PRINT("error", ("Cannot map a segment to the remote node %d. Error code 0x%x",m_RemoteSciNodeId, err)); - //NDB SHOULD TERMINATE AND COMPUTER REBOOTED! - report_error(TE_SCI_CANNOT_MAP_REMOTESEGMENT); - DBUG_RETURN(false); - } + if(err!= SCI_ERR_OK) { + DBUG_PRINT("error", + ("Cannot map a segment to the remote node %d. Error code 0x%x", + m_RemoteSciNodeId, err)); + //NDB SHOULD TERMINATE AND COMPUTER REBOOTED! + report_error(TE_SCI_CANNOT_MAP_REMOTESEGMENT); + DBUG_RETURN(false); + } } m_mapped=true; setupRemoteSegment(); @@ -713,7 +615,6 @@ SCI_Transporter::connect_client_impl(NDB_SOCKET_TYPE sockfd) NDB_CLOSE_SOCKET(sockfd); DBUG_RETURN(false); } - if (!init_local()) { NDB_CLOSE_SOCKET(sockfd); DBUG_RETURN(false); @@ -788,29 +689,9 @@ sci_error_t SCI_Transporter::createSequence(Uint32 adapterid) { &(m_TargetSegm[adapterid].sequence), SCI_FLAG_FAST_BARRIER, &err); - - return err; } // createSequence() - -sci_error_t SCI_Transporter::startSequence(Uint32 adapterid) { - - sci_error_t err; - /** Perform preliminary error check on an SCI adapter before starting a - * sequence of read and write operations on the mapped segment. - */ - m_SequenceStatus = SCIStartSequence( - (m_TargetSegm[adapterid].sequence), - FLAGS, &err); - - - // If there still is an error then data cannot be safely send - return err; -} // startSequence() - - - bool SCI_Transporter::disconnectLocal() { DBUG_ENTER("SCI_Transporter::disconnectLocal"); @@ -878,9 +759,6 @@ SCI_Transporter::~SCI_Transporter() { DBUG_VOID_RETURN; } // ~SCI_Transporter() - - - void SCI_Transporter::closeSCI() { // Termination of SCI sci_error_t err; @@ -897,8 +775,9 @@ void SCI_Transporter::closeSCI() { SCIClose(activeSCIDescriptor, FLAGS, &err); if(err != SCI_ERR_OK) { - DBUG_PRINT("error", ("Cannot close SCI channel to the driver. Error code 0x%x", - err)); + DBUG_PRINT("error", + ("Cannot close SCI channel to the driver. Error code 0x%x", + err)); } SCITerminate(); DBUG_VOID_RETURN; @@ -973,7 +852,6 @@ SCI_Transporter::getConnectionStatus() { return false; } - void SCI_Transporter::setConnected() { *m_remoteStatusFlag = SCICONNECTED; @@ -983,7 +861,6 @@ SCI_Transporter::setConnected() { *m_localStatusFlag = SCICONNECTED; } - void SCI_Transporter::setDisconnect() { if(getLinkStatus(m_ActiveAdapterId)) @@ -994,7 +871,6 @@ SCI_Transporter::setDisconnect() { } } - bool SCI_Transporter::checkConnected() { if (*m_localStatusFlag == SCIDISCONNECT) { @@ -1015,8 +891,9 @@ SCI_Transporter::initSCI() { SCIInitialize(0, &error); if(error != SCI_ERR_OK) { DBUG_PRINT("error", ("Cannot initialize SISCI library.")); - DBUG_PRINT("error", ("Inconsistency between SISCI library and SISCI driver. Error code 0x%x", - error)); + DBUG_PRINT("error", + ("Inconsistency between SISCI library and SISCI driver. Error code 0x%x", + error)); DBUG_RETURN(false); } init = true; @@ -1029,3 +906,4 @@ SCI_Transporter::get_free_buffer() const { return (m_TargetSegm[m_ActiveAdapterId].writer)->get_free_buffer(); } + diff --git a/ndb/src/common/transporter/SCI_Transporter.hpp b/ndb/src/common/transporter/SCI_Transporter.hpp index fbba2ac4516..f774186f238 100644 --- a/ndb/src/common/transporter/SCI_Transporter.hpp +++ b/ndb/src/common/transporter/SCI_Transporter.hpp @@ -54,12 +54,12 @@ * local segment, the SCI transporter connects to a segment created by another * transporter at a remote node, and the maps the remote segment into its * virtual address space. However, since NDB Cluster relies on redundancy - * at the network level, by using dual SCI adapters communica - * + * at the network level, by using dual SCI adapters communication can be + * maintained even if one of the adapter cards fails (or anything on the + * network this adapter card exists in e.g. an SCI switch failure). * */ - /** * class SCITransporter * @brief - main class for the SCI transporter. @@ -84,16 +84,6 @@ public: sci_error_t createSequence(Uint32 adapterid); - /** - * starts a sequence for error checking. - * The actual checking that a sequence is correct is done implicitly - * in SCIMemCpy (in doSend). - * @param adapterid the adapter on which to start the sequence. - * @return SCI_ERR_OK if ok, otherwize something else. - */ - sci_error_t startSequence(Uint32 adapterid); - - /** Initiate Local Segment: create a memory segment, * prepare a memory segment, map the local segment * into memory space and make segment available. @@ -159,7 +149,6 @@ private: bool m_mapped; bool m_initLocal; bool m_sciinit; - Uint32 m_swapCounter; Uint32 m_failCounter; /** * For statistics on transfered packets @@ -195,7 +184,6 @@ private: */ Uint32 m_reportFreq; - Uint32 m_adapters; Uint32 m_numberOfRemoteNodes; From ad11366a2fc42f7cda0d2502c7adac45d599ecd7 Mon Sep 17 00:00:00 2001 From: "mikron@mikael-ronstr-ms-dator.local" <> Date: Mon, 7 May 2007 15:33:27 +0200 Subject: [PATCH 05/40] New SCI Transporter Build scripts --- BUILD/Makefile.am | 2 ++ BUILD/compile-amd64-max-sci | 8 ++++++++ BUILD/compile-pentium64-max-sci | 9 +++++++++ 3 files changed, 19 insertions(+) create mode 100644 BUILD/compile-amd64-max-sci create mode 100644 BUILD/compile-pentium64-max-sci diff --git a/BUILD/Makefile.am b/BUILD/Makefile.am index 3fd61790903..d06106d4431 100644 --- a/BUILD/Makefile.am +++ b/BUILD/Makefile.am @@ -28,6 +28,7 @@ EXTRA_DIST = FINISH.sh \ compile-alpha-debug \ compile-amd64-debug-max \ compile-amd64-max \ + compile-amd64-max-sci \ compile-darwin-mwcc \ compile-dist \ compile-hpux11-parisc2-aCC \ @@ -53,6 +54,7 @@ EXTRA_DIST = FINISH.sh \ compile-pentium-valgrind-max \ compile-pentium64-debug \ compile-pentium64-debug-max \ + compile-pentium64-max-sci \ compile-pentium64-valgrind-max \ compile-ppc \ compile-ppc-debug \ diff --git a/BUILD/compile-amd64-max-sci b/BUILD/compile-amd64-max-sci new file mode 100644 index 00000000000..4afa9004742 --- /dev/null +++ b/BUILD/compile-amd64-max-sci @@ -0,0 +1,8 @@ +#! /bin/sh + +path=`dirname $0` +. "$path/SETUP.sh" +extra_flags="$amd64_cflags $fast_cflags -g" +extra_configs="$amd64_configs $max_configs --with-ndb-sci=/opt/DIS" + +. "$path/FINISH.sh" diff --git a/BUILD/compile-pentium64-max-sci b/BUILD/compile-pentium64-max-sci new file mode 100644 index 00000000000..9ebb1988475 --- /dev/null +++ b/BUILD/compile-pentium64-max-sci @@ -0,0 +1,9 @@ +#! /bin/sh + +path=`dirname $0` +. "$path/SETUP.sh" + +extra_flags="$pentium64_cflags $fast_cflags -g" +extra_configs="$pentium_configs $max_configs --with-ndb-sci=/opt/DIS" + +. "$path/FINISH.sh" From 6e2633e9e405aa20bb3b7100674ebdf7a421ae41 Mon Sep 17 00:00:00 2001 From: "mikron@mikael-ronstr-ms-dator.local" <> Date: Mon, 7 May 2007 15:46:29 +0200 Subject: [PATCH 06/40] Manual merge --- sql/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/Makefile.am b/sql/Makefile.am index 465e5c843f4..d280b22f493 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -33,7 +33,7 @@ SUPPORTING_LIBS = $(top_builddir)/vio/libvio.a \ $(top_builddir)/regex/libregex.a \ $(top_builddir)/strings/libmystrings.a mysqld_DEPENDENCIES= @mysql_plugin_libs@ $(SUPPORTING_LIBS) -LDADD = $(SUPPORTING_LIBS) @ZLIB_LIBS@ +LDADD = $(SUPPORTING_LIBS) @ZLIB_LIBS@ @NDB_SCI_LIBS@ mysqld_LDADD = @MYSQLD_EXTRA_LDFLAGS@ \ @pstack_libs@ \ @mysql_plugin_libs@ \ From f1a7d7ddedbd68b228b93ffb2e77a88c09ff5a19 Mon Sep 17 00:00:00 2001 From: "mikron@mikael-ronstr-ms-dator.local" <> Date: Mon, 7 May 2007 16:07:04 +0200 Subject: [PATCH 07/40] Jamming --- .../debugger/signaldata/SignalNames.cpp | 8 + .../ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 682 +++++++++++------- storage/ndb/src/kernel/blocks/lgman.cpp | 7 +- 3 files changed, 442 insertions(+), 255 deletions(-) diff --git a/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp b/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp index 884a49b3a94..0d31cd5de7f 100644 --- a/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp +++ b/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp @@ -621,6 +621,14 @@ const GsnName SignalNames [] = { ,{ GSN_LCP_PREPARE_REF, "LCP_PREPARE_REF" } ,{ GSN_LCP_PREPARE_CONF, "LCP_PREPARE_CONF" } + ,{ GSN_DICT_ABORT_REQ, "DICT_ABORT_REQ" } + ,{ GSN_DICT_ABORT_REF, "DICT_ABORT_REF" } + ,{ GSN_DICT_ABORT_CONF, "DICT_ABORT_CONF" } + + ,{ GSN_DICT_COMMIT_REQ, "DICT_COMMIT_REQ" } + ,{ GSN_DICT_COMMIT_REF, "DICT_COMMIT_REF" } + ,{ GSN_DICT_COMMIT_CONF, "DICT_COMMIT_CONF" } + /* DICT LOCK */ ,{ GSN_DICT_LOCK_REQ, "DICT_LOCK_REQ" } ,{ GSN_DICT_LOCK_CONF, "DICT_LOCK_CONF" } diff --git a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index 66cd523f333..de365e886a0 100644 --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -13968,7 +13968,8 @@ Dbdict::getTableEntry(XSchemaFile * xsf, Uint32 tableId) //****************************************** void -Dbdict::execCREATE_FILE_REQ(Signal* signal){ +Dbdict::execCREATE_FILE_REQ(Signal* signal) +{ jamEntry(); if(!assembleFragments(signal)){ @@ -14013,13 +14014,14 @@ Dbdict::execCREATE_FILE_REQ(Signal* signal){ Ptr trans_ptr; if (! c_Trans.seize(trans_ptr)){ + jam(); ref->errorCode = CreateFileRef::Busy; ref->status = 0; ref->errorKey = 0; ref->errorLine = __LINE__; break; } - + jam(); const Uint32 trans_key = ++c_opRecordSequence; trans_ptr.p->key = trans_key; trans_ptr.p->m_senderRef = senderRef; @@ -14048,6 +14050,7 @@ Dbdict::execCREATE_FILE_REQ(Signal* signal){ { Uint32 objId = getFreeObjId(0); if (objId == RNIL) { + jam(); ref->errorCode = CreateFileRef::NoMoreObjectRecords; ref->status = 0; ref->errorKey = 0; @@ -14072,7 +14075,6 @@ Dbdict::execCREATE_FILE_REQ(Signal* signal){ CreateObjReq::SignalLength, JBB); c_blockState = BS_CREATE_TAB; - return; } while(0); @@ -14083,7 +14085,8 @@ Dbdict::execCREATE_FILE_REQ(Signal* signal){ } void -Dbdict::execCREATE_FILEGROUP_REQ(Signal* signal){ +Dbdict::execCREATE_FILEGROUP_REQ(Signal* signal) +{ jamEntry(); if(!assembleFragments(signal)){ @@ -14127,13 +14130,14 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal* signal){ Ptr trans_ptr; if (! c_Trans.seize(trans_ptr)){ + jam(); ref->errorCode = CreateFilegroupRef::Busy; ref->status = 0; ref->errorKey = 0; ref->errorLine = __LINE__; break; } - + jam(); const Uint32 trans_key = ++c_opRecordSequence; trans_ptr.p->key = trans_key; trans_ptr.p->m_senderRef = senderRef; @@ -14159,6 +14163,7 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal* signal){ { Uint32 objId = getFreeObjId(0); if (objId == RNIL) { + jam(); ref->errorCode = CreateFilegroupRef::NoMoreObjectRecords; ref->status = 0; ref->errorKey = 0; @@ -14183,7 +14188,6 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal* signal){ CreateObjReq::SignalLength, JBB); c_blockState = BS_CREATE_TAB; - return; } while(0); @@ -14219,7 +14223,8 @@ Dbdict::execDROP_FILE_REQ(Signal* signal) break; } - if (c_blockState != BS_IDLE){ + if (c_blockState != BS_IDLE) + { jam(); ref->errorCode = DropFileRef::Busy; ref->errorKey = 0; @@ -14229,6 +14234,7 @@ Dbdict::execDROP_FILE_REQ(Signal* signal) if (checkSingleUserMode(senderRef)) { + jam(); ref->errorCode = DropFileRef::SingleUser; ref->errorKey = 0; ref->errorLine = __LINE__; @@ -14238,6 +14244,7 @@ Dbdict::execDROP_FILE_REQ(Signal* signal) Ptr file_ptr; if (!c_file_hash.find(file_ptr, objId)) { + jam(); ref->errorCode = DropFileRef::NoSuchFile; ref->errorLine = __LINE__; break; @@ -14245,6 +14252,7 @@ Dbdict::execDROP_FILE_REQ(Signal* signal) if (file_ptr.p->m_version != version) { + jam(); ref->errorCode = DropFileRef::InvalidSchemaObjectVersion; ref->errorLine = __LINE__; break; @@ -14253,10 +14261,12 @@ Dbdict::execDROP_FILE_REQ(Signal* signal) Ptr trans_ptr; if (! c_Trans.seize(trans_ptr)) { + jam(); ref->errorCode = DropFileRef::Busy; ref->errorLine = __LINE__; break; } + jam(); const Uint32 trans_key = ++c_opRecordSequence; trans_ptr.p->key = trans_key; @@ -14292,7 +14302,6 @@ Dbdict::execDROP_FILE_REQ(Signal* signal) DropObjReq::SignalLength, JBB); c_blockState = BS_CREATE_TAB; - return; } while(0); @@ -14320,7 +14329,8 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) Uint32 version = req->filegroup_version; do { - if(getOwnNodeId() != c_masterNodeId){ + if(getOwnNodeId() != c_masterNodeId) + { jam(); ref->errorCode = DropFilegroupRef::NotMaster; ref->errorKey = 0; @@ -14328,7 +14338,8 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) break; } - if (c_blockState != BS_IDLE){ + if (c_blockState != BS_IDLE) + { jam(); ref->errorCode = DropFilegroupRef::Busy; ref->errorKey = 0; @@ -14338,6 +14349,7 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) if (checkSingleUserMode(senderRef)) { + jam(); ref->errorCode = DropFilegroupRef::SingleUser; ref->errorKey = 0; ref->errorLine = __LINE__; @@ -14347,6 +14359,7 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) Ptr filegroup_ptr; if (!c_filegroup_hash.find(filegroup_ptr, objId)) { + jam(); ref->errorCode = DropFilegroupRef::NoSuchFilegroup; ref->errorLine = __LINE__; break; @@ -14354,6 +14367,7 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) if (filegroup_ptr.p->m_version != version) { + jam(); ref->errorCode = DropFilegroupRef::InvalidSchemaObjectVersion; ref->errorLine = __LINE__; break; @@ -14362,10 +14376,12 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) Ptr trans_ptr; if (! c_Trans.seize(trans_ptr)) { + jam(); ref->errorCode = DropFilegroupRef::Busy; ref->errorLine = __LINE__; break; } + jam(); const Uint32 trans_key = ++c_opRecordSequence; trans_ptr.p->key = trans_key; @@ -14401,7 +14417,6 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) DropObjReq::SignalLength, JBB); c_blockState = BS_CREATE_TAB; - return; } while(0); @@ -14412,15 +14427,15 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* signal) } void -Dbdict::execCREATE_OBJ_REF(Signal* signal){ - jamEntry(); - +Dbdict::execCREATE_OBJ_REF(Signal* signal) +{ CreateObjRef * const ref = (CreateObjRef*)signal->getDataPtr(); - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, ref->senderData)); - if(ref->errorCode != CreateObjRef::NF_FakeErrorREF){ + jam(); trans_ptr.p->setErrorCode(ref->errorCode); } Uint32 node = refToNode(ref->senderRef); @@ -14428,12 +14443,12 @@ Dbdict::execCREATE_OBJ_REF(Signal* signal){ } void -Dbdict::execCREATE_OBJ_CONF(Signal* signal){ - jamEntry(); - - CreateObjConf * const conf = (CreateObjConf*)signal->getDataPtr(); - +Dbdict::execCREATE_OBJ_CONF(Signal* signal) +{ Ptr trans_ptr; + CreateObjConf * const conf = (CreateObjConf*)signal->getDataPtr(); + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, conf->senderData)); schemaOp_reply(signal, trans_ptr.p, refToNode(conf->senderRef)); } @@ -14443,6 +14458,7 @@ Dbdict::schemaOp_reply(Signal* signal, SchemaTransaction * trans_ptr_p, Uint32 nodeId) { + jam(); { SafeCounter tmp(c_counterMgr, trans_ptr_p->m_counter); if(!tmp.clearWaitingFor(nodeId)){ @@ -14453,10 +14469,8 @@ Dbdict::schemaOp_reply(Signal* signal, switch(trans_ptr_p->m_op.m_state){ case DictObjOp::Preparing:{ - if(trans_ptr_p->m_errorCode != 0) { - jam(); /** * Failed to prepare on atleast one node -> abort on all */ @@ -14466,10 +14480,16 @@ Dbdict::schemaOp_reply(Signal* signal, safe_cast(&Dbdict::trans_abort_start_done); if(f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_abort_start) + { + jam(); (this->*f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_abort_start) (signal, trans_ptr_p); + } else + { + jam(); execute(signal, trans_ptr_p->m_callback, 0); + } return; } @@ -14479,14 +14499,19 @@ Dbdict::schemaOp_reply(Signal* signal, safe_cast(&Dbdict::trans_commit_start_done); if(f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_commit_start) + { + jam(); (this->*f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_commit_start) (signal, trans_ptr_p); + } else + { + jam(); execute(signal, trans_ptr_p->m_callback, 0); + } return; } case DictObjOp::Committing: { - jam(); ndbrequire(trans_ptr_p->m_errorCode == 0); trans_ptr_p->m_op.m_state = DictObjOp::Committed; @@ -14495,31 +14520,42 @@ Dbdict::schemaOp_reply(Signal* signal, safe_cast(&Dbdict::trans_commit_complete_done); if(f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_commit_complete) + { + jam(); (this->*f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_commit_complete) (signal, trans_ptr_p); + } else - execute(signal, trans_ptr_p->m_callback, 0); + { + jam(); + execute(signal, trans_ptr_p->m_callback, 0); + } return; } case DictObjOp::Aborting:{ - jam(); - trans_ptr_p->m_op.m_state = DictObjOp::Committed; trans_ptr_p->m_callback.m_callbackData = trans_ptr_p->key; trans_ptr_p->m_callback.m_callbackFunction= safe_cast(&Dbdict::trans_abort_complete_done); if(f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_abort_complete) + { + jam(); (this->*f_dict_op[trans_ptr_p->m_op.m_vt_index].m_trans_abort_complete) (signal, trans_ptr_p); + } else - execute(signal, trans_ptr_p->m_callback, 0); + { + jam(); + execute(signal, trans_ptr_p->m_callback, 0); + } return; } case DictObjOp::Defined: case DictObjOp::Prepared: case DictObjOp::Committed: case DictObjOp::Aborted: + jam(); break; } ndbrequire(false); @@ -14528,14 +14564,13 @@ Dbdict::schemaOp_reply(Signal* signal, void Dbdict::trans_commit_start_done(Signal* signal, Uint32 callbackData, - Uint32 retValue){ - jamEntry(); - - ndbrequire(retValue == 0); - + Uint32 retValue) +{ Ptr trans_ptr; + + jam(); + ndbrequire(retValue == 0); ndbrequire(c_Trans.find(trans_ptr, callbackData)); - NodeReceiverGroup rg(DBDICT, trans_ptr.p->m_nodes); SafeCounter tmp(c_counterMgr, trans_ptr.p->m_counter); tmp.init(rg, GSN_DICT_COMMIT_REF, trans_ptr.p->key); @@ -14546,27 +14581,26 @@ Dbdict::trans_commit_start_done(Signal* signal, req->op_key = trans_ptr.p->m_op.m_key; sendSignal(rg, GSN_DICT_COMMIT_REQ, signal, DictCommitReq::SignalLength, JBB); - trans_ptr.p->m_op.m_state = DictObjOp::Committing; } void Dbdict::trans_commit_complete_done(Signal* signal, Uint32 callbackData, - Uint32 retValue){ - jamEntry(); - - ndbrequire(retValue == 0); - + Uint32 retValue) +{ Ptr trans_ptr; + + jam(); + ndbrequire(retValue == 0); ndbrequire(c_Trans.find(trans_ptr, callbackData)); switch(f_dict_op[trans_ptr.p->m_op.m_vt_index].m_gsn_user_req){ case GSN_CREATE_FILEGROUP_REQ:{ FilegroupPtr fg_ptr; + jam(); ndbrequire(c_filegroup_hash.find(fg_ptr, trans_ptr.p->m_op.m_obj_id)); - // CreateFilegroupConf * conf = (CreateFilegroupConf*)signal->getDataPtr(); conf->senderRef = reference(); conf->senderData = trans_ptr.p->m_senderData; @@ -14576,11 +14610,11 @@ Dbdict::trans_commit_complete_done(Signal* signal, //@todo check api failed sendSignal(trans_ptr.p->m_senderRef, GSN_CREATE_FILEGROUP_CONF, signal, CreateFilegroupConf::SignalLength, JBB); - break; } case GSN_CREATE_FILE_REQ:{ FilePtr f_ptr; + jam(); ndbrequire(c_file_hash.find(f_ptr, trans_ptr.p->m_op.m_obj_id)); CreateFileConf * conf = (CreateFileConf*)signal->getDataPtr(); conf->senderRef = reference(); @@ -14591,11 +14625,11 @@ Dbdict::trans_commit_complete_done(Signal* signal, //@todo check api failed sendSignal(trans_ptr.p->m_senderRef, GSN_CREATE_FILE_CONF, signal, CreateFileConf::SignalLength, JBB); - break; } case GSN_DROP_FILE_REQ:{ DropFileConf * conf = (DropFileConf*)signal->getDataPtr(); + jam(); conf->senderRef = reference(); conf->senderData = trans_ptr.p->m_senderData; conf->fileId = trans_ptr.p->m_op.m_obj_id; @@ -14607,6 +14641,7 @@ Dbdict::trans_commit_complete_done(Signal* signal, } case GSN_DROP_FILEGROUP_REQ:{ DropFilegroupConf * conf = (DropFilegroupConf*)signal->getDataPtr(); + jam(); conf->senderRef = reference(); conf->senderData = trans_ptr.p->m_senderData; conf->filegroupId = trans_ptr.p->m_op.m_obj_id; @@ -14629,12 +14664,12 @@ Dbdict::trans_commit_complete_done(Signal* signal, void Dbdict::trans_abort_start_done(Signal* signal, Uint32 callbackData, - Uint32 retValue){ - jamEntry(); - - ndbrequire(retValue == 0); - + Uint32 retValue) +{ Ptr trans_ptr; + + jam(); + ndbrequire(retValue == 0); ndbrequire(c_Trans.find(trans_ptr, callbackData)); NodeReceiverGroup rg(DBDICT, trans_ptr.p->m_nodes); @@ -14652,12 +14687,12 @@ Dbdict::trans_abort_start_done(Signal* signal, void Dbdict::trans_abort_complete_done(Signal* signal, Uint32 callbackData, - Uint32 retValue){ - jamEntry(); - - ndbrequire(retValue == 0); - + Uint32 retValue) +{ Ptr trans_ptr; + + jam(); + ndbrequire(retValue == 0); ndbrequire(c_Trans.find(trans_ptr, callbackData)); switch(f_dict_op[trans_ptr.p->m_op.m_vt_index].m_gsn_user_req){ @@ -14665,6 +14700,7 @@ Dbdict::trans_abort_complete_done(Signal* signal, { // CreateFilegroupRef * ref = (CreateFilegroupRef*)signal->getDataPtr(); + jam(); ref->senderRef = reference(); ref->senderData = trans_ptr.p->m_senderData; ref->masterNodeId = c_masterNodeId; @@ -14676,12 +14712,12 @@ Dbdict::trans_abort_complete_done(Signal* signal, //@todo check api failed sendSignal(trans_ptr.p->m_senderRef, GSN_CREATE_FILEGROUP_REF, signal, CreateFilegroupRef::SignalLength, JBB); - break; } case GSN_CREATE_FILE_REQ: { CreateFileRef * ref = (CreateFileRef*)signal->getDataPtr(); + jam(); ref->senderRef = reference(); ref->senderData = trans_ptr.p->m_senderData; ref->masterNodeId = c_masterNodeId; @@ -14693,12 +14729,12 @@ Dbdict::trans_abort_complete_done(Signal* signal, //@todo check api failed sendSignal(trans_ptr.p->m_senderRef, GSN_CREATE_FILE_REF, signal, CreateFileRef::SignalLength, JBB); - break; } case GSN_DROP_FILE_REQ: { DropFileRef * ref = (DropFileRef*)signal->getDataPtr(); + jam(); ref->senderRef = reference(); ref->senderData = trans_ptr.p->m_senderData; ref->masterNodeId = c_masterNodeId; @@ -14709,13 +14745,13 @@ Dbdict::trans_abort_complete_done(Signal* signal, //@todo check api failed sendSignal(trans_ptr.p->m_senderRef, GSN_DROP_FILE_REF, signal, DropFileRef::SignalLength, JBB); - break; } case GSN_DROP_FILEGROUP_REQ: { // DropFilegroupRef * ref = (DropFilegroupRef*)signal->getDataPtr(); + jam(); ref->senderRef = reference(); ref->senderData = trans_ptr.p->m_senderData; ref->masterNodeId = c_masterNodeId; @@ -14726,7 +14762,6 @@ Dbdict::trans_abort_complete_done(Signal* signal, //@todo check api failed sendSignal(trans_ptr.p->m_senderRef, GSN_DROP_FILEGROUP_REF, signal, DropFilegroupRef::SignalLength, JBB); - break; } default: @@ -14740,7 +14775,8 @@ Dbdict::trans_abort_complete_done(Signal* signal, } void -Dbdict::execCREATE_OBJ_REQ(Signal* signal){ +Dbdict::execCREATE_OBJ_REQ(Signal* signal) +{ jamEntry(); if(!assembleFragments(signal)){ @@ -14785,6 +14821,7 @@ Dbdict::execCREATE_OBJ_REQ(Signal* signal){ switch(objType){ case DictTabInfo::Tablespace: case DictTabInfo::LogfileGroup: + jam(); createObjPtr.p->m_vt_index = 0; break; case DictTabInfo::Datafile: @@ -14793,7 +14830,11 @@ Dbdict::execCREATE_OBJ_REQ(Signal* signal){ * Use restart code to impl. ForceCreateFile */ if (requestInfo & CreateFileReq::ForceCreateFile) - createObjPtr.p->m_restart= 2; + { + jam(); + createObjPtr.p->m_restart= 2; + } + jam(); createObjPtr.p->m_vt_index = 1; break; default: @@ -14809,10 +14850,10 @@ void Dbdict::execDICT_COMMIT_REQ(Signal* signal) { DictCommitReq* req = (DictCommitReq*)signal->getDataPtr(); - Ptr op; - ndbrequire(c_schemaOp.find(op, req->op_key)); + jamEntry(); + ndbrequire(c_schemaOp.find(op, req->op_key)); (this->*f_dict_op[op.p->m_vt_index].m_commit)(signal, op.p); } @@ -14820,23 +14861,23 @@ void Dbdict::execDICT_ABORT_REQ(Signal* signal) { DictAbortReq* req = (DictAbortReq*)signal->getDataPtr(); - Ptr op; - ndbrequire(c_schemaOp.find(op, req->op_key)); + jamEntry(); + ndbrequire(c_schemaOp.find(op, req->op_key)); (this->*f_dict_op[op.p->m_vt_index].m_abort)(signal, op.p); } void -Dbdict::execDICT_COMMIT_REF(Signal* signal){ - jamEntry(); - +Dbdict::execDICT_COMMIT_REF(Signal* signal) +{ DictCommitRef * const ref = (DictCommitRef*)signal->getDataPtr(); - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, ref->senderData)); - if(ref->errorCode != DictCommitRef::NF_FakeErrorREF){ + jam(); trans_ptr.p->setErrorCode(ref->errorCode); } Uint32 node = refToNode(ref->senderRef); @@ -14844,26 +14885,26 @@ Dbdict::execDICT_COMMIT_REF(Signal* signal){ } void -Dbdict::execDICT_COMMIT_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execDICT_COMMIT_CONF(Signal* signal) +{ + Ptr trans_ptr; DictCommitConf * const conf = (DictCommitConf*)signal->getDataPtr(); - - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, conf->senderData)); schemaOp_reply(signal, trans_ptr.p, refToNode(conf->senderRef)); } void -Dbdict::execDICT_ABORT_REF(Signal* signal){ - jamEntry(); - +Dbdict::execDICT_ABORT_REF(Signal* signal) +{ DictAbortRef * const ref = (DictAbortRef*)signal->getDataPtr(); - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, ref->senderData)); - if(ref->errorCode != DictAbortRef::NF_FakeErrorREF){ + jam(); trans_ptr.p->setErrorCode(ref->errorCode); } Uint32 node = refToNode(ref->senderRef); @@ -14871,31 +14912,28 @@ Dbdict::execDICT_ABORT_REF(Signal* signal){ } void -Dbdict::execDICT_ABORT_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execDICT_ABORT_CONF(Signal* signal) +{ DictAbortConf * const conf = (DictAbortConf*)signal->getDataPtr(); - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, conf->senderData)); schemaOp_reply(signal, trans_ptr.p, refToNode(conf->senderRef)); } - - void Dbdict::createObj_prepare_start_done(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; + SegmentedSectionPtr objInfoPtr; ndbrequire(returnCode == 0); - - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); - - SegmentedSectionPtr objInfoPtr; + jam(); getSection(objInfoPtr, createObjPtr.p->m_obj_info_ptr_i); - if(createObjPtr.p->m_errorCode != 0){ jam(); createObjPtr.p->m_obj_info_ptr_i= RNIL; @@ -14923,19 +14961,19 @@ Dbdict::createObj_prepare_start_done(Signal* signal, void Dbdict::createObj_writeSchemaConf1(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ - jam(); - - ndbrequire(returnCode == 0); - + Uint32 returnCode) +{ CreateObjRecordPtr createObjPtr; + Callback callback; + SegmentedSectionPtr objInfoPtr; + + jam(); + ndbrequire(returnCode == 0); ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); - Callback callback; callback.m_callbackData = createObjPtr.p->key; callback.m_callbackFunction = safe_cast(&Dbdict::createObj_writeObjConf); - SegmentedSectionPtr objInfoPtr; getSection(objInfoPtr, createObjPtr.p->m_obj_info_ptr_i); writeTableFile(signal, createObjPtr.p->m_obj_id, objInfoPtr, &callback); @@ -14947,14 +14985,13 @@ Dbdict::createObj_writeSchemaConf1(Signal* signal, void Dbdict::createObj_writeObjConf(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; + jam(); - ndbrequire(returnCode == 0); - - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); - createObjPtr.p->m_callback.m_callbackFunction = safe_cast(&Dbdict::createObj_prepare_complete_done); (this->*f_dict_op[createObjPtr.p->m_vt_index].m_prepare_complete) @@ -14964,12 +15001,12 @@ Dbdict::createObj_writeObjConf(Signal* signal, void Dbdict::createObj_prepare_complete_done(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; + jam(); - ndbrequire(returnCode == 0); - - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); //@todo check for master failed @@ -14998,28 +15035,33 @@ Dbdict::createObj_prepare_complete_done(Signal* signal, } void -Dbdict::createObj_commit(Signal * signal, SchemaOp * op){ - jam(); - +Dbdict::createObj_commit(Signal * signal, SchemaOp * op) +{ OpCreateObj * createObj = (OpCreateObj*)op; + createObj->m_callback.m_callbackFunction = safe_cast(&Dbdict::createObj_commit_start_done); if (f_dict_op[createObj->m_vt_index].m_commit_start) + { + jam(); (this->*f_dict_op[createObj->m_vt_index].m_commit_start)(signal, createObj); + } else + { + jam(); execute(signal, createObj->m_callback, 0); + } } void Dbdict::createObj_commit_start_done(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; jam(); - ndbrequire(returnCode == 0); - - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); Uint32 objId = createObjPtr.p->m_obj_id; @@ -15039,29 +15081,35 @@ Dbdict::createObj_commit_start_done(Signal* signal, void Dbdict::createObj_writeSchemaConf2(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ - jam(); - - CreateObjRecordPtr createObjPtr; - ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; + ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); createObjPtr.p->m_callback.m_callbackFunction = safe_cast(&Dbdict::createObj_commit_complete_done); if (f_dict_op[createObjPtr.p->m_vt_index].m_commit_complete) + { + jam(); (this->*f_dict_op[createObjPtr.p->m_vt_index].m_commit_complete) (signal, createObjPtr.p); + } else + { + jam(); execute(signal, createObjPtr.p->m_callback, 0); + } } void Dbdict::createObj_commit_complete_done(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; + jam(); - - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); //@todo check error @@ -15079,27 +15127,31 @@ Dbdict::createObj_commit_complete_done(Signal* signal, void Dbdict::createObj_abort(Signal* signal, SchemaOp* op) { - jam(); - OpCreateObj * createObj = (OpCreateObj*)op; createObj->m_callback.m_callbackFunction = safe_cast(&Dbdict::createObj_abort_start_done); if (f_dict_op[createObj->m_vt_index].m_abort_start) + { + jam(); (this->*f_dict_op[createObj->m_vt_index].m_abort_start)(signal, createObj); + } else + { + jam(); execute(signal, createObj->m_callback, 0); + } } void Dbdict::createObj_abort_start_done(Signal* signal, Uint32 callbackData, - Uint32 returnCode){ + Uint32 returnCode) +{ + CreateObjRecordPtr createObjPtr; + jam(); - - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); - XSchemaFile * xsf = &c_schemaFile[c_schemaRecord.schemaPage != 0]; SchemaFile::TableEntry objEntry = * getTableEntry(xsf, createObjPtr.p->m_obj_id); @@ -15118,19 +15170,23 @@ Dbdict::createObj_abort_writeSchemaConf(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - jam(); + CreateObjRecordPtr createObjPtr; - CreateObjRecordPtr createObjPtr; ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); - createObjPtr.p->m_callback.m_callbackFunction = safe_cast(&Dbdict::createObj_abort_complete_done); if (f_dict_op[createObjPtr.p->m_vt_index].m_abort_complete) + { + jam(); (this->*f_dict_op[createObjPtr.p->m_vt_index].m_abort_complete) (signal, createObjPtr.p); + } else + { + jam(); execute(signal, createObjPtr.p->m_callback, 0); + } } void @@ -15138,9 +15194,9 @@ Dbdict::createObj_abort_complete_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - jam(); + CreateObjRecordPtr createObjPtr; - CreateObjRecordPtr createObjPtr; + jam(); ndbrequire(c_opCreateObj.find(createObjPtr, callbackData)); DictAbortConf * const conf = (DictAbortConf*)signal->getDataPtr(); @@ -15153,7 +15209,8 @@ Dbdict::createObj_abort_complete_done(Signal* signal, } void -Dbdict::execDROP_OBJ_REQ(Signal* signal){ +Dbdict::execDROP_OBJ_REQ(Signal* signal) +{ jamEntry(); if(!assembleFragments(signal)){ @@ -15191,8 +15248,9 @@ Dbdict::execDROP_OBJ_REQ(Signal* signal){ case DictTabInfo::Tablespace: case DictTabInfo::LogfileGroup: { - dropObjPtr.p->m_vt_index = 3; Ptr fg_ptr; + jam(); + dropObjPtr.p->m_vt_index = 3; ndbrequire(c_filegroup_hash.find(fg_ptr, objId)); dropObjPtr.p->m_obj_ptr_i = fg_ptr.i; break; @@ -15200,15 +15258,19 @@ Dbdict::execDROP_OBJ_REQ(Signal* signal){ } case DictTabInfo::Datafile: { - dropObjPtr.p->m_vt_index = 2; Ptr file_ptr; + jam(); + dropObjPtr.p->m_vt_index = 2; ndbrequire(c_file_hash.find(file_ptr, objId)); dropObjPtr.p->m_obj_ptr_i = file_ptr.i; break; } case DictTabInfo::Undofile: + { + jam(); dropObjPtr.p->m_vt_index = 4; return; + } default: ndbrequire(false); } @@ -15223,12 +15285,12 @@ Dbdict::dropObj_prepare_start_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - ndbrequire(returnCode == 0); + DropObjRecordPtr dropObjPtr; + Callback cb; - DropObjRecordPtr dropObjPtr; + ndbrequire(returnCode == 0); ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); - Callback cb; cb.m_callbackData = callbackData; cb.m_callbackFunction = safe_cast(&Dbdict::dropObj_prepare_writeSchemaConf); @@ -15239,7 +15301,7 @@ Dbdict::dropObj_prepare_start_done(Signal* signal, dropObj_prepare_complete_done(signal, callbackData, 0); return; } - + jam(); Uint32 objId = dropObjPtr.p->m_obj_id; XSchemaFile * xsf = &c_schemaFile[c_schemaRecord.schemaPage != 0]; SchemaFile::TableEntry objEntry = *getTableEntry(xsf, objId); @@ -15252,19 +15314,23 @@ Dbdict::dropObj_prepare_writeSchemaConf(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - ndbrequire(returnCode == 0); + DropObjRecordPtr dropObjPtr; - DropObjRecordPtr dropObjPtr; + ndbrequire(returnCode == 0); ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); - dropObjPtr.p->m_callback.m_callbackFunction = safe_cast(&Dbdict::dropObj_prepare_complete_done); - if(f_dict_op[dropObjPtr.p->m_vt_index].m_prepare_complete) + { + jam(); (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_prepare_complete) (signal, dropObjPtr.p); + } else + { + jam(); execute(signal, dropObjPtr.p->m_callback, 0); + } } void @@ -15272,10 +15338,11 @@ Dbdict::dropObj_prepare_complete_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { + DropObjRecordPtr dropObjPtr; + ndbrequire(returnCode == 0); - - DropObjRecordPtr dropObjPtr; ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); + jam(); //@todo check for master failed @@ -15301,16 +15368,22 @@ Dbdict::dropObj_prepare_complete_done(Signal* signal, } void -Dbdict::dropObj_commit(Signal * signal, SchemaOp * op){ - jam(); - +Dbdict::dropObj_commit(Signal * signal, SchemaOp * op) +{ OpDropObj * dropObj = (OpDropObj*)op; + dropObj->m_callback.m_callbackFunction = safe_cast(&Dbdict::dropObj_commit_start_done); if (f_dict_op[dropObj->m_vt_index].m_commit_start) + { + jam(); (this->*f_dict_op[dropObj->m_vt_index].m_commit_start)(signal, dropObj); + } else + { + jam(); execute(signal, dropObj->m_callback, 0); + } } void @@ -15318,10 +15391,10 @@ Dbdict::dropObj_commit_start_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { + DropObjRecordPtr dropObjPtr; + jam(); ndbrequire(returnCode == 0); - - DropObjRecordPtr dropObjPtr; ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); Uint32 objId = dropObjPtr.p->m_obj_id; @@ -15342,20 +15415,25 @@ Dbdict::dropObj_commit_writeSchemaConf(Signal* signal, Uint32 callbackData, Uint32 returnCode) { + DropObjRecordPtr dropObjPtr; + jam(); ndbrequire(returnCode == 0); - - DropObjRecordPtr dropObjPtr; ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); - dropObjPtr.p->m_callback.m_callbackFunction = safe_cast(&Dbdict::dropObj_commit_complete_done); if(f_dict_op[dropObjPtr.p->m_vt_index].m_commit_complete) + { + jam(); (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_commit_complete) (signal, dropObjPtr.p); + } else + { + jam(); execute(signal, dropObjPtr.p->m_callback, 0); + } } void @@ -15363,7 +15441,9 @@ Dbdict::dropObj_commit_complete_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - DropObjRecordPtr dropObjPtr; + DropObjRecordPtr dropObjPtr; + + jam(); ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); //@todo check error @@ -15374,22 +15454,26 @@ Dbdict::dropObj_commit_complete_done(Signal* signal, conf->senderData = dropObjPtr.p->m_senderData; sendSignal(dropObjPtr.p->m_senderRef, GSN_DICT_COMMIT_CONF, signal, DictCommitConf::SignalLength, JBB); - c_opDropObj.release(dropObjPtr); } void -Dbdict::dropObj_abort(Signal * signal, SchemaOp * op){ - jam(); - +Dbdict::dropObj_abort(Signal * signal, SchemaOp * op) +{ OpDropObj * dropObj = (OpDropObj*)op; + dropObj->m_callback.m_callbackFunction = safe_cast(&Dbdict::dropObj_abort_start_done); - if (f_dict_op[dropObj->m_vt_index].m_abort_start) + { + jam(); (this->*f_dict_op[dropObj->m_vt_index].m_abort_start)(signal, dropObj); + } else + { + jam(); execute(signal, dropObj->m_callback, 0); + } } void @@ -15397,10 +15481,10 @@ Dbdict::dropObj_abort_start_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { + DropObjRecordPtr dropObjPtr; + jam(); ndbrequire(returnCode == 0); - - DropObjRecordPtr dropObjPtr; ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); XSchemaFile * xsf = &c_schemaFile[c_schemaRecord.schemaPage != 0]; @@ -15421,6 +15505,7 @@ Dbdict::dropObj_abort_start_done(Signal* signal, } else { + jam(); execute(signal, callback, 0); } } @@ -15430,20 +15515,24 @@ Dbdict::dropObj_abort_writeSchemaConf(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - jam(); + DropObjRecordPtr dropObjPtr; + ndbrequire(returnCode == 0); - - DropObjRecordPtr dropObjPtr; ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); - dropObjPtr.p->m_callback.m_callbackFunction = safe_cast(&Dbdict::dropObj_abort_complete_done); if(f_dict_op[dropObjPtr.p->m_vt_index].m_abort_complete) + { + jam(); (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_abort_complete) (signal, dropObjPtr.p); + } else + { + jam(); execute(signal, dropObjPtr.p->m_callback, 0); + } } void @@ -15451,24 +15540,26 @@ Dbdict::dropObj_abort_complete_done(Signal* signal, Uint32 callbackData, Uint32 returnCode) { - DropObjRecordPtr dropObjPtr; - ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); - + DropObjRecordPtr dropObjPtr; DictAbortConf * const conf = (DictAbortConf*)signal->getDataPtr(); + + ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); + jam(); conf->senderRef = reference(); conf->senderData = dropObjPtr.p->m_senderData; sendSignal(dropObjPtr.p->m_senderRef, GSN_DICT_ABORT_CONF, signal, DictAbortConf::SignalLength, JBB); - c_opDropObj.release(dropObjPtr); } void -Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ +Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op) +{ /** * Put data into table record */ SegmentedSectionPtr objInfoPtr; + jam(); getSection(objInfoPtr, ((OpCreateObj*)op)->m_obj_info_ptr_i); SimplePropertiesSectionReader it(objInfoPtr, getSectionSegmentPool()); @@ -15485,6 +15576,7 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ if(status != SimpleProperties::Eof) { + jam(); op->m_errorCode = CreateTableRef::InvalidFormat; break; } @@ -15493,6 +15585,7 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ { if(!fg.TS_ExtentSize) { + jam(); op->m_errorCode = CreateFilegroupRef::InvalidExtentSize; break; } @@ -15504,6 +15597,7 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ */ if(fg.LF_UndoBufferSize < 3 * File_formats::NDB_PAGE_SIZE) { + jam(); op->m_errorCode = CreateFilegroupRef::InvalidUndoBufferSize; break; } @@ -15512,16 +15606,19 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ Uint32 len = strlen(fg.FilegroupName) + 1; Uint32 hash = Rope::hash(fg.FilegroupName, len); if(get_object(fg.FilegroupName, len, hash) != 0){ + jam(); op->m_errorCode = CreateTableRef::TableAlreadyExist; break; } if(!c_obj_pool.seize(obj_ptr)){ + jam(); op->m_errorCode = CreateTableRef::NoMoreTableRecords; break; } if(!c_filegroup_pool.seize(fg_ptr)){ + jam(); op->m_errorCode = CreateTableRef::NoMoreTableRecords; break; } @@ -15531,6 +15628,7 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ { Rope name(c_rope_pool, obj_ptr.p->m_name); if(!name.assign(fg.FilegroupName, len, hash)){ + jam(); op->m_errorCode = CreateTableRef::OutOfStringBuffer; break; } @@ -15544,6 +15642,7 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ switch(fg.FilegroupType){ case DictTabInfo::Tablespace: + { //fg.TS_DataGrow = group.m_grow_spec; fg_ptr.p->m_tablespace.m_extent_size = fg.TS_ExtentSize; fg_ptr.p->m_tablespace.m_default_logfile_group_id = fg.TS_LogfileGroupId; @@ -15551,22 +15650,28 @@ Dbdict::create_fg_prepare_start(Signal* signal, SchemaOp* op){ Ptr lg_ptr; if (!c_filegroup_hash.find(lg_ptr, fg.TS_LogfileGroupId)) { + jam(); op->m_errorCode = CreateFilegroupRef::NoSuchLogfileGroup; goto error; } if (lg_ptr.p->m_version != fg.TS_LogfileGroupVersion) { + jam(); op->m_errorCode = CreateFilegroupRef::InvalidFilegroupVersion; goto error; } increase_ref_count(lg_ptr.p->m_obj_ptr_i); break; + } case DictTabInfo::LogfileGroup: + { + jam(); fg_ptr.p->m_logfilegroup.m_undo_buffer_size = fg.LF_UndoBufferSize; fg_ptr.p->m_logfilegroup.m_files.init(); //fg.LF_UndoGrow = ; break; + } default: ndbrequire(false); } @@ -15601,13 +15706,14 @@ error: } void -Dbdict::create_fg_prepare_complete(Signal* signal, SchemaOp* op){ +Dbdict::create_fg_prepare_complete(Signal* signal, SchemaOp* op) +{ /** * CONTACT TSMAN LGMAN PGMAN */ CreateFilegroupImplReq* req = (CreateFilegroupImplReq*)signal->getDataPtrSend(); - + jam(); req->senderData = op->key; req->senderRef = reference(); req->filegroup_id = op->m_obj_id; @@ -15620,18 +15726,24 @@ Dbdict::create_fg_prepare_complete(Signal* signal, SchemaOp* op){ Uint32 len= 0; switch(op->m_obj_type){ case DictTabInfo::Tablespace: + { + jam(); ref = TSMAN_REF; len = CreateFilegroupImplReq::TablespaceLength; req->tablespace.extent_size = fg_ptr.p->m_tablespace.m_extent_size; req->tablespace.logfile_group_id = fg_ptr.p->m_tablespace.m_default_logfile_group_id; break; + } case DictTabInfo::LogfileGroup: + { + jam(); ref = LGMAN_REF; len = CreateFilegroupImplReq::LogfileGroupLength; req->logfile_group.buffer_size = fg_ptr.p->m_logfilegroup.m_undo_buffer_size; break; + } default: ndbrequire(false); } @@ -15640,12 +15752,11 @@ Dbdict::create_fg_prepare_complete(Signal* signal, SchemaOp* op){ } void -Dbdict::execCREATE_FILEGROUP_REF(Signal* signal){ - jamEntry(); - +Dbdict::execCREATE_FILEGROUP_REF(Signal* signal) +{ CreateFilegroupImplRef * ref = (CreateFilegroupImplRef*)signal->getDataPtr(); - CreateObjRecordPtr op_ptr; + jamEntry(); ndbrequire(c_opCreateObj.find(op_ptr, ref->senderData)); op_ptr.p->m_errorCode = ref->errorCode; @@ -15653,13 +15764,12 @@ Dbdict::execCREATE_FILEGROUP_REF(Signal* signal){ } void -Dbdict::execCREATE_FILEGROUP_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execCREATE_FILEGROUP_CONF(Signal* signal) +{ CreateFilegroupImplConf * rep = (CreateFilegroupImplConf*)signal->getDataPtr(); - CreateObjRecordPtr op_ptr; + jamEntry(); ndbrequire(c_opCreateObj.find(op_ptr, rep->senderData)); execute(signal, op_ptr.p->m_callback, 0); @@ -15675,13 +15785,13 @@ Dbdict::create_fg_abort_start(Signal* signal, SchemaOp* op){ send_drop_fg(signal, op, DropFilegroupImplReq::Commit); return; } - + jam(); execute(signal, op->m_callback, 0); } void -Dbdict::create_fg_abort_complete(Signal* signal, SchemaOp* op){ - +Dbdict::create_fg_abort_complete(Signal* signal, SchemaOp* op) +{ if (op->m_obj_ptr_i != RNIL) { jam(); @@ -15691,12 +15801,13 @@ Dbdict::create_fg_abort_complete(Signal* signal, SchemaOp* op){ release_object(fg_ptr.p->m_obj_ptr_i); c_filegroup_hash.release(fg_ptr); } - + jam(); execute(signal, op->m_callback, 0); } void -Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ +Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op) +{ /** * Put data into table record */ @@ -15716,6 +15827,7 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ do { if(status != SimpleProperties::Eof){ + jam(); op->m_errorCode = CreateFileRef::InvalidFormat; break; } @@ -15723,34 +15835,53 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ // Get Filegroup FilegroupPtr fg_ptr; if(!c_filegroup_hash.find(fg_ptr, f.FilegroupId)){ + jam(); op->m_errorCode = CreateFileRef::NoSuchFilegroup; break; } if(fg_ptr.p->m_version != f.FilegroupVersion){ + jam(); op->m_errorCode = CreateFileRef::InvalidFilegroupVersion; break; } switch(f.FileType){ case DictTabInfo::Datafile: + { if(fg_ptr.p->m_type != DictTabInfo::Tablespace) + { + jam(); op->m_errorCode = CreateFileRef::InvalidFileType; + } + jam(); break; + } case DictTabInfo::Undofile: + { if(fg_ptr.p->m_type != DictTabInfo::LogfileGroup) + { + jam(); op->m_errorCode = CreateFileRef::InvalidFileType; + } + jam(); break; + } default: + jam(); op->m_errorCode = CreateFileRef::InvalidFileType; } if(op->m_errorCode) + { + jam(); break; + } Uint32 len = strlen(f.FileName) + 1; Uint32 hash = Rope::hash(f.FileName, len); if(get_object(f.FileName, len, hash) != 0){ + jam(); op->m_errorCode = CreateFileRef::FilenameAlreadyExists; break; } @@ -15761,6 +15892,7 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ m_ctx.m_config.getOwnConfigIterator(); if(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &dl) && dl) { + jam(); op->m_errorCode = CreateFileRef::NotSupportedWhenDiskless; break; } @@ -15768,11 +15900,13 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ // Loop through all filenames... if(!c_obj_pool.seize(obj_ptr)){ + jam(); op->m_errorCode = CreateTableRef::NoMoreTableRecords; break; } if (! c_file_pool.seize(filePtr)){ + jam(); op->m_errorCode = CreateFileRef::OutOfFileRecords; break; } @@ -15782,6 +15916,7 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ { Rope name(c_rope_pool, obj_ptr.p->m_name); if(!name.assign(f.FileName, len, hash)){ + jam(); op->m_errorCode = CreateTableRef::OutOfStringBuffer; break; } @@ -15789,10 +15924,14 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ switch(fg_ptr.p->m_type){ case DictTabInfo::Tablespace: + { + jam(); increase_ref_count(fg_ptr.p->m_obj_ptr_i); break; + } case DictTabInfo::LogfileGroup: { + jam(); Local_file_list list(c_file_pool, fg_ptr.p->m_logfilegroup.m_files); list.add(filePtr); break; @@ -15836,37 +15975,46 @@ Dbdict::create_file_prepare_start(Signal* signal, SchemaOp* op){ c_obj_pool.release(obj_ptr); } } - execute(signal, op->m_callback, 0); } void -Dbdict::create_file_prepare_complete(Signal* signal, SchemaOp* op){ +Dbdict::create_file_prepare_complete(Signal* signal, SchemaOp* op) +{ /** * CONTACT TSMAN LGMAN PGMAN */ CreateFileImplReq* req = (CreateFileImplReq*)signal->getDataPtrSend(); - FilePtr f_ptr; - c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); - FilegroupPtr fg_ptr; + + jam(); + c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); req->senderData = op->key; req->senderRef = reference(); switch(((OpCreateObj*)op)->m_restart){ case 0: + { + jam(); req->requestInfo = CreateFileImplReq::Create; break; + } case 1: + { + jam(); req->requestInfo = CreateFileImplReq::Open; break; + } case 2: + { + jam(); req->requestInfo = CreateFileImplReq::CreateForce; break; } + } req->file_id = f_ptr.p->key; req->filegroup_id = f_ptr.p->m_filegroup_id; @@ -15878,14 +16026,20 @@ Dbdict::create_file_prepare_complete(Signal* signal, SchemaOp* op){ Uint32 len= 0; switch(op->m_obj_type){ case DictTabInfo::Datafile: + { + jam(); ref = TSMAN_REF; len = CreateFileImplReq::DatafileLength; req->tablespace.extent_size = fg_ptr.p->m_tablespace.m_extent_size; break; + } case DictTabInfo::Undofile: + { + jam(); ref = LGMAN_REF; len = CreateFileImplReq::UndofileLength; break; + } default: ndbrequire(false); } @@ -15900,42 +16054,41 @@ Dbdict::create_file_prepare_complete(Signal* signal, SchemaOp* op){ } void -Dbdict::execCREATE_FILE_REF(Signal* signal){ - jamEntry(); - +Dbdict::execCREATE_FILE_REF(Signal* signal) +{ CreateFileImplRef * ref = (CreateFileImplRef*)signal->getDataPtr(); - CreateObjRecordPtr op_ptr; + + jamEntry(); ndbrequire(c_opCreateObj.find(op_ptr, ref->senderData)); op_ptr.p->m_errorCode = ref->errorCode; - execute(signal, op_ptr.p->m_callback, 0); } void -Dbdict::execCREATE_FILE_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execCREATE_FILE_CONF(Signal* signal) +{ CreateFileImplConf * rep = (CreateFileImplConf*)signal->getDataPtr(); - CreateObjRecordPtr op_ptr; + + jamEntry(); ndbrequire(c_opCreateObj.find(op_ptr, rep->senderData)); - execute(signal, op_ptr.p->m_callback, 0); } void -Dbdict::create_file_commit_start(Signal* signal, SchemaOp* op){ +Dbdict::create_file_commit_start(Signal* signal, SchemaOp* op) +{ /** * CONTACT TSMAN LGMAN PGMAN */ CreateFileImplReq* req = (CreateFileImplReq*)signal->getDataPtrSend(); - FilePtr f_ptr; - c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); - FilegroupPtr fg_ptr; + + jam(); + c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); req->senderData = op->key; @@ -15949,15 +16102,20 @@ Dbdict::create_file_commit_start(Signal* signal, SchemaOp* op){ Uint32 ref= 0; switch(op->m_obj_type){ case DictTabInfo::Datafile: + { + jam(); ref = TSMAN_REF; break; + } case DictTabInfo::Undofile: + { + jam(); ref = LGMAN_REF; break; + } default: ndbrequire(false); } - sendSignal(ref, GSN_CREATE_FILE_REQ, signal, CreateFileImplReq::CommitLength, JBB); } @@ -15970,9 +16128,11 @@ Dbdict::create_file_abort_start(Signal* signal, SchemaOp* op) if (op->m_obj_ptr_i != RNIL) { FilePtr f_ptr; + FilegroupPtr fg_ptr; + + jam(); c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); - FilegroupPtr fg_ptr; ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); req->senderData = op->key; @@ -15986,20 +16146,24 @@ Dbdict::create_file_abort_start(Signal* signal, SchemaOp* op) Uint32 ref= 0; switch(op->m_obj_type){ case DictTabInfo::Datafile: + { + jam(); ref = TSMAN_REF; break; + } case DictTabInfo::Undofile: + { + jam(); ref = LGMAN_REF; break; + } default: ndbrequire(false); } - sendSignal(ref, GSN_CREATE_FILE_REQ, signal, CreateFileImplReq::AbortLength, JBB); return; } - execute(signal, op->m_callback, 0); } @@ -16009,17 +16173,21 @@ Dbdict::create_file_abort_complete(Signal* signal, SchemaOp* op) if (op->m_obj_ptr_i != RNIL) { FilePtr f_ptr; - c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); - FilegroupPtr fg_ptr; + + jam(); + c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); - switch(fg_ptr.p->m_type){ case DictTabInfo::Tablespace: + { + jam(); decrease_ref_count(fg_ptr.p->m_obj_ptr_i); break; + } case DictTabInfo::LogfileGroup: { + jam(); Local_file_list list(c_file_pool, fg_ptr.p->m_logfilegroup.m_files); list.remove(f_ptr); break; @@ -16031,19 +16199,20 @@ Dbdict::create_file_abort_complete(Signal* signal, SchemaOp* op) release_object(f_ptr.p->m_obj_ptr_i); c_file_hash.release(f_ptr); } - execute(signal, op->m_callback, 0); } void Dbdict::drop_file_prepare_start(Signal* signal, SchemaOp* op) { + jam(); send_drop_file(signal, op, DropFileImplReq::Prepare); } void Dbdict::drop_undofile_prepare_start(Signal* signal, SchemaOp* op) { + jam(); op->m_errorCode = DropFileRef::DropUndoFileNotSupported; execute(signal, op->m_callback, 0); } @@ -16051,6 +16220,7 @@ Dbdict::drop_undofile_prepare_start(Signal* signal, SchemaOp* op) void Dbdict::drop_file_commit_start(Signal* signal, SchemaOp* op) { + jam(); send_drop_file(signal, op, DropFileImplReq::Commit); } @@ -16058,21 +16228,21 @@ void Dbdict::drop_file_commit_complete(Signal* signal, SchemaOp* op) { FilePtr f_ptr; - c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); - FilegroupPtr fg_ptr; - ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); + jam(); + c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); + ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); decrease_ref_count(fg_ptr.p->m_obj_ptr_i); release_object(f_ptr.p->m_obj_ptr_i); c_file_hash.release(f_ptr); - execute(signal, op->m_callback, 0); } void Dbdict::drop_file_abort_start(Signal* signal, SchemaOp* op) { + jam(); send_drop_file(signal, op, DropFileImplReq::Abort); } @@ -16081,11 +16251,11 @@ Dbdict::send_drop_file(Signal* signal, SchemaOp* op, DropFileImplReq::RequestInfo type) { DropFileImplReq* req = (DropFileImplReq*)signal->getDataPtrSend(); - FilePtr f_ptr; - c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); - FilegroupPtr fg_ptr; + + jam(); + c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); req->senderData = op->key; @@ -16099,29 +16269,34 @@ Dbdict::send_drop_file(Signal* signal, SchemaOp* op, Uint32 ref= 0; switch(op->m_obj_type){ case DictTabInfo::Datafile: + { + jam(); ref = TSMAN_REF; break; + } case DictTabInfo::Undofile: + { + jam(); ref = LGMAN_REF; break; + } default: ndbrequire(false); } - sendSignal(ref, GSN_DROP_FILE_REQ, signal, DropFileImplReq::SignalLength, JBB); } void -Dbdict::execDROP_OBJ_REF(Signal* signal){ - jamEntry(); - +Dbdict::execDROP_OBJ_REF(Signal* signal) +{ DropObjRef * const ref = (DropObjRef*)signal->getDataPtr(); - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, ref->senderData)); - if(ref->errorCode != DropObjRef::NF_FakeErrorREF){ + jam(); trans_ptr.p->setErrorCode(ref->errorCode); } Uint32 node = refToNode(ref->senderRef); @@ -16129,65 +16304,61 @@ Dbdict::execDROP_OBJ_REF(Signal* signal){ } void -Dbdict::execDROP_OBJ_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execDROP_OBJ_CONF(Signal* signal) +{ DropObjConf * const conf = (DropObjConf*)signal->getDataPtr(); - Ptr trans_ptr; + + jamEntry(); ndbrequire(c_Trans.find(trans_ptr, conf->senderData)); schemaOp_reply(signal, trans_ptr.p, refToNode(conf->senderRef)); } void -Dbdict::execDROP_FILE_REF(Signal* signal){ - jamEntry(); - +Dbdict::execDROP_FILE_REF(Signal* signal) +{ DropFileImplRef * ref = (DropFileImplRef*)signal->getDataPtr(); - DropObjRecordPtr op_ptr; + + jamEntry(); ndbrequire(c_opDropObj.find(op_ptr, ref->senderData)); op_ptr.p->m_errorCode = ref->errorCode; - execute(signal, op_ptr.p->m_callback, 0); } void -Dbdict::execDROP_FILE_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execDROP_FILE_CONF(Signal* signal) +{ DropFileImplConf * rep = (DropFileImplConf*)signal->getDataPtr(); - DropObjRecordPtr op_ptr; + + jamEntry(); ndbrequire(c_opDropObj.find(op_ptr, rep->senderData)); - execute(signal, op_ptr.p->m_callback, 0); } void -Dbdict::execDROP_FILEGROUP_REF(Signal* signal){ - jamEntry(); - +Dbdict::execDROP_FILEGROUP_REF(Signal* signal) +{ DropFilegroupImplRef * ref = (DropFilegroupImplRef*)signal->getDataPtr(); - DropObjRecordPtr op_ptr; + + jamEntry(); ndbrequire(c_opDropObj.find(op_ptr, ref->senderData)); op_ptr.p->m_errorCode = ref->errorCode; - execute(signal, op_ptr.p->m_callback, 0); } void -Dbdict::execDROP_FILEGROUP_CONF(Signal* signal){ - jamEntry(); - +Dbdict::execDROP_FILEGROUP_CONF(Signal* signal) +{ DropFilegroupImplConf * rep = (DropFilegroupImplConf*)signal->getDataPtr(); - DropObjRecordPtr op_ptr; + + jamEntry(); ndbrequire(c_opDropObj.find(op_ptr, rep->senderData)); - execute(signal, op_ptr.p->m_callback, 0); } @@ -16200,11 +16371,13 @@ Dbdict::drop_fg_prepare_start(Signal* signal, SchemaOp* op) DictObject * obj = c_obj_pool.getPtr(fg_ptr.p->m_obj_ptr_i); if (obj->m_ref_count) { + jam(); op->m_errorCode = DropFilegroupRef::FilegroupInUse; execute(signal, op->m_callback, 0); } else { + jam(); send_drop_fg(signal, op, DropFilegroupImplReq::Prepare); } } @@ -16216,7 +16389,7 @@ Dbdict::drop_fg_commit_start(Signal* signal, SchemaOp* op) c_filegroup_pool.getPtr(fg_ptr, op->m_obj_ptr_i); if (op->m_obj_type == DictTabInfo::LogfileGroup) { - + jam(); /** * Mark all undofiles as dropped */ @@ -16225,6 +16398,7 @@ Dbdict::drop_fg_commit_start(Signal* signal, SchemaOp* op) XSchemaFile * xsf = &c_schemaFile[c_schemaRecord.schemaPage != 0]; for(list.first(filePtr); !filePtr.isNull(); list.next(filePtr)) { + jam(); Uint32 objId = filePtr.p->key; SchemaFile::TableEntry * tableEntry = getTableEntry(xsf, objId); tableEntry->m_tableState = SchemaFile::DROP_TABLE_COMMITTED; @@ -16237,13 +16411,14 @@ Dbdict::drop_fg_commit_start(Signal* signal, SchemaOp* op) else if(op->m_obj_type == DictTabInfo::Tablespace) { FilegroupPtr lg_ptr; + jam(); ndbrequire(c_filegroup_hash. find(lg_ptr, fg_ptr.p->m_tablespace.m_default_logfile_group_id)); decrease_ref_count(lg_ptr.p->m_obj_ptr_i); } - + jam(); send_drop_fg(signal, op, DropFilegroupImplReq::Commit); } @@ -16252,16 +16427,17 @@ Dbdict::drop_fg_commit_complete(Signal* signal, SchemaOp* op) { FilegroupPtr fg_ptr; c_filegroup_pool.getPtr(fg_ptr, op->m_obj_ptr_i); - + + jam(); release_object(fg_ptr.p->m_obj_ptr_i); c_filegroup_hash.release(fg_ptr); - execute(signal, op->m_callback, 0); } void Dbdict::drop_fg_abort_start(Signal* signal, SchemaOp* op) { + jam(); send_drop_fg(signal, op, DropFilegroupImplReq::Abort); } diff --git a/storage/ndb/src/kernel/blocks/lgman.cpp b/storage/ndb/src/kernel/blocks/lgman.cpp index 82fed94f62e..4af27e25124 100644 --- a/storage/ndb/src/kernel/blocks/lgman.cpp +++ b/storage/ndb/src/kernel/blocks/lgman.cpp @@ -462,7 +462,8 @@ Lgman::drop_filegroup_drop_files(Signal* signal, } void -Lgman::execCREATE_FILE_REQ(Signal* signal){ +Lgman::execCREATE_FILE_REQ(Signal* signal) +{ jamEntry(); CreateFileImplReq* req= (CreateFileImplReq*)signal->getDataPtr(); @@ -491,6 +492,7 @@ Lgman::execCREATE_FILE_REQ(Signal* signal){ switch(requestInfo){ case CreateFileImplReq::Commit: { + jam(); ndbrequire(find_file_by_id(file_ptr, ptr.p->m_meta_files, req->file_id)); file_ptr.p->m_create.m_senderRef = req->senderRef; file_ptr.p->m_create.m_senderData = req->senderData; @@ -503,6 +505,7 @@ Lgman::execCREATE_FILE_REQ(Signal* signal){ Uint32 senderData = req->senderData; if (find_file_by_id(file_ptr, ptr.p->m_meta_files, req->file_id)) { + jam(); file_ptr.p->m_create.m_senderRef = senderRef; file_ptr.p->m_create.m_senderData = senderData; create_file_abort(signal, ptr, file_ptr); @@ -510,11 +513,11 @@ Lgman::execCREATE_FILE_REQ(Signal* signal){ else { CreateFileImplConf* conf= (CreateFileImplConf*)signal->getDataPtr(); + jam(); conf->senderData = senderData; conf->senderRef = reference(); sendSignal(senderRef, GSN_CREATE_FILE_CONF, signal, CreateFileImplConf::SignalLength, JBB); - return; } return; } From 23b1ce1e06a550ad42f29dcfd46ad77216562148 Mon Sep 17 00:00:00 2001 From: "acurtis/antony@xiphis.org/ltamd64.xiphis.org" <> Date: Tue, 8 May 2007 17:16:34 -0700 Subject: [PATCH 08/40] Bug#26241 "Blackhole tables don't honor table locks" Implement neccessary shared lock structure for table locks. Imported test case created by Giuseppe Maxia --- storage/blackhole/ha_blackhole.cc | 124 +++++++++++++++++++++++++++--- storage/blackhole/ha_blackhole.h | 14 +++- 2 files changed, 126 insertions(+), 12 deletions(-) diff --git a/storage/blackhole/ha_blackhole.cc b/storage/blackhole/ha_blackhole.cc index 6f07c4183f1..23fdc014114 100644 --- a/storage/blackhole/ha_blackhole.cc +++ b/storage/blackhole/ha_blackhole.cc @@ -31,6 +31,14 @@ static handler *blackhole_create_handler(handlerton *hton, } +/* Static declarations for shared structures */ + +static pthread_mutex_t blackhole_mutex; +static HASH blackhole_open_tables; + +static st_blackhole_share *get_share(const char *table_name); +static void free_share(st_blackhole_share *share); + /***************************************************************************** ** BLACKHOLE tables *****************************************************************************/ @@ -53,15 +61,18 @@ const char **ha_blackhole::bas_ext() const int ha_blackhole::open(const char *name, int mode, uint test_if_locked) { DBUG_ENTER("ha_blackhole::open"); - thr_lock_init(&thr_lock); - thr_lock_data_init(&thr_lock,&lock,NULL); + + if (!(share= get_share(name))) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + thr_lock_data_init(&share->lock, &lock, NULL); DBUG_RETURN(0); } int ha_blackhole::close(void) { DBUG_ENTER("ha_blackhole::close"); - thr_lock_delete(&thr_lock); + free_share(share); DBUG_RETURN(0); } @@ -136,17 +147,39 @@ int ha_blackhole::external_lock(THD *thd, int lock_type) } -uint ha_blackhole::lock_count(void) const -{ - DBUG_ENTER("ha_blackhole::lock_count"); - DBUG_RETURN(0); -} - THR_LOCK_DATA **ha_blackhole::store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type) { DBUG_ENTER("ha_blackhole::store_lock"); + if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) + { + /* + Here is where we get into the guts of a row level lock. + If TL_UNLOCK is set + If we are not doing a LOCK TABLE or DISCARD/IMPORT + TABLESPACE, then allow multiple writers + */ + + if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && + lock_type <= TL_WRITE) && !thd_in_lock_tables(thd) + && !thd_tablespace_op(thd)) + lock_type = TL_WRITE_ALLOW_WRITE; + + /* + In queries of type INSERT INTO t1 SELECT ... FROM t2 ... + MySQL would use the lock TL_READ_NO_INSERT on t2, and that + would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts + to t2. Convert the lock to a normal read lock to allow + concurrent inserts to t2. + */ + + if (lock_type == TL_READ_NO_INSERT && !thd_in_lock_tables(thd)) + lock_type = TL_READ; + + lock.type= lock_type; + } + *to++= &lock; DBUG_RETURN(to); } @@ -204,6 +237,63 @@ int ha_blackhole::index_last(byte * buf) DBUG_RETURN(HA_ERR_END_OF_FILE); } + +static st_blackhole_share *get_share(const char *table_name) +{ + st_blackhole_share *share; + uint length; + + length= (uint) strlen(table_name); + pthread_mutex_lock(&blackhole_mutex); + + if (!(share= (st_blackhole_share*) hash_search(&blackhole_open_tables, + (byte*) table_name, length))) + { + if (!(share= (st_blackhole_share*) my_malloc(sizeof(st_blackhole_share) + + length, + MYF(MY_WME | MY_ZEROFILL)))) + goto error; + + share->table_name_length= length; + strmov(share->table_name, table_name); + + if (my_hash_insert(&blackhole_open_tables, (byte*) share)) + { + my_free((gptr) share, MYF(0)); + share= NULL; + goto error; + } + + thr_lock_init(&share->lock); + } + share->use_count++; + +error: + pthread_mutex_unlock(&blackhole_mutex); + return share; +} + +static void free_share(st_blackhole_share *share) +{ + pthread_mutex_lock(&blackhole_mutex); + if (!--share->use_count) + hash_delete(&blackhole_open_tables, (byte*) share); + pthread_mutex_unlock(&blackhole_mutex); +} + +static void blackhole_free_key(st_blackhole_share *share) +{ + thr_lock_delete(&share->lock); + my_free((gptr) share, MYF(0)); +} + +static byte* blackhole_get_key(st_blackhole_share *share, uint *length, + my_bool not_used __attribute__((unused))) +{ + *length= share->table_name_length; + return (byte*) share->table_name; +} + static int blackhole_init(void *p) { handlerton *blackhole_hton; @@ -212,6 +302,20 @@ static int blackhole_init(void *p) blackhole_hton->db_type= DB_TYPE_BLACKHOLE_DB; blackhole_hton->create= blackhole_create_handler; blackhole_hton->flags= HTON_CAN_RECREATE; + + VOID(pthread_mutex_init(&blackhole_mutex, MY_MUTEX_INIT_FAST)); + (void) hash_init(&blackhole_open_tables, system_charset_info,32,0,0, + (hash_get_key) blackhole_get_key, + (hash_free_key) blackhole_free_key, 0); + + return 0; +} + +static int blackhole_fini(void *p) +{ + hash_free(&blackhole_open_tables); + pthread_mutex_destroy(&blackhole_mutex); + return 0; } @@ -227,7 +331,7 @@ mysql_declare_plugin(blackhole) "/dev/null storage engine (anything you write to it disappears)", PLUGIN_LICENSE_GPL, blackhole_init, /* Plugin Init */ - NULL, /* Plugin Deinit */ + blackhole_fini, /* Plugin Deinit */ 0x0100 /* 1.0 */, NULL, /* status variables */ NULL, /* system variables */ diff --git a/storage/blackhole/ha_blackhole.h b/storage/blackhole/ha_blackhole.h index 2af12b33077..1fd4df7ea78 100644 --- a/storage/blackhole/ha_blackhole.h +++ b/storage/blackhole/ha_blackhole.h @@ -17,6 +17,17 @@ #pragma interface /* gcc class implementation */ #endif +/* + Shared structure for correct LOCK operation +*/ +struct st_blackhole_share { + THR_LOCK lock; + uint use_count; + uint table_name_length; + char table_name[1]; +}; + + /* Class definition for the blackhole storage engine "Dumbest named feature ever" @@ -24,7 +35,7 @@ class ha_blackhole: public handler { THR_LOCK_DATA lock; /* MySQL lock */ - THR_LOCK thr_lock; + st_blackhole_share *share; public: ha_blackhole(handlerton *hton, TABLE_SHARE *table_arg); @@ -76,7 +87,6 @@ public: void position(const byte *record); int info(uint flag); int external_lock(THD *thd, int lock_type); - uint lock_count(void) const; int create(const char *name, TABLE *table_arg, HA_CREATE_INFO *create_info); THR_LOCK_DATA **store_lock(THD *thd, From 10ecf5855bc5754a296796ab14bb20a80cc19067 Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/april.(none)" <> Date: Thu, 10 May 2007 20:30:49 +0500 Subject: [PATCH 09/40] After merge fix. --- mysql-test/r/binlog_row_blackhole.result | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mysql-test/r/binlog_row_blackhole.result b/mysql-test/r/binlog_row_blackhole.result index 8e90ac4f30b..e58f4648470 100644 --- a/mysql-test/r/binlog_row_blackhole.result +++ b/mysql-test/r/binlog_row_blackhole.result @@ -121,6 +121,9 @@ master-bin.000001 # Query # # use `test`; alter table t1 add b int master-bin.000001 # Query # # use `test`; alter table t1 drop b master-bin.000001 # Query # # use `test`; create table t3 like t1 drop table t1,t2,t3; +CREATE TABLE t1(a INT) ENGINE=BLACKHOLE; +INSERT DELAYED INTO t1 VALUES(1); +DROP TABLE t1; CREATE TABLE t1(a INT, b INT) ENGINE=BLACKHOLE; DELETE FROM t1 WHERE a=10; ALTER TABLE t1 ADD INDEX(a); From 7a02c71f77a9f4f56b53b8e9545328bbcbe9732a Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Fri, 11 May 2007 08:07:42 +0200 Subject: [PATCH 10/40] Bug#25818 No return of NDB share object in failures in open method - make sure resources are release properly on error --- sql/ha_ndbcluster.cc | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc index 9b48e0d4f38..0e9dfcef5fb 100644 --- a/sql/ha_ndbcluster.cc +++ b/sql/ha_ndbcluster.cc @@ -5011,27 +5011,36 @@ int ha_ndbcluster::open(const char *name, int mode, uint test_if_locked) set_dbname(name); set_tabname(name); - if (check_ndb_connection()) { - free_share(m_share); m_share= 0; - DBUG_RETURN(HA_ERR_NO_CONNECTION); + if ((res= check_ndb_connection()) || + (res= get_metadata(name))) + { + free_share(m_share); + m_share= 0; + DBUG_RETURN(res); } - - res= get_metadata(name); - if (!res) + while (1) { Ndb *ndb= get_ndb(); if (ndb->setDatabaseName(m_dbname)) { - ERR_RETURN(ndb->getNdbError()); + res= ndb_to_mysql_error(&ndb->getNdbError()); + break; } struct Ndb_statistics stat; res= ndb_get_table_statistics(NULL, false, ndb, m_tabname, &stat); records= stat.row_count; if(!res) res= info(HA_STATUS_CONST); + break; } - - DBUG_RETURN(res); + if (res) + { + free_share(m_share); + m_share= 0; + release_metadata(); + DBUG_RETURN(res); + } + DBUG_RETURN(0); } From 99923bfa764a95d49921d2e1800c35f897d85f39 Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Fri, 11 May 2007 09:19:03 +0200 Subject: [PATCH 11/40] Bug#25818 No return of NDB share object in failures in open method - correct manual/auto merge to 5.1 --- sql/ha_ndbcluster.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc index e9c634011f0..315b3c56eb3 100644 --- a/sql/ha_ndbcluster.cc +++ b/sql/ha_ndbcluster.cc @@ -6292,9 +6292,9 @@ int ha_ndbcluster::open(const char *name, int mode, uint test_if_locked) } if (res) { - free_share(m_share); + free_share(&m_share); m_share= 0; - release_metadata(); + release_metadata(current_thd, get_ndb()); DBUG_RETURN(res); } #ifdef HAVE_NDB_BINLOG From 1305016bdc54728ef81cd7f788391cfe9c1f42da Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Mon, 14 May 2007 10:34:21 +0200 Subject: [PATCH 12/40] ndb - bug#28348 remove LCP files when dropping table --- .../include/kernel/signaldata/FsOpenReq.hpp | 2 + storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp | 11 ++- .../ndb/src/kernel/blocks/dbtup/DbtupGen.cpp | 4 + .../ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp | 87 +++++++++++++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) diff --git a/storage/ndb/include/kernel/signaldata/FsOpenReq.hpp b/storage/ndb/include/kernel/signaldata/FsOpenReq.hpp index 8d438f79259..8126267f946 100644 --- a/storage/ndb/include/kernel/signaldata/FsOpenReq.hpp +++ b/storage/ndb/include/kernel/signaldata/FsOpenReq.hpp @@ -44,6 +44,8 @@ class FsOpenReq { friend class Restore; friend class Dblqh; + friend class Dbtup; + /** * For printing */ diff --git a/storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp b/storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp index 6d14b714be0..d59d5cd79f2 100644 --- a/storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp +++ b/storage/ndb/src/kernel/blocks/dbtup/Dbtup.hpp @@ -972,6 +972,8 @@ ArrayPool c_triggerPool; struct { Uint32 tabUserPtr; Uint32 tabUserRef; + Uint32 m_lcpno; + Uint32 m_fragPtrI; } m_dropTable; State tableStatus; }; @@ -1533,6 +1535,11 @@ private: void execACCKEYREF(Signal* signal); void execACC_ABORTCONF(Signal* signal); + + // Drop table + void execFSREMOVEREF(Signal*); + void execFSREMOVECONF(Signal*); + //------------------------------------------------------------------ //------------------------------------------------------------------ // Methods to handle execution of TUPKEYREQ + ATTRINFO. @@ -2423,7 +2430,9 @@ private: void drop_fragment_free_extent_log_buffer_callback(Signal*, Uint32, Uint32); void drop_fragment_unmap_pages(Signal*, TablerecPtr, FragrecordPtr, Uint32); void drop_fragment_unmap_page_callback(Signal* signal, Uint32, Uint32); - + void drop_fragment_fsremove(Signal*, TablerecPtr, FragrecordPtr); + void drop_fragment_fsremove_done(Signal*, TablerecPtr, FragrecordPtr); + // Initialisation void initData(); void initRecords(); diff --git a/storage/ndb/src/kernel/blocks/dbtup/DbtupGen.cpp b/storage/ndb/src/kernel/blocks/dbtup/DbtupGen.cpp index 7563712d481..f4fd80a482a 100644 --- a/storage/ndb/src/kernel/blocks/dbtup/DbtupGen.cpp +++ b/storage/ndb/src/kernel/blocks/dbtup/DbtupGen.cpp @@ -102,6 +102,10 @@ Dbtup::Dbtup(Block_context& ctx, Pgman* pgman) addRecSignal(GSN_ACCKEYREF, &Dbtup::execACCKEYREF); addRecSignal(GSN_ACC_ABORTCONF, &Dbtup::execACC_ABORTCONF); + // Drop table + addRecSignal(GSN_FSREMOVEREF, &Dbtup::execFSREMOVEREF, true); + addRecSignal(GSN_FSREMOVECONF, &Dbtup::execFSREMOVECONF, true); + attrbufrec = 0; fragoperrec = 0; fragrecord = 0; diff --git a/storage/ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp b/storage/ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp index 040a43d3dcd..3c2d521c1f9 100644 --- a/storage/ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp +++ b/storage/ndb/src/kernel/blocks/dbtup/DbtupMeta.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1282,6 +1283,24 @@ Dbtup::drop_fragment_free_var_pages(Signal* signal) sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); return; } + + /** + * Remove LCP's for fragment + */ + tabPtr.p->m_dropTable.m_lcpno = 0; + tabPtr.p->m_dropTable.m_fragPtrI = fragPtr.i; + drop_fragment_fsremove(signal, tabPtr, fragPtr); +} + +void +Dbtup::drop_fragment_fsremove_done(Signal* signal, + TablerecPtr tabPtr, + FragrecordPtr fragPtr) +{ + /** + * LCP's removed... + * now continue with "next" + */ Uint32 logfile_group_id = fragPtr.p->m_logfile_group_id ; releaseFragPages(fragPtr.p); Uint32 i; @@ -1301,6 +1320,74 @@ Dbtup::drop_fragment_free_var_pages(Signal* signal) return; } +// Remove LCP + +void +Dbtup::drop_fragment_fsremove(Signal* signal, + TablerecPtr tabPtr, + FragrecordPtr fragPtr) +{ + FsRemoveReq* req = (FsRemoveReq*)signal->getDataPtrSend(); + req->userReference = reference(); + req->userPointer = tabPtr.i; + req->directory = 0; + req->ownDirectory = 0; + + Uint32 lcpno = tabPtr.p->m_dropTable.m_lcpno; + Uint32 fragId = fragPtr.p->fragmentId; + Uint32 tableId = fragPtr.p->fragTableId; + + FsOpenReq::setVersion(req->fileNumber, 5); + FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA); + FsOpenReq::v5_setLcpNo(req->fileNumber, lcpno); + FsOpenReq::v5_setTableId(req->fileNumber, tableId); + FsOpenReq::v5_setFragmentId(req->fileNumber, fragId); + sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal, + FsRemoveReq::SignalLength, JBB); +} + +void +Dbtup::execFSREMOVEREF(Signal* signal) +{ + jamEntry(); + FsRef* ref = (FsRef*)signal->getDataPtr(); + Uint32 userPointer = ref->userPointer; + FsConf* conf = (FsConf*)signal->getDataPtrSend(); + conf->userPointer = userPointer; + execFSREMOVECONF(signal); +} + +void +Dbtup::execFSREMOVECONF(Signal* signal) +{ + jamEntry(); + FsConf* conf = (FsConf*)signal->getDataPtrSend(); + + TablerecPtr tabPtr; + FragrecordPtr fragPtr; + + tabPtr.i = conf->userPointer; + ptrCheckGuard(tabPtr, cnoOfTablerec, tablerec); + + ndbrequire(tabPtr.p->tableStatus == DROPPING); + + fragPtr.i = tabPtr.p->m_dropTable.m_fragPtrI; + ptrCheckGuard(fragPtr, cnoOfFragrec, fragrecord); + + tabPtr.p->m_dropTable.m_lcpno++; + if (tabPtr.p->m_dropTable.m_lcpno < 3) + { + jam(); + drop_fragment_fsremove(signal, tabPtr, fragPtr); + } + else + { + jam(); + drop_fragment_fsremove_done(signal, tabPtr, fragPtr); + } +} +// End remove LCP + void Dbtup::start_restore_lcp(Uint32 tableId, Uint32 fragId) { From 3ed42fbb2d4f768660b6b95076546eba8712f728 Mon Sep 17 00:00:00 2001 From: "istruewing@chilla.local" <> Date: Mon, 14 May 2007 11:33:47 +0200 Subject: [PATCH 13/40] Bug#17332 - changing key_buffer_size on a running server can crash under load Post-post-review fixes. Fixed a typo == -> = Optimized normal flush at end of statement (FLUSH_KEEP), but let other flush types be stringent. Added comments. Fixed debugging. --- mysys/mf_keycache.c | 96 +++++++++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/mysys/mf_keycache.c b/mysys/mf_keycache.c index 651a2b1070a..7ca07016823 100644 --- a/mysys/mf_keycache.c +++ b/mysys/mf_keycache.c @@ -608,6 +608,7 @@ int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size, keycache->can_be_used= 0; goto finish; } + DBUG_ASSERT(cache_empty(keycache)); /* End the flush phase. */ keycache->resize_in_flush= 0; @@ -3599,7 +3600,7 @@ static int flush_key_blocks_int(KEY_CACHE *keycache, So we should not let count become smaller than the fixed buffer. */ if (cache == cache_buff) - count == FLUSH_CACHE; + count= FLUSH_CACHE; } /* Retrieve the blocks and write them to a buffer to be flushed */ @@ -3718,8 +3719,16 @@ restart: link_changed(block, &first_in_switch); } } - else + else if (type != FLUSH_KEEP) { + /* + During the normal flush at end of statement (FLUSH_KEEP) we + do not need to ensure that blocks in flush or update by + other threads are flushed. They will be flushed by them + later. In all other cases we must assure that we do not have + any changed block of this file in the cache when this + function returns. + */ if (block->status & BLOCK_IN_FLUSH) { /* Remember the last block found to be in flush. */ @@ -3743,9 +3752,14 @@ restart: last_errno= error; } /* - Do not restart here. We have now flushed at least all blocks - that were changed when entering this function. + Do not restart here during the normal flush at end of statement + (FLUSH_KEEP). We have now flushed at least all blocks that were + changed when entering this function. In all other cases we must + assure that we do not have any changed block of this file in the + cache when this function returns. */ + if (type != FLUSH_KEEP) + goto restart; } if (last_in_flush) { @@ -3996,7 +4010,35 @@ int flush_key_blocks(KEY_CACHE *keycache, /* - Flush all blocks in the key cache to disk + Flush all blocks in the key cache to disk. + + SYNOPSIS + flush_all_key_blocks() + keycache pointer to key cache root structure + + DESCRIPTION + + Flushing of the whole key cache is done in two phases. + + 1. Flush all changed blocks, waiting for them if necessary. Loop + until there is no changed block left in the cache. + + 2. Free all clean blocks. Normally this means free all blocks. The + changed blocks were flushed in phase 1 and became clean. However we + may need to wait for blocks that are read by other threads. While we + wait, a clean block could become changed if that operation started + before the resize operation started. To be safe we must restart at + phase 1. + + When we can run through the changed_blocks and file_blocks hashes + without finding a block any more, then we are done. + + Note that we hold keycache->cache_lock all the time unless we need + to wait for something. + + RETURN + 0 OK + != 0 Error */ static int flush_all_key_blocks(KEY_CACHE *keycache) @@ -4007,13 +4049,15 @@ static int flush_all_key_blocks(KEY_CACHE *keycache) uint idx; DBUG_ENTER("flush_all_key_blocks"); - safe_mutex_assert_owner(&keycache->cache_lock); - do { + safe_mutex_assert_owner(&keycache->cache_lock); total_found= 0; - /* Flush all changed blocks first. */ + /* + Phase1: Flush all changed blocks, waiting for them if necessary. + Loop until there is no changed block left in the cache. + */ do { found= 0; @@ -4022,17 +4066,15 @@ static int flush_all_key_blocks(KEY_CACHE *keycache) { /* If an array element is non-empty, use the first block from its - chain to find a file for flush. All blocks for this file are - flushed. So the same block will not appear at this place again - with the next iteration. New writes for blocks are not accepted - during the flush. + chain to find a file for flush. All changed blocks for this + file are flushed. So the same block will not appear at this + place again with the next iteration. New writes for blocks are + not accepted during the flush. If multiple files share the + same hash bucket, one of them will be flushed per iteration + of the outer loop of phase 1. */ if ((block= keycache->changed_blocks[idx])) { - /* A block in the changed_blocks hash must have a hash_link. */ - DBUG_ASSERT(block->hash_link); - DBUG_ASSERT(block->hash_link->block == block); - found++; /* Flush dirty blocks but do not free them yet. They can be used @@ -4046,7 +4088,14 @@ static int flush_all_key_blocks(KEY_CACHE *keycache) } while (found); - /* Now flush (free) all clean blocks. */ + /* + Phase 2: Free all clean blocks. Normally this means free all + blocks. The changed blocks were flushed in phase 1 and became + clean. However we may need to wait for blocks that are read by + other threads. While we wait, a clean block could become changed + if that operation started before the resize operation started. To + be safe we must restart at phase 1. + */ do { found= 0; @@ -4057,17 +4106,12 @@ static int flush_all_key_blocks(KEY_CACHE *keycache) If an array element is non-empty, use the first block from its chain to find a file for flush. All blocks for this file are freed. So the same block will not appear at this place again - with the next iteration. Unless it has been read into the cache - anew. In this case readers and the flusher fight against each - other. But since the flusher does not need to do I/O for clean - blocks, and writes for blocks are not accepted during the flush, - it will win finally. + with the next iteration. If multiple files share the + same hash bucket, one of them will be flushed per iteration + of the outer loop of phase 2. */ if ((block= keycache->file_blocks[idx])) { - /* A block in the file_blocks hash must have a hash_link. */ - DBUG_ASSERT(block->hash_link); - total_found++; found++; if (flush_key_blocks_int(keycache, block->hash_link->file, @@ -4412,7 +4456,7 @@ static int cache_empty(KEY_CACHE *keycache) for (idx= 0; idx < keycache->hash_links; idx++) { HASH_LINK *hash_link= keycache->hash_link_root + idx; - if (hash_link->block || hash_link->file || hash_link->diskpos) + if (hash_link->requests || hash_link->block) { fprintf(stderr, "hash_link index: %u\n", idx); fail_hlink(hash_link); From 1e7974a0be3c0b679450997ee2fae266ca0d9a85 Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Mon, 14 May 2007 12:15:27 +0200 Subject: [PATCH 14/40] Bug #28410 ndb: no retry sleep when getting autoincrement - add retry sleep to allow temprary error to go away --- sql/ha_ndbcluster.cc | 57 +++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc index 0e9dfcef5fb..0f3a42bbce7 100644 --- a/sql/ha_ndbcluster.cc +++ b/sql/ha_ndbcluster.cc @@ -2309,16 +2309,24 @@ int ha_ndbcluster::write_row(byte *record) { // Table has hidden primary key Ndb *ndb= get_ndb(); - int ret; Uint64 auto_value; uint retries= NDB_AUTO_INCREMENT_RETRIES; - do { - ret= ndb->getAutoIncrementValue((const NDBTAB *) m_table, auto_value, 1); - } while (ret == -1 && - --retries && - ndb->getNdbError().status == NdbError::TemporaryError); - if (ret == -1) - ERR_RETURN(ndb->getNdbError()); + int retry_sleep= 30; /* 30 milliseconds, transaction */ + for (;;) + { + if (ndb->getAutoIncrementValue((const NDBTAB *) m_table, + auto_value, 1) == -1) + { + if (--retries && + ndb->getNdbError().status == NdbError::TemporaryError); + { + my_sleep(retry_sleep); + continue; + } + ERR_RETURN(ndb->getNdbError()); + } + break; + } if (set_hidden_key(op, table->s->fields, (const byte*)&auto_value)) ERR_RETURN(op->getNdbError()); } @@ -4855,22 +4863,27 @@ ulonglong ha_ndbcluster::get_auto_increment() m_rows_to_insert - m_rows_inserted : ((m_rows_to_insert > m_autoincrement_prefetch) ? m_rows_to_insert : m_autoincrement_prefetch)); - int ret; uint retries= NDB_AUTO_INCREMENT_RETRIES; - do { - ret= - m_skip_auto_increment ? - ndb->readAutoIncrementValue((const NDBTAB *) m_table, auto_value) : - ndb->getAutoIncrementValue((const NDBTAB *) m_table, auto_value, cache_size); - } while (ret == -1 && - --retries && - ndb->getNdbError().status == NdbError::TemporaryError); - if (ret == -1) + int retry_sleep= 30; /* 30 milliseconds, transaction */ + for (;;) { - const NdbError err= ndb->getNdbError(); - sql_print_error("Error %lu in ::get_auto_increment(): %s", - (ulong) err.code, err.message); - DBUG_RETURN(~(ulonglong) 0); + if (m_skip_auto_increment && + ndb->readAutoIncrementValue((const NDBTAB *) m_table, auto_value) || + ndb->getAutoIncrementValue((const NDBTAB *) m_table, + auto_value, cache_size)) + { + if (--retries && + ndb->getNdbError().status == NdbError::TemporaryError); + { + my_sleep(retry_sleep); + continue; + } + const NdbError err= ndb->getNdbError(); + sql_print_error("Error %lu in ::get_auto_increment(): %s", + (ulong) err.code, err.message); + DBUG_RETURN(~(ulonglong) 0); + } + break; } DBUG_RETURN((longlong)auto_value); } From e8159fd003cc1c97da8fafbc4534ae56bee56478 Mon Sep 17 00:00:00 2001 From: "df@pippilotta.erinye.com" <> Date: Mon, 14 May 2007 13:20:18 +0200 Subject: [PATCH 15/40] bug#28358 libmysql.dll cannot be dynamically loaded on Windows --- libmysql/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libmysql/CMakeLists.txt b/libmysql/CMakeLists.txt index 647f6bd5e33..3b18531f6c0 100755 --- a/libmysql/CMakeLists.txt +++ b/libmysql/CMakeLists.txt @@ -18,6 +18,10 @@ INCLUDE("${PROJECT_SOURCE_DIR}/win/mysql_manifest.cmake") # storage does not work properly in DLLs. SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX -DUSE_TLS") SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX -DUSE_TLS") +SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DSAFEMALLOC -DSAFE_MUTEX -DUSE_TLS") +SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DSAFEMALLOC -DSAFE_MUTEX -DUSE_TLS") +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DSAFEMALLOC -DSAFE_MUTEX -DUSE_TLS") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DSAFEMALLOC -DSAFE_MUTEX -DUSE_TLS") INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib From 391265636dd021db9b5b65d23c4527fe2dad36de Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Mon, 14 May 2007 14:38:50 +0200 Subject: [PATCH 16/40] files to check if log bin is turned on --- mysql-test/include/have_log_bin.inc | 4 ++++ mysql-test/r/have_log_bin.require | 2 ++ 2 files changed, 6 insertions(+) create mode 100644 mysql-test/include/have_log_bin.inc create mode 100644 mysql-test/r/have_log_bin.require diff --git a/mysql-test/include/have_log_bin.inc b/mysql-test/include/have_log_bin.inc new file mode 100644 index 00000000000..11530dc953e --- /dev/null +++ b/mysql-test/include/have_log_bin.inc @@ -0,0 +1,4 @@ +-- require r/have_log_bin.require +disable_query_log; +show variables like "log_bin"; +enable_query_log; diff --git a/mysql-test/r/have_log_bin.require b/mysql-test/r/have_log_bin.require new file mode 100644 index 00000000000..cacdf8df0ce --- /dev/null +++ b/mysql-test/r/have_log_bin.require @@ -0,0 +1,2 @@ +Variable_name Value +have_log_bin ON From 7d20986660326f03b82de1e75e6f16e3451a0fdd Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Mon, 14 May 2007 14:39:37 +0200 Subject: [PATCH 17/40] disable test if log bin is not turned on --- mysql-test/t/ndb_binlog_basic2.test | 1 + 1 file changed, 1 insertion(+) diff --git a/mysql-test/t/ndb_binlog_basic2.test b/mysql-test/t/ndb_binlog_basic2.test index 9fa9f2f965a..bcc6b503320 100644 --- a/mysql-test/t/ndb_binlog_basic2.test +++ b/mysql-test/t/ndb_binlog_basic2.test @@ -1,4 +1,5 @@ -- source include/have_ndb.inc +-- source include/have_log_bin.inc --error ER_NDB_CANT_SWITCH_BINLOG_FORMAT set session binlog_format=row; From 6dceef95f74794d6fee4506db23a880b97baaf06 Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Mon, 14 May 2007 14:43:07 +0200 Subject: [PATCH 18/40] corrected manual merge --- sql/ha_ndbcluster.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc index 7ab7de6ab81..304b8fc510d 100644 --- a/sql/ha_ndbcluster.cc +++ b/sql/ha_ndbcluster.cc @@ -6089,7 +6089,8 @@ void ha_ndbcluster::get_auto_increment(ulonglong offset, ulonglong increment, const NdbError err= ndb->getNdbError(); sql_print_error("Error %lu in ::get_auto_increment(): %s", (ulong) err.code, err.message); - DBUG_RETURN(~(ulonglong) 0); + *first_value= ~(ulonglong) 0; + DBUG_VOID_RETURN; } break; } From fd8f890ad7daaec6dbc3c71a3069ad38c130efd9 Mon Sep 17 00:00:00 2001 From: "df@pippilotta.erinye.com" <> Date: Mon, 14 May 2007 15:11:29 +0200 Subject: [PATCH 19/40] import of fix for bug#28240 --- configure.in | 18 +++++++++++++----- include/my_global.h | 11 +++++------ sql/item_func.cc | 4 ++-- strings/strtod.c | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/configure.in b/configure.in index 1026c855bf2..e788419eb4f 100644 --- a/configure.in +++ b/configure.in @@ -2006,12 +2006,20 @@ case "$target" in ;; esac -# isinf() could be a function or a macro (HPUX) -AC_MSG_CHECKING(for isinf with ) +# Check that isinf() is available in math.h and can be used in both C and C++ +# code +AC_MSG_CHECKING(for isinf in ) AC_TRY_LINK([#include ], [float f = 0.0; int r = isinf(f); return r], - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_ISINF, [1], [isinf() macro or function]), - AC_MSG_RESULT(no)) + AC_MSG_RESULT(yes) + AC_MSG_CHECKING(whether isinf() can be used in C++ code) + AC_LANG_SAVE + AC_LANG_CPLUSPLUS + AC_TRY_LINK([#include ], [float f = 0.0; int r = isinf(f); return r], + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ISINF, [1], [isinf() macro or function]), + AC_MSG_RESULT(no)) + AC_LANG_RESTORE, + AC_MSG_RESULT(no)) CFLAGS="$ORG_CFLAGS" diff --git a/include/my_global.h b/include/my_global.h index e9b371d8d30..f32a987ffb1 100644 --- a/include/my_global.h +++ b/include/my_global.h @@ -792,12 +792,11 @@ typedef SOCKET_SIZE_TYPE size_socket; #define isnan(x) ((x) != (x)) #endif -#if !defined(HAVE_ISINF) -/* The configure check for "isinf with math.h" has failed */ -#ifdef isinf -#undef isinf -#endif -#define isinf(X) (!finite(X) && !isnan(X)) +#ifdef HAVE_ISINF +/* isinf() can be used in both C and C++ code */ +#define my_isinf(X) isinf(X) +#else +#define my_isinf(X) (!finite(X) && !isnan(X)) #endif /* Define missing math constants. */ diff --git a/sql/item_func.cc b/sql/item_func.cc index c0a9647e382..cdd5bcb1677 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -2025,9 +2025,9 @@ double my_double_round(double value, longlong dec, bool dec_unsigned, tmp=(abs_dec < array_elements(log_10) ? log_10[abs_dec] : pow(10.0,(double) abs_dec)); - if (dec_negative && isinf(tmp)) + if (dec_negative && my_isinf(tmp)) tmp2= 0; - else if (!dec_negative && isinf(value * tmp)) + else if (!dec_negative && my_isinf(value * tmp)) tmp2= value; else if (truncate) { diff --git a/strings/strtod.c b/strings/strtod.c index 15707a9b944..7196cafb2c9 100644 --- a/strings/strtod.c +++ b/strings/strtod.c @@ -194,7 +194,7 @@ double my_strtod(const char *str, char **end_ptr, int *error) done: *end_ptr= (char*) str; /* end of number */ - if (overflow || isinf(result)) + if (overflow || my_isinf(result)) { result= DBL_MAX; *error= EOVERFLOW; From e53efba0e5ce2e256457fa1a99deec377e1ac19b Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Tue, 15 May 2007 08:34:39 +0200 Subject: [PATCH 20/40] #26906 No message slogan found - added errormessage and code for "declaring node dead" --- ndb/include/mgmapi/ndbd_exit_codes.h | 1 + ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 2 +- ndb/src/kernel/error/ndbd_exit_codes.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ndb/include/mgmapi/ndbd_exit_codes.h b/ndb/include/mgmapi/ndbd_exit_codes.h index 874bf0aa253..b8a65b54672 100644 --- a/ndb/include/mgmapi/ndbd_exit_codes.h +++ b/ndb/include/mgmapi/ndbd_exit_codes.h @@ -79,6 +79,7 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification; #define NDBD_EXIT_NO_MORE_UNDOLOG 2312 #define NDBD_EXIT_SR_UNDOLOG 2313 #define NDBD_EXIT_SINGLE_USER_MODE 2314 +#define NDBD_EXIT_NODE_DECLARED_DEAD 2315 #define NDBD_EXIT_MEMALLOC 2327 #define NDBD_EXIT_BLOCK_JBUFCONGESTION 2334 #define NDBD_EXIT_TIME_QUEUE_SHORT 2335 diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index a0a19620a05..a76838f7007 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -2816,7 +2816,7 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, if (failedNodePtr.i == getOwnNodeId()) { jam(); - Uint32 code = 0; + Uint32 code = NDBD_EXIT_NODE_DECLARED_DEAD; const char * msg = 0; char extra[100]; switch(aFailCause){ diff --git a/ndb/src/kernel/error/ndbd_exit_codes.c b/ndb/src/kernel/error/ndbd_exit_codes.c index 37a54e33350..68d8f22f158 100644 --- a/ndb/src/kernel/error/ndbd_exit_codes.c +++ b/ndb/src/kernel/error/ndbd_exit_codes.c @@ -57,6 +57,8 @@ static const ErrStruct errArray[] = "error(s) on other node(s)"}, {NDBD_EXIT_PARTITIONED_SHUTDOWN, XAE, "Partitioned cluster detected. " "Please check if cluster is already running"}, + {NDBD_EXIT_NODE_DECLARED_DEAD, XAE, + "Node declared dead. See error log for details"}, {NDBD_EXIT_POINTER_NOTINRANGE, XIE, "Pointer too large"}, {NDBD_EXIT_SR_OTHERNODEFAILED, XRE, "Another node failed during system " "restart, please investigate error(s) on other node(s)"}, From b027b84afada9d6339171fb278312bf431f36411 Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Tue, 15 May 2007 09:03:00 +0200 Subject: [PATCH 21/40] Bug #26386 ndbd wont start after changing schema backported error code from 5.1 split error check when reading schema file --- ndb/include/mgmapi/ndbd_exit_codes.h | 1 + ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 26 +++++++++++++++++-------- ndb/src/kernel/error/ndbd_exit_codes.c | 1 + 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ndb/include/mgmapi/ndbd_exit_codes.h b/ndb/include/mgmapi/ndbd_exit_codes.h index b8a65b54672..1051fd9e394 100644 --- a/ndb/include/mgmapi/ndbd_exit_codes.h +++ b/ndb/include/mgmapi/ndbd_exit_codes.h @@ -80,6 +80,7 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification; #define NDBD_EXIT_SR_UNDOLOG 2313 #define NDBD_EXIT_SINGLE_USER_MODE 2314 #define NDBD_EXIT_NODE_DECLARED_DEAD 2315 +#define NDBD_EXIT_SR_SCHEMAFILE 2316 #define NDBD_EXIT_MEMALLOC 2327 #define NDBD_EXIT_BLOCK_JBUFCONGESTION 2334 #define NDBD_EXIT_TIME_QUEUE_SHORT 2335 diff --git a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index a039c1bdbe7..fd7aabc8b67 100644 --- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -1069,14 +1069,24 @@ void Dbdict::readSchemaConf(Signal* signal, for (Uint32 n = 0; n < xsf->noOfPages; n++) { SchemaFile * sf = &xsf->schemaPage[n]; - bool ok = - memcmp(sf->Magic, NDB_SF_MAGIC, sizeof(sf->Magic)) == 0 && - sf->FileSize != 0 && - sf->FileSize % NDB_SF_PAGE_SIZE == 0 && - sf->FileSize == sf0->FileSize && - sf->PageNumber == n && - computeChecksum((Uint32*)sf, NDB_SF_PAGE_SIZE_IN_WORDS) == 0; - ndbrequire(ok || !crashInd); + bool ok = false; + if (memcmp(sf->Magic, NDB_SF_MAGIC, sizeof(sf->Magic)) != 0) + { jam(); } + else if (sf->FileSize == 0) + { jam(); } + else if (sf->FileSize % NDB_SF_PAGE_SIZE != 0) + { jam(); } + else if (sf->FileSize != sf0->FileSize) + { jam(); } + else if (sf->PageNumber != n) + { jam(); } + else if (computeChecksum((Uint32*)sf, NDB_SF_PAGE_SIZE_IN_WORDS) != 0) + { jam(); } + else if (crashInd) + { jam(); } + else + ok = true; + ndbrequireErr(ok, NDBD_EXIT_SR_SCHEMAFILE); if (! ok) { jam(); ndbrequire(fsPtr.p->fsState == FsConnectRecord::READ_SCHEMA1); diff --git a/ndb/src/kernel/error/ndbd_exit_codes.c b/ndb/src/kernel/error/ndbd_exit_codes.c index 68d8f22f158..92bee522d24 100644 --- a/ndb/src/kernel/error/ndbd_exit_codes.c +++ b/ndb/src/kernel/error/ndbd_exit_codes.c @@ -65,6 +65,7 @@ static const ErrStruct errArray[] = {NDBD_EXIT_NODE_NOT_DEAD, XRE, "Internal node state conflict, " "most probably resolved by restarting node again"}, {NDBD_EXIT_SR_REDOLOG, XFI, "Error while reading the REDO log"}, + {NDBD_EXIT_SR_SCHEMAFILE, XFI, "Error while reading the schema file"}, /* Currently unused? */ {2311, XIE, "Conflict when selecting restart type"}, {NDBD_EXIT_NO_MORE_UNDOLOG, XCR, From 7e8cc53abec2b7ad381b8dd61914a0d410bc9675 Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Tue, 15 May 2007 09:08:16 +0200 Subject: [PATCH 22/40] ndb - bug#24631 add Dbdict::restartDropObj* --- .../ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 189 +++++++++++++++++- .../ndb/src/kernel/blocks/dbdict/Dbdict.hpp | 9 +- storage/ndb/src/kernel/blocks/lgman.cpp | 13 +- storage/ndb/src/kernel/blocks/tsman.cpp | 6 + storage/ndb/test/ndbapi/testDict.cpp | 157 +++++++++++++++ .../ndb/test/run-test/daily-basic-tests.txt | 4 + 6 files changed, 372 insertions(+), 6 deletions(-) diff --git a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index de365e886a0..e5ed9e49642 100644 --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -188,7 +188,7 @@ struct { 0, 0, 0, 0, &Dbdict::drop_undofile_prepare_start, 0, 0, - 0, 0, + 0, &Dbdict::drop_undofile_commit_complete, 0, 0, 0 } }; @@ -3209,9 +3209,7 @@ Dbdict::restartDropTab(Signal* signal, Uint32 tableId, case DictTabInfo::LogfileGroup: case DictTabInfo::Datafile: case DictTabInfo::Undofile: - warningEvent("Dont drop object: %d", tableId); - c_restartRecord.activeTable++; - checkSchemaStatus(signal); + restartDropObj(signal, tableId, old_entry); return; } @@ -3254,6 +3252,9 @@ Dbdict::restartDropTab_complete(Signal* signal, checkSchemaStatus(signal); } +/** + * Create Obj during NR/SR + */ void Dbdict::restartCreateObj(Signal* signal, Uint32 tableId, @@ -3482,6 +3483,170 @@ Dbdict::restartCreateObj_commit_complete_done(Signal* signal, checkSchemaStatus(signal); } +/** + * Drop object during NR/SR + */ +void +Dbdict::restartDropObj(Signal* signal, + Uint32 tableId, + const SchemaFile::TableEntry * entry) +{ + jam(); + + DropObjRecordPtr dropObjPtr; + ndbrequire(c_opDropObj.seize(dropObjPtr)); + + const Uint32 key = ++c_opRecordSequence; + dropObjPtr.p->key = key; + c_opDropObj.add(dropObjPtr); + dropObjPtr.p->m_errorCode = 0; + dropObjPtr.p->m_senderRef = reference(); + dropObjPtr.p->m_senderData = tableId; + dropObjPtr.p->m_clientRef = reference(); + dropObjPtr.p->m_clientData = tableId; + + dropObjPtr.p->m_obj_id = tableId; + dropObjPtr.p->m_obj_type = entry->m_tableType; + dropObjPtr.p->m_obj_version = entry->m_tableVersion; + + dropObjPtr.p->m_callback.m_callbackData = key; + dropObjPtr.p->m_callback.m_callbackFunction= + safe_cast(&Dbdict::restartDropObj_prepare_start_done); + + ndbout_c("Dropping %d %d", tableId, entry->m_tableType); + switch(entry->m_tableType){ + case DictTabInfo::Tablespace: + case DictTabInfo::LogfileGroup:{ + jam(); + Ptr fg_ptr; + ndbrequire(c_filegroup_hash.find(fg_ptr, tableId)); + dropObjPtr.p->m_obj_ptr_i = fg_ptr.i; + dropObjPtr.p->m_vt_index = 3; + break; + } + case DictTabInfo::Datafile:{ + jam(); + Ptr file_ptr; + dropObjPtr.p->m_vt_index = 2; + ndbrequire(c_file_hash.find(file_ptr, tableId)); + dropObjPtr.p->m_obj_ptr_i = file_ptr.i; + break; + } + case DictTabInfo::Undofile:{ + jam(); + Ptr file_ptr; + dropObjPtr.p->m_vt_index = 4; + ndbrequire(c_file_hash.find(file_ptr, tableId)); + dropObjPtr.p->m_obj_ptr_i = file_ptr.i; + + /** + * Undofiles are only removed from logfile groups file list + * as drop undofile is currently not supported... + * file will be dropped by lgman when dropping filegroup + */ + dropObjPtr.p->m_callback.m_callbackFunction= + safe_cast(&Dbdict::restartDropObj_commit_complete_done); + + if (f_dict_op[dropObjPtr.p->m_vt_index].m_commit_complete) + (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_commit_complete) + (signal, dropObjPtr.p); + else + execute(signal, dropObjPtr.p->m_callback, 0); + return; + } + default: + jamLine(entry->m_tableType); + ndbrequire(false); + } + + if (f_dict_op[dropObjPtr.p->m_vt_index].m_prepare_start) + (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_prepare_start) + (signal, dropObjPtr.p); + else + execute(signal, dropObjPtr.p->m_callback, 0); +} + +void +Dbdict::restartDropObj_prepare_start_done(Signal* signal, + Uint32 callbackData, + Uint32 returnCode) +{ + jam(); + ndbrequire(returnCode == 0); + DropObjRecordPtr dropObjPtr; + ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); + ndbrequire(dropObjPtr.p->m_errorCode == 0); + + dropObjPtr.p->m_callback.m_callbackFunction = + safe_cast(&Dbdict::restartDropObj_prepare_complete_done); + + if (f_dict_op[dropObjPtr.p->m_vt_index].m_prepare_complete) + (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_prepare_complete) + (signal, dropObjPtr.p); + else + execute(signal, dropObjPtr.p->m_callback, 0); +} + +void +Dbdict::restartDropObj_prepare_complete_done(Signal* signal, + Uint32 callbackData, + Uint32 returnCode) +{ + jam(); + ndbrequire(returnCode == 0); + DropObjRecordPtr dropObjPtr; + ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); + ndbrequire(dropObjPtr.p->m_errorCode == 0); + + dropObjPtr.p->m_callback.m_callbackFunction = + safe_cast(&Dbdict::restartDropObj_commit_start_done); + + if (f_dict_op[dropObjPtr.p->m_vt_index].m_commit_start) + (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_commit_start) + (signal, dropObjPtr.p); + else + execute(signal, dropObjPtr.p->m_callback, 0); +} + +void +Dbdict::restartDropObj_commit_start_done(Signal* signal, + Uint32 callbackData, + Uint32 returnCode) +{ + jam(); + ndbrequire(returnCode == 0); + DropObjRecordPtr dropObjPtr; + ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); + ndbrequire(dropObjPtr.p->m_errorCode == 0); + + dropObjPtr.p->m_callback.m_callbackFunction = + safe_cast(&Dbdict::restartDropObj_commit_complete_done); + + if (f_dict_op[dropObjPtr.p->m_vt_index].m_commit_complete) + (this->*f_dict_op[dropObjPtr.p->m_vt_index].m_commit_complete) + (signal, dropObjPtr.p); + else + execute(signal, dropObjPtr.p->m_callback, 0); +} + + +void +Dbdict::restartDropObj_commit_complete_done(Signal* signal, + Uint32 callbackData, + Uint32 returnCode) +{ + jam(); + ndbrequire(returnCode == 0); + DropObjRecordPtr dropObjPtr; + ndbrequire(c_opDropObj.find(dropObjPtr, callbackData)); + ndbrequire(dropObjPtr.p->m_errorCode == 0); + + c_opDropObj.release(dropObjPtr); + + c_restartRecord.activeTable++; + checkSchemaStatus(signal); +} + /* **************************************************************** */ /* ---------------------------------------------------------------- */ /* MODULE: NODE FAILURE HANDLING ------------------------- */ @@ -16239,6 +16404,22 @@ Dbdict::drop_file_commit_complete(Signal* signal, SchemaOp* op) execute(signal, op->m_callback, 0); } +void +Dbdict::drop_undofile_commit_complete(Signal* signal, SchemaOp* op) +{ + FilePtr f_ptr; + FilegroupPtr fg_ptr; + + jam(); + c_file_pool.getPtr(f_ptr, op->m_obj_ptr_i); + ndbrequire(c_filegroup_hash.find(fg_ptr, f_ptr.p->m_filegroup_id)); + Local_file_list list(c_file_pool, fg_ptr.p->m_logfilegroup.m_files); + list.remove(f_ptr); + release_object(f_ptr.p->m_obj_ptr_i); + c_file_hash.release(f_ptr); + execute(signal, op->m_callback, 0); +} + void Dbdict::drop_file_abort_start(Signal* signal, SchemaOp* op) { diff --git a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp index e5b918ca270..3fff330d699 100644 --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp @@ -2565,6 +2565,12 @@ private: const SchemaFile::TableEntry *, const SchemaFile::TableEntry *); void restartDropTab_complete(Signal*, Uint32 callback, Uint32); + + void restartDropObj(Signal*, Uint32, const SchemaFile::TableEntry *); + void restartDropObj_prepare_start_done(Signal*, Uint32, Uint32); + void restartDropObj_prepare_complete_done(Signal*, Uint32, Uint32); + void restartDropObj_commit_start_done(Signal*, Uint32, Uint32); + void restartDropObj_commit_complete_done(Signal*, Uint32, Uint32); void restart_checkSchemaStatusComplete(Signal*, Uint32 callback, Uint32); void restart_writeSchemaConf(Signal*, Uint32 callbackData, Uint32); @@ -2657,7 +2663,8 @@ public: void send_drop_fg(Signal*, SchemaOp*, DropFilegroupImplReq::RequestInfo); void drop_undofile_prepare_start(Signal* signal, SchemaOp*); - + void drop_undofile_commit_complete(Signal* signal, SchemaOp*); + int checkSingleUserMode(Uint32 senderRef); }; diff --git a/storage/ndb/src/kernel/blocks/lgman.cpp b/storage/ndb/src/kernel/blocks/lgman.cpp index 4af27e25124..25cdac89737 100644 --- a/storage/ndb/src/kernel/blocks/lgman.cpp +++ b/storage/ndb/src/kernel/blocks/lgman.cpp @@ -436,7 +436,6 @@ Lgman::drop_filegroup_drop_files(Signal* signal, { jam(); ndbrequire(! (ptr.p->m_state & Logfile_group::LG_THREAD_MASK)); - ndbrequire(ptr.p->m_meta_files.isEmpty()); ndbrequire(ptr.p->m_outstanding_fs == 0); Local_undofile_list list(m_file_pool, ptr.p->m_files); @@ -452,6 +451,18 @@ Lgman::drop_filegroup_drop_files(Signal* signal, return; } + Local_undofile_list metalist(m_file_pool, ptr.p->m_meta_files); + if (metalist.first(file_ptr)) + { + jam(); + metalist.remove(file_ptr); + list.add(file_ptr); + file_ptr.p->m_create.m_senderRef = ref; + file_ptr.p->m_create.m_senderData = data; + create_file_abort(signal, ptr, file_ptr); + return; + } + free_logbuffer_memory(ptr); m_logfile_group_hash.release(ptr); DropFilegroupImplConf *conf = (DropFilegroupImplConf*)signal->getDataPtr(); diff --git a/storage/ndb/src/kernel/blocks/tsman.cpp b/storage/ndb/src/kernel/blocks/tsman.cpp index 62aa80a67fe..8f61ec0cf7b 100644 --- a/storage/ndb/src/kernel/blocks/tsman.cpp +++ b/storage/ndb/src/kernel/blocks/tsman.cpp @@ -1309,6 +1309,12 @@ Tsman::execDROP_FILE_REQ(Signal* signal) Local_datafile_list free(m_file_pool, fg_ptr.p->m_free_files); free.remove(file_ptr); } + else if(find_file_by_id(file_ptr, fg_ptr.p->m_meta_files, req.file_id)) + { + jam(); + Local_datafile_list meta(m_file_pool, fg_ptr.p->m_meta_files); + meta.remove(file_ptr); + } else { errorCode = DropFileImplRef::NoSuchFile; diff --git a/storage/ndb/test/ndbapi/testDict.cpp b/storage/ndb/test/ndbapi/testDict.cpp index 9828cb768df..13c071f968e 100644 --- a/storage/ndb/test/ndbapi/testDict.cpp +++ b/storage/ndb/test/ndbapi/testDict.cpp @@ -2204,6 +2204,159 @@ runBug21755(NDBT_Context* ctx, NDBT_Step* step) return NDBT_OK; } +static +int +create_tablespace(NdbDictionary::Dictionary* pDict, + const char * lgname, + const char * tsname, + const char * dfname) +{ + NdbDictionary::Tablespace ts; + ts.setName(tsname); + ts.setExtentSize(1024*1024); + ts.setDefaultLogfileGroup(lgname); + + if(pDict->createTablespace(ts) != 0) + { + g_err << "Failed to create tablespace:" + << endl << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + + NdbDictionary::Datafile df; + df.setPath(dfname); + df.setSize(1*1024*1024); + df.setTablespace(tsname); + + if(pDict->createDatafile(df) != 0) + { + g_err << "Failed to create datafile:" + << endl << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + return 0; +} + +int +runBug24631(NDBT_Context* ctx, NDBT_Step* step) +{ + char tsname[256]; + char dfname[256]; + char lgname[256]; + char ufname[256]; + NdbRestarter res; + + if (res.getNumDbNodes() < 2) + return NDBT_OK; + + Ndb* pNdb = GETNDB(step); + NdbDictionary::Dictionary* pDict = pNdb->getDictionary(); + + NdbDictionary::Dictionary::List list; + if (pDict->listObjects(list) == -1) + return NDBT_FAILED; + + const char * lgfound = 0; + + for (Uint32 i = 0; icreateLogfileGroup(lg) != 0) + { + g_err << "Failed to create logfilegroup:" + << endl << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + + NdbDictionary::Undofile uf; + BaseString::snprintf(ufname, sizeof(ufname), "%s-%u", lgname, rand()); + uf.setPath(ufname); + uf.setSize(2*1024*1024); + uf.setLogfileGroup(lgname); + + if(pDict->createUndofile(uf) != 0) + { + g_err << "Failed to create undofile:" + << endl << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + } + else + { + BaseString::snprintf(lgname, sizeof(lgname), "%s", lgfound); + } + + BaseString::snprintf(tsname, sizeof(tsname), "TS-%u", rand()); + BaseString::snprintf(dfname, sizeof(dfname), "%s-%u.dat", tsname, rand()); + + if (create_tablespace(pDict, lgname, tsname, dfname)) + return NDBT_FAILED; + + + int node = res.getRandomNotMasterNodeId(rand()); + res.restartOneDbNode(node, false, true, true); + NdbSleep_SecSleep(3); + + if (pDict->dropDatafile(pDict->getDatafile(0, dfname)) != 0) + { + g_err << "Failed to drop datafile: " << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + + if (pDict->dropTablespace(pDict->getTablespace(tsname)) != 0) + { + g_err << "Failed to drop tablespace: " << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + + if (res.waitNodesNoStart(&node, 1)) + return NDBT_FAILED; + + res.startNodes(&node, 1); + if (res.waitClusterStarted()) + return NDBT_FAILED; + + if (create_tablespace(pDict, lgname, tsname, dfname)) + return NDBT_FAILED; + + if (pDict->dropDatafile(pDict->getDatafile(0, dfname)) != 0) + { + g_err << "Failed to drop datafile: " << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + + if (pDict->dropTablespace(pDict->getTablespace(tsname)) != 0) + { + g_err << "Failed to drop tablespace: " << pDict->getNdbError() << endl; + return NDBT_FAILED; + } + + if (lgfound == 0) + { + if (pDict->dropLogfileGroup(pDict->getLogfileGroup(lgname)) != 0) + return NDBT_FAILED; + } + + return NDBT_OK; +} + struct RandSchemaOp { struct Obj @@ -2707,6 +2860,10 @@ TESTCASE("DictRestart", ""){ INITIALIZER(runDictRestart); } +TESTCASE("Bug24631", + ""){ + INITIALIZER(runBug24631); +} NDBT_TESTSUITE_END(testDict); int main(int argc, const char** argv){ diff --git a/storage/ndb/test/run-test/daily-basic-tests.txt b/storage/ndb/test/run-test/daily-basic-tests.txt index e080536dad9..8f24e8826f9 100644 --- a/storage/ndb/test/run-test/daily-basic-tests.txt +++ b/storage/ndb/test/run-test/daily-basic-tests.txt @@ -619,6 +619,10 @@ max-time: 1500 cmd: testDict args: -l 25 -n DictRestart T1 +max-time: 500 +cmd: testDict +args: -n Bug24631 T1 + # # TEST NDBAPI # From 3083443262383a9111795fe9de8d0f34dd4ff7d5 Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Tue, 15 May 2007 12:02:58 +0200 Subject: [PATCH 23/40] improve error message on corrup schema file --- ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 31 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index fd7aabc8b67..a8db352e705 100644 --- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -79,6 +79,9 @@ #include #include +#include +extern EventLogger g_eventLogger; + #define ZNOT_FOUND 626 #define ZALREADYEXIST 630 @@ -1070,26 +1073,34 @@ void Dbdict::readSchemaConf(Signal* signal, for (Uint32 n = 0; n < xsf->noOfPages; n++) { SchemaFile * sf = &xsf->schemaPage[n]; bool ok = false; + const char *reason; if (memcmp(sf->Magic, NDB_SF_MAGIC, sizeof(sf->Magic)) != 0) - { jam(); } + { jam(); reason = "magic code"; } else if (sf->FileSize == 0) - { jam(); } + { jam(); reason = "file size == 0"; } else if (sf->FileSize % NDB_SF_PAGE_SIZE != 0) - { jam(); } + { jam(); reason = "invalid size multiple"; } else if (sf->FileSize != sf0->FileSize) - { jam(); } + { jam(); reason = "invalid size"; } else if (sf->PageNumber != n) - { jam(); } + { jam(); reason = "invalid page number"; } else if (computeChecksum((Uint32*)sf, NDB_SF_PAGE_SIZE_IN_WORDS) != 0) - { jam(); } - else if (crashInd) - { jam(); } + { jam(); reason = "invalid checksum"; } else ok = true; - ndbrequireErr(ok, NDBD_EXIT_SR_SCHEMAFILE); - if (! ok) { + + if (!ok) + { + char reason_msg[128]; + snprintf(reason_msg, sizeof(reason_msg), + "schema file corrupt, page %u (%s, " + "sz=%u sz0=%u pn=%u)", + n, reason, sf->FileSize, sf0->FileSize, sf->PageNumber); + if (crashInd) + progError(__LINE__, NDBD_EXIT_SR_SCHEMAFILE, reason_msg); jam(); ndbrequire(fsPtr.p->fsState == FsConnectRecord::READ_SCHEMA1); + infoEvent("primary %s, trying backup", reason_msg); readSchemaRef(signal, fsPtr); return; } From 17944acee241df94f19d6312dc17ea916b549ff4 Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Tue, 15 May 2007 12:07:09 +0200 Subject: [PATCH 24/40] add error code to ndb require --- ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index a8db352e705..d86f32dc8d1 100644 --- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -1098,8 +1098,9 @@ void Dbdict::readSchemaConf(Signal* signal, n, reason, sf->FileSize, sf0->FileSize, sf->PageNumber); if (crashInd) progError(__LINE__, NDBD_EXIT_SR_SCHEMAFILE, reason_msg); + ndbrequireErr(fsPtr.p->fsState == FsConnectRecord::READ_SCHEMA1, + NDBD_EXIT_SR_SCHEMAFILE); jam(); - ndbrequire(fsPtr.p->fsState == FsConnectRecord::READ_SCHEMA1); infoEvent("primary %s, trying backup", reason_msg); readSchemaRef(signal, fsPtr); return; From 3ea2ee357b03d909566965046a9a2c5f32f91b3b Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/april.(none)" <> Date: Wed, 16 May 2007 23:42:32 +0500 Subject: [PATCH 25/40] BUG#25712 - insert delayed and check table run together report crashed tables In case system doesn't have native pread/pwrite calls (e.g. Windows) and there is CHECK TABLE runs concurrently with another statement that reads from a table, the table may be reported as crashed. This is fixed by locking file descriptor when my_seek is executed on MyISAM index file and emulated pread/pwrite may be executed concurrently. Affects MyISAM tables on platforms that do not have native pread/pwrite calls (e.g. Windows). No deterministic test case for this bug. --- myisam/mi_check.c | 13 +++++++------ mysys/my_seek.c | 15 +++++++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/myisam/mi_check.c b/myisam/mi_check.c index 3a7817b7f03..ce8fb04874e 100644 --- a/myisam/mi_check.c +++ b/myisam/mi_check.c @@ -336,7 +336,7 @@ int chk_size(MI_CHECK *param, register MI_INFO *info) flush_key_blocks(info->s->key_cache, info->s->kfile, FLUSH_FORCE_WRITE); - size=my_seek(info->s->kfile,0L,MY_SEEK_END,MYF(0)); + size= my_seek(info->s->kfile, 0L, MY_SEEK_END, MYF(MY_THREADSAFE)); if ((skr=(my_off_t) info->state->key_file_length) != size) { if (skr > size) @@ -595,7 +595,8 @@ static int chk_index_down(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo, { /* purecov: begin tested */ /* Give it a chance to fit in the real file size. */ - my_off_t max_length= my_seek(info->s->kfile, 0L, MY_SEEK_END, MYF(0)); + my_off_t max_length= my_seek(info->s->kfile, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE)); mi_check_print_error(param, "Invalid key block position: %s " "key block size: %u file_length: %s", llstr(page, llbuff), keyinfo->block_length, @@ -4039,10 +4040,10 @@ int test_if_almost_full(MI_INFO *info) { if (info->s->options & HA_OPTION_COMPRESS_RECORD) return 0; - return (my_seek(info->s->kfile,0L,MY_SEEK_END,MYF(0))/10*9 > - (my_off_t) (info->s->base.max_key_file_length) || - my_seek(info->dfile,0L,MY_SEEK_END,MYF(0))/10*9 > - (my_off_t) info->s->base.max_data_file_length); + return my_seek(info->s->kfile, 0L, MY_SEEK_END, MYF(MY_THREADSAFE)) / 10 * 9 > + (my_off_t) info->s->base.max_key_file_length || + my_seek(info->dfile, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 > + (my_off_t) info->s->base.max_data_file_length; } /* Recreate table with bigger more alloced record-data */ diff --git a/mysys/my_seek.c b/mysys/my_seek.c index a9ae68cd5f0..cb0fe75d7e5 100644 --- a/mysys/my_seek.c +++ b/mysys/my_seek.c @@ -24,7 +24,9 @@ my_off_t pos The expected position (absolute or relative) int whence A direction parameter and one of {SEEK_SET, SEEK_CUR, SEEK_END} - myf MyFlags Not used. + myf MyFlags MY_THREADSAFE must be set in case my_seek may be mixed + with my_pread/my_pwrite calls and fd is shared among + threads. DESCRIPTION The my_seek function is a wrapper around the system call lseek and @@ -51,7 +53,16 @@ my_off_t my_seek(File fd, my_off_t pos, int whence, whence, MyFlags)); DBUG_ASSERT(pos != MY_FILEPOS_ERROR); /* safety check */ - newpos=lseek(fd, pos, whence); +#if defined(THREAD) && !defined(HAVE_PREAD) + if (MyFlags & MY_THREADSAFE) + { + pthread_mutex_lock(&my_file_info[fd].mutex); + newpos= lseek(fd, pos, whence); + pthread_mutex_lock(&my_file_info[fd].mutex); + } + else +#endif + newpos= lseek(fd, pos, whence); if (newpos == (os_off_t) -1) { my_errno=errno; From 1aec91bd1c4b5787f5df63c351526c3c0a46e292 Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Thu, 17 May 2007 08:54:30 +0200 Subject: [PATCH 26/40] ndb - bug#28491 disable "disable expand check" as it does not solve problems anyway --- storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp b/storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp index 3b27446d3a9..d34cfb159a4 100644 --- a/storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp +++ b/storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp @@ -5202,9 +5202,9 @@ void Dbacc::execEXPANDCHECK2(Signal* signal) { jamEntry(); - if(refToBlock(signal->getSendersBlockRef()) == DBLQH){ + if(refToBlock(signal->getSendersBlockRef()) == DBLQH) + { jam(); - reenable_expand_after_redo_log_exection_complete(signal); return; } From 4f11124c485ccfed76e50b19eeaf18101d3bdf53 Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Thu, 17 May 2007 12:43:52 +0500 Subject: [PATCH 27/40] Addition to fix for BUG#25712 - insert delayed and check table run together report crashed tables Let MY_THREADSAFE have distinct value. Some functions call my_seek passing MyFlags argument directly to it. This may cause unnecessary locks, which may finally lead to a dead-lock (specifically see my_lock). --- include/my_sys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/my_sys.h b/include/my_sys.h index 4c9a7a7964c..759531fa649 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -73,8 +73,8 @@ extern int NEAR my_errno; /* Last error in mysys */ #define MY_ALLOW_ZERO_PTR 64 /* my_realloc() ; zero ptr -> malloc */ #define MY_FREE_ON_ERROR 128 /* my_realloc() ; Free old ptr on error */ #define MY_HOLD_ON_ERROR 256 /* my_realloc() ; Return old ptr on error */ -#define MY_THREADSAFE 128 /* pread/pwrite: Don't allow interrupts */ #define MY_DONT_OVERWRITE_FILE 1024 /* my_copy; Don't overwrite file */ +#define MY_THREADSAFE 2048 /* my_seek(): lock fd mutex */ #define MY_CHECK_ERROR 1 /* Params to my_end; Check open-close */ #define MY_GIVE_INFO 2 /* Give time info about process*/ From f6a111dfef4ef1c442a4954d31628eed92a0f357 Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Thu, 17 May 2007 15:23:59 +0500 Subject: [PATCH 28/40] Addition to fix for BUG#25712 - insert delayed and check table run together report crashed tables Fixed wrongly applied patch. --- mysys/my_seek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mysys/my_seek.c b/mysys/my_seek.c index cb0fe75d7e5..805a5654ffc 100644 --- a/mysys/my_seek.c +++ b/mysys/my_seek.c @@ -58,7 +58,7 @@ my_off_t my_seek(File fd, my_off_t pos, int whence, { pthread_mutex_lock(&my_file_info[fd].mutex); newpos= lseek(fd, pos, whence); - pthread_mutex_lock(&my_file_info[fd].mutex); + pthread_mutex_unlock(&my_file_info[fd].mutex); } else #endif From b26bbbb8ff4e72797ad13c4b7aa15301a7dd3a6a Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Fri, 18 May 2007 09:48:52 +0200 Subject: [PATCH 29/40] ndb - bug#28443 Make sure that data can not e left lingering in receive buffer --- ndb/src/common/transporter/Packer.cpp | 5 +++ .../common/transporter/TCP_Transporter.hpp | 4 ++ .../transporter/TransporterRegistry.cpp | 39 ++++++++++--------- ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp | 17 ++++++++ ndb/test/ndbapi/testNdbApi.cpp | 34 ++++++++++++++++ ndb/test/run-test/daily-basic-tests.txt | 4 ++ 6 files changed, 85 insertions(+), 18 deletions(-) diff --git a/ndb/src/common/transporter/Packer.cpp b/ndb/src/common/transporter/Packer.cpp index 9eba335330d..503ff453e7e 100644 --- a/ndb/src/common/transporter/Packer.cpp +++ b/ndb/src/common/transporter/Packer.cpp @@ -21,7 +21,12 @@ #include #include +#ifdef ERROR_INSERT +Uint32 MAX_RECEIVED_SIGNALS = 1024; +#else #define MAX_RECEIVED_SIGNALS 1024 +#endif + Uint32 TransporterRegistry::unpack(Uint32 * readPtr, Uint32 sizeOfData, diff --git a/ndb/src/common/transporter/TCP_Transporter.hpp b/ndb/src/common/transporter/TCP_Transporter.hpp index 151ec261506..d081c6175a0 100644 --- a/ndb/src/common/transporter/TCP_Transporter.hpp +++ b/ndb/src/common/transporter/TCP_Transporter.hpp @@ -100,6 +100,10 @@ private: virtual void updateReceiveDataPtr(Uint32 bytesRead); virtual Uint32 get_free_buffer() const; + + inline bool hasReceiveData () const { + return receiveBuffer.sizeOfData > 0; + } protected: /** * Setup client/server and perform connect/accept diff --git a/ndb/src/common/transporter/TransporterRegistry.cpp b/ndb/src/common/transporter/TransporterRegistry.cpp index 3f190d16264..c0ee93a8489 100644 --- a/ndb/src/common/transporter/TransporterRegistry.cpp +++ b/ndb/src/common/transporter/TransporterRegistry.cpp @@ -807,6 +807,7 @@ TransporterRegistry::poll_OSE(Uint32 timeOutMillis) Uint32 TransporterRegistry::poll_TCP(Uint32 timeOutMillis) { + bool hasdata = false; if (false && nTCPTransporters == 0) { tcpReadSelectReply = 0; @@ -851,6 +852,7 @@ TransporterRegistry::poll_TCP(Uint32 timeOutMillis) // Put the connected transporters in the socket read-set FD_SET(socket, &tcpReadset); } + hasdata |= t->hasReceiveData(); } // The highest socket value plus one @@ -867,7 +869,7 @@ TransporterRegistry::poll_TCP(Uint32 timeOutMillis) } #endif - return tcpReadSelectReply; + return tcpReadSelectReply || hasdata; } #endif @@ -902,25 +904,26 @@ TransporterRegistry::performReceive() #endif #ifdef NDB_TCP_TRANSPORTER - if(tcpReadSelectReply > 0) + for (int i=0; igetRemoteNodeId(); - const NDB_SOCKET_TYPE socket = t->getSocket(); - if(is_connected(nodeId)){ - if(t->isConnected() && FD_ISSET(socket, &tcpReadset)) + checkJobBuffer(); + TCP_Transporter *t = theTCPTransporters[i]; + const NodeId nodeId = t->getRemoteNodeId(); + const NDB_SOCKET_TYPE socket = t->getSocket(); + if(is_connected(nodeId)){ + if(t->isConnected()) + { + if (FD_ISSET(socket, &tcpReadset)) { - const int receiveSize = t->doReceive(); - if(receiveSize > 0) - { - Uint32 * ptr; - Uint32 sz = t->getReceiveData(&ptr); - Uint32 szUsed = unpack(ptr, sz, nodeId, ioStates[nodeId]); - t->updateReceiveDataPtr(szUsed); - } + t->doReceive(); + } + + if (t->hasReceiveData()) + { + Uint32 * ptr; + Uint32 sz = t->getReceiveData(&ptr); + Uint32 szUsed = unpack(ptr, sz, nodeId, ioStates[nodeId]); + t->updateReceiveDataPtr(szUsed); } } } diff --git a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp index 6519444c364..c5f1ba2575a 100644 --- a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp +++ b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp @@ -135,6 +135,7 @@ Cmvmi::~Cmvmi() #ifdef ERROR_INSERT NodeBitmask c_error_9000_nodes_mask; +extern Uint32 MAX_RECEIVED_SIGNALS; #endif void Cmvmi::execNDB_TAMPER(Signal* signal) @@ -164,6 +165,22 @@ void Cmvmi::execNDB_TAMPER(Signal* signal) kill(getpid(), SIGABRT); } #endif + +#ifdef ERROR_INSERT + if (signal->theData[0] == 9003) + { + if (MAX_RECEIVED_SIGNALS < 1024) + { + MAX_RECEIVED_SIGNALS = 1024; + } + else + { + MAX_RECEIVED_SIGNALS = rand() % 128; + } + ndbout_c("MAX_RECEIVED_SIGNALS: %d", MAX_RECEIVED_SIGNALS); + CLEAR_ERROR_INSERT_VALUE; + } +#endif }//execNDB_TAMPER() void Cmvmi::execSET_LOGLEVELORD(Signal* signal) diff --git a/ndb/test/ndbapi/testNdbApi.cpp b/ndb/test/ndbapi/testNdbApi.cpp index 3a06269f8dc..1ef8c628dd4 100644 --- a/ndb/test/ndbapi/testNdbApi.cpp +++ b/ndb/test/ndbapi/testNdbApi.cpp @@ -1131,7 +1131,36 @@ int runBug_11133(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int +runBug28443(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int records = ctx->getNumRecords(); + + NdbRestarter restarter; + restarter.insertErrorInAllNodes(9003); + + for (Uint32 i = 0; igetNumLoops(); i++) + { + HugoTransactions hugoTrans(*ctx->getTab()); + if (hugoTrans.loadTable(GETNDB(step), records, 2048) != 0) + { + result = NDBT_FAILED; + goto done; + } + if (runClearTable(ctx, step) != 0) + { + result = NDBT_FAILED; + goto done; + } + } + +done: + restarter.insertErrorInAllNodes(9003); + + return result; +} NDBT_TESTSUITE(testNdbApi); TESTCASE("MaxNdb", @@ -1212,6 +1241,11 @@ TESTCASE("Bug_11133", INITIALIZER(runBug_11133); FINALIZER(runClearTable); } +TESTCASE("Bug28443", + ""){ + INITIALIZER(runBug28443); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testNdbApi); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 51ee6d14f00..fffe1ac9046 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -606,6 +606,10 @@ max-time: 500 cmd: testNdbApi args: -n Bug_11133 T1 +max-time: 1000 +cmd: testNdbApi +args: -n BugBug28443 + #max-time: 500 #cmd: testInterpreter #args: T1 From b9fd34a9ff46daeeb71ddff6aea3b98cf49f50e7 Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Fri, 18 May 2007 11:06:03 +0200 Subject: [PATCH 30/40] ndb - bug#28443 review comment if some tcp-transporter has data, then do select with timeout 0 --- .../transporter/TransporterRegistry.cpp | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/ndb/src/common/transporter/TransporterRegistry.cpp b/ndb/src/common/transporter/TransporterRegistry.cpp index c0ee93a8489..c459b5640dc 100644 --- a/ndb/src/common/transporter/TransporterRegistry.cpp +++ b/ndb/src/common/transporter/TransporterRegistry.cpp @@ -814,22 +814,6 @@ TransporterRegistry::poll_TCP(Uint32 timeOutMillis) return 0; } - struct timeval timeout; -#ifdef NDB_OSE - // Return directly if there are no TCP transporters configured - - if(timeOutMillis <= 1){ - timeout.tv_sec = 0; - timeout.tv_usec = 1025; - } else { - timeout.tv_sec = timeOutMillis / 1000; - timeout.tv_usec = (timeOutMillis % 1000) * 1000; - } -#else - timeout.tv_sec = timeOutMillis / 1000; - timeout.tv_usec = (timeOutMillis % 1000) * 1000; -#endif - NDB_SOCKET_TYPE maxSocketValue = -1; // Needed for TCP/IP connections @@ -855,6 +839,24 @@ TransporterRegistry::poll_TCP(Uint32 timeOutMillis) hasdata |= t->hasReceiveData(); } + timeOutMillis = hasdata ? 0 : timeOutMillis; + + struct timeval timeout; +#ifdef NDB_OSE + // Return directly if there are no TCP transporters configured + + if(timeOutMillis <= 1){ + timeout.tv_sec = 0; + timeout.tv_usec = 1025; + } else { + timeout.tv_sec = timeOutMillis / 1000; + timeout.tv_usec = (timeOutMillis % 1000) * 1000; + } +#else + timeout.tv_sec = timeOutMillis / 1000; + timeout.tv_usec = (timeOutMillis % 1000) * 1000; +#endif + // The highest socket value plus one maxSocketValue++; From 3019b64801f865bcbabcbc1e8aea1871809d3e0f Mon Sep 17 00:00:00 2001 From: "jonas@perch.ndb.mysql.com" <> Date: Fri, 18 May 2007 11:34:57 +0200 Subject: [PATCH 31/40] ndb - bug#28443 review comment 2, atleast 1 signal need for test prg --- ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp index c5f1ba2575a..75a6117ce08 100644 --- a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp +++ b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp @@ -175,7 +175,7 @@ void Cmvmi::execNDB_TAMPER(Signal* signal) } else { - MAX_RECEIVED_SIGNALS = rand() % 128; + MAX_RECEIVED_SIGNALS = 1 + (rand() % 128); } ndbout_c("MAX_RECEIVED_SIGNALS: %d", MAX_RECEIVED_SIGNALS); CLEAR_ERROR_INSERT_VALUE; From e3f7947da028887ec1932a7e43caf8ea5b91e06b Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Fri, 18 May 2007 16:23:46 +0500 Subject: [PATCH 32/40] BUG#28341 - Security issue still in library loading UDF can be created from any library in any part of the server LD_LIBRARY_PATH. Allow to load udfs only from plugin_dir. On windows, refuse to open udf in case it's path contains a slash. No good test case for this bug because of imperfect error message that includes error code and error string when it fails to dlopen a library. --- mysql-test/mysql-test-run.pl | 23 ++++------------------- mysql-test/r/plugin.result | 3 ++- mysql-test/t/plugin-master.opt | 1 + mysql-test/t/udf-master.opt | 1 + sql/sql_udf.cc | 30 ++++++++++++++++++++++-------- 5 files changed, 30 insertions(+), 28 deletions(-) create mode 100644 mysql-test/t/plugin-master.opt create mode 100644 mysql-test/t/udf-master.opt diff --git a/mysql-test/mysql-test-run.pl b/mysql-test/mysql-test-run.pl index a305b1d052d..b8d750c6d15 100755 --- a/mysql-test/mysql-test-run.pl +++ b/mysql-test/mysql-test-run.pl @@ -1766,22 +1766,6 @@ sub environment_setup () { push(@ld_library_paths, "$glob_basedir/storage/ndb/src/.libs"); } - # -------------------------------------------------------------------------- - # Add the path where mysqld will find udf_example.so - # -------------------------------------------------------------------------- - if ( $lib_udf_example ) - { - push(@ld_library_paths, dirname($lib_udf_example)); - } - - # -------------------------------------------------------------------------- - # Add the path where mysqld will find ha_example.so - # -------------------------------------------------------------------------- - if ( $lib_example_plugin ) - { - push(@ld_library_paths, dirname($lib_example_plugin)); - } - # -------------------------------------------------------------------------- # Valgrind need to be run with debug libraries otherwise it's almost # impossible to add correct supressions, that means if "/usr/lib/debug" @@ -2060,12 +2044,16 @@ sub environment_setup () { # ---------------------------------------------------- $ENV{'UDF_EXAMPLE_LIB'}= ($lib_udf_example ? basename($lib_udf_example) : ""); + $ENV{'UDF_EXAMPLE_LIB_OPT'}= + ($lib_udf_example ? "--plugin_dir=" . dirname($lib_udf_example) : ""); # ---------------------------------------------------- # Add the path where mysqld will find ha_example.so # ---------------------------------------------------- $ENV{'EXAMPLE_PLUGIN'}= ($lib_example_plugin ? basename($lib_example_plugin) : ""); + $ENV{'EXAMPLE_PLUGIN_OPT'}= + ($lib_example_plugin ? "--plugin_dir=" . dirname($lib_example_plugin) : ""); # ---------------------------------------------------- # We are nice and report a bit about our settings @@ -3821,9 +3809,6 @@ sub mysqld_arguments ($$$$) { mtr_add_arg($args, "%s--ndb-extra-logging", $prefix); } } - - mtr_add_arg($args, "%s--plugin_dir=%s", $prefix, - dirname($lib_example_plugin)); } else { diff --git a/mysql-test/r/plugin.result b/mysql-test/r/plugin.result index 44641858fca..6005bdcae8d 100644 --- a/mysql-test/r/plugin.result +++ b/mysql-test/r/plugin.result @@ -1,6 +1,7 @@ CREATE TABLE t1(a int) ENGINE=EXAMPLE; Warnings: -Error 1286 Unknown table engine 'EXAMPLE' +Warning 1286 Unknown table engine 'EXAMPLE' +Warning 1266 Using storage engine MyISAM for table 't1' DROP TABLE t1; INSTALL PLUGIN example SONAME 'ha_example.so'; INSTALL PLUGIN EXAMPLE SONAME 'ha_example.so'; diff --git a/mysql-test/t/plugin-master.opt b/mysql-test/t/plugin-master.opt new file mode 100644 index 00000000000..367d5233e0e --- /dev/null +++ b/mysql-test/t/plugin-master.opt @@ -0,0 +1 @@ +$EXAMPLE_PLUGIN_OPT diff --git a/mysql-test/t/udf-master.opt b/mysql-test/t/udf-master.opt new file mode 100644 index 00000000000..7d8786c156a --- /dev/null +++ b/mysql-test/t/udf-master.opt @@ -0,0 +1 @@ +$UDF_EXAMPLE_LIB_OPT diff --git a/sql/sql_udf.cc b/sql/sql_udf.cc index 89084c21c0c..505f7a9a765 100644 --- a/sql/sql_udf.cc +++ b/sql/sql_udf.cc @@ -169,11 +169,15 @@ void udf_init() Ensure that the .dll doesn't have a path This is done to ensure that only approved dll from the system directories are used (to make this even remotely secure). + + On windows we must check both FN_LIBCHAR and '/'. */ if (my_strchr(files_charset_info, dl_name, - dl_name + strlen(dl_name), FN_LIBCHAR) || - check_string_char_length(&name, "", NAME_CHAR_LEN, - system_charset_info, 1)) + dl_name + strlen(dl_name), FN_LIBCHAR) || + IF_WIN(my_strchr(files_charset_info, dl_name, + dl_name + strlen(dl_name), '/'), 0) || + check_string_char_length(&name, "", NAME_CHAR_LEN, + system_charset_info, 1)) { sql_print_error("Invalid row in mysql.func table for function '%.64s'", name.str); @@ -190,10 +194,13 @@ void udf_init() void *dl = find_udf_dl(tmp->dl); if (dl == NULL) { - if (!(dl= dlopen(tmp->dl, RTLD_NOW))) + char dlpath[FN_REFLEN]; + strxnmov(dlpath, sizeof(dlpath) - 1, opt_plugin_dir, "/", tmp->dl, + NullS); + if (!(dl= dlopen(dlpath, RTLD_NOW))) { /* Print warning to log */ - sql_print_error(ER(ER_CANT_OPEN_LIBRARY), tmp->dl, errno, dlerror()); + sql_print_error(ER(ER_CANT_OPEN_LIBRARY), tmp->dl, errno, dlerror()); /* Keep the udf in the hash so that we can remove it later */ continue; } @@ -394,8 +401,13 @@ int mysql_create_function(THD *thd,udf_func *udf) Ensure that the .dll doesn't have a path This is done to ensure that only approved dll from the system directories are used (to make this even remotely secure). + + On windows we must check both FN_LIBCHAR and '/'. */ - if (my_strchr(files_charset_info, udf->dl, udf->dl + strlen(udf->dl), FN_LIBCHAR)) + if (my_strchr(files_charset_info, udf->dl, + udf->dl + strlen(udf->dl), FN_LIBCHAR) || + IF_WIN(my_strchr(files_charset_info, udf->dl, + udf->dl + strlen(udf->dl), '/'), 0)) { my_message(ER_UDF_NO_PATHS, ER(ER_UDF_NO_PATHS), MYF(0)); DBUG_RETURN(1); @@ -422,10 +434,12 @@ int mysql_create_function(THD *thd,udf_func *udf) } if (!(dl = find_udf_dl(udf->dl))) { - if (!(dl = dlopen(udf->dl, RTLD_NOW))) + char dlpath[FN_REFLEN]; + strxnmov(dlpath, sizeof(dlpath) - 1, opt_plugin_dir, "/", udf->dl, NullS); + if (!(dl = dlopen(dlpath, RTLD_NOW))) { DBUG_PRINT("error",("dlopen of %s failed, error: %d (%s)", - udf->dl, errno, dlerror())); + udf->dl, errno, dlerror())); my_error(ER_CANT_OPEN_LIBRARY, MYF(0), udf->dl, errno, dlerror()); goto err; From 725d728f0e9bf93491e8b940f733cc825e656224 Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Mon, 21 May 2007 11:34:39 +0500 Subject: [PATCH 33/40] Addition to fix for BUG#28341 - Security issue still in library loading Added required option files to rpl_udf test. --- mysql-test/t/rpl_udf-master.opt | 1 + mysql-test/t/rpl_udf-slave.opt | 1 + 2 files changed, 2 insertions(+) create mode 100644 mysql-test/t/rpl_udf-master.opt create mode 100644 mysql-test/t/rpl_udf-slave.opt diff --git a/mysql-test/t/rpl_udf-master.opt b/mysql-test/t/rpl_udf-master.opt new file mode 100644 index 00000000000..7d8786c156a --- /dev/null +++ b/mysql-test/t/rpl_udf-master.opt @@ -0,0 +1 @@ +$UDF_EXAMPLE_LIB_OPT diff --git a/mysql-test/t/rpl_udf-slave.opt b/mysql-test/t/rpl_udf-slave.opt new file mode 100644 index 00000000000..7d8786c156a --- /dev/null +++ b/mysql-test/t/rpl_udf-slave.opt @@ -0,0 +1 @@ +$UDF_EXAMPLE_LIB_OPT From fbf65c86ea533f1e51097a96a2e3943e9441eeca Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Mon, 21 May 2007 11:48:05 +0500 Subject: [PATCH 34/40] BUG#25659 - memory leak via "plugins" test Re-enabled plugin test to check if it still leaks memory. --- mysql-test/t/disabled.def | 1 - 1 file changed, 1 deletion(-) diff --git a/mysql-test/t/disabled.def b/mysql-test/t/disabled.def index f054cf26813..79191233f53 100644 --- a/mysql-test/t/disabled.def +++ b/mysql-test/t/disabled.def @@ -35,7 +35,6 @@ synchronization : Bug#24529 Test 'synchronization' fails on Mac pushb #ndb_binlog_discover : bug#21806 2006-08-24 #ndb_autodiscover3 : bug#21806 -plugin : Bug#25659 memory leak via "plugins" test #rpl_ndb_dd_advance : Bug#25913 rpl_ndb_dd_advance fails randomly rpl_ndb_stm_innodb : Bug#26783 From fbb5c31aa968172fdafb58cafe98fdf03da69a69 Mon Sep 17 00:00:00 2001 From: "svoj@mysql.com/june.mysql.com" <> Date: Mon, 21 May 2007 17:48:29 +0500 Subject: [PATCH 35/40] BUG#25659 - memory leak via "plugins" test - Added suppressions for dlopen to make plugin test pass. - Do not pass empty string to mysqld, since my_getopt is not capable to handle it. - Re-enabled trailing UNINSTALL PLUGIN statement of plugin.test. The memory leak described in the bug report happens in libdl, not in mysqld. On some valgrind installations this error is suppressed by default, no idea why it isn't suppressed on pb-valgrind. If library remains open after thread has finished, and is closed by another thread, we get memory leak. But in case library is opened and closed by the same thread no leak occurs. --- mysql-test/lib/mtr_io.pl | 6 +++++- mysql-test/r/plugin.result | 3 +++ mysql-test/t/plugin.test | 6 +++--- mysql-test/valgrind.supp | 17 +++++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/mysql-test/lib/mtr_io.pl b/mysql-test/lib/mtr_io.pl index 570a58875c2..aa671c0f4f7 100644 --- a/mysql-test/lib/mtr_io.pl +++ b/mysql-test/lib/mtr_io.pl @@ -120,7 +120,11 @@ sub mtr_get_opts_from_file ($) { $arg =~ s/\$(\w+)/envsubst($1)/ge; # print STDERR "ARG: $arg\n"; - push(@args, $arg); + # Do not pass empty string since my_getopt is not capable to handle it. + if (length($arg)) + { + push(@args, $arg) + } } } close FILE; diff --git a/mysql-test/r/plugin.result b/mysql-test/r/plugin.result index 6005bdcae8d..8628acecf55 100644 --- a/mysql-test/r/plugin.result +++ b/mysql-test/r/plugin.result @@ -12,5 +12,8 @@ CREATE TABLE t1(a int) ENGINE=EXAMPLE; SELECT * FROM t1; a DROP TABLE t1; +UNINSTALL PLUGIN example; +UNINSTALL PLUGIN EXAMPLE; +ERROR 42000: PLUGIN EXAMPLE does not exist UNINSTALL PLUGIN non_exist; ERROR 42000: PLUGIN non_exist does not exist diff --git a/mysql-test/t/plugin.test b/mysql-test/t/plugin.test index 80c1de00b8e..fb6d5febe45 100644 --- a/mysql-test/t/plugin.test +++ b/mysql-test/t/plugin.test @@ -18,9 +18,9 @@ SELECT * FROM t1; DROP TABLE t1; -# Waiting for fix to BUG#22694 -#UNINSTALL PLUGIN example; -#UNINSTALL PLUGIN EXAMPLE; +UNINSTALL PLUGIN example; +--error 1305 +UNINSTALL PLUGIN EXAMPLE; --error 1305 UNINSTALL PLUGIN non_exist; diff --git a/mysql-test/valgrind.supp b/mysql-test/valgrind.supp index 17532d2345f..a4fb488a3d6 100644 --- a/mysql-test/valgrind.supp +++ b/mysql-test/valgrind.supp @@ -426,6 +426,23 @@ fun:_dl_map_object } +{ + libc pthread_exit 6 + Memcheck:Leak + fun:malloc + fun:_dl_map_object + fun:openaux + fun:_dl_catch_error +} + +{ + libc pthread_exit 7 + Memcheck:Leak + fun:malloc + fun:dl_open_worker + fun:_dl_catch_error + fun:_dl_open +} # # This is seen internally in the system libraries on 64-bit RHAS3. From b0677f1ddc90119cecefd7d1b665b3eb769d56e9 Mon Sep 17 00:00:00 2001 From: "joerg@trift2." <> Date: Tue, 22 May 2007 17:06:47 +0200 Subject: [PATCH 36/40] scripts/make_binary_distribution.sh : Include all the additional test suites in the binary packages ("tar.gz"). This is the tar.gz part of the fixes for bug#26609; for RPMs it is already done. --- scripts/make_binary_distribution.sh | 72 ++++++++++++++++------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/scripts/make_binary_distribution.sh b/scripts/make_binary_distribution.sh index e8bf39bd016..77f51c51c19 100644 --- a/scripts/make_binary_distribution.sh +++ b/scripts/make_binary_distribution.sh @@ -102,11 +102,41 @@ case $system in ;; esac +# This is needed to prefer GNU tar over platform tar because that can't +# always handle long filenames + +PATH_DIRS=`echo $PATH | \ + sed -e 's/^:/. /' -e 's/:$/ ./' -e 's/::/ . /g' -e 's/:/ /g' ` + +which_1 () +{ + for cmd + do + for d in $PATH_DIRS + do + for file in $d/$cmd + do + if [ -x $file -a ! -d $file ] ; then + echo $file + exit 0 + fi + done + done + done + exit 1 +} + +tar=`which_1 gnutar gtar` +if [ "$?" = "1" -o x"$tar" = x"" ] ; then + tar=tar +fi + mkdir $BASE $BASE/bin $BASE/docs \ $BASE/include $BASE/lib $BASE/support-files $BASE/share $BASE/scripts \ $BASE/mysql-test $BASE/mysql-test/t $BASE/mysql-test/r \ - $BASE/mysql-test/include $BASE/mysql-test/std_data $BASE/mysql-test/lib + $BASE/mysql-test/include $BASE/mysql-test/std_data $BASE/mysql-test/lib \ + $BASE/mysql-test/suite if [ $BASE_SYSTEM != "netware" ] ; then mkdir $BASE/share/mysql $BASE/tests $BASE/sql-bench $BASE/man \ @@ -117,8 +147,8 @@ fi # Copy files if they exists, warn for those that don't. # Note that when listing files to copy, we might list the file name -# twice, once in the directory location where it is build, and a -# second time in the ".libs" location. In the case the firs one +# twice, once in the directory location where it is built, and a +# second time in the ".libs" location. In the case the first one # is a wrapper script, the second one will overwrite it with the # binary file. copyfileto() @@ -274,6 +304,13 @@ $CP mysql-test/t/*.test mysql-test/t/*.imtest \ $CP mysql-test/r/*.result mysql-test/r/*.require \ $BASE/mysql-test/r +# Copy the additional suites "as is", they are in flux +$tar cf - mysql-test/suite | ( cd $BASE ; $tar xf - ) +# Clean up if we did this from a bk tree +if [ -d mysql-test/SCCS ] ; then + find $BASE/mysql-test -name SCCS -print | xargs rm -rf +fi + if [ $BASE_SYSTEM != "netware" ] ; then chmod a+x $BASE/bin/* copyfileto $BASE/bin scripts/* @@ -374,41 +411,12 @@ if [ x$DEBUG = x1 ] ; then exit fi -# This is needed to prefere gnu tar instead of tar because tar can't -# always handle long filenames - -PATH_DIRS=`echo $PATH | \ - sed -e 's/^:/. /' -e 's/:$/ ./' -e 's/::/ . /g' -e 's/:/ /g' ` - -which_1 () -{ - for cmd - do - for d in $PATH_DIRS - do - for file in $d/$cmd - do - if [ -x $file -a ! -d $file ] ; then - echo $file - exit 0 - fi - done - done - done - exit 1 -} - if [ $BASE_SYSTEM != "netware" ] ; then # # Create the result tar file # - tar=`which_1 gnutar gtar` - if [ "$?" = "1" -o x"$tar" = x"" ] ; then - tar=tar - fi - echo "Using $tar to create archive" OPT=cvf From 118500441956f36be3b303b1fb9dd11e3e15ac1b Mon Sep 17 00:00:00 2001 From: "joerg@trift2." <> Date: Tue, 22 May 2007 17:21:22 +0200 Subject: [PATCH 37/40] Add the "row_lock" test suite to the (to be) release build test run. --- Makefile.am | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.am b/Makefile.am index 992a6f6d755..6cb080aed5a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -157,6 +157,8 @@ test-bt: @PERL@ ./mysql-test-run.pl --force --comment=funcs1_ps --ps-protocol --suite=funcs_1 -cd mysql-test ; MTR_BUILD_THREAD=auto \ @PERL@ ./mysql-test-run.pl --force --comment=funcs2 --suite=funcs_2 + -cd mysql-test ; MTR_BUILD_THREAD=auto \ + @PERL@ ./mysql-test-run.pl --force --comment=rowlock --suite=row_lock test-bt-debug: -cd mysql-test ; MTR_BUILD_THREAD=auto \ From b46c826aee666ef9bc05a287248d157987e51b2c Mon Sep 17 00:00:00 2001 From: "tomas@whalegate.ndb.mysql.com" <> Date: Tue, 22 May 2007 17:53:07 +0200 Subject: [PATCH 38/40] Bug #28593 cluster backup scans in acc index order, bad for disk data - change to scan in tup and disk order (if applicable) --- .../ndb/src/kernel/blocks/backup/Backup.cpp | 2 +- .../ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 29 +++++++++++++++++-- .../kernel/blocks/dbtup/DbtupDiskAlloc.cpp | 21 ++++++++++++-- .../ndb/src/kernel/blocks/dbtup/DbtupScan.cpp | 2 -- storage/ndb/src/kernel/blocks/pgman.cpp | 2 ++ 5 files changed, 47 insertions(+), 9 deletions(-) diff --git a/storage/ndb/src/kernel/blocks/backup/Backup.cpp b/storage/ndb/src/kernel/blocks/backup/Backup.cpp index a07617f0bfb..57082eaccc8 100644 --- a/storage/ndb/src/kernel/blocks/backup/Backup.cpp +++ b/storage/ndb/src/kernel/blocks/backup/Backup.cpp @@ -3543,10 +3543,10 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal* signal) ScanFragReq::setHoldLockFlag(req->requestInfo, 0); ScanFragReq::setKeyinfoFlag(req->requestInfo, 0); ScanFragReq::setAttrLen(req->requestInfo,attrLen); + ScanFragReq::setTupScanFlag(req->requestInfo, 1); if (ptr.p->is_lcp()) { ScanFragReq::setScanPrio(req->requestInfo, 1); - ScanFragReq::setTupScanFlag(req->requestInfo, 1); ScanFragReq::setNoDiskFlag(req->requestInfo, 1); ScanFragReq::setLcpScanFlag(req->requestInfo, 1); } diff --git a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index a359267f9d9..6c99e8d0e13 100644 --- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -8476,9 +8476,32 @@ void Dblqh::continueAfterReceivingAllAiLab(Signal* signal) AccScanReq::setLockMode(req->requestInfo, scanptr.p->scanLockMode); AccScanReq::setReadCommittedFlag(req->requestInfo, scanptr.p->readCommitted); AccScanReq::setDescendingFlag(req->requestInfo, scanptr.p->descending); - AccScanReq::setNoDiskScanFlag(req->requestInfo, - !tcConnectptr.p->m_disk_table); - AccScanReq::setLcpScanFlag(req->requestInfo, scanptr.p->lcpScan); + + if (refToBlock(tcConnectptr.p->clientBlockref) == BACKUP) + { + if (scanptr.p->lcpScan) + { + AccScanReq::setNoDiskScanFlag(req->requestInfo, 1); + AccScanReq::setLcpScanFlag(req->requestInfo, 1); + } + else + { + /* If backup scan disktables in disk order */ + AccScanReq::setNoDiskScanFlag(req->requestInfo, + !tcConnectptr.p->m_disk_table); + AccScanReq::setLcpScanFlag(req->requestInfo, 0); + } + } + else + { +#if BUG_27776_FIXED + AccScanReq::setNoDiskScanFlag(req->requestInfo, + !tcConnectptr.p->m_disk_table); +#else + AccScanReq::setNoDiskScanFlag(req->requestInfo, 1); +#endif + AccScanReq::setLcpScanFlag(req->requestInfo, 0); + } req->transId1 = tcConnectptr.p->transid[0]; req->transId2 = tcConnectptr.p->transid[1]; diff --git a/storage/ndb/src/kernel/blocks/dbtup/DbtupDiskAlloc.cpp b/storage/ndb/src/kernel/blocks/dbtup/DbtupDiskAlloc.cpp index 54abbf18664..db336df6652 100644 --- a/storage/ndb/src/kernel/blocks/dbtup/DbtupDiskAlloc.cpp +++ b/storage/ndb/src/kernel/blocks/dbtup/DbtupDiskAlloc.cpp @@ -318,6 +318,7 @@ Dbtup::restart_setup_page(Disk_alloc_info& alloc, PagePtr pagePtr) unsigned uncommitted, committed; uncommitted = committed = ~(unsigned)0; int ret = tsman.get_page_free_bits(&page, &uncommitted, &committed); + jamEntry(); idx = alloc.calc_page_free_bits(real_free); ddassert(idx == committed); @@ -428,12 +429,12 @@ Dbtup::disk_page_prealloc(Signal* signal, c_extent_pool.getPtr(ext); if ((pageBits= tsman.alloc_page_from_extent(&ext.p->m_key, bits)) >= 0) { - jam(); + jamEntry(); found= true; } else { - jam(); + jamEntry(); /** * The current extent is not in a free list * and since it couldn't accomadate the request @@ -490,7 +491,7 @@ Dbtup::disk_page_prealloc(Signal* signal, if ((err= tsman.alloc_extent(&ext.p->m_key)) < 0) { - jam(); + jamEntry(); #if NOT_YET_UNDO_ALLOC_EXTENT c_lgman->free_log_space(logfile_group_id, sizeof(Disk_undo::AllocExtent)>>2); @@ -542,6 +543,7 @@ Dbtup::disk_page_prealloc(Signal* signal, alloc.m_curr_extent_info_ptr_i= ext.i; ext.p->m_free_matrix_pos= RNIL; pageBits= tsman.alloc_page_from_extent(&ext.p->m_key, bits); + jamEntry(); ddassert(pageBits >= 0); } @@ -601,6 +603,7 @@ Dbtup::disk_page_prealloc(Signal* signal, } int res= m_pgman.get_page(signal, preq, flags); + jamEntry(); switch(res) { case 0: @@ -900,6 +903,7 @@ Dbtup::disk_page_set_dirty(PagePtr pagePtr) // Make sure no one will allocate it... tsman.unmap_page(&key, MAX_FREE_LIST - 1); + jamEntry(); } void @@ -951,6 +955,7 @@ Dbtup::disk_page_unmap_callback(Uint32 page_id, Uint32 dirty_count) fragPtr.p->m_tablespace_id); tsman.unmap_page(&key, idx); + jamEntry(); pagePtr.p->list_index = idx | 0x8000; } @@ -999,6 +1004,7 @@ Dbtup::disk_page_alloc(Signal* signal, fragPtrP->m_tablespace_id); tsman.update_page_free_bits(key, new_bits, lsn); + jamEntry(); } } @@ -1051,6 +1057,7 @@ Dbtup::disk_page_free(Signal *signal, fragPtrP->m_tablespace_id); tsman.update_page_free_bits(key, new_bits, lsn); + jamEntry(); } Uint32 ext = pagePtr.p->m_extent_info_ptr; @@ -1104,6 +1111,7 @@ Dbtup::disk_page_abort_prealloc(Signal *signal, Fragrecord* fragPtrP, memcpy(&req.m_page, key, sizeof(Local_key)); int res= m_pgman.get_page(signal, req, flags); + jamEntry(); switch(res) { case 0: @@ -1232,6 +1240,7 @@ Dbtup::disk_page_alloc_extent_log_buffer_callback(Signal* signal, Uint64 lsn= lgman.add_entry(c, 1); tsman.update_lsn(&key, lsn); + jamEntry(); } #endif @@ -1250,6 +1259,7 @@ Dbtup::disk_page_undo_alloc(Page* page, const Local_key* key, Uint64 lsn= lgman.add_entry(c, 1); m_pgman.update_lsn(* key, lsn); + jamEntry(); return lsn; } @@ -1279,6 +1289,7 @@ Dbtup::disk_page_undo_update(Page* page, const Local_key* key, Uint64 lsn= lgman.add_entry(c, 3); m_pgman.update_lsn(* key, lsn); + jamEntry(); return lsn; } @@ -1308,6 +1319,7 @@ Dbtup::disk_page_undo_free(Page* page, const Local_key* key, Uint64 lsn= lgman.add_entry(c, 3); m_pgman.update_lsn(* key, lsn); + jamEntry(); return lsn; } @@ -1402,6 +1414,7 @@ Dbtup::disk_restart_undo(Signal* signal, Uint64 lsn, int flags = 0; int res= m_pgman.get_page(signal, preq, flags); + jamEntry(); switch(res) { case 0: @@ -1545,6 +1558,7 @@ Dbtup::disk_restart_undo_callback(Signal* signal, lsn = undo->m_lsn - 1; // make sure undo isn't run again... m_pgman.update_lsn(undo->m_key, lsn); + jamEntry(); } else if (DBG_UNDO) { @@ -1637,6 +1651,7 @@ Dbtup::disk_restart_undo_page_bits(Signal* signal, Apply_undo* undo) fragPtrP->m_tablespace_id); tsman.restart_undo_page_free_bits(&undo->m_key, new_bits, undo->m_lsn, lsn); + jamEntry(); } int diff --git a/storage/ndb/src/kernel/blocks/dbtup/DbtupScan.cpp b/storage/ndb/src/kernel/blocks/dbtup/DbtupScan.cpp index 04e60edfb2e..eb9b9c7acc2 100644 --- a/storage/ndb/src/kernel/blocks/dbtup/DbtupScan.cpp +++ b/storage/ndb/src/kernel/blocks/dbtup/DbtupScan.cpp @@ -62,13 +62,11 @@ Dbtup::execACC_SCANREQ(Signal* signal) break; } -#if BUG_27776_FIXED if (!AccScanReq::getNoDiskScanFlag(req->requestInfo) && tablePtr.p->m_no_of_disk_attributes) { bits |= ScanOp::SCAN_DD; } -#endif bool mm = (bits & ScanOp::SCAN_DD); if (tablePtr.p->m_attributes[mm].m_no_of_varsize > 0) { diff --git a/storage/ndb/src/kernel/blocks/pgman.cpp b/storage/ndb/src/kernel/blocks/pgman.cpp index af648c71253..719b60fa466 100644 --- a/storage/ndb/src/kernel/blocks/pgman.cpp +++ b/storage/ndb/src/kernel/blocks/pgman.cpp @@ -1561,6 +1561,7 @@ Pgman::execFSWRITEREF(Signal* signal) int Pgman::get_page(Signal* signal, Ptr ptr, Page_request page_req) { + jamEntry(); #ifdef VM_TRACE Ptr tmp = { &page_req, RNIL}; debugOut << "PGMAN: >get_page" << endl; @@ -1708,6 +1709,7 @@ Pgman::get_page(Signal* signal, Ptr ptr, Page_request page_req) void Pgman::update_lsn(Ptr ptr, Uint32 block, Uint64 lsn) { + jamEntry(); #ifdef VM_TRACE const char* bname = getBlockName(block, "?"); debugOut << "PGMAN: >update_lsn: block=" << bname << " lsn=" << lsn << endl; From 8592dfd90d455c5fb6a0c8e6b827fc3603e047be Mon Sep 17 00:00:00 2001 From: "joerg@trift2." <> Date: Thu, 24 May 2007 15:20:06 +0200 Subject: [PATCH 39/40] Makefile.am : Add "embedded" tests to the "test-bt" target. --- Makefile.am | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.am b/Makefile.am index 69ea5809e89..b92ee7ae0bc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -123,6 +123,11 @@ test-bt: -cd mysql-test ; MTR_BUILD_THREAD=auto \ @PERL@ ./mysql-test-run.pl --comment=NDB --force --timer \ --with-ndbcluster-only + -if [ -e bin/mysqltest_embedded -o -e libmysqld/examples/mysqltest_embedded ] ; then \ + cd mysql-test ; MTR_BUILD_THREAD=auto \ + @PERL@ ./mysql-test-run.pl --comment=embedded --force --timer \ + --embedded-server --skip-rpl --skip-ndbcluster ; \ + fi -cd mysql-test ; MTR_BUILD_THREAD=auto \ @PERL@ ./mysql-test-run.pl --force --comment=funcs1_ps --ps-protocol --suite=funcs_1 -cd mysql-test ; MTR_BUILD_THREAD=auto \ From 7034295f79cdc1ea942d4afb98950993c64cf503 Mon Sep 17 00:00:00 2001 From: "jbruehe/mysqldev@mysql.com/production.mysql.com" <> Date: Thu, 24 May 2007 17:35:22 +0200 Subject: [PATCH 40/40] Raise version number after cloning 5.1.19-beta --- configure.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.in b/configure.in index e3c6edb9b0f..3ba8f64cad1 100644 --- a/configure.in +++ b/configure.in @@ -10,7 +10,7 @@ AC_CANONICAL_SYSTEM # # When changing major version number please also check switch statement # in mysqlbinlog::check_master_version(). -AM_INIT_AUTOMAKE(mysql, 5.1.19-beta) +AM_INIT_AUTOMAKE(mysql, 5.1.20-beta) AM_CONFIG_HEADER(config.h) PROTOCOL_VERSION=10