From 5e55d1ced52c52fb2f0508e1346059901a85960f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 19 Dec 2013 14:36:38 +0200 Subject: [PATCH 01/56] Changes for Fusion-io multi-threaded flush, page compressed tables and tables using atomic write/table. This is work in progress and some parts are at most POC quality. --- storage/innobase/CMakeLists.txt | 3 + storage/innobase/buf/buf0buf.cc | 2 + storage/innobase/buf/buf0dblwr.cc | 31 +- storage/innobase/buf/buf0flu.cc | 324 ++++- storage/innobase/buf/buf0mtflu.cc | 1103 +++++++++++++++++ storage/innobase/buf/buf0rea.cc | 5 +- storage/innobase/dict/dict0dict.cc | 7 +- storage/innobase/fil/fil0fil.cc | 175 ++- storage/innobase/fil/fil0pagecompress.cc | 369 ++++++ storage/innobase/handler/ha_innodb.cc | 242 +++- storage/innobase/handler/ha_innodb.h | 15 + storage/innobase/handler/handler0alter.cc | 28 + storage/innobase/include/buf0buf.h | 6 + storage/innobase/include/dict0dict.h | 14 +- storage/innobase/include/dict0dict.ic | 151 ++- storage/innobase/include/dict0mem.h | 56 +- storage/innobase/include/dict0pagecompress.h | 94 ++ storage/innobase/include/dict0pagecompress.ic | 191 +++ storage/innobase/include/fil0fil.h | 43 +- storage/innobase/include/fil0pagecompress.h | 117 ++ storage/innobase/include/fsp0fsp.h | 66 +- storage/innobase/include/fsp0fsp.ic | 17 + storage/innobase/include/fsp0pagecompress.h | 64 + storage/innobase/include/fsp0pagecompress.ic | 61 + storage/innobase/include/fsp0types.h | 1 + storage/innobase/include/os0file.h | 57 +- storage/innobase/include/os0file.ic | 13 +- storage/innobase/include/srv0mon.h | 10 + storage/innobase/include/srv0srv.h | 64 +- storage/innobase/log/log0log.cc | 17 +- storage/innobase/log/log0recv.cc | 19 +- storage/innobase/os/os0file.cc | 561 ++++++++- storage/innobase/srv/srv0mon.cc | 68 + storage/innobase/srv/srv0srv.cc | 41 +- storage/innobase/srv/srv0start.cc | 720 ++++++++++- 35 files changed, 4559 insertions(+), 196 deletions(-) create mode 100644 storage/innobase/buf/buf0mtflu.cc create mode 100644 storage/innobase/fil/fil0pagecompress.cc create mode 100644 storage/innobase/include/dict0pagecompress.h create mode 100644 storage/innobase/include/dict0pagecompress.ic create mode 100644 storage/innobase/include/fil0pagecompress.h create mode 100644 storage/innobase/include/fsp0pagecompress.h create mode 100644 storage/innobase/include/fsp0pagecompress.ic diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index ee8758a08d2..e41d2406bd2 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -278,6 +278,8 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc +# TODO: JAN uncomment +# buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -291,6 +293,7 @@ SET(INNOBASE_SOURCES eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc + fil/fil0pagecompress.cc fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 6efa14e6791..328d5a6f3bf 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -3254,6 +3255,7 @@ buf_page_init_low( bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; + bpage->write_size = 0; HASH_INVALIDATE(bpage, hash); #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG bpage->file_page_was_freed = FALSE; diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index fb853fe1543..933b56eaf88 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -365,8 +366,8 @@ buf_dblwr_init_or_restore_pages( /* Read the trx sys header to check if we are using the doublewrite buffer */ - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, - UNIV_PAGE_SIZE, read_buf, NULL); + fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, + UNIV_PAGE_SIZE, read_buf, NULL, 0); doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -402,11 +403,11 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf, NULL); - fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0, + buf, NULL, 0); + fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - NULL); + NULL, 0); /* Check if any of these pages is half-written in data files, in the intended position */ @@ -433,8 +434,8 @@ buf_dblwr_init_or_restore_pages( + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; } - fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0, - UNIV_PAGE_SIZE, page, NULL); + fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL, 0); } else { space_id = mach_read_from_4( @@ -476,7 +477,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, TRUE, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); + read_buf, NULL, 0); /* Check if the page is corrupt */ @@ -528,7 +529,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_WRITE, TRUE, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); ib_logf(IB_LOG_LEVEL_INFO, "Recovered the page from" @@ -714,7 +715,7 @@ buf_dblwr_write_block_to_datafile( buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), (void*) bpage->zip.data, - (void*) bpage); + (void*) bpage, 0); return; } @@ -727,7 +728,7 @@ buf_dblwr_write_block_to_datafile( fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block); + (void*) block->frame, (void*) block, 0); } /********************************************************************//** @@ -820,7 +821,7 @@ try_again: fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -836,7 +837,7 @@ try_again: fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); flush: /* increment the doublewrite flushed pages counter */ @@ -1056,14 +1057,14 @@ retry: fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf - + UNIV_PAGE_SIZE * i), NULL); + + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, - NULL); + NULL, 0); } /* Now flush the doublewrite buffer data to disk */ diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 542c1669667..06ae7b5375c 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, Fusion-io. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -673,8 +675,10 @@ buf_flush_write_complete( flush_type = buf_page_get_flush_type(bpage); buf_pool->n_flush[flush_type]--; +#ifdef UNIV_DEBUG /* fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[flush_type]); */ +#endif if (buf_pool->n_flush[flush_type] == 0 && buf_pool->init_flush[flush_type] == FALSE) { @@ -938,7 +942,7 @@ buf_flush_write_block_low( FALSE, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage); + frame, bpage, 0); } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { buf_dblwr_write_single_page(bpage); } else { @@ -1213,7 +1217,9 @@ buf_flush_try_neighbors( } } +#ifdef UNIV_DEBUG /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */ +#endif if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); @@ -1655,7 +1661,7 @@ pages: to avoid deadlocks, this function must be written so that it cannot end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued */ -static +//static ulint buf_flush_batch( /*============*/ @@ -1712,7 +1718,7 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -static +//static void buf_flush_common( /*=============*/ @@ -1737,7 +1743,7 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -static +//static ibool buf_flush_start( /*============*/ @@ -1766,7 +1772,7 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -static +//static void buf_flush_end( /*==========*/ @@ -1816,11 +1822,55 @@ buf_flush_wait_batch_end( } } else { thd_wait_begin(NULL, THD_WAIT_DISKIO); - os_event_wait(buf_pool->no_flush[type]); + os_event_wait(buf_pool->no_flush[type]); thd_wait_end(NULL); } } +/* JAN: TODO: */ +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list and also +puts replaceable clean pages from the end of the LRU list to the free +list. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully. false if another batch +of same type was already running. */ +static +bool +pgcomp_buf_flush_LRU( +/*==========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ +{ + ulint page_count; + + if (n_processed) { + *n_processed = 0; + } + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + return(false); + } + + page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, page_count); + + if (n_processed) { + *n_processed = page_count; + } + + return(true); +} +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free @@ -1863,6 +1913,168 @@ buf_flush_LRU( return(true); } +/* JAN: TODO: */ +/*******************************************************************//**/ +extern int is_pgcomp_wrk_init_done(void); +extern int pgcomp_flush_work_items(int buf_pool_inst, int *pages_flushed, + int flush_type, int min_n, unsigned long long lsn_limit); + +#define MT_COMP_WATER_MARK 50 + +#include +int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) +{ + if (g_time->tv_usec < s_time->tv_usec) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; + s_time->tv_usec -= 1000000 * nsec; + s_time->tv_sec += nsec; + } + if (g_time->tv_usec - s_time->tv_usec > 1000000) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; + s_time->tv_usec += 1000000 * nsec; + s_time->tv_sec -= nsec; + } + d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; + d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; + + return 0; +} + +static pthread_mutex_t pgcomp_mtx = PTHREAD_MUTEX_INITIALIZER; +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +UNIV_INTERN +bool +pgcomp_buf_flush_list( +/*==================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + struct timeval p_start_time, p_end_time, d_time; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { + int cnt_flush[32]; + + //stack_trace(); + pthread_mutex_lock(&pgcomp_mtx); + //gettimeofday(&p_start_time, 0x0); + //fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + + } + } + + pthread_mutex_unlock(&pgcomp_mtx); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); + } + /* Flush to lsn_limit in all buffer pool instances */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + ulint page_count = 0; + + buf_pool = buf_pool_from_array(i); + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + success = false; + + continue; + } + + page_count = buf_flush_batch( + buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit); + + buf_flush_end(buf_pool, BUF_FLUSH_LIST); + + buf_flush_common(BUF_FLUSH_LIST, page_count); + + if (n_processed) { + *n_processed += page_count; + } + + if (page_count) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + page_count); + } + } + +#if UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); +} +#endif +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. @@ -1890,6 +2102,12 @@ buf_flush_list( ulint i; bool success = true; + /* JAN: TODO: */ + if (is_pgcomp_wrk_init_done()) { + return(pgcomp_buf_flush_list(min_n, lsn_limit, n_processed)); + } + /* JAN: TODO: END: */ + if (n_processed) { *n_processed = 0; } @@ -2043,6 +2261,59 @@ buf_flush_single_page_from_LRU( return(freed); } +/* JAN: TODO: */ +/*********************************************************************//** +pgcomp_Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +pgcomp_buf_flush_LRU_tail(void) +/*====================*/ +{ + struct timeval p_start_time, p_end_time, d_time; + ulint total_flushed=0, i=0; + int cnt_flush[32]; + +#if UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + assert(is_pgcomp_wrk_init_done()); + + pthread_mutex_lock(&pgcomp_mtx); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + + pthread_mutex_unlock(&pgcomp_mtx); + +#if UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + + return(total_flushed); +} +/* JAN: TODO: END: */ + /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list @@ -2056,6 +2327,12 @@ buf_flush_LRU_tail(void) /*====================*/ { ulint total_flushed = 0; + /* JAN: TODO: */ + if(is_pgcomp_wrk_init_done()) + { + return(pgcomp_buf_flush_LRU_tail()); + } + /* JAN: TODO: END */ for (ulint i = 0; i < srv_buf_pool_instances; i++) { @@ -2342,6 +2619,8 @@ page_cleaner_sleep_if_needed( } } + + /******************************************************************//** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one instance of this thread. @@ -2357,6 +2636,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( ulint next_loop_time = ut_time_ms() + 1000; ulint n_flushed = 0; ulint last_activity = srv_get_activity_count(); + ulint n_lru=0, n_pgc_flush=0, n_pgc_batch=0; ut_ad(!srv_read_only_mode); @@ -2368,7 +2648,6 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n", os_thread_pf(os_thread_get_curr_id())); #endif /* UNIV_DEBUG_THREAD_CREATION */ - buf_page_cleaner_is_active = TRUE; while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { @@ -2388,12 +2667,23 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( last_activity = srv_get_activity_count(); /* Flush pages from end of LRU if required */ - n_flushed = buf_flush_LRU_tail(); + n_lru = n_flushed = buf_flush_LRU_tail(); +#ifdef UNIV_DEBUG + if (n_lru) { + fprintf(stderr,"n_lru:%lu ",n_lru); + } +#endif /* Flush pages from flush_list if required */ - n_flushed += page_cleaner_flush_pages_if_needed(); + n_flushed += n_pgc_flush = page_cleaner_flush_pages_if_needed(); + +#ifdef UNIV_DEBUG + if (n_pgc_flush) { + fprintf(stderr,"n_pgc_flush:%lu ",n_pgc_flush); + } +#endif } else { - n_flushed = page_cleaner_do_flush_batch( + n_pgc_batch = n_flushed = page_cleaner_do_flush_batch( PCT_IO(100), LSN_MAX); @@ -2404,7 +2694,18 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( MONITOR_FLUSH_BACKGROUND_PAGES, n_flushed); } +#ifdef UNIV_DEBUG + if (n_pgc_batch) { + fprintf(stderr,"n_pgc_batch:%lu ",n_pgc_batch); + } +#endif } +#ifdef UNIV_DEBUG + if (n_lru || n_pgc_flush || n_pgc_batch) { + fprintf(stderr,"\n"); + n_lru = n_pgc_flush = n_pgc_batch = 0; + } +#endif } ut_ad(srv_shutdown_state > 0); @@ -2573,8 +2874,9 @@ buf_flush_validate( return(ret); } + #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#endif /* !UNIV_HOTBACKUP */ + #ifdef UNIV_DEBUG /******************************************************************//** diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc new file mode 100644 index 00000000000..7abe0547877 --- /dev/null +++ b/storage/innobase/buf/buf0mtflu.cc @@ -0,0 +1,1103 @@ +/***************************************************************************** + +Copyright (C) 2013 Fusion-io. All Rights Reserved. +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file buf/buf0mtflu.cc +Multi-threaded flush method implementation + +Created 06/11/2013 Dhananjoy Das DDas@fusionio.com +Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include + +#ifdef UNIV_PFS_MUTEX +/* Key to register fil_system_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t mtflush_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* Mutex to protect critical sections during multi-threaded flush */ +ib_mutex_t mt_flush_mutex; + +#define MT_COMP_WATER_MARK 50 + +/* Work item status */ +typedef enum { + WORK_ITEM_SET=0, /* Work item information set */ + WORK_ITEM_START=1, /* Work item assigned to thread and + execution started */ + WORK_ITEM_DONE=2, /* Work item execution done */ +} mtflu_witem_status_t; + +/* Work thread status */ +typedef enum { + WORK_THREAD_NOT_INIT=0, /* Work thread not initialized */ + WORK_THREAD_INITIALIZED=1, /* Work thread initialized */ + WORK_THREAD_SIG_WAITING=2, /* Work thred signaled */ + WORK_THREAD_RUNNING=3, /* Work thread running */ + WORK_THREAD_NO_WORK=4, /* Work thread has no work to do */ +} mtflu_wthr_status_t; + +/* Structure containing multi-treaded flush thread information */ +typedef struct { + os_thread_t wthread_id; /* Thread id */ + opq_t *wq; /* Write queue ? */ + opq_t *cq; /* Commit queue ?*/ + ib_mutex_t thread_mutex; /* Mutex proecting below + structures */ + mtflu_wthr_status_t thread_status; /* Thread status */ + ib_uint64_t total_num_processed; /* Total number of + pages processed */ + ib_uint64_t cycle_num_processed; /* Numper of pages + processed on last + cycle */ + ulint check_wrk_done_count; /* Number of pages + to process in this + work item ? */ + ulint done_cnt_flag; /* Number of pages + processed in this + work item ?*/ +} mtflu_thread_t; + +struct work_item_t { + /****************************/ + /* Need to group into struct*/ + buf_pool_t* buf_pool; //buffer-pool instance + int flush_type; //flush-type for buffer-pool flush operation + ulint min; //minimum number of pages requested to be flushed + lsn_t lsn_limit; //lsn limit for the buffer-pool flush operation + /****************************/ + + unsigned long result; //flush pages count + unsigned long t_usec; //time-taken in usec + os_thread_t id_usr; /* thread-id + currently working , why ? */ + mtflu_witem_status_t wi_status; /* work item status */ + + UT_LIST_NODE_T(work_node_t) next; +}; + +/* Multi-threaded flush system structure */ +typedef struct { + int pgc_n_threads = 8;// ??? why what this is + + mtflu_thread_t pc_sync[PGCOMP_MAX_WORKER]; + wrk_t work_items[PGCOMP_MAX_WORKER]; + int pgcomp_wrk_initialized = -1; /* ???? */ + opq_t wq; /* write queue ? */ + opq_t cq; /* commit queue ? */ +} mtflu_system_t; + +typedef enum op_q_status { + Q_NOT_INIT=0, + Q_EMPTY=1, + Q_INITIALIZED=2, + Q_PROCESS=3, + Q_DONE=4, + Q_ERROR=5, + Q_STATUS_UNDEFINED +} q_status_t; + +// NOTE: jan: could we use ut/ut0wqueue.(h|cc) +// NOTE: jan: here ????, it would handle waiting, signaling +// and contains simple interface + +typedef struct op_queue +{ + ib_mutex_t mtx; /* Mutex protecting below variables + */ + os_cond_t cv; /* ? is waiting here ? */ + q_status_t flag; /* Operation queue status */ + UT_LIST_BASE_NODE_T(work_item_t) work_list; +} opq_t; + + +/*******************************************************************//** +Initialize multi-threaded flush. +*/ +void +buf_mtflu_init(void) +/*================*/ +{ + mutex_create(mtflush_mutex_key, + &mt_flush_mutex, SYNC_ANY_LATCH); +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list and also +puts replaceable clean pages from the end of the LRU list to the free +list. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully. false if another batch +of same type was already running. */ +bool +buf_mtflu_flush_LRU( +/*================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ +{ + ulint page_count; + + if (n_processed) { + *n_processed = 0; + } + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + return(false); + } + + page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, page_count); + + if (n_processed) { + *n_processed = page_count; + } + + return(true); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Utility function to calculate time difference between start time +and end time. +@return Time difference. +*/ +UNIV_INTERN +void +mtflu_timediff( +/*===========*/ + struct timeval *g_time, /*!< in/out: Start time*/ + struct timeval *s_time, /*!< in/out: End time */ + struct timeval *d_time) /*!< out: Time difference */ +{ + if (g_time->tv_usec < s_time->tv_usec) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; + s_time->tv_usec -= 1000000 * nsec; + s_time->tv_sec += nsec; + } + if (g_time->tv_usec - s_time->tv_usec > 1000000) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; + s_time->tv_usec += 1000000 * nsec; + s_time->tv_sec -= nsec; + } + d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; + d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; +} +#endif + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush list of +all buffer pool instances. This is multi-threaded version of buf_flush_list. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully for each buffer pool +instance. false if another batch of same type was already running in +at least one of the buffer pool instance */ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + struct timeval p_start_time, p_end_time, d_time; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { + int cnt_flush[32]; + + mutex_enter(&mt_flush_mutex); + +#ifdef UNIV_DEBUG + fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); +#endif + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + + } + } + + mutex_exit(&pgcomp_mtx); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); + } + + /* Flush to lsn_limit in all buffer pool instances */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + ulint page_count = 0; + + buf_pool = buf_pool_from_array(i); + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + success = false; + + continue; + } + + page_count = buf_flush_batch( + buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit); + + buf_flush_end(buf_pool, BUF_FLUSH_LIST); + + buf_flush_common(BUF_FLUSH_LIST, page_count); + + if (n_processed) { + *n_processed += page_count; + } + + if (page_count) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + page_count); + } + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); +} + +/*********************************************************************//** +Clear up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +ulint +buf_mtflu_flush_LRU_tail(void) +/*==========================*/ +{ + ulint total_flushed=0, i=0; + int cnt_flush[32]; + +#ifdef UNIV_DEBUG + struct timeval p_start_time, p_end_time, d_time; + gettimeofday(&p_start_time, 0x0); +#endif + assert(is_pgcomp_wrk_init_done()); + + mutex_enter(&pgcomp_mtx); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + + mutex_exit(&pgcomp_mtx); + +#if UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + + return(total_flushed); +} + +/*******************************************************************//** +Set work done count to given count. +@return 1 if still work to do, 0 if no work left */ +int +set_check_done_flag_count(int cnt) +/*================*/ +{ + return(check_wrk_done_count = cnt); +} + +/*******************************************************************//** +? +@return why ? */ +int +set_pgcomp_wrk_init_done(void) +/*================*/ +{ + pgcomp_wrk_initialized = 1; + return 0; +} + +/*******************************************************************//** +? +@return true if work is initialized */ +bool +is_pgcomp_wrk_init_done(void) +/*================*/ +{ + return(pgcomp_wrk_initialized == 1); +} + +/*******************************************************************//** +Set current done pages count to the given value +@return number of pages flushed */ +int +set_done_cnt_flag(int val) +/*================*/ +{ + /* + * Assumption: The thread calling into set_done_cnt_flag + * needs to have "cq.mtx" acquired, else not safe. + */ + done_cnt_flag = val; + return done_cnt_flag; +} + +/*******************************************************************//** +? +@return number of pages flushed */ +int +cv_done_inc_flag_sig(thread_sync_t * ppc) +/*================*/ +{ + mutex_enter(&ppc->cq->mtx); + ppc->stat_universal_num_processed++; + ppc->stat_cycle_num_processed++; + done_cnt_flag++; + if(!(done_cnt_flag <= check_wrk_done_count)) { + fprintf(stderr, "ERROR: done_cnt:%d check_wrk_done_count:%d\n", + done_cnt_flag, check_wrk_done_count); + } + assert(done_cnt_flag <= check_wrk_done_count); + mutex_exit(&ppc->cq->mtx); + if(done_cnt_flag == check_wrk_done_count) { + // why below does not need mutex protection ? + ppc->wq->flag = Q_DONE; + mutex_enter(&ppc->cq->mtx); + ppc->cq->flag = Q_DONE; + os_cond_signal(&ppc->cq->cv); + mutex_exit(&ppc->cq->mtx); + } + return(done_cnt_flag); +} + +/*******************************************************************//** +Remove work item from queue, in my opinion not needed after we use +UT_LIST +@return number of pages flushed */ +int +q_remove_wrk(opq_t *q, wrk_t **wi) +/*================*/ +{ + int ret = 0; + + if(!wi || !q) { + return -1; + } + + mutex_enter(&q->mtx); + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* get the first in the list*/ + *wi = q->head; + if(q->head) { + ret = 0; + q->head = q->head->next; + (*wi)->next = NULL; + if(!q->head) { + q->tail = NULL; + } + } else { + q->tail = NULL; + ret = 1; /* indicating remove from queue failed */ + } + mutex_exit(&q->mtx); + return (ret); +} + +/*******************************************************************//** +Return true if work item has being assigned to a thread or false +if work item is not assigned. +@return true if work is assigned, false if not */ +bool +is_busy_wrk_itm(wrk_t *wi) +/*================*/ +{ + if(!wi) { + return -1; + } + return(!(wi->id_usr == -1)); +} + +/*******************************************************************//** +Initialize work items. +@return why ? */ +int +setup_wrk_itm(int items) +/*================*/ +{ + int i; + for(i=0; imtx = os_mutex_create(); + os_cond_init(&q->cv); + q->flag = Q_INITIALIZED; + q->head = q->tail = NULL; + + return 0; +} + +/// NEEDED ? +#if 0 +int drain_cq(opq_t *cq, int items) +{ + int i=0; + + if(!cq) { + return -1; + } + mutex_enter(&cq->mtx); + for(i=0; ihead = cq->tail = NULL; + mutex_unlock(&cq->mtx); + return 0; +} +#endif + +/*******************************************************************//** +Insert work item list to queue, not needed with UT_LIST +@return why ? */ +int +q_insert_wrk_list(opq_t *q, wrk_t *w_list) +/*================*/ +{ + if((!q) || (!w_list)) { + fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); + return -1; + } + + mutex_enter(&q->mtx); + + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* list is empty */ + if(!q->tail) { + q->head = q->tail = w_list; + } else { + /* added the first of the node to list */ + assert(q->head != NULL); + q->tail->next = w_list; + } + + /* move tail to the last node */ + while(q->tail->next) { + q->tail = q->tail->next; + } + mutex_exit(&q->mtx); + + return 0; +} + +/*******************************************************************//** +Flush ? +@return why ? */ +int +flush_pool_instance(wrk_t *wi) +/*================*/ +{ + struct timeval p_start_time, p_end_time, d_time; + + if(!wi) { + fprintf(stderr, "work item invalid wi:%p\n", wi); + return -1; + } + + wi->t_usec = 0; + if (!buf_flush_start(wi->buf_pool, (buf_flush_t)wi->flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + fprintf(stderr, "flush_start Failed, flush_type:%d\n", + (buf_flush_t)wi->flush_type); + return -1; + } + +#ifdef UNIV_DEBUG + /* Record time taken for the OP in usec */ + gettimeofday(&p_start_time, 0x0); +#endif + + if((buf_flush_t)wi->flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(wi->buf_pool); + wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); + buf_pool_mutex_exit(wi->buf_pool); + wi->min = ut_min(srv_LRU_scan_depth,wi->min); + } + + wi->result = buf_flush_batch(wi->buf_pool, + (buf_flush_t)wi->flush_type, + wi->min, wi->lsn_limit); + + buf_flush_end(wi->buf_pool, (buf_flush_t)wi->flush_type); + buf_flush_common((buf_flush_t)wi->flush_type, wi->result); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); +#endif + return 0; +} + +/*******************************************************************//** +? +@return why ? */ +int +service_page_comp_io(thread_sync_t * ppc) +/*================*/ +{ + wrk_t *wi = NULL; + int ret=0; + struct timespec ts; + + mutex_enter(&ppc->wq->mtx); + do{ + ppc->wt_status = WTHR_SIG_WAITING; + ret = os_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); + ppc->wt_status = WTHR_RUNNING; + if(ret == ETIMEDOUT) { + fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%d] ret:%d\n", + done_cnt_flag, ret); + } else if(ret == EINVAL || ret == EPERM) { + fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%d] ret:%d\n", + done_cnt_flag, ret); + } + if(ppc->wq->flag == Q_PROCESS) { + break; + } else { + mutex_exit(&ppc->wq->mtx); + return -1; + } + } while (ppc->wq->flag == Q_PROCESS && ret == 0); + + mutex_exit(&ppc->wq->mtx); + + while (ppc->cq->flag == Q_PROCESS) { + wi = NULL; + /* Get the work item */ + if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { + ppc->wt_status = WTHR_NO_WORK; + return -1; + } + + assert(ret==0); + assert(wi != NULL); + assert(0 == is_busy_wrk_itm(wi)); + assert(wi->id_usr == -1); + + wi->id_usr = ppc->wthread; + wi->wi_status = WRK_ITEM_START; + + /* Process work item */ + if(0 != (ret = flush_pool_instance(wi))) { + fprintf(stderr, "FLUSH op failed ret:%d\n", ret); + wi->wi_status = WRK_ITEM_FAILED; + } + ret = q_insert_wrk_list(ppc->cq, wi); + + assert(0==ret); + assert(check_wrk_done_count >= done_cnt_flag); + wi->wi_status = WRK_ITEM_SUCCESS; + if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { + break; + } + } + return(0); +} + +/******************************************************************//** +Thread main function for multi-threaded flush +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(page_comp_io_thread)( +/*==========================================*/ + void * arg) +{ + thread_sync_t *ppc_io = ((thread_sync_t *)arg); + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + service_page_comp_io(ppc_io); + ppc_io->stat_cycle_num_processed = 0; + } + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +/*******************************************************************//** +Print queue work item +@return why ? */ +int +print_queue_wrk_itm(opq_t *q) +/*================*/ +{ +#if UNIV_DEBUG + wrk_t *wi = NULL; + + if(!q) { + fprintf(stderr, "queue NULL\n"); + return -1; + } + + if(!q->head || !q->tail) { + assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); + fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); + return 0; + } + + mutex_enter(&q->mtx); + for(wi = q->head; (wi != NULL) ; wi = wi->next) { + //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", + // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); + fprintf(stderr, "- [%p] [%s] >%p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->next); + } + mutex_exit(&q->mtx); +#endif + return(0); +} + +/*******************************************************************//** +Print work list +@return why ? */ +int +print_wrk_list(wrk_t *wi_list) +/*================*/ +{ + wrk_t *wi = wi_list; + int i=0; + + if(!wi_list) { + fprintf(stderr, "list NULL\n"); + } + + while(wi) { + fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); + wi = wi->next; + i++; + } + fprintf(stderr, "list len: %d\n", i); + return 0; +} + +/*******************************************************************//** +? +@return why ? */ +int +pgcomp_handler(wrk_t *w_list) +/*================*/ +{ + struct timespec ts; + int ret=0, t_flag=0; + opq_t *wrk_q=NULL, *comp_q=NULL; + wrk_t *tw_list=NULL; + + wrk_q=&wq; + comp_q=&cq; + + mutex_enter(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_EMPTY; + mutex_exit(&wrk_q->mtx); + + ret = q_insert_wrk_list(wrk_q, w_list); + if(ret != 0) { + fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", + __FUNCTION__, &wq, w_list); + return -1; + } + +retry_submit: + mutex_enter(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_INITIALIZED; + mutex_exit(&wrk_q->mtx); + + + mutex_enter(&comp_q->mtx); + if(0 != set_done_cnt_flag(0)) { + fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); + mutex_exit(&comp_q->mtx); + return -1; + } + comp_q->flag = Q_PROCESS; + mutex_enter(&comp_q->mtx); + + /* if threads are waiting request them to start */ + mutex_enter(&wrk_q->mtx); + wrk_q->flag = Q_PROCESS; + os_cond_broadcast(&wrk_q->cv); + mutex_exit(&wrk_q->mtx); + + /* Wait on all worker-threads to complete */ + mutex_enter(&comp_q->mtx); + if (comp_q->flag != Q_DONE) { + do { + os_cond_wait(&comp_q->cv, &comp_q->mtx); + if(comp_q->flag != Q_DONE) { + fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%d\n", + comp_q->flag, done_cnt_flag); + if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%d\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + } + continue; + } else if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%d\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + mutex_exit(&comp_q->mtx); + goto retry_submit; + + ut_ad(!done_cnt_flag); + continue; + } + ut_ad(done_cnt_flag == srv_buf_pool_instances); + + if ((comp_q->flag == Q_DONE) && + (done_cnt_flag == srv_buf_pool_instances)) { + break; + } + } while((comp_q->flag == Q_INITIALIZED) && + (done_cnt_flag != srv_buf_pool_instances)); + } else { + fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%d\n", + comp_q->flag, done_cnt_flag); + if (!done_cnt_flag) { + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + mutex_enter(&comp_q->mtx); + goto retry_submit; + ut_ad(!done_cnt_flag); + } + ut_ad(done_cnt_flag == srv_buf_pool_instances); + } + + mutex_exit(&comp_q->mtx); + mutex_enter(&wrk_q->mtx); + wrk_q->flag = Q_DONE; + mutex_exit(&wrk_q->mtx); + + return 0; +} + +/******************************************************************//** +@return a dummy parameter*/ +int +pgcomp_handler_init( + int num_threads, + int wrk_cnt, + opq_t *wq, + opq_t *cq) +/*================*/ +{ + int i=0; + + if(is_pgcomp_wrk_init_done()) { + fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); + return -1; + } + + if(!wq || !cq) { + fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); + return -1; + } + + /* work-item setup */ + setup_wrk_itm(wrk_cnt); + + /* wq & cq setup */ + init_queue(wq); + init_queue(cq); + + /* Mark each of the thread sync entires */ + for(i=0; i < PGCOMP_MAX_WORKER; i++) { + pc_sync[i].wthread_id = i; + } + + /* Create threads for page-compression-flush */ + for(i=0; i < num_threads; i++) { + pc_sync[i].wthread_id = i; + pc_sync[i].wq = wq; + pc_sync[i].cq = cq; + os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), + thread_ids + START_PGCOMP_CNT + i); + //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; + pc_sync[i].wthread = (START_PGCOMP_CNT + i); + pc_sync[i].wt_status = WTHR_INITIALIZED; + } + + set_check_done_flag_count(wrk_cnt); + set_pgcomp_wrk_init_done(); + + return 0; +} + + +/*******************************************************************//** +Print work thread status information +@return why ? */ +int +wrk_thread_stat( + thread_sync_t *wthr, + unsigned int num_threads) +/*================*/ +{ + long stat_tot=0; + int i=0; + for(i=0; izip.data, bpage); + bpage->zip.data, bpage, 0); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage); + ((buf_block_t*) bpage)->frame, bpage, 0); } thd_wait_end(NULL); diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index a560dc54eac..a382b211275 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1446,8 +1447,8 @@ dict_table_rename_in_cache( ibool exists; char* filepath; - ut_ad(table->space != TRX_SYS_SPACE); - + ut_ad(table->space != TRX_SYS_SPACE); + if (DICT_TF_HAS_DATA_DIR(table->flags)) { dict_get_and_save_data_dir_path(table, true); @@ -1459,7 +1460,7 @@ dict_table_rename_in_cache( filepath = fil_make_ibd_name(table->name, false); } - fil_delete_tablespace(table->space, BUF_REMOVE_FLUSH_NO_WRITE); + fil_delete_tablespace(table->space, BUF_REMOVE_FLUSH_NO_WRITE); /* Delete any temp file hanging around. */ if (os_file_status(filepath, &exists, &type) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 1779ae86c46..2bf5922e07d 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013 SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,6 +26,8 @@ Created 10/25/1995 Heikki Tuuri *******************************************************/ #include "fil0fil.h" +#include "fil0pagecompress.h" +#include "fsp0pagecompress.h" #include #include @@ -54,6 +58,14 @@ Created 10/25/1995 Heikki Tuuri # include "srv0srv.h" static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE @@ -428,11 +440,16 @@ fil_read( block size multiple */ void* buf, /*!< in/out: buffer where to store data read; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /********************************************************************//** @@ -457,18 +474,22 @@ fil_write( be a block size multiple */ void* buf, /*!< in: buffer from which to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ut_ad(!srv_read_only_mode); return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ -UNIV_INLINE fil_space_t* fil_space_get_by_id( /*================*/ @@ -486,6 +507,19 @@ fil_space_get_by_id( return(space); } +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + /*******************************************************************//** Returns the table space by a given name, NULL if not found. */ UNIV_INLINE @@ -704,8 +738,9 @@ fil_node_open_file( byte* buf2; byte* page; ulint space_id; - ulint flags; + ulint flags=0; ulint page_size; + ibool atomic_writes=FALSE; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -722,7 +757,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_ONLY, &success, FALSE); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -774,6 +809,8 @@ fil_node_open_file( space_id = fsp_header_get_space_id(page); flags = fsp_header_get_flags(page); page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); + ut_free(buf2); @@ -824,6 +861,17 @@ fil_node_open_file( ut_error; } + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + if (size_bytes >= 1024 * 1024) { /* Truncate the size to whole megabytes. */ size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); @@ -843,6 +891,8 @@ add_size: space->size += node->size; } + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + /* printf("Opening file %s\n", node->name); */ /* Open the file for reading and writing, in Windows normally in the @@ -853,18 +903,18 @@ add_size: node->handle = os_file_create(innodb_file_log_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, - &ret); + &ret, atomic_writes); } else if (node->is_raw_disk) { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN_RAW, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } else { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } ut_a(ret); @@ -1481,6 +1531,21 @@ fil_space_get_space( if (space->size == 0 && space->purpose == FIL_TABLESPACE) { ut_a(id != 0); + mutex_exit(&fil_system->mutex); + + /* It is possible that the space gets evicted at this point + before the fil_mutex_enter_and_prepare_for_io() acquires + the fil_system->mutex. Check for this after completing the + call to fil_mutex_enter_and_prepare_for_io(). */ + fil_mutex_enter_and_prepare_for_io(id); + + /* We are still holding the fil_system->mutex. Check if + the space is still in memory cache. */ + space = fil_space_get_by_id(id); + if (space == NULL) { + return(NULL); + } + /* The following code must change when InnoDB supports multiple datafiles per tablespace. */ ut_a(1 == UT_LIST_GET_LEN(space->chain)); @@ -1858,12 +1923,12 @@ fil_write_lsn_and_arch_no_to_file( buf = static_cast(ut_align(buf1, UNIV_PAGE_SIZE)); err = fil_read(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); err = fil_write(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); } mem_free(buf1); @@ -3095,7 +3160,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, FALSE); if (!success) { /* The following call will print an error message */ @@ -3111,10 +3176,10 @@ fil_create_link_file( ut_print_filename(stderr, filepath); fputs(" already exists.\n", stderr); err = DB_TABLESPACE_EXISTS; - } else if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; - + } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; } else { err = DB_ERROR; } @@ -3204,8 +3269,9 @@ fil_open_linked_file( /*===============*/ const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ - os_file_t* remote_file) /*!< out: remote file handle */ - + os_file_t* remote_file, /*!< out: remote file handle */ + ibool atomic_writes) /*!< in: should atomic writes be + used */ { ibool success; @@ -3219,7 +3285,7 @@ fil_open_linked_file( *remote_file = os_file_create_simple_no_error_handling( innodb_file_data_key, *remote_filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, atomic_writes); if (!success) { char* link_filepath = fil_make_isl_name(tablename); @@ -3274,6 +3340,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + bool atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3306,7 +3373,8 @@ fil_create_new_single_table_tablespace( OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + atomic_writes); if (ret == FALSE) { /* The following call will print an error message */ @@ -3333,6 +3401,11 @@ fil_create_new_single_table_tablespace( goto error_exit_3; } + if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + goto error_exit_3; + } + if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; goto error_exit_3; @@ -3371,6 +3444,7 @@ fil_create_new_single_table_tablespace( flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); if (!(fsp_flags_is_compressed(flags))) { buf_flush_init_for_writing(page, NULL, 0); @@ -3547,6 +3621,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; + ibool atomic_writes = FALSE; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -3557,6 +3632,8 @@ fil_open_single_table_tablespace( return(DB_CORRUPTION); } + atomic_writes = fsp_flags_get_atomic_writes(flags); + /* If the tablespace was relocated, we do not compare the DATA_DIR flag */ ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; @@ -3581,7 +3658,7 @@ fil_open_single_table_tablespace( } link_file_found = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, atomic_writes); remote.success = link_file_found; if (remote.success) { /* possibility of multiple files. */ @@ -3609,7 +3686,7 @@ fil_open_single_table_tablespace( if (dict.filepath) { dict.file = os_file_create_simple_no_error_handling( innodb_file_data_key, dict.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &dict.success); + OS_FILE_READ_ONLY, &dict.success, atomic_writes); if (dict.success) { /* possibility of multiple files. */ validate = true; @@ -3621,7 +3698,7 @@ fil_open_single_table_tablespace( ut_a(def.filepath); def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, atomic_writes); if (def.success) { tablespaces_found++; } @@ -4020,7 +4097,7 @@ fil_load_single_table_tablespace( /* Check for a link file which locates a remote tablespace. */ remote.success = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, FALSE); /* Read the first page of the remote tablespace */ if (remote.success) { @@ -4035,7 +4112,7 @@ fil_load_single_table_tablespace( /* Try to open the tablespace in the datadir. */ def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, FALSE); /* Read the first page of the remote tablespace */ if (def.success) { @@ -4167,7 +4244,7 @@ will_not_choose: new_path = fil_make_ibbackup_old_name(fsp->filepath); bool success = os_file_rename( - innodb_file_data_key, fsp->filepath, new_path)); + innodb_file_data_key, fsp->filepath, new_path); ut_a(success); @@ -4821,7 +4898,7 @@ retry: success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL); + NULL, NULL, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -4852,6 +4929,7 @@ retry: space->size += pages_added; node->size += pages_added; + node->being_extended = FALSE; #ifdef HAVE_POSIX_FALLOCATE complete_io: @@ -4917,7 +4995,7 @@ fil_extend_tablespaces_to_stored_len(void) single-threaded operation */ error = fil_read(TRUE, space->id, fsp_flags_get_zip_size(space->flags), - 0, 0, UNIV_PAGE_SIZE, buf, NULL); + 0, 0, UNIV_PAGE_SIZE, buf, NULL, 0); ut_a(error == DB_SUCCESS); size_in_header = fsp_get_size_low(buf); @@ -5191,8 +5269,13 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ulint mode; fil_space_t* space; @@ -5255,6 +5338,9 @@ fil_io( } else if (type == OS_FILE_WRITE) { ut_ad(!srv_read_only_mode); srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } } /* Reserve the fil_system mutex and make sure that we can open at @@ -5371,7 +5457,7 @@ fil_io( #else /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message); + offset, len, node, message, write_size); #endif /* UNIV_HOTBACKUP */ ut_a(ret); @@ -5994,7 +6080,7 @@ fil_tablespace_iterate( file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, - OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", { @@ -6210,3 +6296,32 @@ fil_mtr_rename_log( mtr_commit(&mtr); } +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc new file mode 100644 index 00000000000..3926b23c677 --- /dev/null +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -0,0 +1,369 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint* out_len) /*!< out: actual length of compressed page */ +{ + int err = Z_OK; + int level = 0; + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; + ulint write_size=0; + + ut_a(buf); + ut_a(out_buf); + ut_a(len); + ut_a(out_len); + + level = fil_space_get_page_compression_level(space_id); + ut_a(fil_space_is_page_compressed(space_id)); + + fil_system_enter(); + fil_space_t* space = fil_space_get_by_id(space_id); + fil_system_exit(); + + /* If no compression level was provided to this table, use system + default level */ + if (level == 0) { + level = srv_compress_zlib_level; + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif + + write_size = UNIV_PAGE_SIZE - header_len; + err = compress2(out_buf+header_len, &write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } else { + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); +#endif + + write_size+=header_len; + /* Actual write needs to be alligned on block size */ + if (write_size % OS_FILE_LOG_BLOCK_SIZE) { + write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif +#define SECT_SIZE 512 + srv_stats.page_compression_saved.add((len - write_size)); + if ((len - write_size) > 0) { + srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); + } + //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + } +} + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len) /*!< in: length of output buffer.*/ +{ + int err = 0; + ulint actual_size = 0; + ulint compression_alg = 0; + byte *in_buf; + + ut_a(buf); + ut_a(len); + + /* Before actual decompress, make sure that page type is correct */ + + if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC || + mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: CRC %lu type %lu.\n" + "InnoDB: len %lu\n", + mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_2(buf+FIL_PAGE_TYPE), len); + + fflush(stderr); + ut_error; + } + + /* Get compression algorithm */ + compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); + + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); +#endif + + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + + + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + + fflush(stderr); + + ut_error; + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } + + srv_stats.pages_page_decompressed.inc(); + } else { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(compression_alg)); + + fflush(stderr); + ut_error; + } +} + +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); +} + +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed, false if not */ +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); +} + +/*******************************************************************//** +Returns the page compression level of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_compression_level(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Extract the page compression from space. +@return true if space is page compressed, false if space is not found +or space is not page compressed. */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_compressed(flags)); + } + + return(flags); +} + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg) /*!tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); #ifndef DBUG_OFF @@ -3118,8 +3161,6 @@ innobase_change_buffering_inited_ok: srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; - page_compression_level = (ulint) innobase_compression_level; - if (!innobase_use_checksums) { ut_print_timestamp(stderr); fprintf(stderr, @@ -9465,11 +9506,16 @@ innobase_table_flags( enum row_type row_format; rec_format_t innodb_row_format = REC_FORMAT_COMPACT; bool use_data_dir; + ha_table_option_struct *options= form->s->option_struct; /* Cache the value of innodb_file_format, in case it is modified by another thread while the table is being created. */ const ulint file_format_allowed = srv_file_format; + /* Cache the value of innobase_compression_level, in case it is + modified by another thread while the table is being created. */ + const ulint default_compression_level = innobase_compression_level; + *flags = 0; *flags2 = 0; @@ -9513,6 +9559,8 @@ index_bad: } } + row_format = form->s->row_type; + if (create_info->key_block_size) { /* The requested compressed page size (key_block_size) is given in kilobytes. If it is a valid number, store @@ -9522,7 +9570,7 @@ index_bad: ulint kbsize; /* Key Block Size */ for (zssize = kbsize = 1; zssize <= ut_min(UNIV_PAGE_SSIZE_MAX, - PAGE_ZIP_SSIZE_MAX); + PAGE_ZIP_SSIZE_MAX); zssize++, kbsize <<= 1) { if (kbsize == create_info->key_block_size) { zip_ssize = zssize; @@ -9550,8 +9598,8 @@ index_bad: } if (!zip_allowed - || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, - PAGE_ZIP_SSIZE_MAX)) { + || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, + PAGE_ZIP_SSIZE_MAX)) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, @@ -9560,8 +9608,6 @@ index_bad: } } - row_format = form->s->row_type; - if (zip_ssize && zip_allowed) { /* if ROW_FORMAT is set to default, automatically change it to COMPRESSED.*/ @@ -9598,7 +9644,6 @@ index_bad: case ROW_TYPE_REDUNDANT: innodb_row_format = REC_FORMAT_REDUNDANT; break; - case ROW_TYPE_COMPRESSED: case ROW_TYPE_DYNAMIC: if (!use_tablespace) { @@ -9616,10 +9661,18 @@ index_bad: " innodb_file_format > Antelope.", get_row_format_name(row_format)); } else { - innodb_row_format = (row_format == ROW_TYPE_DYNAMIC - ? REC_FORMAT_DYNAMIC - : REC_FORMAT_COMPRESSED); - break; + switch(row_format) { + case ROW_TYPE_COMPRESSED: + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + default: + /* Not possible, avoid compiler warning */ + break; + } + break; /* Correct row_format */ } zip_allowed = FALSE; /* fall through to set row_format = COMPACT */ @@ -9646,7 +9699,15 @@ index_bad: && ((create_info->data_file_name != NULL) && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); - dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + /* Set up table dictionary flags */ + dict_tf_set(flags, + innodb_row_format, + zip_ssize, + use_data_dir, + options->page_compressed, + (ulint)options->page_compression_level == ULINT_UNDEFINED ? + default_compression_level : options->page_compression_level, + options->atomic_writes); if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; @@ -9659,6 +9720,111 @@ index_bad: DBUG_RETURN(true); } + +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +UNIV_INTERN +const char* +ha_innobase::check_table_options( + THD *thd, /*!< in: thread handle */ + TABLE* table, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + const bool use_tablespace, /*!< in: use file par table */ + const ulint file_format) +{ + enum row_type row_format = table->s->row_type;; + ha_table_option_struct *options= table->s->option_struct; + + /* Check page compression requirements */ + if (options->page_compressed) { + if (!srv_compress_pages) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + "innodb_compress_pages not enabled"); + return "PAGE_COMPRESSED"; + } + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_format > Antelope."); + return "PAGE_COMPRESSED"; + } + + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if ((ulint)options->page_compression_level != ULINT_UNDEFINED) { + if (options->page_compressed == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 0 || options->page_compression_level > 9) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", + create_info->key_block_size); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + /* Check atomic writes requirements */ + if (options->atomic_writes) { + if (!srv_use_atomic_writes && !use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ATOMIC_WRITES requires" + " innodb_file_per_table."); + return "ATOMIC_WRITES"; + } + } + + return 0; +} + /*****************************************************************//** Creates a new table to an InnoDB database. @return error number */ @@ -9690,6 +9856,7 @@ ha_innobase::create( while creating the table. So we read the current value here and make all further decisions based on this. */ bool use_tablespace = srv_file_per_table; + const ulint file_format = srv_file_format; /* Zip Shift Size - log2 - 9 of compressed page size, zero for uncompressed */ @@ -9713,6 +9880,12 @@ ha_innobase::create( /* Create the table definition in InnoDB */ + /* Validate table options not handled by the SQL-parser */ + if(check_table_options(thd, form, create_info, use_tablespace, + file_format)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + /* Validate create options if innodb_strict_mode is set. */ if (create_options_are_invalid( thd, form, create_info, use_tablespace)) { @@ -13952,6 +14125,12 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); if (table_changes != IS_EQUAL_YES) { @@ -13978,6 +14157,13 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level || + param_new->atomic_writes != param_old->atomic_writes) { + return(COMPATIBLE_DATA_NO); + } + return(COMPATIBLE_DATA_YES); } @@ -16447,6 +16633,31 @@ static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, NULL, NULL, FALSE); #endif /* UNIV_DEBUG */ +static MYSQL_SYSVAR_BOOL(compress_pages, srv_compress_pages, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use page compression.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, + PLUGIN_VAR_OPCMDARG , + "How many percent of compressed pages should be trimmed", + NULL, NULL, 100, 0, 100, 0); + +static MYSQL_SYSVAR_LONG(compress_zlib_level, srv_compress_zlib_level, + PLUGIN_VAR_OPCMDARG , + "Default zlib compression level", + NULL, NULL, 6, 0, 9, 0); + +static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, + PLUGIN_VAR_OPCMDARG, + "Use page compression for only index pages.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, + PLUGIN_VAR_OPCMDARG, + "Use trim.", + NULL, NULL, TRUE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16592,6 +16803,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(limit_optimistic_insert_debug), MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(compress_pages), + MYSQL_SYSVAR(trim_pct), + MYSQL_SYSVAR(compress_zlib_level), + MYSQL_SYSVAR(compress_index_pages), + MYSQL_SYSVAR(use_trim), NULL }; diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index ece9f7cf58a..5eb460072bb 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -56,6 +57,18 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; +/** Engine specific table options are definined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + int page_compression_level; /*!< Table page compression level + or UNIV_UNSPECIFIED. */ + bool atomic_writes; /*!< Use atomic writes for this + table if this options is true. */ +}; + + /** The class defining a handle to an Innodb table */ class ha_innobase: public handler { @@ -182,6 +195,8 @@ class ha_innobase: public handler char* norm_name, char* temp_path, char* remote_path); + const char* check_table_options(THD *thd, TABLE* table, + HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index a120534b36d..49f8a05d11a 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -248,6 +249,22 @@ ha_innobase::check_if_supported_inplace_alter( update_thd(); trx_search_latch_release_if_reserved(prebuilt->trx); + /* Change on engine specific table options require rebuild of the + table */ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION) { + ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct; + ha_table_option_struct *old_options= table->s->option_struct; + + if (new_options->page_compressed != old_options->page_compressed || + new_options->page_compression_level != old_options->page_compression_level || + new_options->atomic_writes != old_options->page_compression_level) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + if (ha_alter_info->handler_flags & ~(INNOBASE_ONLINE_OPERATIONS | INNOBASE_INPLACE_REBUILD)) { if (ha_alter_info->handler_flags @@ -3331,6 +3348,17 @@ ha_innobase::prepare_inplace_alter_table( if (ha_alter_info->handler_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { + /* Check engine specific table options */ + if (const char* invalid_tbopt = check_table_options( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0, + srv_file_format)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + if (const char* invalid_opt = create_options_are_invalid( user_thd, altered_table, ha_alter_info->create_info, diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 74a6e203808..5e301a27e32 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1470,6 +1471,11 @@ struct buf_page_t{ state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ + + ulint write_size; /* Write size is set when this + page is first time written and then + if written again we check is TRIM + operation needed. */ #ifndef UNIV_HOTBACKUP buf_page_t* hash; /*!< node used in chaining to buf_pool->page_hash or diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index af0a5b31cc4..0ca64956a2e 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri #include "ut0byte.h" #include "trx0types.h" #include "row0types.h" +#include "fsp0fsp.h" +#include "dict0pagecompress.h" #ifndef UNIV_HOTBACKUP # include "sync0sync.h" @@ -878,7 +881,14 @@ dict_tf_set( ulint* flags, /*!< in/out: table */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool remote_path) /*!< in: table uses DATA DIRECTORY */ + bool remote_path, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + bool atomic_writes) /*!< in: table uses atomic + writes */ __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is @@ -906,6 +916,7 @@ dict_tf_get_zip_size( /*=================*/ ulint flags) /*!< in: flags */ __attribute__((const)); + /********************************************************************//** Check whether the table uses the compressed compact page format. @return compressed page size, or 0 if not compressed */ @@ -1779,6 +1790,7 @@ dict_tf_to_row_format_string( #endif /* !UNIV_HOTBACKUP */ + #ifndef UNIV_NONINL #include "dict0dict.ic" #endif diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 83953c9325a..65967552b87 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -537,9 +538,25 @@ dict_tf_is_valid( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); ulint unused = DICT_TF_GET_UNUSED(flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags); /* Make sure there are no bits that we do not know about. */ if (unused != 0) { + fprintf(stderr, + "InnoDB: Error: table unused flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + unused, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); @@ -550,12 +567,34 @@ dict_tf_is_valid( data stored off-page in the clustered index. */ if (!compact) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + compact, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } } else if (zip_ssize) { /* Antelope does not support COMPRESSED row format. */ + fprintf(stderr, + "InnoDB: Error: table flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } @@ -568,6 +607,40 @@ dict_tf_is_valid( || !atomic_blobs || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + + ); + return(false); + } + } + + if (page_compression || page_compression_level) { + /* Page compression format must have compact and + atomic_blobs and page_compression_level requires + page_compression */ + if (!compact + || !page_compression + || !atomic_blobs) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } } @@ -594,6 +667,9 @@ dict_sys_tables_type_validate( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -647,6 +723,23 @@ dict_sys_tables_type_validate( format, so the DATA_DIR flag is compatible with any other table flags. However, it is not used with TEMPORARY tables.*/ + if (page_compression || page_compression_level) { + /* page compressed row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + + if (!atomic_blobs || !page_compression) { + return(ULINT_UNDEFINED); + } + } + + if (atomic_writes) { + if (!atomic_blobs) { + return(ULINT_UNDEFINED); + } + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -719,7 +812,14 @@ dict_tf_set( ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ + bool use_data_dir, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + bool atomic_writes) /*!< in: table uses atomic + writes */ { switch (format) { case REC_FORMAT_REDUNDANT: @@ -742,6 +842,22 @@ dict_tf_set( break; } + if (page_compressed) { + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } + + if (atomic_writes) { + *flags |= (1 << DICT_TF_POS_ATOMIC_WRITES); + ut_ad(dict_tf_get_atomic_writes(*flags) == TRUE); + } + if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); } @@ -765,6 +881,9 @@ dict_tf_to_fsp_flags( ulint table_flags) /*!< in: dict_table_t::flags */ { ulint fsp_flags; + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); @@ -783,7 +902,20 @@ dict_tf_to_fsp_flags( fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) ? FSP_FLAGS_MASK_DATA_DIR : 0; + /* In addition, tablespace flags also contain if the page + compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression); + + /* In addition, tablespace flags also contain page compression level + if page compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + + /* In addition, tablespace flags also contain flag if atomic writes + is used for this table */ + fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); + ut_a(fsp_flags_is_valid(fsp_flags)); + ut_a(dict_tf_verify_flags(table_flags, fsp_flags)); return(fsp_flags); } @@ -811,10 +943,15 @@ dict_sys_tables_type_to_tf( /* Adjust bit zero. */ flags = redundant ? 0 : 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ flags |= type & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES + ); return(flags); } @@ -842,10 +979,14 @@ dict_tf_to_sys_tables_type( /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ type = 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ type |= flags & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES); return(type); } diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 671f67eb1f8..6cfcb81bcd5 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -120,11 +121,25 @@ This flag prevents older engines from attempting to open the table and allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_WIDTH_DATA_DIR 1 +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +Width of atomic writes flag +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 1 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ + DICT_TF_WIDTH_ATOMIC_BLOBS \ - + DICT_TF_WIDTH_DATA_DIR) + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -140,9 +155,19 @@ allows InnoDB to update_create_info() accordingly. */ /** Zero relative shift position of the DATA_DIR field */ #define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) + /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ - + DICT_TF_WIDTH_DATA_DIR) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -160,6 +185,18 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_MASK_DATA_DIR \ ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define DICT_TF_MASK_ATOMIC_WRITES \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ + << DICT_TF_POS_ATOMIC_WRITES) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -177,6 +214,19 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_HAS_DATA_DIR(flags) \ ((flags & DICT_TF_MASK_DATA_DIR) \ >> DICT_TF_POS_DATA_DIR) +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define DICT_TF_GET_ATOMIC_WRITES(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ + >> DICT_TF_POS_ATOMIC_WRITES) + /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h new file mode 100644 index 00000000000..236924758f1 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ + __attribute__((const)); + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +ibool +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +ibool +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table); /*!< in: table */ + + +#ifndef UNIV_NONINL +#include "dict0pagecompress.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic new file mode 100644 index 00000000000..98b64723542 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.ic @@ -0,0 +1,191 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ +{ + ulint table_unused = DICT_TF_GET_UNUSED(table_flags); + ulint compact = DICT_TF_GET_COMPACT(table_flags); + ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); + ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); + ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags); + ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); + ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); + ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", + return(ULINT_UNDEFINED);); + + ut_ad(!table_unused); + ut_ad(!fsp_unused); + ut_ad(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_ad(compact == 0 || compact == 1); /* silence compiler */ + ut_ad(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_ad(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + + if (ssize != zip_ssize) { + fprintf(stderr, + "InnoDB: Error: table flags has zip_ssize %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has zip_ssize %ld\n", + ssize, zip_ssize); + return (FALSE); + } + if (atomic_blobs != fsp_atomic_blobs) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic_blobs %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_blobs %ld\n", + atomic_blobs, fsp_atomic_blobs); + + return (FALSE); + } + if (page_compression != fsp_page_compression) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file ahas page_compression %ld\n", + page_compression, fsp_page_compression); + + return (FALSE); + } + if (page_compression_level != fsp_page_compression_level) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression_level %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_compression_level %ld\n", + page_compression_level, fsp_page_compression_level); + + return (FALSE); + } + + if (atomic_writes != fsp_atomic_writes) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic writes %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_writes %ld\n", + atomic_writes, fsp_atomic_writes); + + return (FALSE); + } + + return(TRUE); +} + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level >= 0 && page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +ibool +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_ATOMIC_WRITES(flags)); +} + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +ibool +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_atomic_writes(table->flags)); +} diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 56fda8b39b1..c5edd33f46b 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -128,6 +129,12 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this contains the space id of the page */ #define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* Following are used when page compression is used */ +#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on + compressed pages. */ +#define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ + /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -140,6 +147,7 @@ extern fil_addr_t fil_addr_null; #ifndef UNIV_INNOCHECKSUM /** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ @@ -202,6 +210,7 @@ ulint fil_space_get_type( /*===============*/ ulint id); /*!< in: space id */ + #endif /* !UNIV_HOTBACKUP */ /*******************************************************************//** Appends a new file to the chain of files of a space. File must be closed. @@ -742,8 +751,13 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ __attribute__((nonnull(8))); /**********************************************************************//** Waits for an aio operation to complete. This function is used to write the @@ -977,8 +991,33 @@ fil_mtr_rename_log( ulint new_space_id, /*!< in: tablespace id of the new table */ const char* new_name, /*!< in: new table name */ - const char* tmp_name); /*!< in: temp table name used while + const char* tmp_name) /*!< in: temp table name used while swapping */ + __attribute__((nonnull)); #endif /* !UNIV_INNOCHECKSUM */ + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void); +/*==================*/ +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void); +/*==================*/ +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space); /*!< in: space */ + #endif /* fil0fil_h */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h new file mode 100644 index 00000000000..e21eae7a5ee --- /dev/null +++ b/storage/innobase/include/fil0pagecompress.h @@ -0,0 +1,117 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" +#include "fsp0pagecompress.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/*******************************************************************//** +Returns the page compression level flag of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level if page compressed, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return true if space using atomic writes, false if not */ +ibool +fil_space_get_atomic_writes( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf); /*!< in: page */ + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg); /*!> FSP_FLAGS_POS_UNUSED) +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \ + >> FSP_FLAGS_POS_ATOMIC_WRITES) + /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) +/** Set a PAGE_COMPRESSION into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \ + (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION)) + +/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ + (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) +/** Set a ATOMIC_WRITES into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ + (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES)) + /* @} */ /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 0d81e817cc9..0ca02a5652d 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,6 +64,9 @@ fsp_flags_is_valid( ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); + ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); @@ -104,6 +108,18 @@ fsp_flags_is_valid( return(false); } + /* Page compression level requires page compression and atomic blobs + to be set */ + if (page_compression_level || page_compression) { + if (!page_compression || !atomic_blobs) { + return(false); + } + } + + if (atomic_writes && !atomic_blobs) { + return (false); + } + #if UNIV_FORMAT_MAX != UNIV_FORMAT_B # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif @@ -312,3 +328,4 @@ xdes_calc_descriptor_page( } #endif /* !UNIV_INNOCHECKSUM */ + diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h new file mode 100644 index 00000000000..417d4a6879e --- /dev/null +++ b/storage/innobase/include/fsp0pagecompress.h @@ -0,0 +1,64 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef fsp0pagecompress_h +#define fsp0pagecompress_h + +/**********************************************************************//** +Reads the page compression level from the first page of a tablespace. +@return page compression level, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_compression_level( +/*=============================*/ + const page_t* page); /*!< in: first page of a tablespace */ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Extract the page compression level from tablespace flags. +A tablespace has only one physical page compression level +whether that page is compressed or not. +@return page compression level of the file-per-table tablespace, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0pagecompress.ic" +#endif + +#endif diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic new file mode 100644 index 00000000000..1dffd1bedf1 --- /dev/null +++ b/storage/innobase/include/fsp0pagecompress.ic @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.ic +Implementation for helper functions for extracting/storing page +compression and atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not page compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Determine the tablespace is page compression level from dict_table_t::flags. +@return page compression level or 0 if not compressed*/ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); +} + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +ibool +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_ATOMIC_WRITES(flags)); +} diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 94fd908ab0c..e5c1734b842 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -29,6 +29,7 @@ Created May 26, 2009 Vasil Dimov #include "univ.i" #include "fil0fil.h" /* for FIL_PAGE_DATA */ +#include "ut0byte.h" /** @name Flags for inserting records in order If records are inserted in order, there are the following diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 4a744c1b268..3c70f9925fe 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -2,6 +2,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -150,6 +151,7 @@ enum os_file_create_t { #define OS_FILE_INSUFFICIENT_RESOURCE 78 #define OS_FILE_AIO_INTERRUPTED 79 #define OS_FILE_OPERATION_ABORTED 80 +#define OS_FILE_OPERATION_NOT_SUPPORTED 125 /* @} */ /** Types for aio operations @{ */ @@ -269,26 +271,26 @@ os_file_write The wrapper functions have the prefix of "innodb_". */ #ifdef UNIV_PFS_IO -# define os_file_create(key, name, create, purpose, type, success) \ +# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \ pfs_os_file_create_func(key, name, create, purpose, type, \ - success, __FILE__, __LINE__) + success, atomic_writes, __FILE__, __LINE__) -# define os_file_create_simple(key, name, create, access, success) \ +# define os_file_create_simple(key, name, create, access, success, atomic_writes) \ pfs_os_file_create_simple_func(key, name, create, access, \ - success, __FILE__, __LINE__) + success, atomic_writes, __FILE__, __LINE__) # define os_file_create_simple_no_error_handling( \ - key, name, create_mode, access, success) \ + key, name, create_mode, access, success, atomic_writes) \ pfs_os_file_create_simple_no_error_handling_func( \ - key, name, create_mode, access, success, __FILE__, __LINE__) + key, name, create_mode, access, success, atomic_writes, __FILE__, __LINE__) # define os_file_close(file) \ pfs_os_file_close_func(file, __FILE__, __LINE__) # define os_aio(type, mode, name, file, buf, offset, \ - n, message1, message2) \ + n, message1, message2, write_size) \ pfs_os_aio_func(type, mode, name, file, buf, offset, \ - n, message1, message2, __FILE__, __LINE__) + n, message1, message2, write_size, __FILE__, __LINE__) # define os_file_read(file, buf, offset, n) \ pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__) @@ -310,22 +312,22 @@ The wrapper functions have the prefix of "innodb_". */ /* If UNIV_PFS_IO is not defined, these I/O APIs point to original un-instrumented file I/O APIs */ -# define os_file_create(key, name, create, purpose, type, success) \ - os_file_create_func(name, create, purpose, type, success) +# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \ + os_file_create_func(name, create, purpose, type, success, atomic_writes) -# define os_file_create_simple(key, name, create_mode, access, success) \ - os_file_create_simple_func(name, create_mode, access, success) +# define os_file_create_simple(key, name, create_mode, access, success, atomic_writes) \ + os_file_create_simple_func(name, create_mode, access, success, atomic_writes) # define os_file_create_simple_no_error_handling( \ - key, name, create_mode, access, success) \ - os_file_create_simple_no_error_handling_func( \ - name, create_mode, access, success) + key, name, create_mode, access, success, atomic_writes) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, success, atomic_writes) # define os_file_close(file) os_file_close_func(file) -# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \ +# define os_aio(type, mode, name, file, buf, offset, n, message1, message2, write_size) \ os_aio_func(type, mode, name, file, buf, offset, n, \ - message1, message2) + message1, message2, write_size) # define os_file_read(file, buf, offset, n) \ os_file_read_func(file, buf, offset, n) @@ -468,7 +470,8 @@ os_file_create_simple_func( ulint create_mode,/*!< in: create mode */ ulint access_type,/*!< in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success);/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ibool atomic_writes); /*!space_id, 0, (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + group, 0); srv_stats.os_log_pending_writes.dec(); @@ -1859,7 +1860,7 @@ log_group_checkpoint( write_offset / UNIV_PAGE_SIZE, write_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, - buf, ((byte*) group + 1)); + buf, ((byte*) group + 1), 0); ut_ad(((ulint) group & 0x1UL) == 0); } @@ -1939,7 +1940,7 @@ log_group_read_checkpoint_info( fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id, 0, field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0); } /******************************************************//** @@ -2233,7 +2234,7 @@ loop: fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, (ulint) (source_offset / UNIV_PAGE_SIZE), (ulint) (source_offset % UNIV_PAGE_SIZE), - len, buf, NULL); + len, buf, NULL, 0); start_lsn += len; buf += len; @@ -2298,7 +2299,7 @@ log_group_archive_file_header_write( dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, 2 * OS_FILE_LOG_BLOCK_SIZE, - buf, &log_archive_io); + buf, &log_archive_io, 0); } /******************************************************//** @@ -2334,7 +2335,7 @@ log_group_archive_completed_header_write( dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf + LOG_FILE_ARCH_COMPLETED, - &log_archive_io); + &log_archive_io, 0); } /******************************************************//** @@ -2462,7 +2463,7 @@ loop: (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, - &log_archive_io); + &log_archive_io, 0); start_lsn += len; next_offset += len; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 8cefa9e4b70..a3df6a8d5bd 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2,6 +2,7 @@ Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2063,7 +2064,7 @@ recv_apply_log_recs_for_backup(void) error = fil_io(OS_FILE_READ, TRUE, recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); if (error == DB_SUCCESS && !buf_zip_decompress(block, TRUE)) { exit(1); @@ -2073,7 +2074,7 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } if (error != DB_SUCCESS) { @@ -2102,13 +2103,13 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); } else { error = fil_io(OS_FILE_WRITE, TRUE, recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); @@ -3074,7 +3075,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0, 0, 0, LOG_FILE_HDR_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { @@ -3105,7 +3106,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0, 0, 0, OS_FILE_LOG_BLOCK_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); } #ifdef UNIV_LOG_ARCHIVE @@ -3753,8 +3754,8 @@ ask_again: #endif /* Read the archive file header */ - fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, 0, 0, - LOG_FILE_HDR_SIZE, buf, NULL); + fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0, + LOG_FILE_HDR_SIZE, buf, NULL, 0); /* Check if the archive file header is consistent */ @@ -3827,7 +3828,7 @@ ask_again: fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, read_offset / UNIV_PAGE_SIZE, - read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0); ret = recv_scan_log_recs( (buf_pool_get_n_pages() diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index d1b2b12bf59..60331f9c483 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,8 +43,14 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#include "linux/falloc.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -60,6 +67,13 @@ Created 10/21/1995 Heikki Tuuri #include #endif +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -175,6 +189,25 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ulint write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + #ifdef WIN_ASYNC_IO HANDLE handle; /*!< handle object we need in the OVERLAPPED struct */ @@ -294,6 +327,79 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len); /*!< in: length of area */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name /*!< in: name of the file */ + __attribute__((unused)), + os_file_t file /*!< in: handle to the file */ + __attribute__((unused))) +{ +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET + int atomic_option = 1; + + if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + + os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__); + return(FALSE); + } + + return(TRUE); +#else + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "non-supported platform! Please restart with " + "innodb_use_atomic_writes disabled.\n"); + return(FALSE); +#endif +} + + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -498,7 +604,17 @@ os_file_get_last_error_low( fprintf(stderr, "InnoDB: The error means mysqld does not have" " the access rights to\n" - "InnoDB: the directory.\n"); + "InnoDECANCELEDB: the directory.\n"); + } else if (err == ECANCELED) { + fprintf(stderr, + "InnoDB: Operation canceled (%d):%s\n", + err, strerror(err)); + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { if (strerror(err) != NULL) { fprintf(stderr, @@ -530,6 +646,8 @@ os_file_get_last_error_low( case ENOTDIR: case EISDIR: return(OS_FILE_PATH_ERROR); + case ECANCELED: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EAGAIN: if (srv_use_native_aio) { return(OS_FILE_AIO_RESOURCES_RESERVED); @@ -574,9 +692,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -606,6 +726,9 @@ os_file_handle_error_cond_exit( " InnoDB: Disk is full. Try to clean the disk" " to free space.\n"); + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + os_has_said_disk_full = TRUE; fflush(stderr); @@ -652,6 +775,9 @@ os_file_handle_error_cond_exit( operation, err); } + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + if (should_exit) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Cannot continue " @@ -675,10 +801,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -690,12 +818,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -835,7 +965,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -846,7 +976,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -868,7 +998,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -880,7 +1010,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -952,7 +1082,7 @@ next_file: return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1038,7 +1168,7 @@ next_file: goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1089,7 +1219,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1102,7 +1232,7 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1126,7 +1256,8 @@ os_file_create_simple_func( ulint create_mode,/*!< in: create mode */ ulint access_type,/*!< in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ibool atomic_writes) /*!slots = static_cast( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); + #ifdef __WIN__ array->handles = static_cast(ut_malloc(n * sizeof(HANDLE))); #endif /* __WIN__ */ @@ -3803,8 +3996,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { -#ifdef WIN_ASYNC_IO ulint i; +#ifdef WIN_ASYNC_IO for (i = 0; i < array->n_slots; i++) { os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); @@ -3826,6 +4019,14 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4159,7 +4360,12 @@ os_aio_array_reserve_slot( void* buf, /*!< in: buffer where to read or from which to write */ os_offset_t offset, /*!< in: file offset */ - ulint len) /*!< in: length of the block to read or write */ + ulint len, /*!< in: length of the block to read or write */ + ulint write_size) /*!< in: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4249,6 +4455,54 @@ found: slot->buf = static_cast(buf); slot->offset = offset; slot->io_already_done = FALSE; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + + /* If the space is page compressed and this is write operation + and if either only index pages compression is disabled or + page is index page and only index pages compression is enabled then + we compress the page */ + if (message1 && + type == OS_FILE_WRITE && + fil_space_is_page_compressed(fil_node_get_space_id(slot->message1)) && + (srv_page_compress_index_pages == false || + (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + + ut_ad(slot->page_buf); + + /* Write buffer full of zeros, this is needed for trim, + can't really avoid this now. */ + memset(slot->page_buf, 0, len); + + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, &real_len); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back, not sure if this is really needed + below */ + os_mutex_enter(array->mutex); + + } + #ifdef WIN_ASYNC_IO control = &slot->control; @@ -4523,10 +4777,15 @@ os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ - void* message2)/*!< in: message for the aio handler + void* message2,/*!< in: message for the aio handler (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ + ulint write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4624,7 +4883,8 @@ try_again: } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n); + name, buf, offset, n, write_size); + if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4704,7 +4964,7 @@ err_exit: os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4817,7 +5077,7 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { retry = TRUE; } else { @@ -4847,9 +5107,17 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret = WriteFile(slot->file, slot->buf, + if (slot->message1 && + fil_space_is_page_compressed(fil_node_get_space_id(slot->message1)) && + slot->page_buf) { + ret = WriteFile(slot->file, slot->page_buf, (DWORD) slot->len, &len, &(slot->control)); + } else { + ret = WriteFile(slot->file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + } break; case OS_FILE_READ: @@ -4881,6 +5149,29 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->message1 && + fil_space_is_page_compressed(fil_node_get_space_id(slot->message1))) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len); + } + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + os_aio_array_free_slot(array, slot); return(ret_val); @@ -4970,6 +5261,34 @@ retry: /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && + fil_space_is_page_compressed(fil_node_get_space_id(slot->message1))) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len); + } + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5113,6 +5432,13 @@ found: } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5120,7 +5446,7 @@ found: windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5323,7 +5649,7 @@ consecutive_loop: if (slot->reserved && slot != aio_slot - && slot->offset == slot->offset + aio_slot->len + && slot->offset == aio_slot->offset + aio_slot->len && slot->type == aio_slot->type && slot->file == aio_slot->file) { @@ -5791,4 +6117,147 @@ os_aio_all_slots_free(void) } #endif /* UNIV_DEBUG */ +#ifdef _WIN32 +#include +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len) /*!< in: length of area */ +{ + + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size > 0 && len >= slot->write_size)) { + + if (slot->write_size > 0 && len >= slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lx len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + slot->write_size = 0; + + return (FALSE); + } else { + slot->write_size = len; + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + slot->write_size = 0; + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + slot->write_size = 0; + return (FALSE); + } else { + slot->write_size = len; + } +#endif + +#define SECT_SIZE 512 + srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} #endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast(cbuf2); + slot->page_buf = static_cast(cbuf); +} diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 3b3da2f070f..44a60961110 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -290,6 +290,12 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + {"buffer_index_pages_written", "buffer", + "Number of index pages written (innodb_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -875,6 +881,41 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + {"compress_saved", "compression", + "Number of bytes saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED}, + + {"compress_trim_sect512", "compression", + "Number of sect-512 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512}, + + {"compress_trim_sect4096", "compression", + "Number of sect-4K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096}, + + {"compress_pages_page_compressed", "compression", + "Number of pages compressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED}, + + {"compress_page_compressed_trim_op", "compression", + "Number of TRIM operation performed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP}, + + {"compress_page_compressed_trim_op_saved", "compression", + "Number of TRIM operation saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED}, + + {"compress_pages_page_decompressed", "compression", + "Number of pages decompressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1528,6 +1569,11 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; + /* innodb_index_pages_written, the number of page written */ + case MONITOR_OVLD_INDEX_PAGES_WRITTEN: + value = srv_stats.index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); @@ -1769,6 +1815,28 @@ srv_mon_process_existing_counter( value = btr_cur_n_non_sea; break; + case MONITOR_OVLD_PAGE_COMPRESS_SAVED: + value = srv_stats.page_compression_saved; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512: + value = srv_stats.page_compression_trim_sect512; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096: + value = srv_stats.page_compression_trim_sect4096; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSED: + value = srv_stats.pages_page_compressed; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP: + value = srv_stats.page_compressed_trim_op; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED: + value = srv_stats.page_compressed_trim_op_saved; + break; + case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: + value = srv_stats.pages_page_decompressed; + break; + default: ut_error; } diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 4c5753ac40e..90864cee9ef 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -145,6 +145,24 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +/* If this flag is TRUE, then we will use page compression +to the pages */ +UNIV_INTERN my_bool srv_compress_pages = FALSE; +/* If this flag is TRUE, then we will use page compression +only for index pages */ +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN long srv_trim_pct = 100; +/* Default compression level if page compression is used and no compression +level is set for the table*/ +UNIV_INTERN long srv_compress_zlib_level = 6; +/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) +to the pages */ +UNIV_INTERN my_bool srv_use_trim = TRUE; +/* If this flag is TRUE, then we will use posix fallocate for file extentsion */ +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +/* If this flag is TRUE, then we disable doublewrite buffer */ +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; + #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function pointers, because they are not available on Windows Server 2003 and @@ -347,11 +365,6 @@ batch flushing i.e.: LRU flushing and flush_list flushing. The rest of the pages are used for single page flushing. */ UNIV_INTERN ulong srv_doublewrite_batch_size = 120; -UNIV_INTERN ibool srv_use_atomic_writes = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -UNIV_INTERN ibool srv_use_posix_fallocate = TRUE; -#endif - UNIV_INTERN ulong srv_replication_delay = 0; /*-------------------------------------------*/ @@ -375,6 +388,16 @@ static ulint srv_n_rows_read_old = 0; UNIV_INTERN ulint srv_truncated_status_writes = 0; UNIV_INTERN ulint srv_available_undo_logs = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; +UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; +UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; + + /* Set the following to 0 if you want InnoDB to write messages on stderr on startup/shutdown. */ UNIV_INTERN ibool srv_print_verbose_log = TRUE; @@ -1457,6 +1480,14 @@ srv_export_innodb_status(void) srv_truncated_status_writes; export_vars.innodb_available_undo_logs = srv_available_undo_logs; + export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved; + export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; + export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; + export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; + export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; + export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; + export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; #ifdef UNIV_DEBUG if (purge_sys->done.trx_no == 0 diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index efe9f094c0d..0517f4b1468 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -3,6 +3,7 @@ Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -126,7 +127,10 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32]; +/** pgcomp_thread are 16 total */ +#define START_PGCOMP_CNT (SRV_MAX_N_IO_THREADS + 6 + 32) +#define PGCOMP_MAX_WORKER 16 +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + PGCOMP_MAX_WORKER]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -522,7 +526,7 @@ create_log_file( *file = os_file_create( innodb_file_log_key, name, - OS_FILE_CREATE, OS_FILE_NORMAL, OS_LOG_FILE, &ret); + OS_FILE_CREATE, OS_FILE_NORMAL, OS_LOG_FILE, &ret, FALSE); ib_logf(IB_LOG_LEVEL_INFO, "Setting log file %s size to %lu MB", @@ -715,7 +719,7 @@ open_log_file( *file = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); return(DB_ERROR); @@ -806,7 +810,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode) { @@ -849,7 +853,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -881,17 +885,17 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else if (i == 0) { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -1078,7 +1082,7 @@ srv_undo_tablespace_create( innodb_file_data_key, name, srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode && ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -1159,7 +1163,8 @@ srv_undo_tablespace_open( | OS_FILE_ON_ERROR_SILENT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + FALSE); /* If the file open was successful then load the tablespace. */ @@ -1430,6 +1435,691 @@ srv_start_wait_for_purge_to_start() } } +/* JAN: TODO: */ +/**********************************************************************************/ +extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time); +extern ibool buf_flush_start(buf_pool_t* buf_pool, enum buf_flush flush_type); +extern void buf_flush_end(buf_pool_t* buf_pool, enum buf_flush flush_type); +extern void buf_flush_common(enum buf_flush flush_type, ulint page_count); +extern ulint buf_flush_batch(buf_pool_t* buf_pool, enum buf_flush flush_type, ulint min_n, lsn_t lsn_limit); + +typedef enum wrk_status { + WRK_ITEM_SET=0, + WRK_ITEM_START=1, + WRK_ITEM_DONE=2, + WRK_ITEM_SUCCESS=2, + WRK_ITEM_FAILED=3, + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +typedef enum wthr_status { + WTHR_NOT_INIT=0, + WTHR_INITIALIZED=1, + WTHR_SIG_WAITING=2, + WTHR_RUNNING=3, + WTHR_NO_WORK=4, + WTHR_KILL_IT=5, + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +typedef struct wrk_itm +{ + /****************************/ + /* Need to group into struct*/ + buf_pool_t* buf_pool; //buffer-pool instance + int flush_type; //flush-type for buffer-pool flush operation + int min; //minimum number of pages requested to be flushed + unsigned long long lsn_limit; //lsn limit for the buffer-pool flush operation + /****************************/ + + unsigned long result; //flush pages count + unsigned long t_usec; //time-taken in usec + long id_usr; //thread-id currently working + wrk_status_t wi_status; //flag + struct wrk_itm *next; +} wrk_t; + +typedef enum op_q_status { + Q_NOT_INIT=0, + Q_EMPTY=1, + Q_INITIALIZED=2, + Q_PROCESS=3, + Q_DONE=4, + Q_ERROR=5, + Q_STATUS_UNDEFINED +} q_status_t; + +typedef struct op_queue +{ + pthread_mutex_t mtx; + pthread_cond_t cv; + q_status_t flag; + wrk_t *head; + wrk_t *tail; +} opq_t; + +opq_t wq, cq; + +typedef struct thread_sync +{ + int wthread_id; + pthread_t wthread; + opq_t *wq; + opq_t *cq; + wthr_status_t wt_status; + unsigned long stat_universal_num_processed; + unsigned long stat_cycle_num_processed; +} thread_sync_t; + +/* Global XXX:DD needs to be cleaned */ +int exit_flag; +ulint check_wrk_done_count; +static ulint done_cnt_flag; +static int pgc_n_threads = 8; + +thread_sync_t pc_sync[PGCOMP_MAX_WORKER]; +static wrk_t work_items[PGCOMP_MAX_WORKER]; +static int pgcomp_wrk_initialized = -1; + +int set_check_done_flag_count(int cnt) +{ + return(check_wrk_done_count = cnt); +} + +int set_pgcomp_wrk_init_done(void) +{ + pgcomp_wrk_initialized = 1; + return 0; +} + +int is_pgcomp_wrk_init_done(void) +{ + return(pgcomp_wrk_initialized == 1); +} + +ulint set_done_cnt_flag(ulint val) +{ + /* + * Assumption: The thread calling into set_done_cnt_flag + * needs to have "cq.mtx" acquired, else not safe. + */ + done_cnt_flag = val; + return done_cnt_flag; +} + + +ulint cv_done_inc_flag_sig(thread_sync_t * ppc) +{ + pthread_mutex_lock(&ppc->cq->mtx); + ppc->stat_universal_num_processed++; + ppc->stat_cycle_num_processed++; + done_cnt_flag++; + if(!(done_cnt_flag <= check_wrk_done_count)) { + fprintf(stderr, "ERROR: done_cnt:%lu check_wrk_done_count:%lu\n", + done_cnt_flag, check_wrk_done_count); + } + assert(done_cnt_flag <= check_wrk_done_count); + pthread_mutex_unlock(&ppc->cq->mtx); + if(done_cnt_flag == check_wrk_done_count) { + ppc->wq->flag = Q_DONE; + pthread_mutex_lock(&ppc->cq->mtx); + ppc->cq->flag = Q_DONE; + pthread_cond_signal(&ppc->cq->cv); + pthread_mutex_unlock(&ppc->cq->mtx); + } + return(done_cnt_flag); +} + +int q_remove_wrk(opq_t *q, wrk_t **wi) +{ + int ret = 0; + + if(!wi || !q) { + return -1; + } + + pthread_mutex_lock(&q->mtx); + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* get the first in the list*/ + *wi = q->head; + if(q->head) { + ret = 0; + q->head = q->head->next; + (*wi)->next = NULL; + if(!q->head) { + q->tail = NULL; + } + } else { + q->tail = NULL; + ret = 1; /* indicating remove from queue failed */ + } + pthread_mutex_unlock(&q->mtx); + return (ret); +} + +int is_busy_wrk_itm(wrk_t *wi) +{ + if(!wi) { + return -1; + } + return(!(wi->id_usr == -1)); +} + +int setup_wrk_itm(int items) +{ + int i; + for(i=0; imtx, NULL); + pthread_cond_init(&q->cv, NULL); + q->flag = Q_INITIALIZED; + q->head = q->tail = NULL; + + return 0; +} + +#if 0 +int drain_cq(opq_t *cq, int items) +{ + int i=0; + + if(!cq) { + return -1; + } + pthread_mutex_lock(&cq->mtx); + for(i=0; ihead = cq->tail = NULL; + pthread_mutex_unlock(&cq->mtx); + return 0; +} +#endif + +int q_insert_wrk_list(opq_t *q, wrk_t *w_list) +{ + if((!q) || (!w_list)) { + fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); + return -1; + } + + pthread_mutex_lock(&q->mtx); + + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* list is empty */ + if(!q->tail) { + q->head = q->tail = w_list; + } else { + /* added the first of the node to list */ + assert(q->head != NULL); + q->tail->next = w_list; + } + + /* move tail to the last node */ + while(q->tail->next) { + q->tail = q->tail->next; + } + pthread_mutex_unlock(&q->mtx); + + return 0; +} + +int flush_pool_instance(wrk_t *wi) +{ + struct timeval p_start_time, p_end_time, d_time; + + if(!wi) { + fprintf(stderr, "work item invalid wi:%p\n", wi); + return -1; + } + + wi->t_usec = 0; + if (!buf_flush_start(wi->buf_pool, (buf_flush)wi->flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + fprintf(stderr, "flush_start Failed, flush_type:%d\n", + (buf_flush)wi->flush_type); + return -1; + } + +#ifdef UNIV_DEBUG + /* Record time taken for the OP in usec */ + gettimeofday(&p_start_time, 0x0); +#endif + + if((buf_flush)wi->flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(wi->buf_pool); + wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); + buf_pool_mutex_exit(wi->buf_pool); + wi->min = ut_min(srv_LRU_scan_depth,wi->min); + } + + wi->result = buf_flush_batch(wi->buf_pool, + (buf_flush)wi->flush_type, + wi->min, wi->lsn_limit); + + buf_flush_end(wi->buf_pool, (buf_flush)wi->flush_type); + buf_flush_common((buf_flush)wi->flush_type, wi->result); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); +#endif + + return 0; +} + +int service_page_comp_io(thread_sync_t * ppc) +{ + wrk_t *wi = NULL; + int ret=0; + + pthread_mutex_lock(&ppc->wq->mtx); + do{ + ppc->wt_status = WTHR_SIG_WAITING; + ret = pthread_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); + ppc->wt_status = WTHR_RUNNING; + if(ret == ETIMEDOUT) { + fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%lu] ret:%d\n", + done_cnt_flag, ret); + } else if(ret == EINVAL || ret == EPERM) { + fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%lu] ret:%d\n", + done_cnt_flag, ret); + } + if(ppc->wq->flag == Q_PROCESS) { + break; + } else { + pthread_mutex_unlock(&ppc->wq->mtx); + return -1; + } + } while (ppc->wq->flag == Q_PROCESS && ret == 0); + + pthread_mutex_unlock(&ppc->wq->mtx); + + while (ppc->cq->flag == Q_PROCESS) { + wi = NULL; + /* Get the work item */ + if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { + ppc->wt_status = WTHR_NO_WORK; + return -1; + } + + assert(ret==0); + assert(wi != NULL); + assert(0 == is_busy_wrk_itm(wi)); + assert(wi->id_usr == -1); + + wi->id_usr = ppc->wthread; + wi->wi_status = WRK_ITEM_START; + + /* Process work item */ + if(0 != (ret = flush_pool_instance(wi))) { + fprintf(stderr, "FLUSH op failed ret:%d\n", ret); + wi->wi_status = WRK_ITEM_FAILED; + } + + ret = q_insert_wrk_list(ppc->cq, wi); + + assert(0==ret); + assert(check_wrk_done_count >= done_cnt_flag); + wi->wi_status = WRK_ITEM_SUCCESS; + if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { + break; + } + } + return(0); +} + +/******************************************************************//** +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(page_comp_io_thread)( +/*==========================================*/ + void * arg) +{ + thread_sync_t *ppc_io = ((thread_sync_t *)arg); + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + service_page_comp_io(ppc_io); + ppc_io->stat_cycle_num_processed = 0; + } + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +int print_queue_wrk_itm(opq_t *q) +{ +#if UNIV_DEBUG + wrk_t *wi = NULL; + + if(!q) { + fprintf(stderr, "queue NULL\n"); + return -1; + } + + if(!q->head || !q->tail) { + assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); + fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); + return 0; + } + + pthread_mutex_lock(&q->mtx); + for(wi = q->head; (wi != NULL) ; wi = wi->next) { + //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", + // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); + fprintf(stderr, "- [%p] [%s] >%p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->next); + } + pthread_mutex_unlock(&q->mtx); +#endif + return(0); +} + +int print_wrk_list(wrk_t *wi_list) +{ + wrk_t *wi = wi_list; + int i=0; + + if(!wi_list) { + fprintf(stderr, "list NULL\n"); + } + + while(wi) { + fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); + wi = wi->next; + i++; + } + fprintf(stderr, "list len: %d\n", i); + return 0; +} + +int pgcomp_handler(wrk_t *w_list) +{ + int ret=0; + opq_t *wrk_q=NULL, *comp_q=NULL; + + wrk_q=&wq; + comp_q=&cq; + + pthread_mutex_lock(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_EMPTY; + pthread_mutex_unlock(&wrk_q->mtx); + + ret = q_insert_wrk_list(wrk_q, w_list); + if(ret != 0) { + fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", + __FUNCTION__, &wq, w_list); + return -1; + } + +retry_submit: + pthread_mutex_lock(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&wrk_q->mtx); + + + pthread_mutex_lock(&comp_q->mtx); + if(0 != set_done_cnt_flag(0)) { + fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); + pthread_mutex_unlock(&comp_q->mtx); + return -1; + } + comp_q->flag = Q_PROCESS; + pthread_mutex_unlock(&comp_q->mtx); + + /* if threads are waiting request them to start */ + pthread_mutex_lock(&wrk_q->mtx); + wrk_q->flag = Q_PROCESS; + pthread_cond_broadcast(&wrk_q->cv); + pthread_mutex_unlock(&wrk_q->mtx); + + /* Wait on all worker-threads to complete */ + pthread_mutex_lock(&comp_q->mtx); + if (comp_q->flag != Q_DONE) { + do { + pthread_cond_wait(&comp_q->cv, &comp_q->mtx); + if(comp_q->flag != Q_DONE) { + fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + } + continue; + } else if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&comp_q->mtx); + goto retry_submit; + + assert(!done_cnt_flag); + continue; + } + assert(done_cnt_flag == srv_buf_pool_instances); + + if ((comp_q->flag == Q_DONE) && + (done_cnt_flag == srv_buf_pool_instances)) { + break; + } + } while((comp_q->flag == Q_INITIALIZED) && + (done_cnt_flag != srv_buf_pool_instances)); + } else { + fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + if (!done_cnt_flag) { + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&comp_q->mtx); + goto retry_submit; + assert(!done_cnt_flag); + } + assert(done_cnt_flag == srv_buf_pool_instances); + } + + pthread_mutex_unlock(&comp_q->mtx); + pthread_mutex_lock(&wrk_q->mtx); + wrk_q->flag = Q_DONE; + pthread_mutex_unlock(&wrk_q->mtx); + + return 0; +} + +/******************************************************************//** +@return a dummy parameter*/ +int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) +{ + int i=0; + + if(is_pgcomp_wrk_init_done()) { + fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); + return -1; + } + + if(!wq || !cq) { + fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); + return -1; + } + + /* work-item setup */ + setup_wrk_itm(wrk_cnt); + + /* wq & cq setup */ + init_queue(wq); + init_queue(cq); + + /* Mark each of the thread sync entires */ + for(i=0; i < PGCOMP_MAX_WORKER; i++) { + pc_sync[i].wthread_id = i; + } + + /* Create threads for page-compression-flush */ + for(i=0; i < num_threads; i++) { + pc_sync[i].wthread_id = i; + pc_sync[i].wq = wq; + pc_sync[i].cq = cq; + os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), + thread_ids + START_PGCOMP_CNT + i); + //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; + pc_sync[i].wthread = (START_PGCOMP_CNT + i); + pc_sync[i].wt_status = WTHR_INITIALIZED; + } + + set_check_done_flag_count(wrk_cnt); + set_pgcomp_wrk_init_done(); + + return 0; +} + + +int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) +{ + long stat_tot=0; + unsigned int i=0; + for(i=0; i< num_threads;i++) { + stat_tot+=wthr[i].stat_universal_num_processed; + fprintf(stderr, "[%d] stat [%lu]\n", wthr[i].wthread_id, + wthr[i].stat_universal_num_processed); + } + fprintf(stderr, "Stat-Total:%lu\n", stat_tot); + return (0); +} + +int reset_wrk_itm(int items) +{ + int i; + + pthread_mutex_lock(&wq.mtx); + wq.head = wq.tail = NULL; + pthread_mutex_unlock(&wq.mtx); + + pthread_mutex_lock(&cq.mtx); + for(i=0;i Date: Thu, 19 Dec 2013 18:04:26 +0200 Subject: [PATCH 02/56] Atomic writes require also atomic_blobs. Add that missing flag to dictionary setting and from there it will be stored to table space. --- storage/innobase/include/dict0dict.ic | 1 + 1 file changed, 1 insertion(+) diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 65967552b87..65c1bfca24f 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -856,6 +856,7 @@ dict_tf_set( if (atomic_writes) { *flags |= (1 << DICT_TF_POS_ATOMIC_WRITES); ut_ad(dict_tf_get_atomic_writes(*flags) == TRUE); + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); } if (use_data_dir) { From f023715fe8c3bc7c60f65cfd58e4980b4cc89560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 20 Dec 2013 06:50:58 +0200 Subject: [PATCH 03/56] Need to disable fast file extension with posix_fallocate for Fusion-io currently. --- storage/innobase/fil/fil0fil.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 2bf5922e07d..0939598d90d 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -4859,6 +4859,8 @@ retry: start_page_no = space->size; file_start_page_no = space->size - node->size; + /* JAN: TODO: Need to disable fast file extension for Fusion-io + currently. #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { ulint n_pages = size_after_extend - start_page_no; @@ -4875,6 +4877,7 @@ retry: goto complete_io; } #endif + */ /* Extend at most 64 pages at a time */ buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; From f6a196555e639489a7e1987eb88c67827f468a9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 20 Dec 2013 08:59:34 +0200 Subject: [PATCH 04/56] Temporally disable posix_fallocate on os_file_set_size because currently Fusion-io SSD drive does not support setting file size without fysically writing pages with zeroes when fallocate with PUCH_HOLE is used. Added additional error message if atomic write setup does not succeed. --- storage/innobase/os/os0file.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 60331f9c483..4ce5646b379 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -386,6 +386,9 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform! Please restart with " + "innodb_use_atomic_writes disabled.\n", name); os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2285,6 +2288,8 @@ os_file_set_size( current_size = 0; + /* JAN: TODO: Disable posix_fallocate file extension for Fusion-io + because currently it assumes that pages are initialized by zeroes #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2300,6 +2305,7 @@ os_file_set_size( return(TRUE); } #endif + */ /* Write up to 1 megabyte at a time. */ From e80f2468b468540c27e9b7174769262297bffc13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 9 Jan 2014 08:30:09 +0200 Subject: [PATCH 05/56] Fixed issues with atomic writes and compressed pages. Temporal solution: In directFS using atomic writes we must use posix_fallocate to extend the file because pwrite past end of file fails but when compression is used the file pages must be physically initialized with zeroes, thus after file extend with posix_fallocate we still write empty pages to file. --- storage/innobase/fil/fil0fil.cc | 36 +++++++++++++++++++++++---------- storage/innobase/os/os0file.cc | 4 ---- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 0939598d90d..8a416d09c94 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -2,7 +2,6 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2013 SkySQL Ab. All Rights Reserved. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -4828,6 +4827,7 @@ retry: } page_size = fsp_flags_get_zip_size(space->flags); + if (!page_size) { page_size = UNIV_PAGE_SIZE; } @@ -4859,8 +4859,6 @@ retry: start_page_no = space->size; file_start_page_no = space->size - node->size; - /* JAN: TODO: Need to disable fast file extension for Fusion-io - currently. #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { ulint n_pages = size_after_extend - start_page_no; @@ -4868,16 +4866,37 @@ retry: success = os_file_set_size(node->name, node->handle, n_pages * page_size); + /* Temporal solution: In directFS using atomic writes + we must use posix_fallocate to extend the file because + pwrite past end of file fails but when compression is + used the file pages must be physically initialized with + zeroes, thus after file extend with posix_fallocate + we still write empty pages to file. */ + if (success && + srv_use_atomic_writes && + srv_compress_pages) { + goto extend_file; + } + mutex_enter(&fil_system->mutex); + if (success) { node->size += n_pages; space->size += n_pages; os_has_said_disk_full = FALSE; } + + /* If posix_fallocate was used to extent the file space + we need to complete the io. Because no actual writes were + dispatched read operation is enough here. Without this + there will be assertion at shutdown indicating that + all IO is not completed. */ + fil_node_complete_io(node, fil_system, OS_FILE_READ); goto complete_io; } #endif - */ + +extend_file: /* Extend at most 64 pages at a time */ buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; @@ -4932,15 +4951,10 @@ retry: space->size += pages_added; node->size += pages_added; - node->being_extended = FALSE; -#ifdef HAVE_POSIX_FALLOCATE + fil_node_complete_io(node, fil_system, OS_FILE_WRITE); + complete_io: - fil_node_complete_io(node, fil_system, OS_FILE_READ); -#else - fil_node_complete_io(node, fil_system, OS_FILE_WRITE); -#endif - node->being_extended = FALSE; *actual_size = space->size; diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 4ce5646b379..9f12ca86601 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2288,8 +2288,6 @@ os_file_set_size( current_size = 0; - /* JAN: TODO: Disable posix_fallocate file extension for Fusion-io - because currently it assumes that pages are initialized by zeroes #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2305,8 +2303,6 @@ os_file_set_size( return(TRUE); } #endif - */ - /* Write up to 1 megabyte at a time. */ buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) From 2b5a0a22802a0069f318f7d23a1071a703930c90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 9 Jan 2014 12:33:29 +0200 Subject: [PATCH 06/56] Feature: In first write if we trim we set write_size to actual bytes written and rest of the page is trimmed. In following writes there is no need to trim again if write_size only increases because rest of the page is already trimmed. If actual write size decreases we need to trim again. Need to research if this can happen frequently enough to make any effect. --- storage/innobase/buf/buf0dblwr.cc | 2 +- storage/innobase/buf/buf0flu.cc | 2 +- storage/innobase/fil/fil0fil.cc | 6 ++--- storage/innobase/include/fil0fil.h | 2 +- storage/innobase/include/os0file.h | 8 +++++-- storage/innobase/include/os0file.ic | 8 +++++-- storage/innobase/os/os0file.cc | 37 ++++++++++++++++++++--------- 7 files changed, 44 insertions(+), 21 deletions(-) diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 933b56eaf88..2ae67d8a41e 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -728,7 +728,7 @@ buf_dblwr_write_block_to_datafile( fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block, 0); + (void*) block->frame, (void*) block, (ulint *)&bpage->write_size); } /********************************************************************//** diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 06ae7b5375c..b5f1aeef597 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -942,7 +942,7 @@ buf_flush_write_block_low( FALSE, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage, 0); + frame, bpage, &bpage->write_size); } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { buf_dblwr_write_single_page(bpage); } else { diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 8a416d09c94..0bec85c699a 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -441,7 +441,7 @@ fil_read( in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ - ulint write_size) /*!< in/out: Actual write size initialized + ulint* write_size) /*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if initialized we do not trim again if @@ -475,7 +475,7 @@ fil_write( this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ - ulint write_size) /*!< in/out: Actual write size initialized + ulint* write_size) /*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if initialized we do not trim again if @@ -5288,7 +5288,7 @@ fil_io( appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ - ulint write_size) /*!< in/out: Actual write size initialized + ulint* write_size) /*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if initialized we do not trim again if diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index c5edd33f46b..01084d52365 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -753,7 +753,7 @@ fil_io( appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ - ulint write_size) /*!< in/out: Actual write size initialized + ulint* write_size) /*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if initialized we do not trim again if diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 3c70f9925fe..eb5e1dddaf5 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -724,7 +724,11 @@ pfs_os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ - ibool atomic_writes, /*!write_size > 0 && len >= slot->write_size)) { + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { - if (slot->write_size > 0 && len >= slot->write_size) { +#ifdef UNIV_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { srv_stats.page_compressed_trim_op_saved.inc(); } - slot->write_size = len; + *slot->write_size = len; return (TRUE); } @@ -6191,11 +6198,15 @@ os_file_trim( " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", FALSE, __FILE__, __LINE__); - slot->write_size = 0; + if (slot->write_size) { + *slot->write_size = 0; + } return (FALSE); } else { - slot->write_size = len; + if (slot->write_size) { + *slot->write_size = len; + } } #else ut_print_timestamp(stderr); @@ -6203,7 +6214,7 @@ os_file_trim( " InnoDB: [Warning] fallocate not supported on this installation." " InnoDB: Disabling fallocate for now."); os_fallocate_failed = TRUE; - slot->write_size = 0; + slot->write_size = NULL; #endif /* HAVE_FALLOCATE ... */ @@ -6229,10 +6240,14 @@ os_file_trim( " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", FALSE, __FILE__, __LINE__); - slot->write_size = 0; + if (slot->write_size) { + slot->write_size = 0; + } return (FALSE); } else { - slot->write_size = len; + if (slot->write_size) { + slot->write_size = len; + } } #endif From ec8257216e5b25ed82d63f074254b9454e0a0df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 10 Jan 2014 12:11:36 +0200 Subject: [PATCH 07/56] Enhancement: Change atomic_writes table option to enum type. Now every file can either use atomic writes, not use it or use default. SYNTAX: ATOMIC_WRITES=['DEFAULT','ON','OFF'] Idea here is to be able to define innodb_doublewrite = 1 but with following rules: ATOMIC_WRITES='DEFAULT' - if innodb_use_atomic_writes = 1, we do not write to doublewrite buffer the changes if innodb_use_atomic_writes = 0, we write to doublewrite buffer ATOMIC_WRITES='ON' - do not write to doublewrite buffer ATOMIC_WRITES='OFF' - write to doublewrite buffer Note that doublewrite buffer can't be used if innodb_doublewrite = 0. --- storage/innobase/buf/buf0flu.cc | 24 +++++++- storage/innobase/fil/fil0fil.cc | 10 ++-- storage/innobase/fil/fil0pagecompress.cc | 8 +-- storage/innobase/handler/ha_innodb.cc | 8 ++- storage/innobase/handler/ha_innodb.h | 8 ++- storage/innobase/handler/handler0alter.cc | 2 +- storage/innobase/include/dict0dict.h | 4 +- storage/innobase/include/dict0dict.ic | 15 +++-- storage/innobase/include/dict0mem.h | 3 +- storage/innobase/include/dict0pagecompress.h | 4 +- storage/innobase/include/dict0pagecompress.ic | 12 ++-- storage/innobase/include/dict0types.h | 8 +++ storage/innobase/include/fil0pagecompress.h | 4 +- storage/innobase/include/fsp0fsp.h | 2 +- storage/innobase/include/fsp0fsp.ic | 5 +- storage/innobase/include/fsp0pagecompress.h | 9 +++ storage/innobase/include/fsp0pagecompress.ic | 4 +- storage/innobase/include/os0file.h | 34 ++++++----- storage/innobase/include/os0file.ic | 9 +-- storage/innobase/os/os0file.cc | 58 ++++++++----------- 20 files changed, 137 insertions(+), 94 deletions(-) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index b5f1aeef597..d159ddbe23f 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -50,6 +50,7 @@ Created 11/11/1995 Heikki Tuuri #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /** Number of pages flushed through non flush_list flushes. */ static ulint buf_lru_flush_page_count = 0; @@ -866,6 +867,8 @@ buf_flush_write_block_low( { ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; + ulint space_id = buf_page_get_space(bpage); + atomic_writes_t awrites = fil_space_get_atomic_writes(space_id); #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -943,10 +946,25 @@ buf_flush_write_block_low( buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, frame, bpage, &bpage->write_size); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage); } else { - buf_dblwr_add_to_batch(bpage); + /* InnoDB uses doublewrite buffer and doublewrite buffer + is initialized. User can define do we use atomic writes + on a file space (table) or not. If atomic writes are + not used we should use doublewrite buffer and if + atomic writes should be used, no doublewrite buffer + is used. */ + + if (awrites == ATOMIC_WRITES_ON) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage, &bpage->write_size); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage); + } else { + buf_dblwr_add_to_batch(bpage); + } } } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 0bec85c699a..2f56936ae04 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -756,7 +756,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success, FALSE); + OS_FILE_READ_ONLY, &success, 0); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -3159,7 +3159,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, FALSE); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); if (!success) { /* The following call will print an error message */ @@ -3269,8 +3269,8 @@ fil_open_linked_file( const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ os_file_t* remote_file, /*!< out: remote file handle */ - ibool atomic_writes) /*!< in: should atomic writes be - used */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ { ibool success; @@ -4861,7 +4861,7 @@ retry: #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { - ulint n_pages = size_after_extend - start_page_no; + ulint n_pages = size_after_extend; success = os_file_set_size(node->name, node->handle, n_pages * page_size); diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 3926b23c677..b67f583b53b 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -350,8 +350,8 @@ fil_get_compression_alg_name( /*******************************************************************//** Returns the atomic writes flag of the space, or false if the space is not using atomic writes. The tablespace must be cached in the memory cache. -@return true if space using atomic writes, false if not */ -ibool +@return atomic writes table option value */ +atomic_writes_t fil_space_get_atomic_writes( /*========================*/ ulint id) /*!< in: space id */ @@ -362,8 +362,8 @@ fil_space_get_atomic_writes( if (flags && flags != ULINT_UNDEFINED) { - return(fsp_flags_get_atomic_writes(flags)); + return((atomic_writes_t)fsp_flags_get_atomic_writes(flags)); } - return(flags); + return((atomic_writes_t)0); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 085521ac7e5..074f8c3fc2c 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -503,7 +503,7 @@ ha_create_table_option innodb_table_option_list[]= compression for this table*/ HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), /* With this option user can enable atomic writes feature for this table */ - HA_TOPTION_BOOL("ATOMIC_WRITES", atomic_writes, 0), + HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), HA_TOPTION_END }; @@ -9738,6 +9738,7 @@ ha_innobase::check_table_options( { enum row_type row_format = table->s->row_type;; ha_table_option_struct *options= table->s->option_struct; + atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; /* Check page compression requirements */ if (options->page_compressed) { @@ -9811,8 +9812,9 @@ ha_innobase::check_table_options( } /* Check atomic writes requirements */ - if (options->atomic_writes) { - if (!srv_use_atomic_writes && !use_tablespace) { + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!use_tablespace) { push_warning( thd, Sql_condition::WARN_LEVEL_WARN, HA_WRONG_CREATE_OPTION, diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 5eb460072bb..9e133ea1023 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -64,8 +64,12 @@ struct ha_table_option_struct if this option is true. */ int page_compression_level; /*!< Table page compression level or UNIV_UNSPECIFIED. */ - bool atomic_writes; /*!< Use atomic writes for this - table if this options is true. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + srv_use_atomic_writes=1. + Atomic writes are not used if + value OFF.*/ }; diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 49f8a05d11a..244e7d19586 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -258,7 +258,7 @@ ha_innobase::check_if_supported_inplace_alter( if (new_options->page_compressed != old_options->page_compressed || new_options->page_compression_level != old_options->page_compression_level || - new_options->atomic_writes != old_options->page_compression_level) { + new_options->atomic_writes != old_options->atomic_writes) { ha_alter_info->unsupported_reason = innobase_get_err_msg( ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 0ca64956a2e..3208a764fe1 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -887,8 +887,8 @@ dict_tf_set( pages */ ulint page_compression_level, /*!< in: table page compression level */ - bool atomic_writes) /*!< in: table uses atomic - writes */ + ulint atomic_writes) /*!< in: table atomic + writes option value*/ __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 65c1bfca24f..f9d548681a8 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -670,6 +670,7 @@ dict_sys_tables_type_validate( ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -734,7 +735,8 @@ dict_sys_tables_type_validate( } } - if (atomic_writes) { + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { if (!atomic_blobs) { return(ULINT_UNDEFINED); } @@ -818,9 +820,10 @@ dict_tf_set( pages */ ulint page_compression_level, /*!< in: table page compression level */ - bool atomic_writes) /*!< in: table uses atomic - writes */ + ulint atomic_writes) /*!< in: table atomic writes setup */ { + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + switch (format) { case REC_FORMAT_REDUNDANT: *flags = 0; @@ -853,9 +856,9 @@ dict_tf_set( ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); } - if (atomic_writes) { - *flags |= (1 << DICT_TF_POS_ATOMIC_WRITES); - ut_ad(dict_tf_get_atomic_writes(*flags) == TRUE); + if (awrites != ATOMIC_WRITES_DEFAULT) { + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); } diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 6cfcb81bcd5..f4e5e558488 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -129,8 +129,9 @@ Width of the page compression flag /** Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 */ -#define DICT_TF_WIDTH_ATOMIC_WRITES 1 +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h index 236924758f1..19a2a6c52f3 100644 --- a/storage/innobase/include/dict0pagecompress.h +++ b/storage/innobase/include/dict0pagecompress.h @@ -71,7 +71,7 @@ dict_tf_verify_flags( Extract the atomic writes flag from table flags. @return true if atomic writes are used, false if not used */ UNIV_INLINE -ibool +atomic_writes_t dict_tf_get_atomic_writes( /*======================*/ ulint flags) /*!< in: flags */ @@ -81,7 +81,7 @@ dict_tf_get_atomic_writes( Check whether the table uses the atomic writes. @return true if atomic writes is used, false if not */ UNIV_INLINE -ibool +atomic_writes_t dict_table_get_atomic_writes( /*=========================*/ const dict_table_t* table); /*!< in: table */ diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic index 98b64723542..fb9581fc657 100644 --- a/storage/innobase/include/dict0pagecompress.ic +++ b/storage/innobase/include/dict0pagecompress.ic @@ -168,24 +168,24 @@ dict_table_is_page_compressed( /********************************************************************//** Extract the atomic writes flag from table flags. -@return true if atomic writes are used, false if not used */ +@return enumerated value of atomic writes */ UNIV_INLINE -ibool +atomic_writes_t dict_tf_get_atomic_writes( /*======================*/ ulint flags) /*!< in: flags */ { - return(DICT_TF_GET_ATOMIC_WRITES(flags)); + return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags)); } /********************************************************************//** Check whether the table uses the atomic writes. -@return true if atomic writes is used, false if not */ +@return enumerated value of atomic writes */ UNIV_INLINE -ibool +atomic_writes_t dict_table_get_atomic_writes( /*=========================*/ const dict_table_t* table) /*!< in: table */ { - return (dict_tf_get_atomic_writes(table->flags)); + return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags)); } diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index b7f7c2d9df9..a398ccfe7ea 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -67,4 +68,11 @@ enum ib_quiesce_t { QUIESCE_COMPLETE /*!< All done */ }; +/** Enum values for atomic_writes table option */ +typedef enum { + ATOMIC_WRITES_DEFAULT = 0, + ATOMIC_WRITES_ON = 1, + ATOMIC_WRITES_OFF = 2 +} atomic_writes_t; + #endif diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h index e21eae7a5ee..bf5caf98a75 100644 --- a/storage/innobase/include/fil0pagecompress.h +++ b/storage/innobase/include/fil0pagecompress.h @@ -49,8 +49,8 @@ fil_space_is_page_compressed( /*******************************************************************//** Returns the atomic writes flag of the space, or false if the space is not using atomic writes. The tablespace must be cached in the memory cache. -@return true if space using atomic writes, false if not */ -ibool +@return atomic write table option value */ +atomic_writes_t fil_space_get_atomic_writes( /*=========================*/ ulint id); /*!< in: space id */ diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 31c34cdafca..87f1f5a636d 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -58,7 +58,7 @@ is found in a remote location, not the default data directory. */ #define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1 #define FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL 4 /** Number of flag bits used to indicate atomic writes for this tablespace */ -#define FSP_FLAGS_WIDTH_ATOMIC_WRITES 1 +#define FSP_FLAGS_WIDTH_ATOMIC_WRITES 2 /** Width of all the currently known tablespace flags */ #define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 0ca02a5652d..cb12d556ec4 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -67,6 +67,7 @@ fsp_flags_is_valid( ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); @@ -116,7 +117,9 @@ fsp_flags_is_valid( } } - if (atomic_writes && !atomic_blobs) { + if ((awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) + && !atomic_blobs) { return (false); } diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h index 417d4a6879e..4913f1d6b29 100644 --- a/storage/innobase/include/fsp0pagecompress.h +++ b/storage/innobase/include/fsp0pagecompress.h @@ -57,6 +57,15 @@ fsp_flags_get_page_compression_level( /*=================================*/ ulint flags); /*!< in: tablespace flags */ +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags); /*!< in: tablespace flags */ + #ifndef UNIV_NONINL #include "fsp0pagecompress.ic" #endif diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic index 1dffd1bedf1..4859012428a 100644 --- a/storage/innobase/include/fsp0pagecompress.ic +++ b/storage/innobase/include/fsp0pagecompress.ic @@ -52,10 +52,10 @@ fsp_flags_get_page_compression_level( Determine the tablespace is using atomic writes from dict_table_t::flags. @return true if atomic writes is used, false if not */ UNIV_INLINE -ibool +atomic_writes_t fsp_flags_get_atomic_writes( /*========================*/ ulint flags) /*!< in: tablespace flags */ { - return(FSP_FLAGS_GET_ATOMIC_WRITES(flags)); + return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags)); } diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index eb5e1dddaf5..8b798b6d34f 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -275,12 +275,12 @@ The wrapper functions have the prefix of "innodb_". */ pfs_os_file_create_func(key, name, create, purpose, type, \ success, atomic_writes, __FILE__, __LINE__) -# define os_file_create_simple(key, name, create, access, success, atomic_writes) \ +# define os_file_create_simple(key, name, create, access, success) \ pfs_os_file_create_simple_func(key, name, create, access, \ - success, atomic_writes, __FILE__, __LINE__) + success, __FILE__, __LINE__) # define os_file_create_simple_no_error_handling( \ - key, name, create_mode, access, success, atomic_writes) \ + key, name, create_mode, access, success, atomic_writes) \ pfs_os_file_create_simple_no_error_handling_func( \ key, name, create_mode, access, success, atomic_writes, __FILE__, __LINE__) @@ -315,13 +315,13 @@ to original un-instrumented file I/O APIs */ # define os_file_create(key, name, create, purpose, type, success, atomic_writes) \ os_file_create_func(name, create, purpose, type, success, atomic_writes) -# define os_file_create_simple(key, name, create_mode, access, success, atomic_writes) \ - os_file_create_simple_func(name, create_mode, access, success, atomic_writes) +# define os_file_create_simple(key, name, create_mode, access, success) \ + os_file_create_simple_func(name, create_mode, access, success) # define os_file_create_simple_no_error_handling( \ - key, name, create_mode, access, success, atomic_writes) \ - os_file_create_simple_no_error_handling_func( \ - name, create_mode, access, success, atomic_writes) + key, name, create_mode, access, success, atomic_writes) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, success, atomic_writes) # define os_file_close(file) os_file_close_func(file) @@ -470,8 +470,7 @@ os_file_create_simple_func( ulint create_mode,/*!< in: create mode */ ulint access_type,/*!< in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ - ibool* success,/*!< out: TRUE if succeed, FALSE if error */ - ibool atomic_writes); /*! Date: Mon, 13 Jan 2014 15:02:31 +0200 Subject: [PATCH 08/56] Removed some unnecessary assertions to debug build and enhanced the page_compression and page_compression_level fetch. --- storage/innobase/fil/fil0fil.cc | 10 +- storage/innobase/fil/fil0pagecompress.cc | 119 ++----------------- storage/innobase/include/fil0pagecompress.h | 1 + storage/innobase/include/fsp0pagecompress.ic | 112 +++++++++++++++++ storage/innobase/include/os0file.h | 18 ++- storage/innobase/include/os0file.ic | 7 +- storage/innobase/os/os0file.cc | 31 +++-- 7 files changed, 171 insertions(+), 127 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 2f56936ae04..1718e68d667 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -4920,7 +4920,7 @@ extend_file: success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL, 0); + NULL, NULL, 0, FALSE, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5302,6 +5302,8 @@ fil_io( ulint wake_later; os_offset_t offset; ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ibool page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5462,6 +5464,9 @@ fil_io( ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); + #ifdef UNIV_HOTBACKUP /* In ibbackup do normal i/o, not aio */ if (type == OS_FILE_READ) { @@ -5474,7 +5479,8 @@ fil_io( #else /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message, write_size); + offset, len, node, message, write_size, + page_compressed, page_compression_level); #endif /* UNIV_HOTBACKUP */ ut_a(ret); diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index b67f583b53b..2da9d70e197 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -77,6 +77,7 @@ fil_compress_page( this must be appropriately aligned */ byte* out_buf, /*!< out: compressed buffer */ ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /* in: compression level */ ulint* out_len) /*!< out: actual length of compressed page */ { int err = Z_OK; @@ -84,13 +85,13 @@ fil_compress_page( ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; ulint write_size=0; - ut_a(buf); - ut_a(out_buf); - ut_a(len); - ut_a(out_len); + ut_ad(buf); + ut_ad(out_buf); + ut_ad(len); + ut_ad(out_len); - level = fil_space_get_page_compression_level(space_id); - ut_a(fil_space_is_page_compressed(space_id)); + level = compression_level; + ut_ad(fil_space_is_page_compressed(space_id)); fil_system_enter(); fil_space_t* space = fil_space_get_by_id(space_id); @@ -181,8 +182,8 @@ fil_decompress_page( ulint compression_alg = 0; byte *in_buf; - ut_a(buf); - ut_a(len); + ut_ad(buf); + ut_ad(len); /* Before actual decompress, make sure that page type is correct */ @@ -264,106 +265,4 @@ fil_decompress_page( } } -/*******************************************************************//** -Find out wheather the page is index page or not -@return true if page type index page, false if not */ -ibool -fil_page_is_index_page( -/*===================*/ - byte *buf) /*!< in: page */ -{ - return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); -} -/*******************************************************************//** -Find out wheather the page is page compressed -@return true if page is page compressed, false if not */ -ibool -fil_page_is_compressed( -/*===================*/ - byte *buf) /*!< in: page */ -{ - return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); -} - -/*******************************************************************//** -Returns the page compression level of the space, or 0 if the space -is not compressed. The tablespace must be cached in the memory cache. -@return page compression level, ULINT_UNDEFINED if space not found */ -ulint -fil_space_get_page_compression_level( -/*=================================*/ - ulint id) /*!< in: space id */ -{ - ulint flags; - - flags = fil_space_get_flags(id); - - if (flags && flags != ULINT_UNDEFINED) { - - return(fsp_flags_get_page_compression_level(flags)); - } - - return(flags); -} - -/*******************************************************************//** -Extract the page compression from space. -@return true if space is page compressed, false if space is not found -or space is not page compressed. */ -ibool -fil_space_is_page_compressed( -/*=========================*/ - ulint id) /*!< in: space id */ -{ - ulint flags; - - flags = fil_space_get_flags(id); - - if (flags && flags != ULINT_UNDEFINED) { - - return(fsp_flags_is_page_compressed(flags)); - } - - return(flags); -} - -/****************************************************************//** -Get the name of the compression algorithm used for page -compression. -@return compression algorithm name or "UNKNOWN" if not known*/ -const char* -fil_get_compression_alg_name( -/*=========================*/ - ulint comp_alg) /*!io_already_done = FALSE; slot->page_compress_success = FALSE; slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; /* If the space is page compressed and this is write operation and if either only index pages compression is disabled or @@ -4456,7 +4465,7 @@ found: we compress the page */ if (message1 && type == OS_FILE_WRITE && - fil_space_is_page_compressed(fil_node_get_space_id(slot->message1)) && + page_compression && (srv_page_compress_index_pages == false || (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) { ulint real_len = len; @@ -4477,7 +4486,7 @@ found: can't really avoid this now. */ memset(slot->page_buf, 0, len); - tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, &real_len); + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); /* If compression succeeded, set up the length and buffer */ if (tmp != buf) { @@ -4773,11 +4782,15 @@ os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ - ulint* write_size)/*!< in/out: Actual write size initialized + ulint* write_size,/*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if initialized we do not trim again if actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4875,7 +4888,7 @@ try_again: } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n, write_size); + name, buf, offset, n, write_size, page_compression, page_compression_level); if (type == OS_FILE_READ) { if (srv_use_native_aio) { @@ -5100,7 +5113,7 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: if (slot->message1 && - fil_space_is_page_compressed(fil_node_get_space_id(slot->message1)) && + page_compression && slot->page_buf) { ret = WriteFile(slot->file, slot->page_buf, (DWORD) slot->len, &len, @@ -5141,8 +5154,7 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } - if (slot->message1 && - fil_space_is_page_compressed(fil_node_get_space_id(slot->message1))) { + if (slot->message1 && page_compression) { // We allocate memory for page compressed buffer if and only // if it is not yet allocated. if (slot->page_buf == NULL) { @@ -5256,8 +5268,7 @@ retry: /* If the table is page compressed and this is read, we decompress before we annouce the read is complete. For writes, we free the compressed page. */ - if (slot->message1 && - fil_space_is_page_compressed(fil_node_get_space_id(slot->message1))) { + if (slot->message1 && slot->page_compression) { // We allocate memory for page compressed buffer if and only // if it is not yet allocated. if (slot->page_buf == NULL) { From 8c5d5bc5de135ed143bfe91c99fd53a8c9b4487c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Feb 2014 10:08:15 +0200 Subject: [PATCH 09/56] Fixed merge error on InnoDB page compression level handling. Merged page compression feature to XtraDB storage engine. Added feature where page compression can use lz4 compression method (innodb_use_lz4, default OFF). --- storage/innobase/CMakeLists.txt | 1 + storage/innobase/btr/btr0btr.cc | 4 +- storage/innobase/btr/btr0cur.cc | 4 +- storage/innobase/fil/fil0fil.cc | 2 +- storage/innobase/fil/fil0pagecompress.cc | 186 +++-- storage/innobase/fil/lz4.c | 822 +++++++++++++++++++ storage/innobase/fil/lz4.h | 205 +++++ storage/innobase/handler/ha_innodb.cc | 44 +- storage/innobase/include/fil0fil.h | 1 + storage/innobase/include/fsp0pagecompress.ic | 5 +- storage/innobase/include/page0zip.h | 2 +- storage/innobase/include/srv0srv.h | 7 +- storage/innobase/page/page0cur.cc | 2 +- storage/innobase/page/page0page.cc | 6 +- storage/innobase/page/page0zip.cc | 4 +- storage/innobase/srv/srv0srv.cc | 18 +- storage/xtradb/CMakeLists.txt | 4 + storage/xtradb/buf/buf0buf.cc | 23 + storage/xtradb/buf/buf0dblwr.cc | 26 +- storage/xtradb/buf/buf0flu.cc | 349 +++++++- storage/xtradb/buf/buf0rea.cc | 5 +- storage/xtradb/dict/dict0dict.cc | 1 + storage/xtradb/fil/fil0fil.cc | 152 +++- storage/xtradb/fil/fil0pagecompress.cc | 324 ++++++++ storage/xtradb/fil/lz4.c | 822 +++++++++++++++++++ storage/xtradb/fil/lz4.h | 205 +++++ storage/xtradb/handler/ha_innodb.cc | 246 +++++- storage/xtradb/handler/ha_innodb.h | 18 + storage/xtradb/handler/handler0alter.cc | 28 + storage/xtradb/include/buf0buf.h | 21 + storage/xtradb/include/buf0flu.h | 7 + storage/xtradb/include/dict0dict.h | 12 +- storage/xtradb/include/dict0dict.ic | 164 +++- storage/xtradb/include/dict0mem.h | 56 +- storage/xtradb/include/dict0pagecompress.h | 94 +++ storage/xtradb/include/dict0pagecompress.ic | 191 +++++ storage/xtradb/include/dict0types.h | 9 + storage/xtradb/include/fil0fil.h | 43 +- storage/xtradb/include/fil0pagecompress.h | 118 +++ storage/xtradb/include/fsp0fsp.h | 68 +- storage/xtradb/include/fsp0fsp.ic | 19 + storage/xtradb/include/fsp0pagecompress.h | 73 ++ storage/xtradb/include/fsp0pagecompress.ic | 177 ++++ storage/xtradb/include/os0file.h | 69 +- storage/xtradb/include/os0file.ic | 26 +- storage/xtradb/include/srv0mon.h | 11 + storage/xtradb/include/srv0srv.h | 62 +- storage/xtradb/log/log0log.cc | 20 +- storage/xtradb/log/log0online.cc | 6 +- storage/xtradb/log/log0recv.cc | 19 +- storage/xtradb/os/os0file.cc | 553 +++++++++++-- storage/xtradb/srv/srv0mon.cc | 68 ++ storage/xtradb/srv/srv0srv.cc | 43 +- storage/xtradb/srv/srv0start.cc | 730 +++++++++++++++- 54 files changed, 5847 insertions(+), 328 deletions(-) create mode 100644 storage/innobase/fil/lz4.c create mode 100644 storage/innobase/fil/lz4.h create mode 100644 storage/xtradb/fil/fil0pagecompress.cc create mode 100644 storage/xtradb/fil/lz4.c create mode 100644 storage/xtradb/fil/lz4.h create mode 100644 storage/xtradb/include/dict0pagecompress.h create mode 100644 storage/xtradb/include/dict0pagecompress.ic create mode 100644 storage/xtradb/include/fil0pagecompress.h create mode 100644 storage/xtradb/include/fsp0pagecompress.h create mode 100644 storage/xtradb/include/fsp0pagecompress.ic diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index e41d2406bd2..0b1043bc421 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -294,6 +294,7 @@ SET(INNOBASE_SOURCES eval/eval0proc.cc fil/fil0fil.cc fil/fil0pagecompress.cc + fil/lz4.c fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index e3e127c3ace..3d7dc993146 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -1923,7 +1923,7 @@ btr_page_reorganize( dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ { - return(btr_page_reorganize_low(FALSE, page_compression_level, + return(btr_page_reorganize_low(FALSE, page_zip_level, block, index, mtr)); } #endif /* !UNIV_HOTBACKUP */ @@ -1942,7 +1942,7 @@ btr_parse_page_reorganize( buf_block_t* block, /*!< in: page to be reorganized, or NULL */ mtr_t* mtr) /*!< in: mtr or NULL */ { - ulint level = page_compression_level; + ulint level = page_zip_level; ut_ad(ptr && end_ptr); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index ecc17188770..5feb1363867 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1844,7 +1844,7 @@ btr_cur_update_alloc_zip( /* Have a local copy of the variables as these can change dynamically. */ bool log_compressed = page_log_compressed_pages; - ulint compression_level = page_compression_level; + ulint compression_level = page_zip_level; page_t* page = buf_block_get_frame(block); ut_a(page_zip == buf_block_get_page_zip(block)); @@ -4334,7 +4334,7 @@ btr_store_big_rec_extern_fields( heap = mem_heap_create(250000); page_zip_set_alloc(&c_stream, heap); - err = deflateInit2(&c_stream, page_compression_level, + err = deflateInit2(&c_stream, page_zip_level, Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 1718e68d667..3803d0a93aa 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -5303,7 +5303,7 @@ fil_io( os_offset_t offset; ibool ignore_nonexistent_pages; ibool page_compressed = FALSE; - ibool page_compression_level = 0; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 2da9d70e197..10ac273955f 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,6 +63,7 @@ static ulint srv_data_read, srv_data_written; #include #endif #include "row0mysql.h" +#include "lz4.h" /****************************************************************//** For page compressed pages compress the page before actual write @@ -100,7 +101,7 @@ fil_compress_page( /* If no compression level was provided to this table, use system default level */ if (level == 0) { - level = srv_compress_zlib_level; + level = page_zip_level; } #ifdef UNIV_DEBUG @@ -110,60 +111,88 @@ fil_compress_page( #endif write_size = UNIV_PAGE_SIZE - header_len; - err = compress2(out_buf+header_len, &write_size, buf, len, level); - if (err != Z_OK) { - /* If error we leave the actual page as it was */ + if (srv_use_lz4) { + err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); + write_size = err; - fprintf(stderr, - "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", - space_id, fil_space_name(space), len, err, write_size); + if (err == 0) { + /* If error we leave the actual page as it was */ - *out_len = len; - return (buf); + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } } else { - /* Set up the page header */ - memcpy(out_buf, buf, FIL_PAGE_DATA); - /* Set up the checksum */ - mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); - /* Set up the correct page type */ - mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); - /* Set up the flush lsn to be compression algorithm */ - mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); - /* Set up the actual payload lenght */ - mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + err = compress2(out_buf+header_len, &write_size, buf, len, level); -#ifdef UNIV_DEBUG - /* Verify */ - ut_ad(fil_page_is_compressed(out_buf)); - ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); - ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); - ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); -#endif + if (err != Z_OK) { + /* If error we leave the actual page as it was */ - write_size+=header_len; - /* Actual write needs to be alligned on block size */ - if (write_size % OS_FILE_LOG_BLOCK_SIZE) { - write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); } - -#ifdef UNIV_DEBUG - fprintf(stderr, - "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", - space_id, fil_space_name(space), len, write_size); -#endif -#define SECT_SIZE 512 - srv_stats.page_compression_saved.add((len - write_size)); - if ((len - write_size) > 0) { - srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); - srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); - } - //srv_stats.page_compressed_trim_op.inc(); - srv_stats.pages_page_compressed.inc(); - *out_len = write_size; - - return(out_buf); } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + if (srv_use_lz4) { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); + } else { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); + } + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + if (srv_use_lz4) { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); + } else { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); + } +#endif + + write_size+=header_len; + /* Actual write needs to be alligned on block size */ + if (write_size % OS_FILE_LOG_BLOCK_SIZE) { + write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif + +#define SECT_SIZE 512 + + srv_stats.page_compression_saved.add((len - write_size)); + if ((len - write_size) > 0) { + srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); + } + //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + } /****************************************************************//** @@ -203,16 +232,30 @@ fil_decompress_page( /* Get compression algorithm */ compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); - if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { - // If no buffer was given, we need to allocate temporal buffer - if (page_buf == NULL) { - in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); - } else { - in_buf = page_buf; - } + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression buffer not given, allocating...\n"); +#endif + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } - /* Get the actual size of compressed page */ - actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } + + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { #ifdef UNIV_DEBUG fprintf(stderr, @@ -242,17 +285,19 @@ fil_decompress_page( "InnoDB: Note: Decompression succeeded for len %lu \n", len); #endif + } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); - /* Copy the uncompressed page to the buffer pool, not - really any other options. */ - memcpy(buf, in_buf, len); + if (err != actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); - // Need to free temporal buffer if no buffer was given - if (page_buf == NULL) { - ut_free(in_buf); + ut_error; } - - srv_stats.pages_page_decompressed.inc(); } else { fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" @@ -263,6 +308,17 @@ fil_decompress_page( fflush(stderr); ut_error; } + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } } diff --git a/storage/innobase/fil/lz4.c b/storage/innobase/fil/lz4.c new file mode 100644 index 00000000000..4e864de67d3 --- /dev/null +++ b/storage/innobase/fil/lz4.c @@ -0,0 +1,822 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2013, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +//************************************** +// Tuning parameters +//************************************** +// MEMORY_USAGE : +// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +// Increasing memory usage improves compression ratio +// Reduced memory usage can improve speed, due to cache effect +// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache +#define MEMORY_USAGE 14 + +// HEAPMODE : +// Select how default compression functions will allocate memory for their hash table, +// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). +#define HEAPMODE 0 + + +//************************************** +// CPU Feature Detection +//************************************** +// 32 or 64 bits ? +#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ + || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \ + || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \ + || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +// Little Endian or Big Endian ? +// Overwrite the #define below if you know your architecture endianess +#if defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define LZ4_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define LZ4_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define LZ4_BIG_ENDIAN 1 +#else +// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +// Define this parameter if your target system or compiler does not support hardware bit count +#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count +# define LZ4_FORCE_SW_BITCOUNT +#endif + +// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : +// This option may provide a small boost to performance for some big endian cpu, although probably modest. +// You may set this option to 1 if data will remain within closed environment. +// This option is useless on Little_Endian CPU (such as x86) +//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 + + +//************************************** +// Compiler Options +//************************************** +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99 +/* "restrict" is a known keyword */ +#else +# define restrict // Disable restrict +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +# include // For Visual 2005 +# if LZ4_ARCH64 // 64-bits +# pragma intrinsic(_BitScanForward64) // For Visual 2005 +# pragma intrinsic(_BitScanReverse64) // For Visual 2005 +# else // 32-bits +# pragma intrinsic(_BitScanForward) // For Visual 2005 +# pragma intrinsic(_BitScanReverse) // For Visual 2005 +# endif +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +#ifdef _MSC_VER +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +//************************************** +// Memory routines +//************************************** +#include // malloc, calloc, free +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include // memset, memcpy +#define MEM_INIT memset + + +//************************************** +// Includes +//************************************** +#include "lz4.h" + + +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct { U16 v; } _PACKED U16_S; +typedef struct { U32 v; } _PACKED U32_S; +typedef struct { U64 v; } _PACKED U64_S; +typedef struct {size_t v;} _PACKED size_t_S; + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(0) +# else +# pragma pack(pop) +# endif +#endif + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) +#define AARCH(x) (((size_t_S *)(x))->v) + + +//************************************** +// Constants +//************************************** +#define LZ4_HASHLOG (MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << MEMORY_USAGE) +#define HASHNBCELLS4 (1 << LZ4_HASHLOG) + +#define MINMATCH 4 + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +const int LZ4_minLength = (MFLIMIT+1); + +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1)) +#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<=e; + + +//**************************** +// Private functions +//**************************** +#if LZ4_ARCH64 + +FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif +# endif +} + +#else + +FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif +# endif +} + +#endif + + +//**************************** +// Compression functions +//**************************** +FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } + +FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } + } +} + +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return +} + +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + + +FORCE_INLINE int LZ4_compress_generic( + void* ctx, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + + limitedOutput_directive limitedOutput, + tableType_t tableType, + prefix64k_directive prefix) +{ + const BYTE* ip = (const BYTE*) source; + const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source; + const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source); + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + // Init conditions + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative) + if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block + if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit + if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit) + if (inputSize> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if unlikely(forwardIp > mflimit) { goto _last_literals; } + + forwardH = LZ4_hashPosition(forwardIp, tableType); + ref = LZ4_getPositionOnHash(h, ctx, tableType, base); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip))); + + // Catch up + while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(length<>8) > oend)) return 0; // Check output limit + if (length>=(int)ML_MASK) + { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; } + if (length >= 255) { length-=255; *op++ = 255; } + *op++ = (BYTE)length; + } + else *token += (BYTE)(length); + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + LZ4_putPosition(ip-2, ctx, tableType, base); + + // Test next position + ref = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_hashPosition(ip, tableType); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun<hashTable, 0, sizeof(lz4ds->hashTable)); + lz4ds->bufferStart = base; + lz4ds->base = base; + lz4ds->nextBlock = base; +} + + +void* LZ4_create (const char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure)); + LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer); + return lz4ds; +} + + +int LZ4_free (void* LZ4_Data) +{ + FREEMEM(LZ4_Data); + return (0); +} + + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data; + size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB); + + if ( (lz4ds->base - delta > lz4ds->base) // underflow control + || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit + { + size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base; + int nH; + + for (nH=0; nH < HASHNBCELLS4; nH++) + { + if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0; + else lz4ds->hashTable[nH] -= (U32)deltaLimit; + } + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->base = lz4ds->bufferStart; + lz4ds->nextBlock = lz4ds->base + 64 KB; + } + else + { + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->nextBlock -= delta; + lz4ds->base -= delta; + } + + return (char*)(lz4ds->nextBlock); +} + + +//**************************** +// Decompression functions +//**************************** + +// This generic decompression function cover all use cases. +// It shall be instanciated several times, using different sets of directives +// Note that it is essential this generic function is really inlined, +// in order to remove useless branches during compilation optimisation. +FORCE_INLINE int LZ4_decompress_generic( + const char* source, + char* dest, + int inputSize, // + int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer. + + int endOnInput, // endOnOutputSize, endOnInputSize + int prefix64k, // noPrefix, withPrefix + int partialDecoding, // full, partial + int targetOutputSize // only used if partialDecoding==partial + ) +{ + // Local Variables + const BYTE* restrict ip = (const BYTE*) source; + const BYTE* ref; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + + const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64 + static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + + // Special cases + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything + if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer + if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1); + + + // Main Loop + while (1) + { + unsigned token; + size_t length; + + // get runlength + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s=255; + while (((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer + if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed + } + memcpy(op, ip, length); + ip += length; + op += length; + break; // Necessarily EOF, due to parsing restrictions + } + LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) + { + while ((!endOnInput) || (ipoend-COPYLENGTH-(STEPSIZE-4)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals + LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH)); + while(op (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) +static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) + primarily useful for memory allocation of output buffer. + inline function is recommended for the general case, + macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). + + isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ + + +int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); + +/* +LZ4_compress_limitedOutput() : + Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. + If it cannot achieve it, compression will stop, and result of the function will be zero. + This function never writes outside of provided output buffer. + + inputSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxOutputSize : is the size of the destination buffer (which must be already allocated) + return : the number of bytes written in buffer 'dest' + or 0 if the compression fails +*/ + + +int LZ4_decompress_fast (const char* source, char* dest, int outputSize); + +/* +LZ4_decompress_fast() : + outputSize : is the original (uncompressed) size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is malformed, the function will stop decoding and return a negative result. + note : This function is a bit faster than LZ4_decompress_safe() + This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet. + Use this function preferably into a trusted environment (data to decode comes from a trusted source). + Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes. +*/ + +int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'inputSize' at position 'source' + into output buffer 'dest' of size 'maxOutputSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ + + +//**************************** +// Stream Functions +//**************************** + +void* LZ4_create (const char* inputBuffer); +int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize); +char* LZ4_slideInputBuffer (void* LZ4_Data); +int LZ4_free (void* LZ4_Data); + +/* +These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. +In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function : + +void* LZ4_create (const char* inputBuffer); +The result of the function is the (void*) pointer on the LZ4 Data Structure. +This pointer will be needed in all other functions. +If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. +The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. +The input buffer must be already allocated, and size at least 192KB. +'inputBuffer' will also be the 'const char* source' of the first block. + +All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. +To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(). +Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(), +but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one. +If next block does not begin immediately after the previous one, the compression will fail (return 0). + +When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : +char* LZ4_slideInputBuffer(void* LZ4_Data); +must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. +Note that, for this function to work properly, minimum size of an input buffer must be 192KB. +==> The memory position where the next input data block must start is provided as the result of the function. + +Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual. + +When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure. +*/ + + +int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize); + +/* +*_withPrefix64k() : + These decoding functions work the same as their "normal name" versions, + but can use up to 64KB of data in front of 'char* dest'. + These functions are necessary to decode inter-dependant blocks. +*/ + + +//**************************** +// Obsolete Functions +//**************************** + +static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + +/* +These functions are deprecated and should no longer be used. +They are provided here for compatibility with existing user programs. +*/ + + + +#if defined (__cplusplus) +} +#endif diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index eda7da81d5c..d4ce4eb9c4f 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4,7 +4,7 @@ Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, SkySQL Ab. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -15429,29 +15429,6 @@ innodb_reset_all_monitor_update( TRUE); } -/****************************************************************//** -Update the system variable innodb_compression_level using the "saved" -value. This function is registered as a callback with MySQL. */ -static -void -innodb_compression_level_update( -/*============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ -{ - /* We have this call back just to avoid confusion between - ulong and ulint datatypes. */ - innobase_compression_level = - (*static_cast(save)); - page_compression_level = - (static_cast(innobase_compression_level)); -} - /****************************************************************//** Parse and enable InnoDB monitor counters during server startup. User can list the monitor counters/groups to be enable by specifying @@ -16140,11 +16117,11 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, "innodb_thread_concurrency is reached (0 by default)", NULL, NULL, 0, 0, ~0UL, 0); -static MYSQL_SYSVAR_ULONG(compression_level, innobase_compression_level, +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" + "Compression level used for zlib compression. 0 is no compression" ", 1 is fastest, 9 is best compression and default is 6.", - NULL, innodb_compression_level_update, + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size, @@ -16620,11 +16597,6 @@ static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, "How many percent of compressed pages should be trimmed", NULL, NULL, 100, 0, 100, 0); -static MYSQL_SYSVAR_LONG(compress_zlib_level, srv_compress_zlib_level, - PLUGIN_VAR_OPCMDARG , - "Default zlib compression level", - NULL, NULL, 6, 0, 9, 0); - static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, PLUGIN_VAR_OPCMDARG, "Use page compression for only index pages.", @@ -16635,6 +16607,12 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, "Use trim.", NULL, NULL, TRUE); +static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, + PLUGIN_VAR_OPCMDARG , + "Use LZ4 for page compression", + NULL, NULL, FALSE); + + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16782,9 +16760,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #endif /* UNIV_DEBUG */ MYSQL_SYSVAR(compress_pages), MYSQL_SYSVAR(trim_pct), - MYSQL_SYSVAR(compress_zlib_level), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(use_lz4), NULL }; diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 01084d52365..918a92fa811 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -134,6 +134,7 @@ extern fil_addr_t fil_addr_null; actual payload data size on compressed pages. */ #define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ +#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */ /* @} */ /** File page trailer @{ */ diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic index 755d91b3cd9..10f9d30d1f8 100644 --- a/storage/innobase/include/fsp0pagecompress.ic +++ b/storage/innobase/include/fsp0pagecompress.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013,2014 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -144,6 +144,9 @@ fil_get_compression_alg_name( case FIL_PAGE_COMPRESSION_ZLIB: return ("ZLIB"); break; + case FIL_PAGE_COMPRESSION_LZ4: + return ("LZ4"); + break; default: return("UNKNOWN"); break; diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index 12781bd61b8..89260d0984e 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -41,7 +41,7 @@ Created June 2005 by Marko Makela #include "mem0mem.h" /* Compression level to be used by zlib. Settable by user. */ -extern ulint page_compression_level; +extern uint page_zip_level; /* Default compression level. */ #define DEFAULT_COMPRESSION_LEVEL 6 diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index f4fa8b434fe..a11c213d534 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -3,7 +3,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -254,9 +254,8 @@ extern my_bool srv_use_posix_fallocate; /* Use atomic writes i.e disable doublewrite buffer */ extern my_bool srv_use_atomic_writes; -/* Default zlib compression level */ -extern long srv_compress_zlib_level; - +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +extern my_bool srv_use_lz4; #ifdef __WIN__ extern ibool srv_use_native_conditions; diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index f416d38cc35..9d6a62cae8f 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1180,7 +1180,7 @@ page_cur_insert_rec_zip_reorg( /* Make a local copy as the values can change dynamically. */ bool log_compressed = page_log_compressed_pages; - ulint level = page_compression_level; + ulint level = page_zip_level; /* Recompress or reorganize and recompress the page. */ if (page_zip_compress(page_zip, page, index, level, diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index 6b7b8424856..bf73a249f95 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -514,7 +514,7 @@ page_create_zip( mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level); if (!page_zip_compress(page_zip, page, index, - page_compression_level, mtr)) { + page_zip_level, mtr)) { /* The compression of a newly created page should always succeed. */ ut_error; @@ -663,7 +663,7 @@ page_copy_rec_list_end( if (!page_zip_compress(new_page_zip, new_page, index, - page_compression_level, + page_zip_level, mtr)) { /* Before trying to reorganize the page, store the number of preceding records on the page. */ @@ -788,7 +788,7 @@ page_copy_rec_list_start( goto zip_reorganize;); if (!page_zip_compress(new_page_zip, new_page, index, - page_compression_level, mtr)) { + page_zip_level, mtr)) { ulint ret_pos; #ifndef DBUG_OFF diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index dee37580002..3fba6216430 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -69,7 +69,7 @@ UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key; #endif /* !UNIV_HOTBACKUP */ /* Compression level to be used by zlib. Settable by user. */ -UNIV_INTERN ulint page_compression_level = 6; +UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL; /* Whether or not to log compressed page images to avoid possible compression algorithm changes in zlib. */ @@ -4631,7 +4631,7 @@ page_zip_reorganize( mtr_set_log_mode(mtr, log_mode); if (!page_zip_compress(page_zip, page, index, - page_compression_level, mtr)) { + page_zip_level, mtr)) { #ifndef UNIV_HOTBACKUP buf_block_free(temp_block); diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 90864cee9ef..cffd3f928c3 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -147,21 +148,20 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE; /* If this flag is TRUE, then we will use page compression to the pages */ -UNIV_INTERN my_bool srv_compress_pages = FALSE; +UNIV_INTERN my_bool srv_compress_pages = FALSE; /* If this flag is TRUE, then we will use page compression only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; -UNIV_INTERN long srv_trim_pct = 100; -/* Default compression level if page compression is used and no compression -level is set for the table*/ -UNIV_INTERN long srv_compress_zlib_level = 6; +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN long srv_trim_pct = 100; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ -UNIV_INTERN my_bool srv_use_trim = TRUE; +UNIV_INTERN my_bool srv_use_trim = TRUE; /* If this flag is TRUE, then we will use posix fallocate for file extentsion */ -UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ -UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +UNIV_INTERN my_bool srv_use_lz4 = FALSE; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 282db2ddf31..5050ca34da9 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -284,6 +284,8 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc +# TODO: JAN uncomment +# buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -297,6 +299,8 @@ SET(INNOBASE_SOURCES eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc + fil/fil0pagecompress.cc + fil/lz4.c fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index d4b170028d9..b995e3ee737 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -3371,6 +3372,7 @@ buf_page_init_low( bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; + bpage->write_size = 0; HASH_INVALIDATE(bpage, hash); bpage->is_corrupt = FALSE; #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG @@ -5501,3 +5503,24 @@ buf_page_init_for_backup_restore( } } #endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Aquire LRU list mutex */ +void +buf_pool_mutex_enter( +/*=================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + mutex_enter(&buf_pool->LRU_list_mutex); +} +/*********************************************************************//** +Exit LRU list mutex */ +void +buf_pool_mutex_exit( +/*================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + mutex_exit(&buf_pool->LRU_list_mutex); +} diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc index 506a5b177ba..30b41dc754e 100644 --- a/storage/xtradb/buf/buf0dblwr.cc +++ b/storage/xtradb/buf/buf0dblwr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -382,7 +383,7 @@ buf_dblwr_init_or_restore_pages( buffer */ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, - UNIV_PAGE_SIZE, read_buf, NULL); + UNIV_PAGE_SIZE, read_buf, NULL, 0); doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -418,11 +419,11 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf, NULL); + buf, NULL, 0); fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - NULL); + NULL, 0); /* Check if any of these pages is half-written in data files, in the intended position */ @@ -450,7 +451,7 @@ buf_dblwr_init_or_restore_pages( } fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0, - UNIV_PAGE_SIZE, page, NULL); + UNIV_PAGE_SIZE, page, NULL, 0); } else { space_id = mach_read_from_4( @@ -492,7 +493,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); + read_buf, NULL, 0); /* Check if the page is corrupt */ @@ -544,7 +545,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_WRITE, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); ib_logf(IB_LOG_LEVEL_INFO, "Recovered the page from" @@ -763,7 +764,7 @@ buf_dblwr_write_block_to_datafile( buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), (void*) bpage->zip.data, - (void*) bpage); + (void*) bpage, 0); return; } @@ -775,7 +776,8 @@ buf_dblwr_write_block_to_datafile( fil_io(flags, sync, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block); + (void*) block->frame, (void*) block, + (ulint *)&bpage->write_size); } /********************************************************************//** @@ -869,7 +871,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -885,7 +887,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); flush: /* increment the doublewrite flushed pages counter */ @@ -1115,14 +1117,14 @@ retry: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf - + UNIV_PAGE_SIZE * i), NULL); + + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, - NULL); + NULL, 0); } /* Now flush the doublewrite buffer data to disk */ diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index abcee504d2e..3c030eb60ee 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -48,6 +50,7 @@ Created 11/11/1995 Heikki Tuuri #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /** Number of pages flushed through non flush_list flushes. */ // static ulint buf_lru_flush_page_count = 0; @@ -71,11 +74,6 @@ in thrashing. */ /* @} */ -/** Handled page counters for a single flush */ -struct flush_counters_t { - ulint flushed; /*!< number of dirty pages flushed */ - ulint evicted; /*!< number of clean pages evicted */ -}; /******************************************************************//** Increases flush_list size in bytes with zip_size for compressed page, @@ -721,8 +719,10 @@ buf_flush_write_complete( buf_pool->n_flush[flush_type]--; - /* fprintf(stderr, "n pending flush %lu\n", - buf_pool->n_flush[flush_type]); */ +#ifdef UNIV_DEBUG + fprintf(stderr, "n pending flush %lu\n", + buf_pool->n_flush[flush_type]); +#endif if (buf_pool->n_flush[flush_type] == 0 && buf_pool->init_flush[flush_type] == FALSE) { @@ -880,6 +880,8 @@ buf_flush_write_block_low( { ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; + ulint space_id = buf_page_get_space(bpage); + atomic_writes_t awrites = fil_space_get_atomic_writes(space_id); #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -955,12 +957,26 @@ buf_flush_write_block_low( sync, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage, sync); + frame, bpage, &bpage->write_size); } else { - ut_ad(!sync); - buf_dblwr_add_to_batch(bpage); + /* InnoDB uses doublewrite buffer and doublewrite buffer + is initialized. User can define do we use atomic writes + on a file space (table) or not. If atomic writes are + not used we should use doublewrite buffer and if + atomic writes should be used, no doublewrite buffer + is used. */ + + if (awrites == ATOMIC_WRITES_ON) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage, &bpage->write_size); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage, sync); + } else { + buf_dblwr_add_to_batch(bpage); + } } /* When doing single page flushing the IO is done synchronously @@ -1747,7 +1763,6 @@ end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued */ __attribute__((nonnull)) -static void buf_flush_batch( /*============*/ @@ -1806,7 +1821,6 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -static void buf_flush_common( /*=============*/ @@ -1833,7 +1847,6 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -static ibool buf_flush_start( /*============*/ @@ -1862,7 +1875,6 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -static void buf_flush_end( /*==========*/ @@ -1912,11 +1924,55 @@ buf_flush_wait_batch_end( } } else { thd_wait_begin(NULL, THD_WAIT_DISKIO); - os_event_wait(buf_pool->no_flush[type]); + os_event_wait(buf_pool->no_flush[type]); thd_wait_end(NULL); } } +/* JAN: TODO: */ +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list and also +puts replaceable clean pages from the end of the LRU list to the free +list. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully. false if another batch +of same type was already running. */ +static +bool +pgcomp_buf_flush_LRU( +/*==========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ +{ + flush_counters_t n; + + if (n_processed) { + *n_processed = 0; + } + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + return(false); + } + + buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, false, &n); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, n.flushed); + + if (n_processed) { + *n_processed = n.flushed; + } + + return(true); +} +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free @@ -1954,6 +2010,168 @@ buf_flush_LRU( return(true); } +/* JAN: TODO: */ +/*******************************************************************//**/ +extern int is_pgcomp_wrk_init_done(void); +extern int pgcomp_flush_work_items(int buf_pool_inst, int *pages_flushed, + int flush_type, int min_n, unsigned long long lsn_limit); + +#define MT_COMP_WATER_MARK 50 + +#include +int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) +{ + if (g_time->tv_usec < s_time->tv_usec) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; + s_time->tv_usec -= 1000000 * nsec; + s_time->tv_sec += nsec; + } + if (g_time->tv_usec - s_time->tv_usec > 1000000) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; + s_time->tv_usec += 1000000 * nsec; + s_time->tv_sec -= nsec; + } + d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; + d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; + + return 0; +} + +static pthread_mutex_t pgcomp_mtx = PTHREAD_MUTEX_INITIALIZER; +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +UNIV_INTERN +bool +pgcomp_buf_flush_list( +/*==================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + struct timeval p_start_time, p_end_time, d_time; + flush_counters_t n; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { + int cnt_flush[32]; + + //stack_trace(); + pthread_mutex_lock(&pgcomp_mtx); + //gettimeofday(&p_start_time, 0x0); + //fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + + } + } + + pthread_mutex_unlock(&pgcomp_mtx); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); + } + /* Flush to lsn_limit in all buffer pool instances */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + success = false; + + continue; + } + + buf_flush_batch( + buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit, false, &n); + + buf_flush_end(buf_pool, BUF_FLUSH_LIST); + + buf_flush_common(BUF_FLUSH_LIST, n.flushed); + + if (n_processed) { + *n_processed += n.flushed; + } + + if (n.flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n.flushed); + } + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); +} + +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. @@ -1986,6 +2204,12 @@ buf_flush_list( bool timeout = false; ulint flush_start_time = 0; + /* JAN: TODO: */ + if (is_pgcomp_wrk_init_done()) { + return(pgcomp_buf_flush_list(min_n, lsn_limit, n_processed)); + } + /* JAN: TODO: END: */ + for (i = 0; i < srv_buf_pool_instances; i++) { requested_pages[i] = 0; active_instance[i] = true; @@ -2179,6 +2403,60 @@ buf_flush_single_page_from_LRU( return(freed); } +/* JAN: TODO: */ +/*********************************************************************//** +pgcomp_Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +pgcomp_buf_flush_LRU_tail(void) +/*====================*/ +{ + struct timeval p_start_time, p_end_time, d_time; + ulint total_flushed=0, i=0; + int cnt_flush[32]; + +#ifdef UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + assert(is_pgcomp_wrk_init_done()); + + pthread_mutex_lock(&pgcomp_mtx); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + + pthread_mutex_unlock(&pgcomp_mtx); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + + return(total_flushed); +} +/* JAN: TODO: END: */ + + /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list @@ -2203,6 +2481,13 @@ buf_flush_LRU_tail(void) ulint free_list_lwm = srv_LRU_scan_depth / 100 * srv_cleaner_free_list_lwm; + /* JAN: TODO: */ + if(is_pgcomp_wrk_init_done()) + { + return(pgcomp_buf_flush_LRU_tail()); + } + /* JAN: TODO: END */ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { const buf_pool_t* buf_pool = buf_pool_from_array(i); @@ -2640,6 +2925,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( ulint n_flushed = 0; ulint last_activity = srv_get_activity_count(); ulint lru_sleep_time = srv_cleaner_max_lru_time; + ulint n_lru=0, n_pgc_flush=0, n_pgc_batch=0; ut_ad(!srv_read_only_mode); @@ -2684,15 +2970,25 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( next_loop_time = ut_time_ms() + page_cleaner_sleep_time; /* Flush pages from end of LRU if required */ - n_flushed = buf_flush_LRU_tail(); + n_lru = n_flushed = buf_flush_LRU_tail(); +#ifdef UNIV_DEBUG + if (n_lru) { + fprintf(stderr,"n_lru:%lu ",n_lru); + } +#endif if (srv_check_activity(last_activity)) { last_activity = srv_get_activity_count(); /* Flush pages from flush_list if required */ - n_flushed += page_cleaner_flush_pages_if_needed(); + n_flushed += n_pgc_flush = page_cleaner_flush_pages_if_needed(); +#ifdef UNIV_DEBUG + if (n_pgc_flush) { + fprintf(stderr,"n_pgc_flush:%lu ",n_pgc_flush); + } +#endif } else { - n_flushed = page_cleaner_do_flush_batch( + n_pgc_batch = n_flushed = page_cleaner_do_flush_batch( PCT_IO(100), LSN_MAX); @@ -2703,7 +2999,20 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( MONITOR_FLUSH_BACKGROUND_PAGES, n_flushed); } +#ifdef UNIV_DEBUG + if (n_pgc_batch) { + fprintf(stderr,"n_pgc_batch:%lu ",n_pgc_batch); + } +#endif } + +#ifdef UNIV_DEBUG + if (n_lru || n_pgc_flush || n_pgc_batch) { + fprintf(stderr,"\n"); + n_lru = n_pgc_flush = n_pgc_batch = 0; + } +#endif + } ut_ad(srv_shutdown_state > 0); diff --git a/storage/xtradb/buf/buf0rea.cc b/storage/xtradb/buf/buf0rea.cc index 6e348bbf004..3dec3df6f2b 100644 --- a/storage/xtradb/buf/buf0rea.cc +++ b/storage/xtradb/buf/buf0rea.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -229,14 +230,14 @@ not_to_recover: *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, zip_size, offset, 0, zip_size, - bpage->zip.data, bpage, trx); + bpage->zip.data, bpage, 0, trx); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage, trx); + ((buf_block_t*) bpage)->frame, bpage, 0, trx); } if (sync) { diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index a20456fe3cf..d6a05d2b214 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index 9861f85b814..f3e952299ff 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -54,6 +55,15 @@ Created 10/25/1995 Heikki Tuuri # include "srv0srv.h" static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ +#include "fil0pagecompress.h" +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE @@ -434,11 +444,16 @@ fil_read( block size multiple */ void* buf, /*!< in/out: buffer where to store data read; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync - aio used, else ignored */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /********************************************************************//** @@ -463,18 +478,22 @@ fil_write( be a block size multiple */ void* buf, /*!< in: buffer from which to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync - aio used, else ignored */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ut_ad(!srv_read_only_mode); return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ -UNIV_INLINE fil_space_t* fil_space_get_by_id( /*================*/ @@ -492,6 +511,19 @@ fil_space_get_by_id( return(space); } +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + /*******************************************************************//** Returns the table space by a given name, NULL if not found. */ UNIV_INLINE @@ -712,8 +744,9 @@ fil_node_open_file( byte* buf2; byte* page; ulint space_id; - ulint flags; + ulint flags=0; ulint page_size; + ibool atomic_writes=FALSE; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -730,7 +763,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_ONLY, &success, 0); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -782,6 +815,7 @@ fil_node_open_file( space_id = fsp_header_get_space_id(page); flags = fsp_header_get_flags(page); page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); ut_free(buf2); @@ -832,6 +866,17 @@ fil_node_open_file( ut_error; } + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + if (size_bytes >= 1024 * 1024) { /* Truncate the size to whole megabytes. */ size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); @@ -851,6 +896,8 @@ add_size: space->size += node->size; } + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + /* printf("Opening file %s\n", node->name); */ /* Open the file for reading and writing, in Windows normally in the @@ -861,18 +908,18 @@ add_size: node->handle = os_file_create(innodb_file_log_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, - &ret); + &ret, atomic_writes); } else if (node->is_raw_disk) { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN_RAW, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } else { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } ut_a(ret); @@ -1932,12 +1979,12 @@ fil_write_lsn_and_arch_no_to_file( buf = static_cast(ut_align(buf1, UNIV_PAGE_SIZE)); err = fil_read(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); err = fil_write(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); } mem_free(buf1); @@ -3222,7 +3269,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); if (!success) { /* The following call will print an error message */ @@ -3331,8 +3378,9 @@ fil_open_linked_file( /*===============*/ const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ - os_file_t* remote_file) /*!< out: remote file handle */ - + os_file_t* remote_file, /*!< out: remote file handle */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ { ibool success; @@ -3346,7 +3394,7 @@ fil_open_linked_file( *remote_file = os_file_create_simple_no_error_handling( innodb_file_data_key, *remote_filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, atomic_writes); if (!success) { char* link_filepath = fil_make_isl_name(tablename); @@ -3401,6 +3449,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + bool atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3433,7 +3482,8 @@ fil_create_new_single_table_tablespace( OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + atomic_writes); if (ret == FALSE) { /* The following call will print an error message */ @@ -3498,6 +3548,7 @@ fil_create_new_single_table_tablespace( flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); if (!(fsp_flags_is_compressed(flags))) { buf_flush_init_for_writing(page, NULL, 0); @@ -3685,6 +3736,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; + ibool atomic_writes = FALSE; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -3719,7 +3771,7 @@ fil_open_single_table_tablespace( } link_file_found = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, atomic_writes); remote.success = link_file_found; if (remote.success) { /* possibility of multiple files. */ @@ -3747,7 +3799,7 @@ fil_open_single_table_tablespace( if (dict.filepath) { dict.file = os_file_create_simple_no_error_handling( innodb_file_data_key, dict.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &dict.success); + OS_FILE_READ_ONLY, &dict.success, atomic_writes); if (dict.success) { /* possibility of multiple files. */ validate = true; @@ -3759,7 +3811,7 @@ fil_open_single_table_tablespace( ut_a(def.filepath); def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, atomic_writes); if (def.success) { tablespaces_found++; } @@ -4155,7 +4207,7 @@ fil_load_single_table_tablespace( /* Check for a link file which locates a remote tablespace. */ remote.success = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, FALSE); /* Read the first page of the remote tablespace */ if (remote.success) { @@ -4170,7 +4222,7 @@ fil_load_single_table_tablespace( /* Try to open the tablespace in the datadir. */ def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, FALSE); /* Read the first page of the remote tablespace */ if (def.success) { @@ -4938,7 +4990,6 @@ retry: #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { - mutex_exit(&fil_system->mutex); success = os_file_set_size(node->name, node->handle, (size_after_extend - file_start_page_no) * page_size); @@ -4975,7 +5026,7 @@ retry: success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL, space_id, NULL); + NULL, NULL, space_id, NULL, 0, 0, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5361,7 +5412,12 @@ _fil_io( or from where to write; in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync - aio used, else ignored */ + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ trx_t* trx) { ulint mode; @@ -5372,6 +5428,8 @@ _fil_io( ulint wake_later; os_offset_t offset; ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5425,6 +5483,9 @@ _fil_io( } else if (type == OS_FILE_WRITE) { ut_ad(!srv_read_only_mode); srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } } /* Reserve the fil_system mutex and make sure that we can open at @@ -5434,6 +5495,8 @@ _fil_io( space = fil_space_get_by_id(space_id); + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); /* If we are deleting a tablespace we don't allow any read operations on that. However, we do allow write operations. */ if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { @@ -5579,7 +5642,8 @@ _fil_io( /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message, space_id, trx); + offset, len, node, message, space_id, trx, + page_compressed, page_compression_level, write_size); #else /* In ibbackup do normal i/o, not aio */ @@ -6214,7 +6278,7 @@ fil_tablespace_iterate( file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, - OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", { @@ -6501,3 +6565,33 @@ fil_space_set_corrupt( mutex_exit(&fil_system->mutex); } + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc new file mode 100644 index 00000000000..10ac273955f --- /dev/null +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -0,0 +1,324 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" +#include "lz4.h" + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /* in: compression level */ + ulint* out_len) /*!< out: actual length of compressed page */ +{ + int err = Z_OK; + int level = 0; + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; + ulint write_size=0; + + ut_ad(buf); + ut_ad(out_buf); + ut_ad(len); + ut_ad(out_len); + + level = compression_level; + ut_ad(fil_space_is_page_compressed(space_id)); + + fil_system_enter(); + fil_space_t* space = fil_space_get_by_id(space_id); + fil_system_exit(); + + /* If no compression level was provided to this table, use system + default level */ + if (level == 0) { + level = page_zip_level; + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif + + write_size = UNIV_PAGE_SIZE - header_len; + + if (srv_use_lz4) { + err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); + write_size = err; + + if (err == 0) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } + } else { + err = compress2(out_buf+header_len, &write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } + } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + if (srv_use_lz4) { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); + } else { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); + } + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + if (srv_use_lz4) { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); + } else { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); + } +#endif + + write_size+=header_len; + /* Actual write needs to be alligned on block size */ + if (write_size % OS_FILE_LOG_BLOCK_SIZE) { + write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif + +#define SECT_SIZE 512 + + srv_stats.page_compression_saved.add((len - write_size)); + if ((len - write_size) > 0) { + srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); + } + //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + +} + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len) /*!< in: length of output buffer.*/ +{ + int err = 0; + ulint actual_size = 0; + ulint compression_alg = 0; + byte *in_buf; + + ut_ad(buf); + ut_ad(len); + + /* Before actual decompress, make sure that page type is correct */ + + if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC || + mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: CRC %lu type %lu.\n" + "InnoDB: len %lu\n", + mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_2(buf+FIL_PAGE_TYPE), len); + + fflush(stderr); + ut_error; + } + + /* Get compression algorithm */ + compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); + + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression buffer not given, allocating...\n"); +#endif + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } + + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); +#endif + + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + + + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + + fflush(stderr); + + ut_error; + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif + } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); + + if (err != actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); + + ut_error; + } + } else { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(compression_alg)); + + fflush(stderr); + ut_error; + } + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } +} + + diff --git a/storage/xtradb/fil/lz4.c b/storage/xtradb/fil/lz4.c new file mode 100644 index 00000000000..4e864de67d3 --- /dev/null +++ b/storage/xtradb/fil/lz4.c @@ -0,0 +1,822 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2013, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +//************************************** +// Tuning parameters +//************************************** +// MEMORY_USAGE : +// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +// Increasing memory usage improves compression ratio +// Reduced memory usage can improve speed, due to cache effect +// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache +#define MEMORY_USAGE 14 + +// HEAPMODE : +// Select how default compression functions will allocate memory for their hash table, +// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). +#define HEAPMODE 0 + + +//************************************** +// CPU Feature Detection +//************************************** +// 32 or 64 bits ? +#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ + || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \ + || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \ + || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +// Little Endian or Big Endian ? +// Overwrite the #define below if you know your architecture endianess +#if defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define LZ4_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define LZ4_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define LZ4_BIG_ENDIAN 1 +#else +// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +// Define this parameter if your target system or compiler does not support hardware bit count +#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count +# define LZ4_FORCE_SW_BITCOUNT +#endif + +// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : +// This option may provide a small boost to performance for some big endian cpu, although probably modest. +// You may set this option to 1 if data will remain within closed environment. +// This option is useless on Little_Endian CPU (such as x86) +//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 + + +//************************************** +// Compiler Options +//************************************** +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99 +/* "restrict" is a known keyword */ +#else +# define restrict // Disable restrict +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +# include // For Visual 2005 +# if LZ4_ARCH64 // 64-bits +# pragma intrinsic(_BitScanForward64) // For Visual 2005 +# pragma intrinsic(_BitScanReverse64) // For Visual 2005 +# else // 32-bits +# pragma intrinsic(_BitScanForward) // For Visual 2005 +# pragma intrinsic(_BitScanReverse) // For Visual 2005 +# endif +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +#ifdef _MSC_VER +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +//************************************** +// Memory routines +//************************************** +#include // malloc, calloc, free +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include // memset, memcpy +#define MEM_INIT memset + + +//************************************** +// Includes +//************************************** +#include "lz4.h" + + +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct { U16 v; } _PACKED U16_S; +typedef struct { U32 v; } _PACKED U32_S; +typedef struct { U64 v; } _PACKED U64_S; +typedef struct {size_t v;} _PACKED size_t_S; + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(0) +# else +# pragma pack(pop) +# endif +#endif + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) +#define AARCH(x) (((size_t_S *)(x))->v) + + +//************************************** +// Constants +//************************************** +#define LZ4_HASHLOG (MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << MEMORY_USAGE) +#define HASHNBCELLS4 (1 << LZ4_HASHLOG) + +#define MINMATCH 4 + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +const int LZ4_minLength = (MFLIMIT+1); + +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1)) +#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<=e; + + +//**************************** +// Private functions +//**************************** +#if LZ4_ARCH64 + +FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif +# endif +} + +#else + +FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif +# endif +} + +#endif + + +//**************************** +// Compression functions +//**************************** +FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } + +FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } + } +} + +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return +} + +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + + +FORCE_INLINE int LZ4_compress_generic( + void* ctx, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + + limitedOutput_directive limitedOutput, + tableType_t tableType, + prefix64k_directive prefix) +{ + const BYTE* ip = (const BYTE*) source; + const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source; + const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source); + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + // Init conditions + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative) + if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block + if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit + if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit) + if (inputSize> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if unlikely(forwardIp > mflimit) { goto _last_literals; } + + forwardH = LZ4_hashPosition(forwardIp, tableType); + ref = LZ4_getPositionOnHash(h, ctx, tableType, base); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip))); + + // Catch up + while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(length<>8) > oend)) return 0; // Check output limit + if (length>=(int)ML_MASK) + { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; } + if (length >= 255) { length-=255; *op++ = 255; } + *op++ = (BYTE)length; + } + else *token += (BYTE)(length); + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + LZ4_putPosition(ip-2, ctx, tableType, base); + + // Test next position + ref = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_hashPosition(ip, tableType); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun<hashTable, 0, sizeof(lz4ds->hashTable)); + lz4ds->bufferStart = base; + lz4ds->base = base; + lz4ds->nextBlock = base; +} + + +void* LZ4_create (const char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure)); + LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer); + return lz4ds; +} + + +int LZ4_free (void* LZ4_Data) +{ + FREEMEM(LZ4_Data); + return (0); +} + + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data; + size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB); + + if ( (lz4ds->base - delta > lz4ds->base) // underflow control + || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit + { + size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base; + int nH; + + for (nH=0; nH < HASHNBCELLS4; nH++) + { + if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0; + else lz4ds->hashTable[nH] -= (U32)deltaLimit; + } + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->base = lz4ds->bufferStart; + lz4ds->nextBlock = lz4ds->base + 64 KB; + } + else + { + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->nextBlock -= delta; + lz4ds->base -= delta; + } + + return (char*)(lz4ds->nextBlock); +} + + +//**************************** +// Decompression functions +//**************************** + +// This generic decompression function cover all use cases. +// It shall be instanciated several times, using different sets of directives +// Note that it is essential this generic function is really inlined, +// in order to remove useless branches during compilation optimisation. +FORCE_INLINE int LZ4_decompress_generic( + const char* source, + char* dest, + int inputSize, // + int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer. + + int endOnInput, // endOnOutputSize, endOnInputSize + int prefix64k, // noPrefix, withPrefix + int partialDecoding, // full, partial + int targetOutputSize // only used if partialDecoding==partial + ) +{ + // Local Variables + const BYTE* restrict ip = (const BYTE*) source; + const BYTE* ref; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + + const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64 + static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + + // Special cases + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything + if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer + if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1); + + + // Main Loop + while (1) + { + unsigned token; + size_t length; + + // get runlength + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s=255; + while (((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer + if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed + } + memcpy(op, ip, length); + ip += length; + op += length; + break; // Necessarily EOF, due to parsing restrictions + } + LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) + { + while ((!endOnInput) || (ipoend-COPYLENGTH-(STEPSIZE-4)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals + LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH)); + while(op (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) +static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) + primarily useful for memory allocation of output buffer. + inline function is recommended for the general case, + macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). + + isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ + + +int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); + +/* +LZ4_compress_limitedOutput() : + Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. + If it cannot achieve it, compression will stop, and result of the function will be zero. + This function never writes outside of provided output buffer. + + inputSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxOutputSize : is the size of the destination buffer (which must be already allocated) + return : the number of bytes written in buffer 'dest' + or 0 if the compression fails +*/ + + +int LZ4_decompress_fast (const char* source, char* dest, int outputSize); + +/* +LZ4_decompress_fast() : + outputSize : is the original (uncompressed) size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is malformed, the function will stop decoding and return a negative result. + note : This function is a bit faster than LZ4_decompress_safe() + This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet. + Use this function preferably into a trusted environment (data to decode comes from a trusted source). + Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes. +*/ + +int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'inputSize' at position 'source' + into output buffer 'dest' of size 'maxOutputSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ + + +//**************************** +// Stream Functions +//**************************** + +void* LZ4_create (const char* inputBuffer); +int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize); +char* LZ4_slideInputBuffer (void* LZ4_Data); +int LZ4_free (void* LZ4_Data); + +/* +These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. +In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function : + +void* LZ4_create (const char* inputBuffer); +The result of the function is the (void*) pointer on the LZ4 Data Structure. +This pointer will be needed in all other functions. +If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. +The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. +The input buffer must be already allocated, and size at least 192KB. +'inputBuffer' will also be the 'const char* source' of the first block. + +All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. +To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(). +Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(), +but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one. +If next block does not begin immediately after the previous one, the compression will fail (return 0). + +When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : +char* LZ4_slideInputBuffer(void* LZ4_Data); +must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. +Note that, for this function to work properly, minimum size of an input buffer must be 192KB. +==> The memory position where the next input data block must start is provided as the result of the function. + +Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual. + +When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure. +*/ + + +int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize); + +/* +*_withPrefix64k() : + These decoding functions work the same as their "normal name" versions, + but can use up to 64KB of data in front of 'char* dest'. + These functions are necessary to decode inter-dependant blocks. +*/ + + +//**************************** +// Obsolete Functions +//**************************** + +static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + +/* +These functions are deprecated and should no longer be used. +They are provided here for compatibility with existing user programs. +*/ + + + +#if defined (__cplusplus) +} +#endif diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 43cfa23a99f..ead0b0fc902 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -4,6 +4,7 @@ Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -558,6 +559,27 @@ ib_cb_t innodb_api_cb[] = { (ib_cb_t) ib_cfg_bk_commit_interval }; +/** + Structure for CREATE TABLE options (table options). + It needs to be called ha_table_option_struct. + + The option values can be specified in the CREATE TABLE at the end: + CREATE TABLE ( ... ) *here* +*/ + +ha_create_table_option innodb_table_option_list[]= +{ + /* With this option user can enable page compression feature for the + table */ + HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0), + /* With this option user can set zip compression level for page + compression for this table*/ + HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), + /* With this option user can enable atomic writes feature for this table */ + HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), + HA_TOPTION_END +}; + /*************************************************************//** Check whether valid argument given to innodb_ft_*_stopword_table. This function is registered as a callback with MySQL. @@ -873,6 +895,25 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_x_lock_spin_rounds, SHOW_LONGLONG}, {"x_lock_spin_waits", (char*) &export_vars.innodb_x_lock_spin_waits, SHOW_LONGLONG}, + + /* Status variables for page compression */ + {"page_compression_saved", + (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG}, + {"page_compression_trim_sect512", + (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG}, + {"page_compression_trim_sect4096", + (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, + {"num_index_pages_written", + (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_pages_page_compressed", + (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, + {"num_page_compressed_trim_op", + (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG}, + {"num_page_compressed_trim_op_saved", + (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG}, + {"num_pages_page_decompressed", + (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} }; @@ -3156,6 +3197,8 @@ innobase_init( if (srv_file_per_table) innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); #ifndef DBUG_OFF @@ -10010,11 +10053,16 @@ innobase_table_flags( enum row_type row_format; rec_format_t innodb_row_format = REC_FORMAT_COMPACT; bool use_data_dir; + ha_table_option_struct *options= form->s->option_struct; /* Cache the value of innodb_file_format, in case it is modified by another thread while the table is being created. */ const ulint file_format_allowed = srv_file_format; + /* Cache the value of innobase_compression_level, in case it is + modified by another thread while the table is being created. */ + const ulint default_compression_level = page_zip_level; + *flags = 0; *flags2 = 0; @@ -10063,6 +10111,8 @@ index_bad: } } + row_format = form->s->row_type; + if (create_info->key_block_size) { /* The requested compressed page size (key_block_size) is given in kilobytes. If it is a valid number, store @@ -10110,8 +10160,6 @@ index_bad: } } - row_format = form->s->row_type; - if (zip_ssize && zip_allowed) { /* if ROW_FORMAT is set to default, automatically change it to COMPRESSED.*/ @@ -10166,10 +10214,18 @@ index_bad: " innodb_file_format > Antelope.", get_row_format_name(row_format)); } else { - innodb_row_format = (row_format == ROW_TYPE_DYNAMIC - ? REC_FORMAT_DYNAMIC - : REC_FORMAT_COMPRESSED); - break; + switch(row_format) { + case ROW_TYPE_COMPRESSED: + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + default: + /* Not possible, avoid compiler warning */ + break; + } + break; /* Correct row_format */ } zip_allowed = FALSE; /* fall through to set row_format = COMPACT */ @@ -10196,7 +10252,15 @@ index_bad: && ((create_info->data_file_name != NULL) && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); - dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + /* Set up table dictionary flags */ + dict_tf_set(flags, + innodb_row_format, + zip_ssize, + use_data_dir, + options->page_compressed, + (ulint)options->page_compression_level == ULINT_UNDEFINED ? + default_compression_level : options->page_compression_level, + options->atomic_writes); if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; @@ -10209,6 +10273,112 @@ index_bad: DBUG_RETURN(true); } +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +UNIV_INTERN +const char* +ha_innobase::check_table_options( + THD *thd, /*!< in: thread handle */ + TABLE* table, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + const bool use_tablespace, /*!< in: use file par table */ + const ulint file_format) +{ + enum row_type row_format = table->s->row_type;; + ha_table_option_struct *options= table->s->option_struct; + atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; + + /* Check page compression requirements */ + if (options->page_compressed) { + if (!srv_compress_pages) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + "innodb_compress_pages not enabled"); + return "PAGE_COMPRESSED"; + } + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_format > Antelope."); + return "PAGE_COMPRESSED"; + } + + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if ((ulint)options->page_compression_level != ULINT_UNDEFINED) { + if (options->page_compressed == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 0 || options->page_compression_level > 9) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", + create_info->key_block_size); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + /* Check atomic writes requirements */ + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ATOMIC_WRITES requires" + " innodb_file_per_table."); + return "ATOMIC_WRITES"; + } + } + + return 0; +} + /*****************************************************************//** Creates a new table to an InnoDB database. @return error number */ @@ -10240,6 +10410,7 @@ ha_innobase::create( while creating the table. So we read the current value here and make all further decisions based on this. */ bool use_tablespace = srv_file_per_table; + const ulint file_format = srv_file_format; /* Zip Shift Size - log2 - 9 of compressed page size, zero for uncompressed */ @@ -10263,6 +10434,12 @@ ha_innobase::create( /* Create the table definition in InnoDB */ + /* Validate table options not handled by the SQL-parser */ + if(check_table_options(thd, form, create_info, use_tablespace, + file_format)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + /* Validate create options if innodb_strict_mode is set. */ if (create_options_are_invalid( thd, form, create_info, use_tablespace)) { @@ -14578,6 +14755,12 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); if (table_changes != IS_EQUAL_YES) { @@ -14604,6 +14787,13 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level || + param_new->atomic_writes != param_old->atomic_writes) { + return(COMPATIBLE_DATA_NO); + } + return(COMPATIBLE_DATA_YES); } @@ -17079,12 +17269,6 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, "innodb_thread_concurrency is reached (0 by default)", NULL, NULL, 0, 0, ~0UL, 0); -static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, - PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" - ", 1 is fastest, 9 is best compression and default is 6.", - NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); - static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages, PLUGIN_VAR_OPCMDARG, "Enables/disables the logging of entire compressed page images." @@ -17758,6 +17942,37 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, "Print stacktrace on long semaphore wait (off by default supported only on linux)", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(compress_pages, srv_compress_pages, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use page compression.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, + PLUGIN_VAR_OPCMDARG , + "How many percent of compressed pages should be trimmed", + NULL, NULL, 100, 0, 100, 0); + +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, + PLUGIN_VAR_RQCMDARG, + "Compression level used for zlib compression. 0 is no compression" + ", 1 is fastest, 9 is best compression and default is 6.", + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); + +static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, + PLUGIN_VAR_OPCMDARG, + "Use page compression for only index pages.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, + PLUGIN_VAR_OPCMDARG, + "Use trim.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, + PLUGIN_VAR_OPCMDARG , + "Use LZ4 for page compression", + NULL, NULL, FALSE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), MYSQL_SYSVAR(additional_mem_pool_size), @@ -17948,6 +18163,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fake_changes), MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), + MYSQL_SYSVAR(compress_pages), + MYSQL_SYSVAR(trim_pct), + MYSQL_SYSVAR(compress_index_pages), + MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(use_lz4), NULL }; diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index 773a9b6b04d..b4df711356c 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -57,6 +58,21 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; +/** Engine specific table options are definined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + int page_compression_level; /*!< Table page compression level + or UNIV_UNSPECIFIED. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + srv_use_atomic_writes=1. + Atomic writes are not used if + value OFF.*/ +}; + /** The class defining a handle to an Innodb table */ class ha_innobase: public handler { @@ -184,6 +200,8 @@ class ha_innobase: public handler char* norm_name, char* temp_path, char* remote_path); + const char* check_table_options(THD *thd, TABLE* table, + HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc index 9c535285d1e..24dc1086cc5 100644 --- a/storage/xtradb/handler/handler0alter.cc +++ b/storage/xtradb/handler/handler0alter.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -252,6 +253,22 @@ ha_innobase::check_if_supported_inplace_alter( update_thd(); trx_search_latch_release_if_reserved(prebuilt->trx); + /* Change on engine specific table options require rebuild of the + table */ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION) { + ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct; + ha_table_option_struct *old_options= table->s->option_struct; + + if (new_options->page_compressed != old_options->page_compressed || + new_options->page_compression_level != old_options->page_compression_level || + new_options->atomic_writes != old_options->atomic_writes) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE | INNOBASE_ALTER_NOREBUILD @@ -3372,6 +3389,17 @@ ha_innobase::prepare_inplace_alter_table( if (ha_alter_info->handler_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { + /* Check engine specific table options */ + if (const char* invalid_tbopt = check_table_options( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0, + srv_file_format)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + if (const char* invalid_opt = create_options_are_invalid( user_thd, altered_table, ha_alter_info->create_info, diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index ba2f413429c..8fedeeaa832 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1489,6 +1490,12 @@ struct buf_page_t{ state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ + + ulint write_size; /* Write size is set when this + page is first time written and then + if written again we check is TRIM + operation needed. */ + #ifndef UNIV_HOTBACKUP buf_page_t* hash; /*!< node used in chaining to buf_pool->page_hash or @@ -2118,6 +2125,20 @@ struct CheckUnzipLRUAndLRUList { }; #endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */ +/*********************************************************************//** +Aquire LRU list mutex */ +void +buf_pool_mutex_enter( +/*=================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool */ +/*********************************************************************//** +Exit LRU list mutex */ +void +buf_pool_mutex_exit( +/*================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool */ + + #ifndef UNIV_NONINL #include "buf0buf.ic" #endif diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h index f4542e7c206..6b2827e77a7 100644 --- a/storage/xtradb/include/buf0flu.h +++ b/storage/xtradb/include/buf0flu.h @@ -36,6 +36,13 @@ Created 11/5/1995 Heikki Tuuri /** Flag indicating if the page_cleaner is in active state. */ extern ibool buf_page_cleaner_is_active; +/** Handled page counters for a single flush */ +struct flush_counters_t { + ulint flushed; /*!< number of dirty pages flushed */ + ulint evicted; /*!< number of clean pages evicted */ +}; + + /********************************************************************//** Remove a block from the flush list of modified blocks. */ UNIV_INTERN diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h index 6669f60b95a..8ab05c50dbd 100644 --- a/storage/xtradb/include/dict0dict.h +++ b/storage/xtradb/include/dict0dict.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri #include "ut0byte.h" #include "trx0types.h" #include "row0types.h" +#include "fsp0fsp.h" +#include "dict0pagecompress.h" #ifndef UNIV_HOTBACKUP # include "sync0sync.h" @@ -904,7 +907,14 @@ dict_tf_set( ulint* flags, /*!< in/out: table */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool remote_path) /*!< in: table uses DATA DIRECTORY */ + bool remote_path, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic + writes option value*/ __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index c261d6a3aee..502b1d028d8 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -537,10 +538,27 @@ dict_tf_is_valid( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); ulint unused = DICT_TF_GET_UNUSED(flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags); /* Make sure there are no bits that we do not know about. */ if (unused != 0) { + fprintf(stderr, + "InnoDB: Error: table unused flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + unused, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } else if (atomic_blobs) { @@ -550,12 +568,36 @@ dict_tf_is_valid( data stored off-page in the clustered index. */ if (!compact) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + compact, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } } else if (zip_ssize) { /* Antelope does not support COMPRESSED row format. */ + fprintf(stderr, + "InnoDB: Error: table flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } @@ -568,6 +610,41 @@ dict_tf_is_valid( || !atomic_blobs || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + + ); + return(false); + } + } + + if (page_compression || page_compression_level) { + /* Page compression format must have compact and + atomic_blobs and page_compression_level requires + page_compression */ + if (!compact + || !page_compression + || !atomic_blobs) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } } @@ -594,6 +671,10 @@ dict_sys_tables_type_validate( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -647,6 +728,24 @@ dict_sys_tables_type_validate( format, so the DATA_DIR flag is compatible with any other table flags. However, it is not used with TEMPORARY tables.*/ + if (page_compression || page_compression_level) { + /* page compressed row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + + if (!atomic_blobs || !page_compression) { + return(ULINT_UNDEFINED); + } + } + + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!atomic_blobs) { + return(ULINT_UNDEFINED); + } + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -719,8 +818,16 @@ dict_tf_set( ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ + bool use_data_dir, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic writes setup */ { + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + switch (format) { case REC_FORMAT_REDUNDANT: *flags = 0; @@ -745,6 +852,28 @@ dict_tf_set( if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); } + + if (page_compressed) { + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } + + if (awrites != ATOMIC_WRITES_DEFAULT) { + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); + } + + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); + } + } /********************************************************************//** @@ -765,6 +894,9 @@ dict_tf_to_fsp_flags( ulint table_flags) /*!< in: dict_table_t::flags */ { ulint fsp_flags; + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); @@ -783,7 +915,20 @@ dict_tf_to_fsp_flags( fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) ? FSP_FLAGS_MASK_DATA_DIR : 0; + /* In addition, tablespace flags also contain if the page + compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression); + + /* In addition, tablespace flags also contain page compression level + if page compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + + /* In addition, tablespace flags also contain flag if atomic writes + is used for this table */ + fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); + ut_a(fsp_flags_is_valid(fsp_flags)); + ut_a(dict_tf_verify_flags(table_flags, fsp_flags)); return(fsp_flags); } @@ -811,10 +956,15 @@ dict_sys_tables_type_to_tf( /* Adjust bit zero. */ flags = redundant ? 0 : 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ flags |= type & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES + ); return(flags); } @@ -842,10 +992,14 @@ dict_tf_to_sys_tables_type( /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ type = 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ type |= flags & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES); return(type); } diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h index bde0ce16094..087fde0ccb7 100644 --- a/storage/xtradb/include/dict0mem.h +++ b/storage/xtradb/include/dict0mem.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -125,11 +126,26 @@ This flag prevents older engines from attempting to open the table and allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_WIDTH_DATA_DIR 1 +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ + DICT_TF_WIDTH_ATOMIC_BLOBS \ - + DICT_TF_WIDTH_DATA_DIR) + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -145,9 +161,18 @@ allows InnoDB to update_create_info() accordingly. */ /** Zero relative shift position of the DATA_DIR field */ #define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ - + DICT_TF_WIDTH_DATA_DIR) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -165,6 +190,18 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_MASK_DATA_DIR \ ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define DICT_TF_MASK_ATOMIC_WRITES \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ + << DICT_TF_POS_ATOMIC_WRITES) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -185,6 +222,19 @@ allows InnoDB to update_create_info() accordingly. */ /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) + +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define DICT_TF_GET_ATOMIC_WRITES(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ + >> DICT_TF_POS_ATOMIC_WRITES) /* @} */ #ifndef UNIV_INNOCHECKSUM diff --git a/storage/xtradb/include/dict0pagecompress.h b/storage/xtradb/include/dict0pagecompress.h new file mode 100644 index 00000000000..19a2a6c52f3 --- /dev/null +++ b/storage/xtradb/include/dict0pagecompress.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ + __attribute__((const)); + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table); /*!< in: table */ + + +#ifndef UNIV_NONINL +#include "dict0pagecompress.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0pagecompress.ic b/storage/xtradb/include/dict0pagecompress.ic new file mode 100644 index 00000000000..fb9581fc657 --- /dev/null +++ b/storage/xtradb/include/dict0pagecompress.ic @@ -0,0 +1,191 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ +{ + ulint table_unused = DICT_TF_GET_UNUSED(table_flags); + ulint compact = DICT_TF_GET_COMPACT(table_flags); + ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); + ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); + ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags); + ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); + ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); + ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", + return(ULINT_UNDEFINED);); + + ut_ad(!table_unused); + ut_ad(!fsp_unused); + ut_ad(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_ad(compact == 0 || compact == 1); /* silence compiler */ + ut_ad(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_ad(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + + if (ssize != zip_ssize) { + fprintf(stderr, + "InnoDB: Error: table flags has zip_ssize %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has zip_ssize %ld\n", + ssize, zip_ssize); + return (FALSE); + } + if (atomic_blobs != fsp_atomic_blobs) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic_blobs %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_blobs %ld\n", + atomic_blobs, fsp_atomic_blobs); + + return (FALSE); + } + if (page_compression != fsp_page_compression) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file ahas page_compression %ld\n", + page_compression, fsp_page_compression); + + return (FALSE); + } + if (page_compression_level != fsp_page_compression_level) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression_level %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_compression_level %ld\n", + page_compression_level, fsp_page_compression_level); + + return (FALSE); + } + + if (atomic_writes != fsp_atomic_writes) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic writes %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_writes %ld\n", + atomic_writes, fsp_atomic_writes); + + return (FALSE); + } + + return(TRUE); +} + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level >= 0 && page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ +{ + return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags)); +} + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags)); +} diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h index 6acb6a2dcbe..9e210117580 100644 --- a/storage/xtradb/include/dict0types.h +++ b/storage/xtradb/include/dict0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -82,4 +83,12 @@ enum ib_quiesce_t { #define TEMP_TABLE_PREFIX "#sql" #define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX +/** Enum values for atomic_writes table option */ +typedef enum { + ATOMIC_WRITES_DEFAULT = 0, + ATOMIC_WRITES_ON = 1, + ATOMIC_WRITES_OFF = 2 +} atomic_writes_t; + + #endif diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index 472c57fcbfc..6b69a899690 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -129,6 +130,13 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this contains the space id of the page */ #define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* Following are used when page compression is used */ +#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on + compressed pages. */ +#define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ +#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */ + /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -139,6 +147,7 @@ extern fil_addr_t fil_addr_null; /* @} */ /** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< Page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ @@ -721,8 +730,8 @@ fil_space_get_n_reserved_extents( Reads or writes data. This operation is asynchronous (aio). @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ -#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \ - _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL) +#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size) \ + _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size, NULL) UNIV_INTERN dberr_t @@ -752,7 +761,12 @@ _fil_io( or from where to write; in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync - aio used, else ignored */ + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ trx_t* trx) __attribute__((nonnull(8))); /**********************************************************************//** @@ -1018,4 +1032,27 @@ fil_space_set_corrupt( /*==================*/ ulint space_id); +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void); +/*==================*/ +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void); +/*==================*/ +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space); /*!< in: space */ + #endif /* fil0fil_h */ diff --git a/storage/xtradb/include/fil0pagecompress.h b/storage/xtradb/include/fil0pagecompress.h new file mode 100644 index 00000000000..342b105401c --- /dev/null +++ b/storage/xtradb/include/fil0pagecompress.h @@ -0,0 +1,118 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" +#include "fsp0pagecompress.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/*******************************************************************//** +Returns the page compression level flag of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level if page compressed, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return atomic write table option value */ +atomic_writes_t +fil_space_get_atomic_writes( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf); /*!< in: page */ + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg); /*!> FSP_FLAGS_POS_UNUSED) +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \ + >> FSP_FLAGS_POS_ATOMIC_WRITES) /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) +/** Set a PAGE_COMPRESSION into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \ + (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION)) + +/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ + (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) +/** Set a ATOMIC_WRITES into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ + (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES)) + /* @} */ /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic index 0d81e817cc9..bc46967fab0 100644 --- a/storage/xtradb/include/fsp0fsp.ic +++ b/storage/xtradb/include/fsp0fsp.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,6 +64,10 @@ fsp_flags_is_valid( ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); + ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); @@ -108,6 +113,20 @@ fsp_flags_is_valid( # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif + /* Page compression level requires page compression and atomic blobs + to be set */ + if (page_compression_level || page_compression) { + if (!page_compression || !atomic_blobs) { + return(false); + } + } + + if ((awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) + && !atomic_blobs) { + return (false); + } + /* The DATA_DIR field can be used for any row type so there is nothing here to validate. */ diff --git a/storage/xtradb/include/fsp0pagecompress.h b/storage/xtradb/include/fsp0pagecompress.h new file mode 100644 index 00000000000..4913f1d6b29 --- /dev/null +++ b/storage/xtradb/include/fsp0pagecompress.h @@ -0,0 +1,73 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef fsp0pagecompress_h +#define fsp0pagecompress_h + +/**********************************************************************//** +Reads the page compression level from the first page of a tablespace. +@return page compression level, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_compression_level( +/*=============================*/ + const page_t* page); /*!< in: first page of a tablespace */ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Extract the page compression level from tablespace flags. +A tablespace has only one physical page compression level +whether that page is compressed or not. +@return page compression level of the file-per-table tablespace, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0pagecompress.ic" +#endif + +#endif diff --git a/storage/xtradb/include/fsp0pagecompress.ic b/storage/xtradb/include/fsp0pagecompress.ic new file mode 100644 index 00000000000..873f6cd401d --- /dev/null +++ b/storage/xtradb/include/fsp0pagecompress.ic @@ -0,0 +1,177 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.ic +Implementation for helper functions for extracting/storing page +compression and atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fsp0fsp.h" + + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not page compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Determine the tablespace is page compression level from dict_table_t::flags. +@return page compression level or 0 if not compressed*/ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); +} + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags)); +} + +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +UNIV_INLINE +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); +} + +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed, false if not */ +UNIV_INLINE +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); +} + +/*******************************************************************//** +Returns the page compression level of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +UNIV_INLINE +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_compression_level(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Extract the page compression from space. +@return true if space is page compressed, false if space is not found +or space is not page compressed. */ +UNIV_INLINE +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_compressed(flags)); + } + + return(flags); +} + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +UNIV_INLINE +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg) /*!space_id, 0, (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + group, 0); srv_stats.os_log_pending_writes.dec(); @@ -1975,7 +1975,7 @@ log_group_checkpoint( write_offset / UNIV_PAGE_SIZE, write_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, - buf, ((byte*) group + 1)); + buf, ((byte*) group + 1), 0); ut_ad(((ulint) group & 0x1UL) == 0); } @@ -2055,7 +2055,7 @@ log_group_read_checkpoint_info( fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0, field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0); } /******************************************************//** @@ -2438,7 +2438,7 @@ loop: fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, (ulint) (source_offset / UNIV_PAGE_SIZE), (ulint) (source_offset % UNIV_PAGE_SIZE), - len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL); + len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL, 0); start_lsn += len; buf += len; @@ -2563,7 +2563,7 @@ log_group_archive_file_header_write( dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, 2 * OS_FILE_LOG_BLOCK_SIZE, - buf, &log_archive_io); + buf, &log_archive_io, 0); } /******************************************************//** @@ -2600,7 +2600,7 @@ log_group_archive_completed_header_write( dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf + LOG_FILE_ARCH_COMPLETED, - &log_archive_io); + &log_archive_io, 0); } /******************************************************//** @@ -2663,12 +2663,12 @@ loop: file_handle = os_file_create(innodb_file_log_key, name, open_mode, OS_FILE_AIO, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); if (!ret && (open_mode == OS_FILE_CREATE)) { file_handle = os_file_create( innodb_file_log_key, name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); + OS_FILE_AIO, OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -2737,7 +2737,7 @@ loop: (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, - &log_archive_io); + &log_archive_io, 0); start_lsn += len; next_offset += len; diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc index 8c2bc5602a9..2438303043c 100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@ -547,7 +547,7 @@ log_online_start_bitmap_file(void) log_bmp_sys->out.name, OS_FILE_CREATE, OS_FILE_READ_WRITE, - &success); + &success, FALSE); } if (UNIV_UNLIKELY(!success)) { @@ -707,7 +707,7 @@ log_online_read_init(void) log_bmp_sys->out.file = os_file_create_simple_no_error_handling (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &success); + OS_FILE_READ_WRITE, &success, FALSE); if (!success) { @@ -1491,7 +1491,7 @@ log_online_open_bitmap_file_read_only( bitmap_file->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, FALSE); if (UNIV_UNLIKELY(!success)) { /* Here and below assume that bitmap file names do not diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc index d0b833f2bba..1772def9f9b 100644 --- a/storage/xtradb/log/log0recv.cc +++ b/storage/xtradb/log/log0recv.cc @@ -2,6 +2,7 @@ Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2131,7 +2132,7 @@ recv_apply_log_recs_for_backup(void) error = fil_io(OS_FILE_READ, true, recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); if (error == DB_SUCCESS && !buf_zip_decompress(block, TRUE)) { exit(1); @@ -2141,7 +2142,7 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } if (error != DB_SUCCESS) { @@ -2170,13 +2171,13 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); } else { error = fil_io(OS_FILE_WRITE, true, recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); @@ -3144,7 +3145,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, LOG_FILE_HDR_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { @@ -3175,7 +3176,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, OS_FILE_LOG_BLOCK_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); } log_hdr_log_block_size @@ -3775,7 +3776,7 @@ try_open_again: file_handle = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, - OS_FILE_LOG, OS_FILE_AIO, &ret); + OS_FILE_LOG, OS_FILE_AIO, &ret, FALSE); if (ret == FALSE) { ask_again: @@ -3827,7 +3828,7 @@ ask_again: /* Read the archive file header */ fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0, 0, - LOG_FILE_HDR_SIZE, buf, NULL); + LOG_FILE_HDR_SIZE, buf, NULL, 0); /* Check if the archive file header is consistent */ @@ -3901,7 +3902,7 @@ ask_again: fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, read_offset / UNIV_PAGE_SIZE, - read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0); ret = recv_scan_log_recs( (buf_pool_get_n_pages() diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 38eb5241da1..43adf78c63c 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,10 +43,16 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "btr0types.h" #include "trx0trx.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#include "linux/falloc.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -196,6 +203,28 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ibool page_compression; + ulint page_compression_level; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + #ifdef LINUX_NATIVE_AIO struct iocb control; /* Linux control block for aio */ int n_bytes; /* bytes written/read. */ @@ -301,6 +330,58 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len); /*!< in: length of area */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name, /*!< in: name of the file */ + os_file_t file); /*!< in: handle to the file */ + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -537,6 +618,16 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED) { + fprintf(stderr, + "InnoDB: Operation canceled (%d):%s\n", + err, strerror(err)); + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -633,6 +724,8 @@ os_file_get_last_error_low( return(OS_FILE_AIO_RESOURCES_RESERVED); } break; + case ECANCELED: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EINTR: if (srv_use_native_aio) { return(OS_FILE_AIO_INTERRUPTED); @@ -672,9 +765,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -706,6 +801,9 @@ os_file_handle_error_cond_exit( os_has_said_disk_full = TRUE; + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + fflush(stderr); return(FALSE); @@ -737,6 +835,9 @@ os_file_handle_error_cond_exit( is better to ignore on_error_silent and print an error message to the log. */ + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + if (should_exit || !on_error_silent) { ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " "error " ULINTPF ".%s", name ? name : "(unknown)", @@ -760,10 +861,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -775,12 +878,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -923,7 +1028,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -934,7 +1039,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -956,7 +1061,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -968,7 +1073,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -1040,7 +1145,7 @@ next_file: return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1126,7 +1231,7 @@ next_file: goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1177,7 +1282,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1190,7 +1295,7 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1300,7 +1405,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN ? - "open" : "create"); + "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; @@ -1368,7 +1473,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; retry = false; @@ -1410,9 +1515,12 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; *success = FALSE; #ifdef __WIN__ @@ -1473,6 +1581,15 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + CloseHandle(file); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ int create_flag; @@ -1533,6 +1650,15 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + *success = FALSE; + close(file); + file = -1; + } + #endif /* __WIN__ */ return(file); @@ -1602,7 +1728,7 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { - os_file_handle_error_no_exit(name, "ioctl", FALSE); + os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1636,12 +1762,15 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; ibool retry; ibool on_error_no_exit; ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; #ifdef __WIN__ DBUG_EXECUTE_IF( @@ -1784,9 +1913,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1795,8 +1924,10 @@ os_file_create_func( } while (retry); - if (srv_use_atomic_writes && type == OS_DATA_FILE && - !os_file_set_atomic_writes(name, file)) { + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { CloseHandle(file); *success = FALSE; file = INVALID_HANDLE_VALUE; @@ -1876,9 +2007,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1932,14 +2063,16 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ - if (srv_use_atomic_writes && type == OS_DATA_FILE + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - *success = FALSE; close(file); file = -1; } + #endif /* __WIN__ */ return(file); @@ -1998,7 +2131,7 @@ loop: ret = unlink(name); if (ret != 0 && errno != ENOENT) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2062,7 +2195,7 @@ loop: ret = unlink(name); if (ret != 0) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2106,7 +2239,7 @@ os_file_rename_func( return(TRUE); } - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); #else @@ -2115,7 +2248,7 @@ os_file_rename_func( ret = rename(oldpath, newpath); if (ret != 0) { - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2146,7 +2279,7 @@ os_file_close_func( return(TRUE); } - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); #else @@ -2155,7 +2288,7 @@ os_file_close_func( ret = close(file); if (ret == -1) { - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); } @@ -2247,6 +2380,12 @@ os_file_set_size( current_size = 0; +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: File %s current_size %lu extended_size %lu\n", + name, os_file_get_size(file), size); +#endif + + #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2257,7 +2396,7 @@ os_file_set_size( INT64PF ", desired size " INT64PF "\n", name, current_size, size); os_file_handle_error_no_exit (name, "posix_fallocate", - FALSE); + FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); @@ -2446,7 +2585,7 @@ os_file_flush_func( return(TRUE); } - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2500,7 +2639,7 @@ os_file_flush_func( ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2855,6 +2994,9 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len); + } return(TRUE); } #else /* __WIN__ */ @@ -2868,6 +3010,10 @@ try_again: if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n); + } + return(TRUE); } @@ -2875,7 +3021,7 @@ try_again: "Tried to read "ULINTPF" bytes at offset " UINT64PF". " "Was only able to read %ld.", n, offset, (lint) ret); #endif /* __WIN__ */ - retry = os_file_handle_error(NULL, "read"); + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); if (retry) { goto try_again; @@ -2968,10 +3114,14 @@ try_again: if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n); + } + return(TRUE); } #endif /* __WIN__ */ - retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); if (retry) { goto try_again; @@ -3183,7 +3333,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3211,7 +3361,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3260,7 +3410,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3313,7 +3463,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3866,7 +4016,7 @@ os_aio_array_create( array->slots = static_cast( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); #if defined(LINUX_NATIVE_AIO) array->aio_ctx = NULL; @@ -3941,6 +4091,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { + ulint i; + os_mutex_free(array->mutex); os_event_free(array->not_full); os_event_free(array->is_empty); @@ -3952,6 +4104,14 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4296,7 +4456,16 @@ os_aio_array_reserve_slot( to write */ os_offset_t offset, /*!< in: file offset */ ulint len, /*!< in: length of the block to read or write */ - ulint space_id) + ulint space_id, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4388,6 +4557,55 @@ found: slot->io_already_done = FALSE; slot->space_id = space_id; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + + /* If the space is page compressed and this is write operation + and if either only index pages compression is disabled or + page is index page and only index pages compression is enabled then + we compress the page */ + if (message1 && + type == OS_FILE_WRITE && + page_compression && + (srv_page_compress_index_pages == false || + (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + + ut_ad(slot->page_buf); + + /* Write buffer full of zeros, this is needed for trim, + can't really avoid this now. */ + memset(slot->page_buf, 0, len); + + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); + + } + #ifdef WIN_ASYNC_IO control = &slot->control; control->Offset = (DWORD) offset & 0xFFFFFFFF; @@ -4663,7 +4881,16 @@ os_aio_func( aio operation); ignored if mode is OS_AIO_SYNC */ ulint space_id, - trx_t* trx) + trx_t* trx, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4686,7 +4913,7 @@ os_aio_func( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); - if (mode == OS_AIO_SYNC) + if (mode == OS_AIO_SYNC) { ibool ret; /* This is actually an ordinary synchronous read or write: @@ -4753,7 +4980,8 @@ try_again: trx->io_read += n; } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n, space_id); + name, buf, offset, n, space_id, + page_compression, page_compression_level, write_size); if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4811,7 +5039,7 @@ err_exit: os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4911,7 +5139,7 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { retry = TRUE; } else { @@ -4939,11 +5167,17 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret_val = os_file_write(slot->name, slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + if (slot->message1 && page_compression && slot->page_buf) { + ret_val = os_file_write(slot->name, slot->file, slot->page_buf, + slot->control.Offset, slot->control.OffsetHigh, slot->len); + } else { + + ret_val = os_file_write(slot->name, slot->file, slot->buf, + slot->control.Offset, slot->control.OffsetHigh, slot->len); + } break; case OS_FILE_READ: - ret_val = os_file_read(slot->file, slot->buf, + ret_val = os_file_read(slot->file, slot->buf, slot->control.Offset, slot->control.OffsetHigh, slot->len); break; default: @@ -4969,6 +5203,28 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->message1 && page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len); + } + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); return(ret_val); @@ -5058,6 +5314,33 @@ retry: /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len); + } + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5203,6 +5486,13 @@ found: } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5210,7 +5500,7 @@ found: windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5884,3 +6174,162 @@ os_aio_all_slots_free(void) #endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + +#ifdef _WIN32 +#include +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len) /*!< in: length of area */ +{ + + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lx len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + slot->write_size = NULL; + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + slot->write_size = len; + } + } +#endif + +#define SECT_SIZE 512 + srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast(cbuf2); + slot->page_buf = static_cast(cbuf); +} diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index d98315ae9a2..0b5556ab61a 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -290,6 +290,12 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + {"buffer_index_pages_written", "buffer", + "Number of index pages written (innodb_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -879,6 +885,41 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + {"compress_saved", "compression", + "Number of bytes saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED}, + + {"compress_trim_sect512", "compression", + "Number of sect-512 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512}, + + {"compress_trim_sect4096", "compression", + "Number of sect-4K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096}, + + {"compress_pages_page_compressed", "compression", + "Number of pages compressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED}, + + {"compress_page_compressed_trim_op", "compression", + "Number of TRIM operation performed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP}, + + {"compress_page_compressed_trim_op_saved", "compression", + "Number of TRIM operation saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED}, + + {"compress_pages_page_decompressed", "compression", + "Number of pages decompressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1532,6 +1573,11 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; + /* innodb_index_pages_written, the number of page written */ + case MONITOR_OVLD_INDEX_PAGES_WRITTEN: + value = srv_stats.index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); @@ -1773,6 +1819,28 @@ srv_mon_process_existing_counter( value = btr_cur_n_non_sea; break; + case MONITOR_OVLD_PAGE_COMPRESS_SAVED: + value = srv_stats.page_compression_saved; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512: + value = srv_stats.page_compression_trim_sect512; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096: + value = srv_stats.page_compression_trim_sect4096; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSED: + value = srv_stats.pages_page_compressed; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP: + value = srv_stats.page_compressed_trim_op; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED: + value = srv_stats.page_compressed_trim_op_saved; + break; + case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: + value = srv_stats.pages_page_decompressed; + break; + default: ut_error; } diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 953bbba11f7..92acf847ca1 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -160,6 +161,26 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +/* If this flag is TRUE, then we will use page compression +to the pages */ +UNIV_INTERN my_bool srv_compress_pages = FALSE; +/* If this flag is TRUE, then we will use page compression +only for index pages */ +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN long srv_trim_pct = 100; +/* Default compression level if page compression is used and no compression +level is set for the table*/ +UNIV_INTERN long srv_compress_zlib_level = 6; +/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) +to the pages */ +UNIV_INTERN my_bool srv_use_trim = TRUE; +/* If this flag is TRUE, then we will use posix fallocate for file extentsion */ +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +/* If this flag is TRUE, then we disable doublewrite buffer */ +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +UNIV_INTERN my_bool srv_use_lz4 = FALSE; + #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function pointers, because they are not available on Windows Server 2003 and @@ -454,10 +475,6 @@ UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20; UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE; UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; -UNIV_INTERN ibool srv_use_atomic_writes = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -UNIV_INTERN ibool srv_use_posix_fallocate = FALSE; -#endif /** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages. The following parameter is the size of the buffer that is used for @@ -493,6 +510,15 @@ static ulint srv_n_rows_read_old = 0; UNIV_INTERN ulint srv_truncated_status_writes = 0; UNIV_INTERN ulint srv_available_undo_logs = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; +UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; +UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; + /* Ensure status variables are on separate cache lines */ #define CACHE_LINE_SIZE 64 @@ -1835,6 +1861,15 @@ srv_export_innodb_status(void) export_vars.innodb_descriptors_memory = os_atomic_increment_ulint(&srv_descriptors_memory, 0); + export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved; + export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; + export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; + export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; + export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; + export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; + export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); trx_id_t done_trx_no = purge_sys->done.trx_no; diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 3ddfd9ab3a4..faad8c3c133 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -3,6 +3,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -64,6 +65,8 @@ Created 2/16/1996 Heikki Tuuri #include "ibuf0ibuf.h" #include "srv0start.h" #include "srv0srv.h" +#include "buf0flu.h" + #ifndef UNIV_HOTBACKUP # include "trx0rseg.h" # include "os0proc.h" @@ -128,8 +131,14 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 - + SRV_MAX_N_PURGE_THREADS]; +/* + static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 +/ + SRV_MAX_N_PURGE_THREADS]; +*/ +/** pgcomp_thread are 16 total */ +#define START_PGCOMP_CNT (SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS) +#define PGCOMP_MAX_WORKER 16 +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS + PGCOMP_MAX_WORKER]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -537,7 +546,7 @@ create_log_file( *file = os_file_create( innodb_file_log_key, name, OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name); @@ -754,7 +763,7 @@ open_log_file( *file = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); return(DB_ERROR); @@ -845,7 +854,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode) { @@ -888,7 +897,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -921,17 +930,17 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else if (i == 0) { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -1122,7 +1131,7 @@ srv_undo_tablespace_create( innodb_file_data_key, name, srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode && ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -1209,7 +1218,8 @@ srv_undo_tablespace_open( | OS_FILE_ON_ERROR_SILENT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + FALSE); /* If the file open was successful then load the tablespace. */ @@ -1503,6 +1513,694 @@ init_log_online(void) } } +/* JAN: TODO: */ +/**********************************************************************************/ +extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time); +extern ibool buf_flush_start(buf_pool_t* buf_pool, buf_flush_t flush_type); +extern void buf_flush_end(buf_pool_t* buf_pool, buf_flush_t flush_type); +extern void buf_flush_common(buf_flush_t flush_type, ulint page_count); +extern ulint buf_flush_batch(buf_pool_t* buf_pool, buf_flush_t flush_type, ulint min_n, lsn_t lsn_limit, bool limited_lru_scan, flush_counters_t*); + +typedef enum wrk_status { + WRK_ITEM_SET=0, + WRK_ITEM_START=1, + WRK_ITEM_DONE=2, + WRK_ITEM_SUCCESS=2, + WRK_ITEM_FAILED=3, + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +typedef enum wthr_status { + WTHR_NOT_INIT=0, + WTHR_INITIALIZED=1, + WTHR_SIG_WAITING=2, + WTHR_RUNNING=3, + WTHR_NO_WORK=4, + WTHR_KILL_IT=5, + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +typedef struct wrk_itm +{ + /****************************/ + /* Need to group into struct*/ + buf_pool_t* buf_pool; //buffer-pool instance + int flush_type; //flush-type for buffer-pool flush operation + int min; //minimum number of pages requested to be flushed + unsigned long long lsn_limit; //lsn limit for the buffer-pool flush operation + /****************************/ + + unsigned long result; //flush pages count + unsigned long t_usec; //time-taken in usec + long id_usr; //thread-id currently working + wrk_status_t wi_status; //flag + struct wrk_itm *next; +} wrk_t; + +typedef enum op_q_status { + Q_NOT_INIT=0, + Q_EMPTY=1, + Q_INITIALIZED=2, + Q_PROCESS=3, + Q_DONE=4, + Q_ERROR=5, + Q_STATUS_UNDEFINED +} q_status_t; + +typedef struct op_queue +{ + pthread_mutex_t mtx; + pthread_cond_t cv; + q_status_t flag; + wrk_t *head; + wrk_t *tail; +} opq_t; + +opq_t wq, cq; + +typedef struct thread_sync +{ + int wthread_id; + pthread_t wthread; + opq_t *wq; + opq_t *cq; + wthr_status_t wt_status; + unsigned long stat_universal_num_processed; + unsigned long stat_cycle_num_processed; +} thread_sync_t; + +/* Global XXX:DD needs to be cleaned */ +int exit_flag; +ulint check_wrk_done_count; +static ulint done_cnt_flag; +static int pgc_n_threads = 8; + +thread_sync_t pc_sync[PGCOMP_MAX_WORKER]; +static wrk_t work_items[PGCOMP_MAX_WORKER]; +static int pgcomp_wrk_initialized = -1; + +int set_check_done_flag_count(int cnt) +{ + return(check_wrk_done_count = cnt); +} + +int set_pgcomp_wrk_init_done(void) +{ + pgcomp_wrk_initialized = 1; + return 0; +} + +int is_pgcomp_wrk_init_done(void) +{ + return(pgcomp_wrk_initialized == 1); +} + +ulint set_done_cnt_flag(ulint val) +{ + /* + * Assumption: The thread calling into set_done_cnt_flag + * needs to have "cq.mtx" acquired, else not safe. + */ + done_cnt_flag = val; + return done_cnt_flag; +} + + +ulint cv_done_inc_flag_sig(thread_sync_t * ppc) +{ + pthread_mutex_lock(&ppc->cq->mtx); + ppc->stat_universal_num_processed++; + ppc->stat_cycle_num_processed++; + done_cnt_flag++; + if(!(done_cnt_flag <= check_wrk_done_count)) { + fprintf(stderr, "ERROR: done_cnt:%lu check_wrk_done_count:%lu\n", + done_cnt_flag, check_wrk_done_count); + } + assert(done_cnt_flag <= check_wrk_done_count); + pthread_mutex_unlock(&ppc->cq->mtx); + if(done_cnt_flag == check_wrk_done_count) { + ppc->wq->flag = Q_DONE; + pthread_mutex_lock(&ppc->cq->mtx); + ppc->cq->flag = Q_DONE; + pthread_cond_signal(&ppc->cq->cv); + pthread_mutex_unlock(&ppc->cq->mtx); + } + return(done_cnt_flag); +} + +int q_remove_wrk(opq_t *q, wrk_t **wi) +{ + int ret = 0; + + if(!wi || !q) { + return -1; + } + + pthread_mutex_lock(&q->mtx); + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* get the first in the list*/ + *wi = q->head; + if(q->head) { + ret = 0; + q->head = q->head->next; + (*wi)->next = NULL; + if(!q->head) { + q->tail = NULL; + } + } else { + q->tail = NULL; + ret = 1; /* indicating remove from queue failed */ + } + pthread_mutex_unlock(&q->mtx); + return (ret); +} + +int is_busy_wrk_itm(wrk_t *wi) +{ + if(!wi) { + return -1; + } + return(!(wi->id_usr == -1)); +} + +int setup_wrk_itm(int items) +{ + int i; + for(i=0; imtx, NULL); + pthread_cond_init(&q->cv, NULL); + q->flag = Q_INITIALIZED; + q->head = q->tail = NULL; + + return 0; +} + +#if 0 +int drain_cq(opq_t *cq, int items) +{ + int i=0; + + if(!cq) { + return -1; + } + pthread_mutex_lock(&cq->mtx); + for(i=0; ihead = cq->tail = NULL; + pthread_mutex_unlock(&cq->mtx); + return 0; +} +#endif + +int q_insert_wrk_list(opq_t *q, wrk_t *w_list) +{ + if((!q) || (!w_list)) { + fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); + return -1; + } + + pthread_mutex_lock(&q->mtx); + + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* list is empty */ + if(!q->tail) { + q->head = q->tail = w_list; + } else { + /* added the first of the node to list */ + assert(q->head != NULL); + q->tail->next = w_list; + } + + /* move tail to the last node */ + while(q->tail->next) { + q->tail = q->tail->next; + } + pthread_mutex_unlock(&q->mtx); + + return 0; +} + +int flush_pool_instance(wrk_t *wi) +{ + struct timeval p_start_time, p_end_time, d_time; + flush_counters_t n; + + if(!wi) { + fprintf(stderr, "work item invalid wi:%p\n", wi); + return -1; + } + + wi->t_usec = 0; + if (!buf_flush_start(wi->buf_pool, (buf_flush_t)wi->flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + fprintf(stderr, "flush_start Failed, flush_type:%d\n", + (buf_flush_t)wi->flush_type); + return -1; + } + +#ifdef UNIV_DEBUG + /* Record time taken for the OP in usec */ + gettimeofday(&p_start_time, 0x0); +#endif + + if((buf_flush_t)wi->flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(wi->buf_pool); + wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); + buf_pool_mutex_exit(wi->buf_pool); + wi->min = ut_min(srv_LRU_scan_depth,wi->min); + } + + buf_flush_batch(wi->buf_pool, + (buf_flush_t)wi->flush_type, + wi->min, wi->lsn_limit, false, &n); + + wi->result = n.flushed; + + buf_flush_end(wi->buf_pool, (buf_flush_t)wi->flush_type); + buf_flush_common((buf_flush_t)wi->flush_type, wi->result); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); +#endif + + return 0; +} + +int service_page_comp_io(thread_sync_t * ppc) +{ + wrk_t *wi = NULL; + int ret=0; + + pthread_mutex_lock(&ppc->wq->mtx); + do{ + ppc->wt_status = WTHR_SIG_WAITING; + ret = pthread_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); + ppc->wt_status = WTHR_RUNNING; + if(ret == ETIMEDOUT) { + fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%lu] ret:%d\n", + done_cnt_flag, ret); + } else if(ret == EINVAL || ret == EPERM) { + fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%lu] ret:%d\n", + done_cnt_flag, ret); + } + if(ppc->wq->flag == Q_PROCESS) { + break; + } else { + pthread_mutex_unlock(&ppc->wq->mtx); + return -1; + } + } while (ppc->wq->flag == Q_PROCESS && ret == 0); + + pthread_mutex_unlock(&ppc->wq->mtx); + + while (ppc->cq->flag == Q_PROCESS) { + wi = NULL; + /* Get the work item */ + if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { + ppc->wt_status = WTHR_NO_WORK; + return -1; + } + + assert(ret==0); + assert(wi != NULL); + assert(0 == is_busy_wrk_itm(wi)); + assert(wi->id_usr == -1); + + wi->id_usr = ppc->wthread; + wi->wi_status = WRK_ITEM_START; + + /* Process work item */ + if(0 != (ret = flush_pool_instance(wi))) { + fprintf(stderr, "FLUSH op failed ret:%d\n", ret); + wi->wi_status = WRK_ITEM_FAILED; + } + + ret = q_insert_wrk_list(ppc->cq, wi); + + assert(0==ret); + assert(check_wrk_done_count >= done_cnt_flag); + wi->wi_status = WRK_ITEM_SUCCESS; + if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { + break; + } + } + return(0); +} + +/******************************************************************//** +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(page_comp_io_thread)( +/*==========================================*/ + void * arg) +{ + thread_sync_t *ppc_io = ((thread_sync_t *)arg); + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + service_page_comp_io(ppc_io); + ppc_io->stat_cycle_num_processed = 0; + } + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +int print_queue_wrk_itm(opq_t *q) +{ +#if UNIV_DEBUG + wrk_t *wi = NULL; + + if(!q) { + fprintf(stderr, "queue NULL\n"); + return -1; + } + + if(!q->head || !q->tail) { + assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); + fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); + return 0; + } + + pthread_mutex_lock(&q->mtx); + for(wi = q->head; (wi != NULL) ; wi = wi->next) { + //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", + // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); + fprintf(stderr, "- [%p] [%s] >%p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->next); + } + pthread_mutex_unlock(&q->mtx); +#endif + return(0); +} + +int print_wrk_list(wrk_t *wi_list) +{ + wrk_t *wi = wi_list; + int i=0; + + if(!wi_list) { + fprintf(stderr, "list NULL\n"); + } + + while(wi) { + fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); + wi = wi->next; + i++; + } + fprintf(stderr, "list len: %d\n", i); + return 0; +} + +int pgcomp_handler(wrk_t *w_list) +{ + int ret=0; + opq_t *wrk_q=NULL, *comp_q=NULL; + + wrk_q=&wq; + comp_q=&cq; + + pthread_mutex_lock(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_EMPTY; + pthread_mutex_unlock(&wrk_q->mtx); + + ret = q_insert_wrk_list(wrk_q, w_list); + if(ret != 0) { + fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", + __FUNCTION__, &wq, w_list); + return -1; + } + +retry_submit: + pthread_mutex_lock(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&wrk_q->mtx); + + + pthread_mutex_lock(&comp_q->mtx); + if(0 != set_done_cnt_flag(0)) { + fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); + pthread_mutex_unlock(&comp_q->mtx); + return -1; + } + comp_q->flag = Q_PROCESS; + pthread_mutex_unlock(&comp_q->mtx); + + /* if threads are waiting request them to start */ + pthread_mutex_lock(&wrk_q->mtx); + wrk_q->flag = Q_PROCESS; + pthread_cond_broadcast(&wrk_q->cv); + pthread_mutex_unlock(&wrk_q->mtx); + + /* Wait on all worker-threads to complete */ + pthread_mutex_lock(&comp_q->mtx); + if (comp_q->flag != Q_DONE) { + do { + pthread_cond_wait(&comp_q->cv, &comp_q->mtx); + if(comp_q->flag != Q_DONE) { + fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + } + continue; + } else if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&comp_q->mtx); + goto retry_submit; + + assert(!done_cnt_flag); + continue; + } + assert(done_cnt_flag == srv_buf_pool_instances); + + if ((comp_q->flag == Q_DONE) && + (done_cnt_flag == srv_buf_pool_instances)) { + break; + } + } while((comp_q->flag == Q_INITIALIZED) && + (done_cnt_flag != srv_buf_pool_instances)); + } else { + fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + if (!done_cnt_flag) { + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&comp_q->mtx); + goto retry_submit; + assert(!done_cnt_flag); + } + assert(done_cnt_flag == srv_buf_pool_instances); + } + + pthread_mutex_unlock(&comp_q->mtx); + pthread_mutex_lock(&wrk_q->mtx); + wrk_q->flag = Q_DONE; + pthread_mutex_unlock(&wrk_q->mtx); + + return 0; +} + +/******************************************************************//** +@return a dummy parameter*/ +int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) +{ + int i=0; + + if(is_pgcomp_wrk_init_done()) { + fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); + return -1; + } + + if(!wq || !cq) { + fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); + return -1; + } + + /* work-item setup */ + setup_wrk_itm(wrk_cnt); + + /* wq & cq setup */ + init_queue(wq); + init_queue(cq); + + /* Mark each of the thread sync entires */ + for(i=0; i < PGCOMP_MAX_WORKER; i++) { + pc_sync[i].wthread_id = i; + } + + /* Create threads for page-compression-flush */ + for(i=0; i < num_threads; i++) { + pc_sync[i].wthread_id = i; + pc_sync[i].wq = wq; + pc_sync[i].cq = cq; + os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), + thread_ids + START_PGCOMP_CNT + i); + //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; + pc_sync[i].wthread = (START_PGCOMP_CNT + i); + pc_sync[i].wt_status = WTHR_INITIALIZED; + } + + set_check_done_flag_count(wrk_cnt); + set_pgcomp_wrk_init_done(); + + return 0; +} + + +int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) +{ + long stat_tot=0; + unsigned int i=0; + for(i=0; i< num_threads;i++) { + stat_tot+=wthr[i].stat_universal_num_processed; + fprintf(stderr, "[%d] stat [%lu]\n", wthr[i].wthread_id, + wthr[i].stat_universal_num_processed); + } + fprintf(stderr, "Stat-Total:%lu\n", stat_tot); + return (0); +} + +int reset_wrk_itm(int items) +{ + int i; + + pthread_mutex_lock(&wq.mtx); + wq.head = wq.tail = NULL; + pthread_mutex_unlock(&wq.mtx); + + pthread_mutex_lock(&cq.mtx); + for(i=0;i Date: Tue, 4 Feb 2014 14:52:02 +0200 Subject: [PATCH 10/56] Fixed issue on atomic writes on startup, removed incorrect assert. Fixed issue on file space extend when posix_fallocate is used. Merged second iteration of multi-threaded flush code. --- .../r/innodb_monitor_disable_basic.result | 8 + storage/innobase/buf/buf0flu.cc | 122 +--- storage/innobase/fil/fil0fil.cc | 1 - storage/innobase/include/dict0dict.ic | 4 + storage/innobase/include/srv0srv.h | 4 + storage/innobase/srv/srv0start.cc | 670 +++++------------ storage/xtradb/buf/buf0flu.cc | 120 +--- storage/xtradb/fil/fil0fil.cc | 51 +- storage/xtradb/include/dict0dict.ic | 1 - storage/xtradb/include/srv0srv.h | 4 + storage/xtradb/srv/srv0start.cc | 675 +++++------------- 11 files changed, 515 insertions(+), 1145 deletions(-) diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result index ce57dbb2fdc..78d294e5f09 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result @@ -37,6 +37,7 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +161,13 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled index_splits disabled index_merges disabled adaptive_hash_searches disabled diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index d159ddbe23f..ff1fab6eae7 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -46,6 +46,7 @@ Created 11/11/1995 Heikki Tuuri #include "ibuf0ibuf.h" #include "log0log.h" #include "os0file.h" +#include "os0sync.h" #include "trx0sys.h" #include "srv0mon.h" #include "mysql/plugin.h" @@ -1934,11 +1935,16 @@ buf_flush_LRU( /* JAN: TODO: */ /*******************************************************************//**/ extern int is_pgcomp_wrk_init_done(void); -extern int pgcomp_flush_work_items(int buf_pool_inst, int *pages_flushed, - int flush_type, int min_n, unsigned long long lsn_limit); +extern int pgcomp_flush_work_items( + int buf_pool_inst, + int *pages_flushed, + enum buf_flush flush_type, + int min_n, + lsn_t lsn_limit); #define MT_COMP_WATER_MARK 50 +#ifdef UNIV_DEBUG #include int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) { @@ -1959,8 +1965,15 @@ int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_t return 0; } +#endif + +static os_fast_mutex_t pgcomp_mtx; + +void pgcomp_init(void) +{ + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &pgcomp_mtx); +} -static pthread_mutex_t pgcomp_mtx = PTHREAD_MUTEX_INITIALIZER; /*******************************************************************//** Multi-threaded version of buf_flush_list */ @@ -1983,7 +1996,10 @@ pgcomp_buf_flush_list( { ulint i; bool success = true; +#ifdef UNIV_DEBUG struct timeval p_start_time, p_end_time, d_time; +#endif + int cnt_flush[MTFLUSH_MAX_WORKER]; if (n_processed) { *n_processed = 0; @@ -2001,96 +2017,34 @@ pgcomp_buf_flush_list( #ifdef UNIV_DEBUG gettimeofday(&p_start_time, 0x0); #endif - if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { - int cnt_flush[32]; + os_fast_mutex_lock(&pgcomp_mtx); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&pgcomp_mtx); - //stack_trace(); - pthread_mutex_lock(&pgcomp_mtx); - //gettimeofday(&p_start_time, 0x0); - //fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LIST, - min_n, lsn_limit); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (n_processed) { - *n_processed += cnt_flush[i]; - } - if (cnt_flush[i]) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - cnt_flush[i]); - - } - } - - pthread_mutex_unlock(&pgcomp_mtx); - -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - return(success); - } - /* Flush to lsn_limit in all buffer pool instances */ for (i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool; - ulint page_count = 0; - - buf_pool = buf_pool_from_array(i); - - if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ - success = false; - - continue; - } - - page_count = buf_flush_batch( - buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit); - - buf_flush_end(buf_pool, BUF_FLUSH_LIST); - - buf_flush_common(BUF_FLUSH_LIST, page_count); - if (n_processed) { - *n_processed += page_count; + *n_processed += cnt_flush[i]; } - - if (page_count) { + if (cnt_flush[i]) { MONITOR_INC_VALUE_CUMULATIVE( MONITOR_FLUSH_BATCH_TOTAL_PAGE, MONITOR_FLUSH_BATCH_COUNT, MONITOR_FLUSH_BATCH_PAGES, - page_count); + cnt_flush[i]); } } - -#if UNIV_DEBUG +#ifdef UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); - - fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu %llu usec]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); #endif return(success); } -#endif + /* JAN: TODO: END: */ /*******************************************************************//** @@ -2292,18 +2246,21 @@ ulint pgcomp_buf_flush_LRU_tail(void) /*====================*/ { +#ifdef UNIV_DEBUG struct timeval p_start_time, p_end_time, d_time; +#endif ulint total_flushed=0, i=0; int cnt_flush[32]; -#if UNIV_DEBUG +#ifdef UNIV_DEBUG gettimeofday(&p_start_time, 0x0); #endif - assert(is_pgcomp_wrk_init_done()); + ut_ad(is_pgcomp_wrk_init_done()); - pthread_mutex_lock(&pgcomp_mtx); + os_fast_mutex_lock(&pgcomp_mtx); pgcomp_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&pgcomp_mtx); for (i = 0; i < srv_buf_pool_instances; i++) { if (cnt_flush[i]) { @@ -2317,8 +2274,6 @@ pgcomp_buf_flush_LRU_tail(void) } } - pthread_mutex_unlock(&pgcomp_mtx); - #if UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); @@ -2894,6 +2849,7 @@ buf_flush_validate( } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ #ifdef UNIV_DEBUG diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 3803d0a93aa..2430df2b386 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1280,7 +1280,6 @@ fil_space_create( DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); ut_a(fil_system); - ut_a(fsp_flags_is_valid(flags)); /* Look for a matching tablespace and if found free it. */ do { diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index f9d548681a8..ed891a00fd4 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -859,6 +859,10 @@ dict_tf_set( if (awrites != ATOMIC_WRITES_DEFAULT) { *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); + } + + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); } diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index a11c213d534..008a77ddedf 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -257,6 +257,10 @@ extern my_bool srv_use_atomic_writes; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ extern my_bool srv_use_lz4; +/* Number of flush threads */ +#define MTFLUSH_MAX_WORKER 64 +extern ulint srv_mtflush_threads; + #ifdef __WIN__ extern ibool srv_use_native_conditions; #endif /* __WIN__ */ diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 0517f4b1468..18d6cd109e7 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -3,7 +3,7 @@ Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -127,10 +127,9 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -/** pgcomp_thread are 16 total */ -#define START_PGCOMP_CNT (SRV_MAX_N_IO_THREADS + 6 + 32) -#define PGCOMP_MAX_WORKER 16 -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + PGCOMP_MAX_WORKER]; +/** 6 is the ? */ +#define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + 32) +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + MTFLUSH_MAX_WORKER]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -1442,89 +1441,79 @@ extern ibool buf_flush_start(buf_pool_t* buf_pool, enum buf_flush flush_type); extern void buf_flush_end(buf_pool_t* buf_pool, enum buf_flush flush_type); extern void buf_flush_common(enum buf_flush flush_type, ulint page_count); extern ulint buf_flush_batch(buf_pool_t* buf_pool, enum buf_flush flush_type, ulint min_n, lsn_t lsn_limit); +extern void pgcomp_init(void); typedef enum wrk_status { - WRK_ITEM_SET=0, - WRK_ITEM_START=1, - WRK_ITEM_DONE=2, - WRK_ITEM_SUCCESS=2, - WRK_ITEM_FAILED=3, - WRK_ITEM_STATUS_UNDEFINED + WRK_ITEM_SET=0, // wrk-item is set + WRK_ITEM_START=1, // processing of wrk-item has started + WRK_ITEM_DONE=2, // processing is done usually set to SUCCESS/FAILED + WRK_ITEM_SUCCESS=2, // Success processing the wrk-item + WRK_ITEM_FAILED=3, // status of failed + WRK_ITEM_EXIT=4, + WRK_ITEM_STATUS_UNDEFINED } wrk_status_t; +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, // Exit queue-wait + MT_WRK_WRITE=1, // Flush operation + MT_WRK_READ=2, // Decompress operation + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; + typedef enum wthr_status { - WTHR_NOT_INIT=0, - WTHR_INITIALIZED=1, - WTHR_SIG_WAITING=2, - WTHR_RUNNING=3, - WTHR_NO_WORK=4, - WTHR_KILL_IT=5, - WTHR_STATUS_UNDEFINED + WTHR_NOT_INIT=0, + WTHR_INITIALIZED=1, + WTHR_SIG_WAITING=2, + WTHR_RUNNING=3, + WTHR_NO_WORK=4, + WTHR_KILL_IT=5, + WTHR_STATUS_UNDEFINED } wthr_status_t; +typedef struct wr_tsk { + buf_pool_t *buf_pool; // buffer-pool instance + enum buf_flush flush_type; // flush-type for buffer-pool flush operation + ulint min; //minimum number of pages requested to be flushed + lsn_t lsn_limit;//lsn limit for the buffer-pool flush operation +} wr_tsk_t; + + +typedef struct rd_tsk { + void *page_pool; //list of pages to decompress; +} rd_tsk_t; + typedef struct wrk_itm { - /****************************/ - /* Need to group into struct*/ - buf_pool_t* buf_pool; //buffer-pool instance - int flush_type; //flush-type for buffer-pool flush operation - int min; //minimum number of pages requested to be flushed - unsigned long long lsn_limit; //lsn limit for the buffer-pool flush operation - /****************************/ - - unsigned long result; //flush pages count - unsigned long t_usec; //time-taken in usec - long id_usr; //thread-id currently working - wrk_status_t wi_status; //flag - struct wrk_itm *next; + mt_wrk_tsk_t tsk; + /* based on task-type one of the entries wr_tsk/rd_tsk will be used */ + wr_tsk_t wr; //flush page list + rd_tsk_t rd; //decompress page list + unsigned long result; //flush pages count + unsigned long t_usec; //time-taken in usec + long id_usr; //thread-id currently working + wrk_status_t wi_status; //flag + struct wrk_itm *next; } wrk_t; -typedef enum op_q_status { - Q_NOT_INIT=0, - Q_EMPTY=1, - Q_INITIALIZED=2, - Q_PROCESS=3, - Q_DONE=4, - Q_ERROR=5, - Q_STATUS_UNDEFINED -} q_status_t; - -typedef struct op_queue -{ - pthread_mutex_t mtx; - pthread_cond_t cv; - q_status_t flag; - wrk_t *head; - wrk_t *tail; -} opq_t; - -opq_t wq, cq; - typedef struct thread_sync { - int wthread_id; - pthread_t wthread; - opq_t *wq; - opq_t *cq; - wthr_status_t wt_status; + int wthread_id; + os_thread_t wthread; + ib_wqueue_t *wq; // work Queue + ib_wqueue_t *wr_cq;// Write Completion Queue + ib_wqueue_t *rd_cq; // Read Completion Queue + wthr_status_t wt_status; // Worker Thread status unsigned long stat_universal_num_processed; unsigned long stat_cycle_num_processed; } thread_sync_t; /* Global XXX:DD needs to be cleaned */ -int exit_flag; -ulint check_wrk_done_count; -static ulint done_cnt_flag; -static int pgc_n_threads = 8; - -thread_sync_t pc_sync[PGCOMP_MAX_WORKER]; -static wrk_t work_items[PGCOMP_MAX_WORKER]; +ib_wqueue_t *wq=NULL, *wr_cq=NULL, *rd_cq=NULL; +mem_heap_t *heap_allocated=NULL; +thread_sync_t pc_sync[MTFLUSH_MAX_WORKER]; +static wrk_t work_items[MTFLUSH_MAX_WORKER]; static int pgcomp_wrk_initialized = -1; - -int set_check_done_flag_count(int cnt) -{ - return(check_wrk_done_count = cnt); -} +ulint srv_mtflush_threads = 0; int set_pgcomp_wrk_init_done(void) { @@ -1537,83 +1526,14 @@ int is_pgcomp_wrk_init_done(void) return(pgcomp_wrk_initialized == 1); } -ulint set_done_cnt_flag(ulint val) -{ - /* - * Assumption: The thread calling into set_done_cnt_flag - * needs to have "cq.mtx" acquired, else not safe. - */ - done_cnt_flag = val; - return done_cnt_flag; -} - - -ulint cv_done_inc_flag_sig(thread_sync_t * ppc) -{ - pthread_mutex_lock(&ppc->cq->mtx); - ppc->stat_universal_num_processed++; - ppc->stat_cycle_num_processed++; - done_cnt_flag++; - if(!(done_cnt_flag <= check_wrk_done_count)) { - fprintf(stderr, "ERROR: done_cnt:%lu check_wrk_done_count:%lu\n", - done_cnt_flag, check_wrk_done_count); - } - assert(done_cnt_flag <= check_wrk_done_count); - pthread_mutex_unlock(&ppc->cq->mtx); - if(done_cnt_flag == check_wrk_done_count) { - ppc->wq->flag = Q_DONE; - pthread_mutex_lock(&ppc->cq->mtx); - ppc->cq->flag = Q_DONE; - pthread_cond_signal(&ppc->cq->cv); - pthread_mutex_unlock(&ppc->cq->mtx); - } - return(done_cnt_flag); -} - -int q_remove_wrk(opq_t *q, wrk_t **wi) -{ - int ret = 0; - - if(!wi || !q) { - return -1; - } - - pthread_mutex_lock(&q->mtx); - assert(!((q->tail == NULL) && (q->head != NULL))); - assert(!((q->tail != NULL) && (q->head == NULL))); - - /* get the first in the list*/ - *wi = q->head; - if(q->head) { - ret = 0; - q->head = q->head->next; - (*wi)->next = NULL; - if(!q->head) { - q->tail = NULL; - } - } else { - q->tail = NULL; - ret = 1; /* indicating remove from queue failed */ - } - pthread_mutex_unlock(&q->mtx); - return (ret); -} - -int is_busy_wrk_itm(wrk_t *wi) -{ - if(!wi) { - return -1; - } - return(!(wi->id_usr == -1)); -} - int setup_wrk_itm(int items) { int i; for(i=0; imtx, NULL); - pthread_cond_init(&q->cv, NULL); - q->flag = Q_INITIALIZED; - q->head = q->tail = NULL; - - return 0; -} - -#if 0 -int drain_cq(opq_t *cq, int items) -{ - int i=0; - - if(!cq) { - return -1; - } - pthread_mutex_lock(&cq->mtx); - for(i=0; ihead = cq->tail = NULL; - pthread_mutex_unlock(&cq->mtx); - return 0; -} -#endif - -int q_insert_wrk_list(opq_t *q, wrk_t *w_list) -{ - if((!q) || (!w_list)) { - fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); - return -1; - } - - pthread_mutex_lock(&q->mtx); - - assert(!((q->tail == NULL) && (q->head != NULL))); - assert(!((q->tail != NULL) && (q->head == NULL))); - - /* list is empty */ - if(!q->tail) { - q->head = q->tail = w_list; - } else { - /* added the first of the node to list */ - assert(q->head != NULL); - q->tail->next = w_list; - } - - /* move tail to the last node */ - while(q->tail->next) { - q->tail = q->tail->next; - } - pthread_mutex_unlock(&q->mtx); - - return 0; -} - int flush_pool_instance(wrk_t *wi) { struct timeval p_start_time, p_end_time, d_time; - if(!wi) { + if (!wi) { fprintf(stderr, "work item invalid wi:%p\n", wi); return -1; } - wi->t_usec = 0; - if (!buf_flush_start(wi->buf_pool, (buf_flush)wi->flush_type)) { + if (!wi->wr.buf_pool) { + fprintf(stderr, "work-item wi->buf_pool:%p [likely thread exit]\n", + wi->wr.buf_pool); + return -1; + } + + wi->t_usec = 0; + if (!buf_flush_start(wi->wr.buf_pool, wi->wr.flush_type)) { /* We have two choices here. If lsn_limit was specified then skipping an instance of buffer pool means we cannot guarantee that all pages @@ -1709,39 +1571,34 @@ int flush_pool_instance(wrk_t *wi) help in the retry which will follow the failure. */ fprintf(stderr, "flush_start Failed, flush_type:%d\n", - (buf_flush)wi->flush_type); + wi->wr.flush_type); return -1; } -#ifdef UNIV_DEBUG /* Record time taken for the OP in usec */ gettimeofday(&p_start_time, 0x0); -#endif - if((buf_flush)wi->flush_type == BUF_FLUSH_LRU) { - /* srv_LRU_scan_depth can be arbitrarily large value. - * We cap it with current LRU size. - */ - buf_pool_mutex_enter(wi->buf_pool); - wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); - buf_pool_mutex_exit(wi->buf_pool); - wi->min = ut_min(srv_LRU_scan_depth,wi->min); - } + if (wi->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(wi->wr.buf_pool); + wi->wr.min = UT_LIST_GET_LEN(wi->wr.buf_pool->LRU); + buf_pool_mutex_exit(wi->wr.buf_pool); + wi->wr.min = ut_min(srv_LRU_scan_depth,wi->wr.min); + } - wi->result = buf_flush_batch(wi->buf_pool, - (buf_flush)wi->flush_type, - wi->min, wi->lsn_limit); + wi->result = buf_flush_batch(wi->wr.buf_pool, + wi->wr.flush_type, + wi->wr.min, wi->wr.lsn_limit); - buf_flush_end(wi->buf_pool, (buf_flush)wi->flush_type); - buf_flush_common((buf_flush)wi->flush_type, wi->result); + buf_flush_end(wi->wr.buf_pool, wi->wr.flush_type); + buf_flush_common(wi->wr.flush_type, wi->result); -#ifdef UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); -#endif - return 0; } @@ -1750,68 +1607,75 @@ int service_page_comp_io(thread_sync_t * ppc) wrk_t *wi = NULL; int ret=0; - pthread_mutex_lock(&ppc->wq->mtx); - do{ - ppc->wt_status = WTHR_SIG_WAITING; - ret = pthread_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); + ppc->wt_status = WTHR_SIG_WAITING; + wi = (wrk_t *)ib_wqueue_wait(ppc->wq); + + if (wi) { ppc->wt_status = WTHR_RUNNING; - if(ret == ETIMEDOUT) { - fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%lu] ret:%d\n", - done_cnt_flag, ret); - } else if(ret == EINVAL || ret == EPERM) { - fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%lu] ret:%d\n", - done_cnt_flag, ret); - } - if(ppc->wq->flag == Q_PROCESS) { - break; - } else { - pthread_mutex_unlock(&ppc->wq->mtx); - return -1; - } - } while (ppc->wq->flag == Q_PROCESS && ret == 0); + } else { + fprintf(stderr, "%s:%d work-item is NULL\n", __FILE__, __LINE__); + ppc->wt_status = WTHR_NO_WORK; + return (0); + } - pthread_mutex_unlock(&ppc->wq->mtx); + assert(wi != NULL); + wi->id_usr = ppc->wthread; - while (ppc->cq->flag == Q_PROCESS) { - wi = NULL; - /* Get the work item */ - if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { - ppc->wt_status = WTHR_NO_WORK; - return -1; - } + switch(wi->tsk) { + case MT_WRK_NONE: + assert(wi->wi_status == WRK_ITEM_EXIT); + wi->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); + break; - assert(ret==0); - assert(wi != NULL); - assert(0 == is_busy_wrk_itm(wi)); - assert(wi->id_usr == -1); - - wi->id_usr = ppc->wthread; + case MT_WRK_WRITE: wi->wi_status = WRK_ITEM_START; - /* Process work item */ - if(0 != (ret = flush_pool_instance(wi))) { + if (0 != (ret = flush_pool_instance(wi))) { fprintf(stderr, "FLUSH op failed ret:%d\n", ret); wi->wi_status = WRK_ITEM_FAILED; } - - ret = q_insert_wrk_list(ppc->cq, wi); - - assert(0==ret); - assert(check_wrk_done_count >= done_cnt_flag); wi->wi_status = WRK_ITEM_SUCCESS; - if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { - break; - } + ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); + break; + + case MT_WRK_READ: + /* Need to also handle the read case */ + assert(0); + /* completed task get added to rd_cq */ + /* wi->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(ppc->rd_cq, wi, heap_allocated);*/ + break; + + default: + /* None other than Write/Read handling planned */ + assert(0); } + + ppc->wt_status = WTHR_NO_WORK; return(0); } +void page_comp_io_thread_exit() +{ + ulint i; + + fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", srv_buf_pool_instances); + for (i=0; ihead || !q->tail) { - assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); - fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); - return 0; - } - - pthread_mutex_lock(&q->mtx); - for(wi = q->head; (wi != NULL) ; wi = wi->next) { - //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", - // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); - fprintf(stderr, "- [%p] [%s] >%p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->next); - } - pthread_mutex_unlock(&q->mtx); -#endif - return(0); -} - int print_wrk_list(wrk_t *wi_list) { wrk_t *wi = wi_list; @@ -1871,111 +1707,9 @@ int print_wrk_list(wrk_t *wi_list) return 0; } -int pgcomp_handler(wrk_t *w_list) -{ - int ret=0; - opq_t *wrk_q=NULL, *comp_q=NULL; - - wrk_q=&wq; - comp_q=&cq; - - pthread_mutex_lock(&wrk_q->mtx); - /* setup work queue here.. */ - wrk_q->flag = Q_EMPTY; - pthread_mutex_unlock(&wrk_q->mtx); - - ret = q_insert_wrk_list(wrk_q, w_list); - if(ret != 0) { - fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", - __FUNCTION__, &wq, w_list); - return -1; - } - -retry_submit: - pthread_mutex_lock(&wrk_q->mtx); - /* setup work queue here.. */ - wrk_q->flag = Q_INITIALIZED; - pthread_mutex_unlock(&wrk_q->mtx); - - - pthread_mutex_lock(&comp_q->mtx); - if(0 != set_done_cnt_flag(0)) { - fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); - pthread_mutex_unlock(&comp_q->mtx); - return -1; - } - comp_q->flag = Q_PROCESS; - pthread_mutex_unlock(&comp_q->mtx); - - /* if threads are waiting request them to start */ - pthread_mutex_lock(&wrk_q->mtx); - wrk_q->flag = Q_PROCESS; - pthread_cond_broadcast(&wrk_q->cv); - pthread_mutex_unlock(&wrk_q->mtx); - - /* Wait on all worker-threads to complete */ - pthread_mutex_lock(&comp_q->mtx); - if (comp_q->flag != Q_DONE) { - do { - pthread_cond_wait(&comp_q->cv, &comp_q->mtx); - if(comp_q->flag != Q_DONE) { - fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - if (done_cnt_flag != srv_buf_pool_instances) { - fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - } - continue; - } else if (done_cnt_flag != srv_buf_pool_instances) { - fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - comp_q->flag = Q_INITIALIZED; - pthread_mutex_unlock(&comp_q->mtx); - goto retry_submit; - - assert(!done_cnt_flag); - continue; - } - assert(done_cnt_flag == srv_buf_pool_instances); - - if ((comp_q->flag == Q_DONE) && - (done_cnt_flag == srv_buf_pool_instances)) { - break; - } - } while((comp_q->flag == Q_INITIALIZED) && - (done_cnt_flag != srv_buf_pool_instances)); - } else { - fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - if (!done_cnt_flag) { - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - comp_q->flag = Q_INITIALIZED; - pthread_mutex_unlock(&comp_q->mtx); - goto retry_submit; - assert(!done_cnt_flag); - } - assert(done_cnt_flag == srv_buf_pool_instances); - } - - pthread_mutex_unlock(&comp_q->mtx); - pthread_mutex_lock(&wrk_q->mtx); - wrk_q->flag = Q_DONE; - pthread_mutex_unlock(&wrk_q->mtx); - - return 0; -} - /******************************************************************//** @return a dummy parameter*/ -int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) +int pgcomp_handler_init(int num_threads, int wrk_cnt, ib_wqueue_t *wq, ib_wqueue_t *wr_cq, ib_wqueue_t *rd_cq) { int i=0; @@ -1984,106 +1718,89 @@ int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) return -1; } - if(!wq || !cq) { - fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); + if(!wq || !wr_cq || !rd_cq) { + fprintf(stderr, "%s() FAILED wq:%p write-cq:%p read-cq:%p\n", + __FUNCTION__, wq, wr_cq, rd_cq); return -1; } /* work-item setup */ setup_wrk_itm(wrk_cnt); - /* wq & cq setup */ - init_queue(wq); - init_queue(cq); - /* Mark each of the thread sync entires */ - for(i=0; i < PGCOMP_MAX_WORKER; i++) { - pc_sync[i].wthread_id = i; + for(i=0; i < MTFLUSH_MAX_WORKER; i++) { + pc_sync[i].wthread_id = i; } /* Create threads for page-compression-flush */ for(i=0; i < num_threads; i++) { pc_sync[i].wthread_id = i; pc_sync[i].wq = wq; - pc_sync[i].cq = cq; + pc_sync[i].wr_cq = wr_cq; + pc_sync[i].rd_cq = rd_cq; + os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), - thread_ids + START_PGCOMP_CNT + i); - //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; - pc_sync[i].wthread = (START_PGCOMP_CNT + i); + thread_ids + START_OLD_THREAD_CNT + i); + pc_sync[i].wthread = (START_OLD_THREAD_CNT + i); pc_sync[i].wt_status = WTHR_INITIALIZED; } - - set_check_done_flag_count(wrk_cnt); set_pgcomp_wrk_init_done(); - + fprintf(stderr, "%s() Worker-Threads created..\n", __FUNCTION__); return 0; } - int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) { - long stat_tot=0; - unsigned int i=0; - for(i=0; i< num_threads;i++) { + ulong stat_tot=0; + ulint i=0; + for(i=0; i int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) { @@ -2038,8 +2044,15 @@ int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_t return 0; } +#endif + +static os_fast_mutex_t pgcomp_mtx; + +void pgcomp_init(void) +{ + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &pgcomp_mtx); +} -static pthread_mutex_t pgcomp_mtx = PTHREAD_MUTEX_INITIALIZER; /*******************************************************************//** Multi-threaded version of buf_flush_list */ @@ -2062,8 +2075,10 @@ pgcomp_buf_flush_list( { ulint i; bool success = true; +#ifdef UNIV_DEBUG struct timeval p_start_time, p_end_time, d_time; - flush_counters_t n; +#endif + int cnt_flush[MTFLUSH_MAX_WORKER]; if (n_processed) { *n_processed = 0; @@ -2081,91 +2096,30 @@ pgcomp_buf_flush_list( #ifdef UNIV_DEBUG gettimeofday(&p_start_time, 0x0); #endif - if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { - int cnt_flush[32]; + os_fast_mutex_lock(&pgcomp_mtx); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&pgcomp_mtx); - //stack_trace(); - pthread_mutex_lock(&pgcomp_mtx); - //gettimeofday(&p_start_time, 0x0); - //fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LIST, - min_n, lsn_limit); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (n_processed) { - *n_processed += cnt_flush[i]; - } - if (cnt_flush[i]) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - cnt_flush[i]); - - } - } - - pthread_mutex_unlock(&pgcomp_mtx); - -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - return(success); - } - /* Flush to lsn_limit in all buffer pool instances */ for (i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool; - - buf_pool = buf_pool_from_array(i); - - if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ - success = false; - - continue; - } - - buf_flush_batch( - buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit, false, &n); - - buf_flush_end(buf_pool, BUF_FLUSH_LIST); - - buf_flush_common(BUF_FLUSH_LIST, n.flushed); - if (n_processed) { - *n_processed += n.flushed; + *n_processed += cnt_flush[i]; } - - if (n.flushed) { + if (cnt_flush[i]) { MONITOR_INC_VALUE_CUMULATIVE( MONITOR_FLUSH_BATCH_TOTAL_PAGE, MONITOR_FLUSH_BATCH_COUNT, MONITOR_FLUSH_BATCH_PAGES, - n.flushed); + cnt_flush[i]); } } - #ifdef UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); - - fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu %llu usec]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); #endif return(success); } @@ -2416,18 +2370,21 @@ ulint pgcomp_buf_flush_LRU_tail(void) /*====================*/ { +#ifdef UNIV_DEBUG struct timeval p_start_time, p_end_time, d_time; +#endif ulint total_flushed=0, i=0; int cnt_flush[32]; #ifdef UNIV_DEBUG gettimeofday(&p_start_time, 0x0); #endif - assert(is_pgcomp_wrk_init_done()); + ut_ad(is_pgcomp_wrk_init_done()); - pthread_mutex_lock(&pgcomp_mtx); + os_fast_mutex_lock(&pgcomp_mtx); pgcomp_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&pgcomp_mtx); for (i = 0; i < srv_buf_pool_instances; i++) { if (cnt_flush[i]) { @@ -2441,9 +2398,7 @@ pgcomp_buf_flush_LRU_tail(void) } } - pthread_mutex_unlock(&pgcomp_mtx); - -#ifdef UNIV_DEBUG +#if UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); @@ -2454,9 +2409,8 @@ pgcomp_buf_flush_LRU_tail(void) return(total_flushed); } + /* JAN: TODO: END: */ - - /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index f3e952299ff..e170004cea1 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -1323,7 +1323,6 @@ fil_space_create( DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); ut_a(fil_system); - ut_a(fsp_flags_is_valid(flags)); /* Look for a matching tablespace and if found free it. */ do { @@ -4989,21 +4988,42 @@ retry: #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { + ulint n_pages = size_after_extend; + + success = os_file_set_size(node->name, node->handle, n_pages * page_size); + + /* Temporal solution: In directFS using atomic writes + we must use posix_fallocate to extend the file because + pwrite past end of file fails but when compression is + used the file pages must be physically initialized with + zeroes, thus after file extend with posix_fallocate + we still write empty pages to file. */ + if (success && + srv_use_atomic_writes && + srv_compress_pages) { + goto extend_file; + } - success = os_file_set_size(node->name, node->handle, - (size_after_extend - - file_start_page_no) * page_size); mutex_enter(&fil_system->mutex); + if (success) { - node->size += (size_after_extend - start_page_no); - space->size += (size_after_extend - start_page_no); + node->size += n_pages; + space->size += n_pages; os_has_said_disk_full = FALSE; } - node->being_extended = FALSE; + + /* If posix_fallocate was used to extent the file space + we need to complete the io. Because no actual writes were + dispatched read operation is enough here. Without this + there will be assertion at shutdown indicating that + all IO is not completed. */ + fil_node_complete_io(node, fil_system, OS_FILE_READ); goto complete_io; } #endif +extend_file: + /* Extend at most 64 pages at a time */ buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; buf2 = static_cast(mem_alloc(buf_size + page_size)); @@ -5057,24 +5077,11 @@ retry: space->size += pages_added; node->size += pages_added; - node->being_extended = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -complete_io: - /* If posix_fallocate was used to extent the file space - we need to complete the io. Because no actual writes were - dispatched read operation is enough here. Without this - there will be assertion at shutdown indicating that - all IO is not completed. */ - if (srv_use_posix_fallocate) { - fil_node_complete_io(node, fil_system, OS_FILE_READ); - } else { - fil_node_complete_io(node, fil_system, OS_FILE_WRITE); - } -#else fil_node_complete_io(node, fil_system, OS_FILE_WRITE); -#endif +complete_io: + node->being_extended = FALSE; *actual_size = space->size; #ifndef UNIV_HOTBACKUP diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index 502b1d028d8..1ce4fe6a2f1 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -873,7 +873,6 @@ dict_tf_set( (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); } - } /********************************************************************//** diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index cc2221fc3c6..c9a92f608d8 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -277,6 +277,10 @@ extern my_bool srv_use_atomic_writes; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ extern my_bool srv_use_lz4; +/* Number of flush threads */ +#define MTFLUSH_MAX_WORKER 64 +extern ulint srv_mtflush_threads; + /** Server undo tablespaces directory, can be absolute path. */ extern char* srv_undo_dir; diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index faad8c3c133..7b2aebf6b83 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -3,7 +3,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -131,14 +131,9 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -/* - static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 -/ + SRV_MAX_N_PURGE_THREADS]; -*/ -/** pgcomp_thread are 16 total */ -#define START_PGCOMP_CNT (SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS) -#define PGCOMP_MAX_WORKER 16 -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS + PGCOMP_MAX_WORKER]; +/** 6 is the ? */ +#define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS) +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS + MTFLUSH_MAX_WORKER]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -1519,90 +1514,81 @@ extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timev extern ibool buf_flush_start(buf_pool_t* buf_pool, buf_flush_t flush_type); extern void buf_flush_end(buf_pool_t* buf_pool, buf_flush_t flush_type); extern void buf_flush_common(buf_flush_t flush_type, ulint page_count); -extern ulint buf_flush_batch(buf_pool_t* buf_pool, buf_flush_t flush_type, ulint min_n, lsn_t lsn_limit, bool limited_lru_scan, flush_counters_t*); +extern ulint buf_flush_batch(buf_pool_t* buf_pool, buf_flush_t flush_type, ulint min_n, lsn_t lsn_limit, bool limited_lru_scan, +flush_counters_t* n); +extern void pgcomp_init(void); typedef enum wrk_status { - WRK_ITEM_SET=0, - WRK_ITEM_START=1, - WRK_ITEM_DONE=2, - WRK_ITEM_SUCCESS=2, - WRK_ITEM_FAILED=3, - WRK_ITEM_STATUS_UNDEFINED + WRK_ITEM_SET=0, // wrk-item is set + WRK_ITEM_START=1, // processing of wrk-item has started + WRK_ITEM_DONE=2, // processing is done usually set to SUCCESS/FAILED + WRK_ITEM_SUCCESS=2, // Success processing the wrk-item + WRK_ITEM_FAILED=3, // status of failed + WRK_ITEM_EXIT=4, + WRK_ITEM_STATUS_UNDEFINED } wrk_status_t; +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, // Exit queue-wait + MT_WRK_WRITE=1, // Flush operation + MT_WRK_READ=2, // Decompress operation + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; + typedef enum wthr_status { - WTHR_NOT_INIT=0, - WTHR_INITIALIZED=1, - WTHR_SIG_WAITING=2, - WTHR_RUNNING=3, - WTHR_NO_WORK=4, - WTHR_KILL_IT=5, - WTHR_STATUS_UNDEFINED + WTHR_NOT_INIT=0, + WTHR_INITIALIZED=1, + WTHR_SIG_WAITING=2, + WTHR_RUNNING=3, + WTHR_NO_WORK=4, + WTHR_KILL_IT=5, + WTHR_STATUS_UNDEFINED } wthr_status_t; +typedef struct wr_tsk { + buf_pool_t *buf_pool; // buffer-pool instance + buf_flush_t flush_type; // flush-type for buffer-pool flush operation + ulint min; //minimum number of pages requested to be flushed + lsn_t lsn_limit;//lsn limit for the buffer-pool flush operation +} wr_tsk_t; + + +typedef struct rd_tsk { + void *page_pool; //list of pages to decompress; +} rd_tsk_t; + typedef struct wrk_itm { - /****************************/ - /* Need to group into struct*/ - buf_pool_t* buf_pool; //buffer-pool instance - int flush_type; //flush-type for buffer-pool flush operation - int min; //minimum number of pages requested to be flushed - unsigned long long lsn_limit; //lsn limit for the buffer-pool flush operation - /****************************/ - - unsigned long result; //flush pages count - unsigned long t_usec; //time-taken in usec - long id_usr; //thread-id currently working - wrk_status_t wi_status; //flag - struct wrk_itm *next; + mt_wrk_tsk_t tsk; + /* based on task-type one of the entries wr_tsk/rd_tsk will be used */ + wr_tsk_t wr; //flush page list + rd_tsk_t rd; //decompress page list + unsigned long result; //flush pages count + unsigned long t_usec; //time-taken in usec + long id_usr; //thread-id currently working + wrk_status_t wi_status; //flag + struct wrk_itm *next; } wrk_t; -typedef enum op_q_status { - Q_NOT_INIT=0, - Q_EMPTY=1, - Q_INITIALIZED=2, - Q_PROCESS=3, - Q_DONE=4, - Q_ERROR=5, - Q_STATUS_UNDEFINED -} q_status_t; - -typedef struct op_queue -{ - pthread_mutex_t mtx; - pthread_cond_t cv; - q_status_t flag; - wrk_t *head; - wrk_t *tail; -} opq_t; - -opq_t wq, cq; - typedef struct thread_sync { - int wthread_id; - pthread_t wthread; - opq_t *wq; - opq_t *cq; - wthr_status_t wt_status; + int wthread_id; + os_thread_t wthread; + ib_wqueue_t *wq; // work Queue + ib_wqueue_t *wr_cq;// Write Completion Queue + ib_wqueue_t *rd_cq; // Read Completion Queue + wthr_status_t wt_status; // Worker Thread status unsigned long stat_universal_num_processed; unsigned long stat_cycle_num_processed; } thread_sync_t; /* Global XXX:DD needs to be cleaned */ -int exit_flag; -ulint check_wrk_done_count; -static ulint done_cnt_flag; -static int pgc_n_threads = 8; - -thread_sync_t pc_sync[PGCOMP_MAX_WORKER]; -static wrk_t work_items[PGCOMP_MAX_WORKER]; +ib_wqueue_t *wq=NULL, *wr_cq=NULL, *rd_cq=NULL; +mem_heap_t *heap_allocated=NULL; +thread_sync_t pc_sync[MTFLUSH_MAX_WORKER]; +static wrk_t work_items[MTFLUSH_MAX_WORKER]; static int pgcomp_wrk_initialized = -1; - -int set_check_done_flag_count(int cnt) -{ - return(check_wrk_done_count = cnt); -} +ulint srv_mtflush_threads = 0; int set_pgcomp_wrk_init_done(void) { @@ -1615,83 +1601,14 @@ int is_pgcomp_wrk_init_done(void) return(pgcomp_wrk_initialized == 1); } -ulint set_done_cnt_flag(ulint val) -{ - /* - * Assumption: The thread calling into set_done_cnt_flag - * needs to have "cq.mtx" acquired, else not safe. - */ - done_cnt_flag = val; - return done_cnt_flag; -} - - -ulint cv_done_inc_flag_sig(thread_sync_t * ppc) -{ - pthread_mutex_lock(&ppc->cq->mtx); - ppc->stat_universal_num_processed++; - ppc->stat_cycle_num_processed++; - done_cnt_flag++; - if(!(done_cnt_flag <= check_wrk_done_count)) { - fprintf(stderr, "ERROR: done_cnt:%lu check_wrk_done_count:%lu\n", - done_cnt_flag, check_wrk_done_count); - } - assert(done_cnt_flag <= check_wrk_done_count); - pthread_mutex_unlock(&ppc->cq->mtx); - if(done_cnt_flag == check_wrk_done_count) { - ppc->wq->flag = Q_DONE; - pthread_mutex_lock(&ppc->cq->mtx); - ppc->cq->flag = Q_DONE; - pthread_cond_signal(&ppc->cq->cv); - pthread_mutex_unlock(&ppc->cq->mtx); - } - return(done_cnt_flag); -} - -int q_remove_wrk(opq_t *q, wrk_t **wi) -{ - int ret = 0; - - if(!wi || !q) { - return -1; - } - - pthread_mutex_lock(&q->mtx); - assert(!((q->tail == NULL) && (q->head != NULL))); - assert(!((q->tail != NULL) && (q->head == NULL))); - - /* get the first in the list*/ - *wi = q->head; - if(q->head) { - ret = 0; - q->head = q->head->next; - (*wi)->next = NULL; - if(!q->head) { - q->tail = NULL; - } - } else { - q->tail = NULL; - ret = 1; /* indicating remove from queue failed */ - } - pthread_mutex_unlock(&q->mtx); - return (ret); -} - -int is_busy_wrk_itm(wrk_t *wi) -{ - if(!wi) { - return -1; - } - return(!(wi->id_usr == -1)); -} - int setup_wrk_itm(int items) { int i; for(i=0; imtx, NULL); - pthread_cond_init(&q->cv, NULL); - q->flag = Q_INITIALIZED; - q->head = q->tail = NULL; - - return 0; -} - -#if 0 -int drain_cq(opq_t *cq, int items) -{ - int i=0; - - if(!cq) { - return -1; - } - pthread_mutex_lock(&cq->mtx); - for(i=0; ihead = cq->tail = NULL; - pthread_mutex_unlock(&cq->mtx); - return 0; -} -#endif - -int q_insert_wrk_list(opq_t *q, wrk_t *w_list) -{ - if((!q) || (!w_list)) { - fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); - return -1; - } - - pthread_mutex_lock(&q->mtx); - - assert(!((q->tail == NULL) && (q->head != NULL))); - assert(!((q->tail != NULL) && (q->head == NULL))); - - /* list is empty */ - if(!q->tail) { - q->head = q->tail = w_list; - } else { - /* added the first of the node to list */ - assert(q->head != NULL); - q->tail->next = w_list; - } - - /* move tail to the last node */ - while(q->tail->next) { - q->tail = q->tail->next; - } - pthread_mutex_unlock(&q->mtx); - - return 0; -} - int flush_pool_instance(wrk_t *wi) { - struct timeval p_start_time, p_end_time, d_time; flush_counters_t n; +#ifdef UNIV_DEBUG + struct timeval p_start_time, p_end_time, d_time; +#endif - if(!wi) { + if (!wi) { fprintf(stderr, "work item invalid wi:%p\n", wi); return -1; } - wi->t_usec = 0; - if (!buf_flush_start(wi->buf_pool, (buf_flush_t)wi->flush_type)) { + if (!wi->wr.buf_pool) { + fprintf(stderr, "work-item wi->buf_pool:%p [likely thread exit]\n", + wi->wr.buf_pool); + return -1; + } + + wi->t_usec = 0; + if (!buf_flush_start(wi->wr.buf_pool, wi->wr.flush_type)) { /* We have two choices here. If lsn_limit was specified then skipping an instance of buffer pool means we cannot guarantee that all pages @@ -1788,7 +1649,7 @@ int flush_pool_instance(wrk_t *wi) help in the retry which will follow the failure. */ fprintf(stderr, "flush_start Failed, flush_type:%d\n", - (buf_flush_t)wi->flush_type); + wi->wr.flush_type); return -1; } @@ -1797,32 +1658,28 @@ int flush_pool_instance(wrk_t *wi) gettimeofday(&p_start_time, 0x0); #endif - if((buf_flush_t)wi->flush_type == BUF_FLUSH_LRU) { - /* srv_LRU_scan_depth can be arbitrarily large value. - * We cap it with current LRU size. - */ - buf_pool_mutex_enter(wi->buf_pool); - wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); - buf_pool_mutex_exit(wi->buf_pool); - wi->min = ut_min(srv_LRU_scan_depth,wi->min); - } + if (wi->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(wi->wr.buf_pool); + wi->wr.min = UT_LIST_GET_LEN(wi->wr.buf_pool->LRU); + buf_pool_mutex_exit(wi->wr.buf_pool); + wi->wr.min = ut_min(srv_LRU_scan_depth,wi->wr.min); + } - buf_flush_batch(wi->buf_pool, - (buf_flush_t)wi->flush_type, - wi->min, wi->lsn_limit, false, &n); + wi->result = buf_flush_batch(wi->wr.buf_pool, + wi->wr.flush_type, + wi->wr.min, wi->wr.lsn_limit, + false, &n); - wi->result = n.flushed; + buf_flush_end(wi->wr.buf_pool, wi->wr.flush_type); + buf_flush_common(wi->wr.flush_type, wi->result); - buf_flush_end(wi->buf_pool, (buf_flush_t)wi->flush_type); - buf_flush_common((buf_flush_t)wi->flush_type, wi->result); - -#ifdef UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); -#endif - return 0; } @@ -1831,68 +1688,75 @@ int service_page_comp_io(thread_sync_t * ppc) wrk_t *wi = NULL; int ret=0; - pthread_mutex_lock(&ppc->wq->mtx); - do{ - ppc->wt_status = WTHR_SIG_WAITING; - ret = pthread_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); + ppc->wt_status = WTHR_SIG_WAITING; + wi = (wrk_t *)ib_wqueue_wait(ppc->wq); + + if (wi) { ppc->wt_status = WTHR_RUNNING; - if(ret == ETIMEDOUT) { - fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%lu] ret:%d\n", - done_cnt_flag, ret); - } else if(ret == EINVAL || ret == EPERM) { - fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%lu] ret:%d\n", - done_cnt_flag, ret); - } - if(ppc->wq->flag == Q_PROCESS) { - break; - } else { - pthread_mutex_unlock(&ppc->wq->mtx); - return -1; - } - } while (ppc->wq->flag == Q_PROCESS && ret == 0); + } else { + fprintf(stderr, "%s:%d work-item is NULL\n", __FILE__, __LINE__); + ppc->wt_status = WTHR_NO_WORK; + return (0); + } - pthread_mutex_unlock(&ppc->wq->mtx); + assert(wi != NULL); + wi->id_usr = ppc->wthread; - while (ppc->cq->flag == Q_PROCESS) { - wi = NULL; - /* Get the work item */ - if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { - ppc->wt_status = WTHR_NO_WORK; - return -1; - } + switch(wi->tsk) { + case MT_WRK_NONE: + assert(wi->wi_status == WRK_ITEM_EXIT); + wi->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); + break; - assert(ret==0); - assert(wi != NULL); - assert(0 == is_busy_wrk_itm(wi)); - assert(wi->id_usr == -1); - - wi->id_usr = ppc->wthread; + case MT_WRK_WRITE: wi->wi_status = WRK_ITEM_START; - /* Process work item */ - if(0 != (ret = flush_pool_instance(wi))) { + if (0 != (ret = flush_pool_instance(wi))) { fprintf(stderr, "FLUSH op failed ret:%d\n", ret); wi->wi_status = WRK_ITEM_FAILED; } - - ret = q_insert_wrk_list(ppc->cq, wi); - - assert(0==ret); - assert(check_wrk_done_count >= done_cnt_flag); wi->wi_status = WRK_ITEM_SUCCESS; - if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { - break; - } + ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); + break; + + case MT_WRK_READ: + /* Need to also handle the read case */ + assert(0); + /* completed task get added to rd_cq */ + /* wi->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(ppc->rd_cq, wi, heap_allocated);*/ + break; + + default: + /* None other than Write/Read handling planned */ + assert(0); } + + ppc->wt_status = WTHR_NO_WORK; return(0); } +void page_comp_io_thread_exit() +{ + ulint i; + + fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", srv_buf_pool_instances); + for (i=0; ihead || !q->tail) { - assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); - fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); - return 0; - } - - pthread_mutex_lock(&q->mtx); - for(wi = q->head; (wi != NULL) ; wi = wi->next) { - //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", - // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); - fprintf(stderr, "- [%p] [%s] >%p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->next); - } - pthread_mutex_unlock(&q->mtx); -#endif - return(0); -} - int print_wrk_list(wrk_t *wi_list) { wrk_t *wi = wi_list; @@ -1952,111 +1788,9 @@ int print_wrk_list(wrk_t *wi_list) return 0; } -int pgcomp_handler(wrk_t *w_list) -{ - int ret=0; - opq_t *wrk_q=NULL, *comp_q=NULL; - - wrk_q=&wq; - comp_q=&cq; - - pthread_mutex_lock(&wrk_q->mtx); - /* setup work queue here.. */ - wrk_q->flag = Q_EMPTY; - pthread_mutex_unlock(&wrk_q->mtx); - - ret = q_insert_wrk_list(wrk_q, w_list); - if(ret != 0) { - fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", - __FUNCTION__, &wq, w_list); - return -1; - } - -retry_submit: - pthread_mutex_lock(&wrk_q->mtx); - /* setup work queue here.. */ - wrk_q->flag = Q_INITIALIZED; - pthread_mutex_unlock(&wrk_q->mtx); - - - pthread_mutex_lock(&comp_q->mtx); - if(0 != set_done_cnt_flag(0)) { - fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); - pthread_mutex_unlock(&comp_q->mtx); - return -1; - } - comp_q->flag = Q_PROCESS; - pthread_mutex_unlock(&comp_q->mtx); - - /* if threads are waiting request them to start */ - pthread_mutex_lock(&wrk_q->mtx); - wrk_q->flag = Q_PROCESS; - pthread_cond_broadcast(&wrk_q->cv); - pthread_mutex_unlock(&wrk_q->mtx); - - /* Wait on all worker-threads to complete */ - pthread_mutex_lock(&comp_q->mtx); - if (comp_q->flag != Q_DONE) { - do { - pthread_cond_wait(&comp_q->cv, &comp_q->mtx); - if(comp_q->flag != Q_DONE) { - fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - if (done_cnt_flag != srv_buf_pool_instances) { - fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - } - continue; - } else if (done_cnt_flag != srv_buf_pool_instances) { - fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - comp_q->flag = Q_INITIALIZED; - pthread_mutex_unlock(&comp_q->mtx); - goto retry_submit; - - assert(!done_cnt_flag); - continue; - } - assert(done_cnt_flag == srv_buf_pool_instances); - - if ((comp_q->flag == Q_DONE) && - (done_cnt_flag == srv_buf_pool_instances)) { - break; - } - } while((comp_q->flag == Q_INITIALIZED) && - (done_cnt_flag != srv_buf_pool_instances)); - } else { - fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%lu\n", - comp_q->flag, done_cnt_flag); - if (!done_cnt_flag) { - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - comp_q->flag = Q_INITIALIZED; - pthread_mutex_unlock(&comp_q->mtx); - goto retry_submit; - assert(!done_cnt_flag); - } - assert(done_cnt_flag == srv_buf_pool_instances); - } - - pthread_mutex_unlock(&comp_q->mtx); - pthread_mutex_lock(&wrk_q->mtx); - wrk_q->flag = Q_DONE; - pthread_mutex_unlock(&wrk_q->mtx); - - return 0; -} - /******************************************************************//** @return a dummy parameter*/ -int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) +int pgcomp_handler_init(int num_threads, int wrk_cnt, ib_wqueue_t *wq, ib_wqueue_t *wr_cq, ib_wqueue_t *rd_cq) { int i=0; @@ -2065,106 +1799,89 @@ int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) return -1; } - if(!wq || !cq) { - fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); + if(!wq || !wr_cq || !rd_cq) { + fprintf(stderr, "%s() FAILED wq:%p write-cq:%p read-cq:%p\n", + __FUNCTION__, wq, wr_cq, rd_cq); return -1; } /* work-item setup */ setup_wrk_itm(wrk_cnt); - /* wq & cq setup */ - init_queue(wq); - init_queue(cq); - /* Mark each of the thread sync entires */ - for(i=0; i < PGCOMP_MAX_WORKER; i++) { - pc_sync[i].wthread_id = i; + for(i=0; i < MTFLUSH_MAX_WORKER; i++) { + pc_sync[i].wthread_id = i; } /* Create threads for page-compression-flush */ for(i=0; i < num_threads; i++) { pc_sync[i].wthread_id = i; pc_sync[i].wq = wq; - pc_sync[i].cq = cq; + pc_sync[i].wr_cq = wr_cq; + pc_sync[i].rd_cq = rd_cq; + os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), - thread_ids + START_PGCOMP_CNT + i); - //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; - pc_sync[i].wthread = (START_PGCOMP_CNT + i); + thread_ids + START_OLD_THREAD_CNT + i); + pc_sync[i].wthread = (START_OLD_THREAD_CNT + i); pc_sync[i].wt_status = WTHR_INITIALIZED; } - - set_check_done_flag_count(wrk_cnt); set_pgcomp_wrk_init_done(); - + fprintf(stderr, "%s() Worker-Threads created..\n", __FUNCTION__); return 0; } - int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) { - long stat_tot=0; - unsigned int i=0; - for(i=0; i< num_threads;i++) { + ulong stat_tot=0; + ulint i=0; + for(i=0; i Date: Tue, 4 Feb 2014 20:08:59 +0200 Subject: [PATCH 11/56] Fixed compiler errors. --- storage/innobase/srv/srv0start.cc | 10 ++++++++-- storage/tokudb/ft-index/ft/ft-ops.cc | 2 +- storage/xtradb/srv/srv0start.cc | 7 ++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 18d6cd109e7..dd327769d68 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1436,7 +1436,9 @@ srv_start_wait_for_purge_to_start() /* JAN: TODO: */ /**********************************************************************************/ +#ifdef UNIV_DEBUG extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time); +#endif extern ibool buf_flush_start(buf_pool_t* buf_pool, enum buf_flush flush_type); extern void buf_flush_end(buf_pool_t* buf_pool, enum buf_flush flush_type); extern void buf_flush_common(enum buf_flush flush_type, ulint page_count); @@ -1545,8 +1547,9 @@ int setup_wrk_itm(int items) int flush_pool_instance(wrk_t *wi) { +#ifdef UNIV_DEBUG struct timeval p_start_time, p_end_time, d_time; - +#endif if (!wi) { fprintf(stderr, "work item invalid wi:%p\n", wi); return -1; @@ -1575,8 +1578,10 @@ int flush_pool_instance(wrk_t *wi) return -1; } +#ifdef UNIV_DEBUG /* Record time taken for the OP in usec */ gettimeofday(&p_start_time, 0x0); +#endif if (wi->wr.flush_type == BUF_FLUSH_LRU) { /* srv_LRU_scan_depth can be arbitrarily large value. @@ -1595,10 +1600,11 @@ int flush_pool_instance(wrk_t *wi) buf_flush_end(wi->wr.buf_pool, wi->wr.flush_type); buf_flush_common(wi->wr.flush_type, wi->result); +#ifdef UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); - wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); +#endif return 0; } diff --git a/storage/tokudb/ft-index/ft/ft-ops.cc b/storage/tokudb/ft-index/ft/ft-ops.cc index 27ee6ec8000..4437f23b950 100644 --- a/storage/tokudb/ft-index/ft/ft-ops.cc +++ b/storage/tokudb/ft-index/ft/ft-ops.cc @@ -2330,7 +2330,7 @@ basement_node_gc_all_les(BASEMENTNODE bn, while (index < (num_leafentries_before = bn->data_buffer.omt_size())) { void* keyp = NULL; uint32_t keylen = 0; - LEAFENTRY leaf_entry; + LEAFENTRY leaf_entry = 0; bn->data_buffer.fetch_klpair(index, &leaf_entry, &keylen, &keyp); assert_zero(r); ft_basement_node_gc_once( diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 7b2aebf6b83..4f3570249d7 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -1510,7 +1510,10 @@ init_log_online(void) /* JAN: TODO: */ /**********************************************************************************/ +#ifdef UNIV_DEBUG extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time); +#endif + extern ibool buf_flush_start(buf_pool_t* buf_pool, buf_flush_t flush_type); extern void buf_flush_end(buf_pool_t* buf_pool, buf_flush_t flush_type); extern void buf_flush_common(buf_flush_t flush_type, ulint page_count); @@ -1676,10 +1679,12 @@ int flush_pool_instance(wrk_t *wi) buf_flush_end(wi->wr.buf_pool, wi->wr.flush_type); buf_flush_common(wi->wr.flush_type, wi->result); +#ifdef UNIV_DEBUG gettimeofday(&p_end_time, 0x0); timediff(&p_end_time, &p_start_time, &d_time); - wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); +#endif + return 0; } From 921d87d47c779240ea30aec01fbfcab888e98261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 5 Feb 2014 15:32:29 +0200 Subject: [PATCH 12/56] Fixed issue on xtradb shutdown merge error. Multi-threaded flush threads where not shut down properly. --- storage/innobase/buf/buf0flu.cc | 4 ++++ storage/innobase/srv/srv0start.cc | 4 ++++ storage/xtradb/buf/buf0flu.cc | 24 ++++++++++++++++++++++-- storage/xtradb/srv/srv0start.cc | 20 +++++++++++++++++--- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index ff1fab6eae7..421d105b00f 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1973,6 +1973,10 @@ void pgcomp_init(void) { os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &pgcomp_mtx); } +void pgcomp_deinit(void) +{ + os_fast_mutex_free(&pgcomp_mtx); +} /*******************************************************************//** Multi-threaded version of buf_flush_list diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index dd327769d68..318f6b0500c 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1444,6 +1444,7 @@ extern void buf_flush_end(buf_pool_t* buf_pool, enum buf_flush flush_type); extern void buf_flush_common(enum buf_flush flush_type, ulint page_count); extern ulint buf_flush_batch(buf_pool_t* buf_pool, enum buf_flush flush_type, ulint min_n, lsn_t lsn_limit); extern void pgcomp_init(void); +extern void pgcomp_deinit(void); typedef enum wrk_status { WRK_ITEM_SET=0, // wrk-item is set @@ -3277,6 +3278,9 @@ innobase_shutdown_for_mysql(void) fprintf(stderr, "%s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); #endif + /* h. Remove the mutex */ + pgcomp_deinit(); + os_mutex_enter(os_sync_mutex); if (os_thread_count == 0) { diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index e85d1215422..b70dc23d7e0 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1931,6 +1931,21 @@ buf_flush_wait_batch_end( } /* JAN: TODO: */ + +void buf_pool_enter_LRU_mutex( + buf_pool_t* buf_pool) +{ + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + mutex_enter(&buf_pool->LRU_list_mutex); +} + +void buf_pool_exit_LRU_mutex( + buf_pool_t* buf_pool) +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + mutex_exit(&buf_pool->LRU_list_mutex); +} + /*******************************************************************//** This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free @@ -2053,6 +2068,11 @@ void pgcomp_init(void) os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &pgcomp_mtx); } +void pgcomp_deinit(void) +{ + os_fast_mutex_free(&pgcomp_mtx); +} + /*******************************************************************//** Multi-threaded version of buf_flush_list */ @@ -2096,11 +2116,11 @@ pgcomp_buf_flush_list( #ifdef UNIV_DEBUG gettimeofday(&p_start_time, 0x0); #endif - os_fast_mutex_lock(&pgcomp_mtx); + // os_fast_mutex_lock(&pgcomp_mtx); pgcomp_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LIST, min_n, lsn_limit); - os_fast_mutex_unlock(&pgcomp_mtx); + // os_fast_mutex_unlock(&pgcomp_mtx); for (i = 0; i < srv_buf_pool_instances; i++) { if (n_processed) { diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 4f3570249d7..37324118fc7 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -1520,6 +1520,9 @@ extern void buf_flush_common(buf_flush_t flush_type, ulint page_count); extern ulint buf_flush_batch(buf_pool_t* buf_pool, buf_flush_t flush_type, ulint min_n, lsn_t lsn_limit, bool limited_lru_scan, flush_counters_t* n); extern void pgcomp_init(void); +extern void pgcomp_deinit(void); +extern void buf_pool_enter_LRU_mutex(buf_pool_t*); +extern void buf_pool_exit_LRU_mutex(buf_pool_t*); typedef enum wrk_status { WRK_ITEM_SET=0, // wrk-item is set @@ -1554,7 +1557,6 @@ typedef struct wr_tsk { ulint min; //minimum number of pages requested to be flushed lsn_t lsn_limit;//lsn limit for the buffer-pool flush operation } wr_tsk_t; - typedef struct rd_tsk { void *page_pool; //list of pages to decompress; @@ -1665,9 +1667,9 @@ int flush_pool_instance(wrk_t *wi) /* srv_LRU_scan_depth can be arbitrarily large value. * We cap it with current LRU size. */ - buf_pool_mutex_enter(wi->wr.buf_pool); + buf_pool_enter_LRU_mutex(wi->wr.buf_pool); wi->wr.min = UT_LIST_GET_LEN(wi->wr.buf_pool->LRU); - buf_pool_mutex_exit(wi->wr.buf_pool); + buf_pool_exit_LRU_mutex(wi->wr.buf_pool); wi->wr.min = ut_min(srv_LRU_scan_depth,wi->wr.min); } @@ -3407,8 +3409,20 @@ innobase_shutdown_for_mysql(void) logs_empty_and_mark_files_at_shutdown() and should have already quit or is quitting right now. */ + /* g. Exit the multi threaded flush threads */ + + page_comp_io_thread_exit(); + +#ifdef UNIV_DEBUG + fprintf(stderr, "%s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); +#endif + + /* h. Remove the mutex */ + pgcomp_deinit(); + os_mutex_enter(os_sync_mutex); + if (os_thread_count == 0) { /* All the threads have exited or are just exiting; NOTE that the threads may not have completed their From 7f3950a2aedd55b299735645882b48917a380be3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 6 Feb 2014 17:25:26 +0200 Subject: [PATCH 13/56] Moved mt-flush code to buf0mtflu.[cc|h] and cleaned it up. This is for InnoDB. --- storage/innobase/CMakeLists.txt | 3 +- storage/innobase/buf/buf0flu.cc | 235 +--- storage/innobase/buf/buf0mtflu.cc | 1493 ++++++++++---------------- storage/innobase/include/buf0flu.h | 49 + storage/innobase/include/buf0mtflu.h | 95 ++ storage/innobase/include/srv0srv.h | 2 +- storage/innobase/include/srv0start.h | 3 +- storage/innobase/srv/srv0srv.cc | 4 +- storage/innobase/srv/srv0start.cc | 425 +------- storage/xtradb/buf/buf0flu.cc | 3 + 10 files changed, 727 insertions(+), 1585 deletions(-) create mode 100644 storage/innobase/include/buf0mtflu.h diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 0b1043bc421..64c22f9f7df 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -278,8 +278,7 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc -# TODO: JAN uncomment -# buf/buf0mtflu.cc + buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 421d105b00f..d131f2efb44 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -32,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri #endif #include "buf0buf.h" +#include "buf0mtflu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" @@ -1680,7 +1681,6 @@ pages: to avoid deadlocks, this function must be written so that it cannot end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued */ -//static ulint buf_flush_batch( /*============*/ @@ -1737,7 +1737,6 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -//static void buf_flush_common( /*=============*/ @@ -1762,7 +1761,6 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -//static ibool buf_flush_start( /*============*/ @@ -1791,7 +1789,6 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -//static void buf_flush_end( /*==========*/ @@ -1846,50 +1843,6 @@ buf_flush_wait_batch_end( } } -/* JAN: TODO: */ -/*******************************************************************//** -This utility flushes dirty blocks from the end of the LRU list and also -puts replaceable clean pages from the end of the LRU list to the free -list. -NOTE: The calling thread is not allowed to own any latches on pages! -@return true if a batch was queued successfully. false if another batch -of same type was already running. */ -static -bool -pgcomp_buf_flush_LRU( -/*==========*/ - buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - ulint* n_processed) /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ -{ - ulint page_count; - - if (n_processed) { - *n_processed = 0; - } - - if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { - return(false); - } - - page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0); - - buf_flush_end(buf_pool, BUF_FLUSH_LRU); - - buf_flush_common(BUF_FLUSH_LRU, page_count); - - if (n_processed) { - *n_processed = page_count; - } - - return(true); -} -/* JAN: TODO: END: */ - /*******************************************************************//** This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free @@ -1932,125 +1885,6 @@ buf_flush_LRU( return(true); } -/* JAN: TODO: */ -/*******************************************************************//**/ -extern int is_pgcomp_wrk_init_done(void); -extern int pgcomp_flush_work_items( - int buf_pool_inst, - int *pages_flushed, - enum buf_flush flush_type, - int min_n, - lsn_t lsn_limit); - -#define MT_COMP_WATER_MARK 50 - -#ifdef UNIV_DEBUG -#include -int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) -{ - if (g_time->tv_usec < s_time->tv_usec) - { - int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; - s_time->tv_usec -= 1000000 * nsec; - s_time->tv_sec += nsec; - } - if (g_time->tv_usec - s_time->tv_usec > 1000000) - { - int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; - s_time->tv_usec += 1000000 * nsec; - s_time->tv_sec -= nsec; - } - d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; - d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; - - return 0; -} -#endif - -static os_fast_mutex_t pgcomp_mtx; - -void pgcomp_init(void) -{ - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &pgcomp_mtx); -} -void pgcomp_deinit(void) -{ - os_fast_mutex_free(&pgcomp_mtx); -} - -/*******************************************************************//** -Multi-threaded version of buf_flush_list -*/ -UNIV_INTERN -bool -pgcomp_buf_flush_list( -/*==================*/ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all - blocks whose oldest_modification is - smaller than this should be flushed - (if their number does not exceed - min_n), otherwise ignored */ - ulint* n_processed) /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ - -{ - ulint i; - bool success = true; -#ifdef UNIV_DEBUG - struct timeval p_start_time, p_end_time, d_time; -#endif - int cnt_flush[MTFLUSH_MAX_WORKER]; - - if (n_processed) { - *n_processed = 0; - } - - if (min_n != ULINT_MAX) { - /* Ensure that flushing is spread evenly amongst the - buffer pool instances. When min_n is ULINT_MAX - we need to flush everything up to the lsn limit - so no limit here. */ - min_n = (min_n + srv_buf_pool_instances - 1) - / srv_buf_pool_instances; - } - -#ifdef UNIV_DEBUG - gettimeofday(&p_start_time, 0x0); -#endif - os_fast_mutex_lock(&pgcomp_mtx); - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LIST, - min_n, lsn_limit); - os_fast_mutex_unlock(&pgcomp_mtx); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (n_processed) { - *n_processed += cnt_flush[i]; - } - if (cnt_flush[i]) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - cnt_flush[i]); - } - } -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu %llu usec]\n", - __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - return(success); -} - -/* JAN: TODO: END: */ - /*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. @@ -2078,11 +1912,9 @@ buf_flush_list( ulint i; bool success = true; - /* JAN: TODO: */ - if (is_pgcomp_wrk_init_done()) { - return(pgcomp_buf_flush_list(min_n, lsn_limit, n_processed)); + if (buf_mtflu_init_done()) { + return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed)); } - /* JAN: TODO: END: */ if (n_processed) { *n_processed = 0; @@ -2237,60 +2069,6 @@ buf_flush_single_page_from_LRU( return(freed); } -/* JAN: TODO: */ -/*********************************************************************//** -pgcomp_Clears up tail of the LRU lists: -* Put replaceable pages at the tail of LRU to the free list -* Flush dirty pages at the tail of LRU to the disk -The depth to which we scan each buffer pool is controlled by dynamic -config parameter innodb_LRU_scan_depth. -@return total pages flushed */ -UNIV_INTERN -ulint -pgcomp_buf_flush_LRU_tail(void) -/*====================*/ -{ -#ifdef UNIV_DEBUG - struct timeval p_start_time, p_end_time, d_time; -#endif - ulint total_flushed=0, i=0; - int cnt_flush[32]; - -#ifdef UNIV_DEBUG - gettimeofday(&p_start_time, 0x0); -#endif - ut_ad(is_pgcomp_wrk_init_done()); - - os_fast_mutex_lock(&pgcomp_mtx); - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); - os_fast_mutex_unlock(&pgcomp_mtx); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (cnt_flush[i]) { - total_flushed += cnt_flush[i]; - - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_TOTAL_PAGE, - MONITOR_LRU_BATCH_COUNT, - MONITOR_LRU_BATCH_PAGES, - cnt_flush[i]); - } - } - -#if UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - - return(total_flushed); -} -/* JAN: TODO: END: */ - /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list @@ -2304,12 +2082,11 @@ buf_flush_LRU_tail(void) /*====================*/ { ulint total_flushed = 0; - /* JAN: TODO: */ - if(is_pgcomp_wrk_init_done()) + + if(buf_mtflu_init_done()) { - return(pgcomp_buf_flush_LRU_tail()); + return(buf_mtflu_flush_LRU_tail()); } - /* JAN: TODO: END */ for (ulint i = 0; i < srv_buf_pool_instances; i++) { diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index 7abe0547877..901f766c472 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -1,7 +1,7 @@ /***************************************************************************** -Copyright (C) 2013 Fusion-io. All Rights Reserved. -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,124 +23,536 @@ Multi-threaded flush method implementation Created 06/11/2013 Dhananjoy Das DDas@fusionio.com Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com +Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com +Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com ***********************************************************************/ -#include - -#ifdef UNIV_PFS_MUTEX -/* Key to register fil_system_mutex with performance schema */ -UNIV_INTERN mysql_pfs_key_t mtflush_mutex_key; -#endif /* UNIV_PFS_MUTEX */ - -/* Mutex to protect critical sections during multi-threaded flush */ -ib_mutex_t mt_flush_mutex; +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0mtflu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "fil0fil.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "os0sync.h" +#include "trx0sys.h" +#include "srv0mon.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" #define MT_COMP_WATER_MARK 50 /* Work item status */ -typedef enum { - WORK_ITEM_SET=0, /* Work item information set */ - WORK_ITEM_START=1, /* Work item assigned to thread and - execution started */ - WORK_ITEM_DONE=2, /* Work item execution done */ -} mtflu_witem_status_t; +typedef enum wrk_status { + WRK_ITEM_SET=0, /*!< Work item is set */ + WRK_ITEM_START=1, /*!< Processing of work item has started */ + WRK_ITEM_DONE=2, /*!< Processing is done usually set to + SUCCESS/FAILED */ + WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ + WRK_ITEM_FAILED=3, /*!< Work item process failed */ + WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +/* Work item task type */ +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, /*!< Exit queue-wait */ + MT_WRK_WRITE=1, /*!< Flush operation */ + MT_WRK_READ=2, /*!< Read operation */ + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; /* Work thread status */ -typedef enum { - WORK_THREAD_NOT_INIT=0, /* Work thread not initialized */ - WORK_THREAD_INITIALIZED=1, /* Work thread initialized */ - WORK_THREAD_SIG_WAITING=2, /* Work thred signaled */ - WORK_THREAD_RUNNING=3, /* Work thread running */ - WORK_THREAD_NO_WORK=4, /* Work thread has no work to do */ -} mtflu_wthr_status_t; +typedef enum wthr_status { + WTHR_NOT_INIT=0, /*!< Work thread not initialized */ + WTHR_INITIALIZED=1, /*!< Work thread initialized */ + WTHR_SIG_WAITING=2, /*!< Work thread wating signal */ + WTHR_RUNNING=3, /*!< Work thread running */ + WTHR_NO_WORK=4, /*!< Work thread has no work */ + WTHR_KILL_IT=5, /*!< Work thread should exit */ + WTHR_STATUS_UNDEFINED +} wthr_status_t; -/* Structure containing multi-treaded flush thread information */ -typedef struct { - os_thread_t wthread_id; /* Thread id */ - opq_t *wq; /* Write queue ? */ - opq_t *cq; /* Commit queue ?*/ - ib_mutex_t thread_mutex; /* Mutex proecting below - structures */ - mtflu_wthr_status_t thread_status; /* Thread status */ - ib_uint64_t total_num_processed; /* Total number of - pages processed */ - ib_uint64_t cycle_num_processed; /* Numper of pages - processed on last - cycle */ - ulint check_wrk_done_count; /* Number of pages - to process in this - work item ? */ - ulint done_cnt_flag; /* Number of pages - processed in this - work item ?*/ -} mtflu_thread_t; +/* Write work task */ +typedef struct wr_tsk { + buf_pool_t *buf_pool; /*!< buffer-pool instance */ + enum buf_flush flush_type; /*!< flush-type for buffer-pool + flush operation */ + ulint min; /*!< minimum number of pages + requested to be flushed */ + lsn_t lsn_limit; /*!< lsn limit for the buffer-pool + flush operation */ +} wr_tsk_t; -struct work_item_t { - /****************************/ - /* Need to group into struct*/ - buf_pool_t* buf_pool; //buffer-pool instance - int flush_type; //flush-type for buffer-pool flush operation - ulint min; //minimum number of pages requested to be flushed - lsn_t lsn_limit; //lsn limit for the buffer-pool flush operation - /****************************/ +/* Read work task */ +typedef struct rd_tsk { + buf_pool_t *page_pool; /*!< list of pages to decompress; */ +} rd_tsk_t; - unsigned long result; //flush pages count - unsigned long t_usec; //time-taken in usec - os_thread_t id_usr; /* thread-id - currently working , why ? */ - mtflu_witem_status_t wi_status; /* work item status */ - - UT_LIST_NODE_T(work_node_t) next; -}; - -/* Multi-threaded flush system structure */ -typedef struct { - int pgc_n_threads = 8;// ??? why what this is - - mtflu_thread_t pc_sync[PGCOMP_MAX_WORKER]; - wrk_t work_items[PGCOMP_MAX_WORKER]; - int pgcomp_wrk_initialized = -1; /* ???? */ - opq_t wq; /* write queue ? */ - opq_t cq; /* commit queue ? */ -} mtflu_system_t; - -typedef enum op_q_status { - Q_NOT_INIT=0, - Q_EMPTY=1, - Q_INITIALIZED=2, - Q_PROCESS=3, - Q_DONE=4, - Q_ERROR=5, - Q_STATUS_UNDEFINED -} q_status_t; - -// NOTE: jan: could we use ut/ut0wqueue.(h|cc) -// NOTE: jan: here ????, it would handle waiting, signaling -// and contains simple interface - -typedef struct op_queue +/* Work item */ +typedef struct wrk_itm { - ib_mutex_t mtx; /* Mutex protecting below variables - */ - os_cond_t cv; /* ? is waiting here ? */ - q_status_t flag; /* Operation queue status */ - UT_LIST_BASE_NODE_T(work_item_t) work_list; -} opq_t; + mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type + one of the entries wr_tsk/rd_tsk + will be used */ + wr_tsk_t wr; /*!< Flush page list */ + rd_tsk_t rd; /*!< Decompress page list */ + ulint n_flushed; /*!< Flushed pages count */ + os_thread_t id_usr; /*!< Thread-id currently working */ + wrk_status_t wi_status; /*!< Work item status */ + struct wrk_itm *next; /*!< Next work item */ +} wrk_t; + +/* Thread syncronization data */ +typedef struct thread_sync +{ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + ib_wqueue_t *wq; /*!< Work Queue */ + ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ + ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ + wthr_status_t wt_status; /*!< Worker thread status */ + ulint stat_universal_num_processed; + /*!< Total number of pages + processed by this thread */ + ulint stat_cycle_num_processed; + /*!< Number of pages processed + on this cycle */ + mem_heap_t* wheap; /*!< Work heap where memory + is allocated */ + wrk_t* work_item; /*!< Work items to be processed */ +} thread_sync_t; + +/* QUESTION: Is this array used from several threads concurrently ? */ +// static wrk_t work_items[MTFLUSH_MAX_WORKER]; + +/* TODO: REALLY NEEDED ? */ +static int mtflush_work_initialized = -1; +static os_fast_mutex_t mtflush_mtx; +static thread_sync_t* mtflush_ctx=NULL; + +/******************************************************************//** +Initialize work items. */ +static +void +mtflu_setup_work_items( +/*===================*/ + wrk_t* work_items, /*!< inout: Work items */ + ulint n_items) /*!< in: Number of work items */ +{ + ulint i; + for(i=0; iwr.buf_pool != NULL); + + if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ +#ifdef UNIV_DEBUG + /* QUESTION: is this a really failure ? */ + fprintf(stderr, "flush_start Failed, flush_type:%d\n", + work_item->wr.flush_type); +#endif + return 0; + } -/*******************************************************************//** -Initialize multi-threaded flush. + if (work_item->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(work_item->wr.buf_pool); + work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU); + buf_pool_mutex_exit(work_item->wr.buf_pool); + work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min); + } + + work_item->n_flushed = buf_flush_batch(work_item->wr.buf_pool, + work_item->wr.flush_type, + work_item->wr.min, + work_item->wr.lsn_limit); + + buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); + buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); + + return 0; +} + +#ifdef UNIV_DEBUG +/******************************************************************//** +Output work item list status, +*/ +static +void +mtflu_print_work_list( +/*==================*/ + wrk_t* wi_list) /*!< in: Work item list */ +{ + wrk_t* wi = wi_list; + ulint i=0; + + if(!wi_list) { + fprintf(stderr, "list NULL\n"); + } + + while(wi) { + fprintf(stderr, "-\t[%p]\t[%s]\t[%lu] > %p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->n_flushed, wi->next); + wi = wi->next; + i++; + } + fprintf(stderr, "list len: %d\n", i); +} +#endif /* UNIV_DEBUG */ + +/******************************************************************//** +Worker function to wait for work items and processing them and +sending reply back. +*/ +static +void +mtflush_service_io( +/*===============*/ + thread_sync_t* mtflush_io) /*!< inout: multi-threaded flush + syncronization data */ +{ + wrk_t *work_item = NULL; + ulint n_flushed=0; + ib_time_t max_wait_usecs = 5000000; + + mtflush_io->wt_status = WTHR_SIG_WAITING; + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); + +#ifdef UNIV_DEBUG + mtflu_print_work_list(mtflush_io->work_item); +#endif + + if (work_item) { + mtflush_io->wt_status = WTHR_RUNNING; + } else { + /* Because of timeout this thread did not get any work */ + mtflush_io->wt_status = WTHR_NO_WORK; + return; + } + + work_item->id_usr = mtflush_io->wthread; + + switch(work_item->tsk) { + case MT_WRK_NONE: + ut_a(work_item->wi_status == WRK_ITEM_EXIT); + work_item->wi_status = WRK_ITEM_SUCCESS; + /* QUESTION: Why completed work items are inserted to + completion queue ? */ + ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + break; + + case MT_WRK_WRITE: + work_item->wi_status = WRK_ITEM_START; + /* Process work item */ + /* QUESTION: Is this a really a error ? */ + if (0 != (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { + fprintf(stderr, "FLUSH op failed ret:%lu\n", n_flushed); + work_item->wi_status = WRK_ITEM_FAILED; + } + work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + break; + + case MT_WRK_READ: + /* Need to also handle the read case */ + /* TODO: ? */ + ut_a(0); + /* completed task get added to rd_cq */ + /* work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->rd_cq, work_item, mtflush_io->wheap);*/ + break; + + default: + /* None other than Write/Read handling planned */ + ut_a(0); + } + + mtflush_io->wt_status = WTHR_NO_WORK; +} + +/******************************************************************//** +Thead used to flush dirty pages when multi-threaded flush is +used. +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(mtflush_io_thread)( +/*==============================*/ + void * arg) +{ + thread_sync_t *mtflush_io = ((thread_sync_t *)arg); + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + mtflush_service_io(mtflush_io); + mtflush_io->stat_cycle_num_processed = 0; + } + + /* This should make sure that all current work items are + processed before threads exit. */ + while (!ib_wqueue_is_empty(mtflush_io->wq)) { + mtflush_service_io(mtflush_io); + } + + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. */ void -buf_mtflu_init(void) -/*================*/ +buf_mtflu_io_thread_exit(void) +/*==========================*/ { - mutex_create(mtflush_mutex_key, - &mt_flush_mutex, SYNC_ANY_LATCH); + ulint i; + thread_sync_t* mtflush_io = mtflush_ctx; + + ut_a(mtflush_io != NULL); + + fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", + srv_buf_pool_instances); + + /* Send one exit work item/thread */ + for (i=0; i < srv_buf_pool_instances; i++) { + mtflush_io->work_item[i].wr.buf_pool = NULL; + mtflush_io->work_item[i].rd.page_pool = NULL; + mtflush_io->work_item[i].tsk = MT_WRK_NONE; + mtflush_io->work_item[i].wi_status = WRK_ITEM_EXIT; + + ib_wqueue_add(mtflush_io->wq, + (void *)&(mtflush_io->work_item[i]), + mtflush_io->wheap); + } + + /* Wait until all work items on a work queue are processed */ + while(!ib_wqueue_is_empty(mtflush_io->wq)) { + /* Wait about 1/2 sec */ + os_thread_sleep(50000); + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + + /* Collect all work done items */ + for (i=0; i < srv_buf_pool_instances;) { + wrk_t* work_item; + + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); + + if (work_item) { + i++; + } + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); + ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); + + /* Free all queues */ + ib_wqueue_free(mtflush_io->wq); + ib_wqueue_free(mtflush_io->wr_cq); + ib_wqueue_free(mtflush_io->rd_cq); + + /* Free heap */ + mem_heap_free(mtflush_io->wheap); + + os_fast_mutex_free(&mtflush_mtx); +} + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt) /*!< in: Number of work items */ +{ + ulint i; + mem_heap_t* mtflush_heap; + ib_wqueue_t* mtflush_work_queue; + ib_wqueue_t* mtflush_write_comp_queue; + ib_wqueue_t* mtflush_read_comp_queue; + wrk_t* work_items; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + + /* Create heap, work queue, write completion queue, read + completion queue for multi-threaded flush, and init + handler. */ + mtflush_heap = mem_heap_create(0); + ut_a(mtflush_heap != NULL); + mtflush_work_queue = ib_wqueue_create(); + ut_a(mtflush_work_queue != NULL); + mtflush_write_comp_queue = ib_wqueue_create(); + ut_a(mtflush_write_comp_queue != NULL); + mtflush_read_comp_queue = ib_wqueue_create(); + ut_a(mtflush_read_comp_queue != NULL); + + mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, + MTFLUSH_MAX_WORKER * sizeof(thread_sync_t)); + ut_a(mtflush_ctx != NULL); + work_items = (wrk_t*)mem_heap_alloc(mtflush_heap, + MTFLUSH_MAX_WORKER * sizeof(wrk_t)); + ut_a(work_items != NULL); + + /* Initialize work items */ + mtflu_setup_work_items(work_items, MTFLUSH_MAX_WORKER); + + /* Create threads for page-compression-flush */ + for(i=0; i < n_threads; i++) { + os_thread_id_t new_thread_id; + mtflush_ctx[i].wq = mtflush_work_queue; + mtflush_ctx[i].wr_cq = mtflush_write_comp_queue; + mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; + mtflush_ctx[i].wheap = mtflush_heap; + mtflush_ctx[i].wt_status = WTHR_INITIALIZED; + mtflush_ctx[i].work_item = work_items; + + mtflush_ctx[i].wthread = os_thread_create( + mtflush_io_thread, + ((void *)(mtflush_ctx + i)), + &new_thread_id); + + mtflush_ctx[i].wthread_id = new_thread_id; + } + + buf_mtflu_work_init(); + + return((void *)mtflush_ctx); +} + +/******************************************************************//** +Flush buffer pool instances. +@return number of pages flushed. */ +ulint +buf_mtflu_flush_work_items( +/*=======================*/ + ulint buf_pool_inst, /*!< in: Number of buffer pool instances */ + ulint *per_pool_pages_flushed, /*!< out: Number of pages + flushed/instance */ + enum buf_flush flush_type, /*!< in: Type of flush */ + ulint min_n, /*!< in: Wished minimum number of + blocks to be flushed */ + lsn_t lsn_limit) /*!< in: All blocks whose + oldest_modification is smaller than + this should be flushed (if their + number does not exceed min_n) */ +{ + ulint n_flushed=0, i; + wrk_t *done_wi; + + for(i=0;iwork_item[i].tsk = MT_WRK_WRITE; + mtflush_ctx->work_item[i].rd.page_pool = NULL; + mtflush_ctx->work_item[i].wr.buf_pool = buf_pool_from_array(i); + mtflush_ctx->work_item[i].wr.flush_type = flush_type; + mtflush_ctx->work_item[i].wr.min = min_n; + mtflush_ctx->work_item[i].wr.lsn_limit = lsn_limit; + mtflush_ctx->work_item[i].id_usr = -1; + mtflush_ctx->work_item[i].wi_status = WRK_ITEM_SET; + + ib_wqueue_add(mtflush_ctx->wq, + (void *)(&(mtflush_ctx->work_item[i])), + mtflush_ctx->wheap); + } + + /* wait on the completion to arrive */ + for(i=0; i< buf_pool_inst;) { + done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, 50000); + + if (done_wi != NULL) { + if(done_wi->n_flushed == 0) { + per_pool_pages_flushed[i] = 0; + } else { + per_pool_pages_flushed[i] = done_wi->n_flushed; + } + + if(done_wi->id_usr == -1 && + done_wi->wi_status == WRK_ITEM_SET ) { + fprintf(stderr, + "**Set/Unused work_item[%d] flush_type=%lu\n", + i, + done_wi->wr.flush_type); + ut_a(0); + } + + n_flushed+= done_wi->n_flushed; + /* Reset for next round*/ + mtflush_ctx->work_item[i].id_usr = -1; + + i++; + } + } + + return(n_flushed); } /*******************************************************************//** -This utility flushes dirty blocks from the end of the LRU list and also +Flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free list. NOTE: The calling thread is not allowed to own any latches on pages! @@ -180,44 +592,9 @@ buf_mtflu_flush_LRU( return(true); } -#ifdef UNIV_DEBUG /*******************************************************************//** -Utility function to calculate time difference between start time -and end time. -@return Time difference. +Multi-threaded version of buf_flush_list */ -UNIV_INTERN -void -mtflu_timediff( -/*===========*/ - struct timeval *g_time, /*!< in/out: Start time*/ - struct timeval *s_time, /*!< in/out: End time */ - struct timeval *d_time) /*!< out: Time difference */ -{ - if (g_time->tv_usec < s_time->tv_usec) - { - int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; - s_time->tv_usec -= 1000000 * nsec; - s_time->tv_sec += nsec; - } - if (g_time->tv_usec - s_time->tv_usec > 1000000) - { - int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; - s_time->tv_usec += 1000000 * nsec; - s_time->tv_sec -= nsec; - } - d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; - d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; -} -#endif - -/*******************************************************************//** -This utility flushes dirty blocks from the end of the flush list of -all buffer pool instances. This is multi-threaded version of buf_flush_list. -NOTE: The calling thread is not allowed to own any latches on pages! -@return true if a batch was queued successfully for each buffer pool -instance. false if another batch of same type was already running in -at least one of the buffer pool instance */ bool buf_mtflu_flush_list( /*=================*/ @@ -236,7 +613,7 @@ buf_mtflu_flush_list( { ulint i; bool success = true; - struct timeval p_start_time, p_end_time, d_time; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; if (n_processed) { *n_processed = 0; @@ -251,853 +628,91 @@ buf_mtflu_flush_list( / srv_buf_pool_instances; } -#ifdef UNIV_DEBUG - gettimeofday(&p_start_time, 0x0); -#endif - if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { - int cnt_flush[32]; + /* QUESTION: What is procted by below mutex ? */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&mtflush_mtx); - mutex_enter(&mt_flush_mutex); - -#ifdef UNIV_DEBUG - fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); -#endif - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LIST, - min_n, lsn_limit); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (n_processed) { - *n_processed += cnt_flush[i]; - } - if (cnt_flush[i]) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - cnt_flush[i]); - - } - } - - mutex_exit(&pgcomp_mtx); - -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - return(success); - } - - /* Flush to lsn_limit in all buffer pool instances */ for (i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool; - ulint page_count = 0; - - buf_pool = buf_pool_from_array(i); - - if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ - success = false; - - continue; - } - - page_count = buf_flush_batch( - buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit); - - buf_flush_end(buf_pool, BUF_FLUSH_LIST); - - buf_flush_common(BUF_FLUSH_LIST, page_count); - if (n_processed) { - *n_processed += page_count; + *n_processed += cnt_flush[i]; } - - if (page_count) { + if (cnt_flush[i]) { MONITOR_INC_VALUE_CUMULATIVE( MONITOR_FLUSH_BATCH_TOTAL_PAGE, MONITOR_FLUSH_BATCH_COUNT, MONITOR_FLUSH_BATCH_PAGES, - page_count); + cnt_flush[i]); } } - #ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - - fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); #endif return(success); } /*********************************************************************//** -Clear up tail of the LRU lists: +Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list * Flush dirty pages at the tail of LRU to the disk The depth to which we scan each buffer pool is controlled by dynamic config parameter innodb_LRU_scan_depth. @return total pages flushed */ +UNIV_INTERN ulint buf_mtflu_flush_LRU_tail(void) /*==========================*/ { - ulint total_flushed=0, i=0; - int cnt_flush[32]; + ulint total_flushed=0, i; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; -#ifdef UNIV_DEBUG - struct timeval p_start_time, p_end_time, d_time; - gettimeofday(&p_start_time, 0x0); -#endif - assert(is_pgcomp_wrk_init_done()); + ut_a(buf_mtflu_init_done()); - mutex_enter(&pgcomp_mtx); - pgcomp_flush_work_items(srv_buf_pool_instances, + /* QUESTION: What is protected by below mutex ? */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&mtflush_mtx); for (i = 0; i < srv_buf_pool_instances; i++) { if (cnt_flush[i]) { total_flushed += cnt_flush[i]; MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_TOTAL_PAGE, MONITOR_LRU_BATCH_COUNT, MONITOR_LRU_BATCH_PAGES, cnt_flush[i]); } } - mutex_exit(&pgcomp_mtx); - #if UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); #endif return(total_flushed); } -/*******************************************************************//** -Set work done count to given count. -@return 1 if still work to do, 0 if no work left */ -int -set_check_done_flag_count(int cnt) -/*================*/ +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*!cq->mtx); - ppc->stat_universal_num_processed++; - ppc->stat_cycle_num_processed++; - done_cnt_flag++; - if(!(done_cnt_flag <= check_wrk_done_count)) { - fprintf(stderr, "ERROR: done_cnt:%d check_wrk_done_count:%d\n", - done_cnt_flag, check_wrk_done_count); - } - assert(done_cnt_flag <= check_wrk_done_count); - mutex_exit(&ppc->cq->mtx); - if(done_cnt_flag == check_wrk_done_count) { - // why below does not need mutex protection ? - ppc->wq->flag = Q_DONE; - mutex_enter(&ppc->cq->mtx); - ppc->cq->flag = Q_DONE; - os_cond_signal(&ppc->cq->cv); - mutex_exit(&ppc->cq->mtx); - } - return(done_cnt_flag); -} - -/*******************************************************************//** -Remove work item from queue, in my opinion not needed after we use -UT_LIST -@return number of pages flushed */ -int -q_remove_wrk(opq_t *q, wrk_t **wi) -/*================*/ -{ - int ret = 0; - - if(!wi || !q) { - return -1; - } - - mutex_enter(&q->mtx); - assert(!((q->tail == NULL) && (q->head != NULL))); - assert(!((q->tail != NULL) && (q->head == NULL))); - - /* get the first in the list*/ - *wi = q->head; - if(q->head) { - ret = 0; - q->head = q->head->next; - (*wi)->next = NULL; - if(!q->head) { - q->tail = NULL; - } - } else { - q->tail = NULL; - ret = 1; /* indicating remove from queue failed */ - } - mutex_exit(&q->mtx); - return (ret); -} - -/*******************************************************************//** -Return true if work item has being assigned to a thread or false -if work item is not assigned. -@return true if work is assigned, false if not */ -bool -is_busy_wrk_itm(wrk_t *wi) -/*================*/ -{ - if(!wi) { - return -1; - } - return(!(wi->id_usr == -1)); -} - -/*******************************************************************//** -Initialize work items. -@return why ? */ -int -setup_wrk_itm(int items) -/*================*/ -{ - int i; - for(i=0; imtx = os_mutex_create(); - os_cond_init(&q->cv); - q->flag = Q_INITIALIZED; - q->head = q->tail = NULL; - - return 0; -} - -/// NEEDED ? -#if 0 -int drain_cq(opq_t *cq, int items) -{ - int i=0; - - if(!cq) { - return -1; - } - mutex_enter(&cq->mtx); - for(i=0; ihead = cq->tail = NULL; - mutex_unlock(&cq->mtx); - return 0; -} -#endif - -/*******************************************************************//** -Insert work item list to queue, not needed with UT_LIST -@return why ? */ -int -q_insert_wrk_list(opq_t *q, wrk_t *w_list) -/*================*/ -{ - if((!q) || (!w_list)) { - fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); - return -1; - } - - mutex_enter(&q->mtx); - - assert(!((q->tail == NULL) && (q->head != NULL))); - assert(!((q->tail != NULL) && (q->head == NULL))); - - /* list is empty */ - if(!q->tail) { - q->head = q->tail = w_list; - } else { - /* added the first of the node to list */ - assert(q->head != NULL); - q->tail->next = w_list; - } - - /* move tail to the last node */ - while(q->tail->next) { - q->tail = q->tail->next; - } - mutex_exit(&q->mtx); - - return 0; -} - -/*******************************************************************//** -Flush ? -@return why ? */ -int -flush_pool_instance(wrk_t *wi) -/*================*/ -{ - struct timeval p_start_time, p_end_time, d_time; - - if(!wi) { - fprintf(stderr, "work item invalid wi:%p\n", wi); - return -1; - } - - wi->t_usec = 0; - if (!buf_flush_start(wi->buf_pool, (buf_flush_t)wi->flush_type)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ - fprintf(stderr, "flush_start Failed, flush_type:%d\n", - (buf_flush_t)wi->flush_type); - return -1; - } - -#ifdef UNIV_DEBUG - /* Record time taken for the OP in usec */ - gettimeofday(&p_start_time, 0x0); -#endif - - if((buf_flush_t)wi->flush_type == BUF_FLUSH_LRU) { - /* srv_LRU_scan_depth can be arbitrarily large value. - * We cap it with current LRU size. - */ - buf_pool_mutex_enter(wi->buf_pool); - wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); - buf_pool_mutex_exit(wi->buf_pool); - wi->min = ut_min(srv_LRU_scan_depth,wi->min); - } - - wi->result = buf_flush_batch(wi->buf_pool, - (buf_flush_t)wi->flush_type, - wi->min, wi->lsn_limit); - - buf_flush_end(wi->buf_pool, (buf_flush_t)wi->flush_type); - buf_flush_common((buf_flush_t)wi->flush_type, wi->result); - -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - - wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); -#endif - return 0; -} - -/*******************************************************************//** -? -@return why ? */ -int -service_page_comp_io(thread_sync_t * ppc) -/*================*/ -{ - wrk_t *wi = NULL; - int ret=0; - struct timespec ts; - - mutex_enter(&ppc->wq->mtx); - do{ - ppc->wt_status = WTHR_SIG_WAITING; - ret = os_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); - ppc->wt_status = WTHR_RUNNING; - if(ret == ETIMEDOUT) { - fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%d] ret:%d\n", - done_cnt_flag, ret); - } else if(ret == EINVAL || ret == EPERM) { - fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%d] ret:%d\n", - done_cnt_flag, ret); - } - if(ppc->wq->flag == Q_PROCESS) { - break; - } else { - mutex_exit(&ppc->wq->mtx); - return -1; - } - } while (ppc->wq->flag == Q_PROCESS && ret == 0); - - mutex_exit(&ppc->wq->mtx); - - while (ppc->cq->flag == Q_PROCESS) { - wi = NULL; - /* Get the work item */ - if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { - ppc->wt_status = WTHR_NO_WORK; - return -1; - } - - assert(ret==0); - assert(wi != NULL); - assert(0 == is_busy_wrk_itm(wi)); - assert(wi->id_usr == -1); - - wi->id_usr = ppc->wthread; - wi->wi_status = WRK_ITEM_START; - - /* Process work item */ - if(0 != (ret = flush_pool_instance(wi))) { - fprintf(stderr, "FLUSH op failed ret:%d\n", ret); - wi->wi_status = WRK_ITEM_FAILED; - } - ret = q_insert_wrk_list(ppc->cq, wi); - - assert(0==ret); - assert(check_wrk_done_count >= done_cnt_flag); - wi->wi_status = WRK_ITEM_SUCCESS; - if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { - break; - } - } - return(0); -} - -/******************************************************************//** -Thread main function for multi-threaded flush -@return a dummy parameter*/ -extern "C" UNIV_INTERN -os_thread_ret_t -DECLARE_THREAD(page_comp_io_thread)( -/*==========================================*/ - void * arg) -{ - thread_sync_t *ppc_io = ((thread_sync_t *)arg); - - while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { - service_page_comp_io(ppc_io); - ppc_io->stat_cycle_num_processed = 0; - } - os_thread_exit(NULL); - OS_THREAD_DUMMY_RETURN; -} - -/*******************************************************************//** -Print queue work item -@return why ? */ -int -print_queue_wrk_itm(opq_t *q) -/*================*/ -{ -#if UNIV_DEBUG - wrk_t *wi = NULL; - - if(!q) { - fprintf(stderr, "queue NULL\n"); - return -1; - } - - if(!q->head || !q->tail) { - assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); - fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); - return 0; - } - - mutex_enter(&q->mtx); - for(wi = q->head; (wi != NULL) ; wi = wi->next) { - //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", - // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); - fprintf(stderr, "- [%p] [%s] >%p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->next); - } - mutex_exit(&q->mtx); -#endif - return(0); -} - -/*******************************************************************//** -Print work list -@return why ? */ -int -print_wrk_list(wrk_t *wi_list) -/*================*/ -{ - wrk_t *wi = wi_list; - int i=0; - - if(!wi_list) { - fprintf(stderr, "list NULL\n"); - } - - while(wi) { - fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); - wi = wi->next; - i++; - } - fprintf(stderr, "list len: %d\n", i); - return 0; -} - -/*******************************************************************//** -? -@return why ? */ -int -pgcomp_handler(wrk_t *w_list) -/*================*/ -{ - struct timespec ts; - int ret=0, t_flag=0; - opq_t *wrk_q=NULL, *comp_q=NULL; - wrk_t *tw_list=NULL; - - wrk_q=&wq; - comp_q=&cq; - - mutex_enter(&wrk_q->mtx); - /* setup work queue here.. */ - wrk_q->flag = Q_EMPTY; - mutex_exit(&wrk_q->mtx); - - ret = q_insert_wrk_list(wrk_q, w_list); - if(ret != 0) { - fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", - __FUNCTION__, &wq, w_list); - return -1; - } - -retry_submit: - mutex_enter(&wrk_q->mtx); - /* setup work queue here.. */ - wrk_q->flag = Q_INITIALIZED; - mutex_exit(&wrk_q->mtx); - - - mutex_enter(&comp_q->mtx); - if(0 != set_done_cnt_flag(0)) { - fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); - mutex_exit(&comp_q->mtx); - return -1; - } - comp_q->flag = Q_PROCESS; - mutex_enter(&comp_q->mtx); - - /* if threads are waiting request them to start */ - mutex_enter(&wrk_q->mtx); - wrk_q->flag = Q_PROCESS; - os_cond_broadcast(&wrk_q->cv); - mutex_exit(&wrk_q->mtx); - - /* Wait on all worker-threads to complete */ - mutex_enter(&comp_q->mtx); - if (comp_q->flag != Q_DONE) { - do { - os_cond_wait(&comp_q->cv, &comp_q->mtx); - if(comp_q->flag != Q_DONE) { - fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%d\n", - comp_q->flag, done_cnt_flag); - if (done_cnt_flag != srv_buf_pool_instances) { - fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%d\n", - comp_q->flag, done_cnt_flag); - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - } - continue; - } else if (done_cnt_flag != srv_buf_pool_instances) { - fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%d\n", - comp_q->flag, done_cnt_flag); - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - comp_q->flag = Q_INITIALIZED; - mutex_exit(&comp_q->mtx); - goto retry_submit; - - ut_ad(!done_cnt_flag); - continue; - } - ut_ad(done_cnt_flag == srv_buf_pool_instances); - - if ((comp_q->flag == Q_DONE) && - (done_cnt_flag == srv_buf_pool_instances)) { - break; - } - } while((comp_q->flag == Q_INITIALIZED) && - (done_cnt_flag != srv_buf_pool_instances)); - } else { - fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%d\n", - comp_q->flag, done_cnt_flag); - if (!done_cnt_flag) { - fprintf(stderr, "============\n"); - print_wrk_list(w_list); - fprintf(stderr, "============\n"); - comp_q->flag = Q_INITIALIZED; - mutex_enter(&comp_q->mtx); - goto retry_submit; - ut_ad(!done_cnt_flag); - } - ut_ad(done_cnt_flag == srv_buf_pool_instances); - } - - mutex_exit(&comp_q->mtx); - mutex_enter(&wrk_q->mtx); - wrk_q->flag = Q_DONE; - mutex_exit(&wrk_q->mtx); - - return 0; -} - -/******************************************************************//** -@return a dummy parameter*/ -int -pgcomp_handler_init( - int num_threads, - int wrk_cnt, - opq_t *wq, - opq_t *cq) -/*================*/ -{ - int i=0; - - if(is_pgcomp_wrk_init_done()) { - fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); - return -1; - } - - if(!wq || !cq) { - fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); - return -1; - } - - /* work-item setup */ - setup_wrk_itm(wrk_cnt); - - /* wq & cq setup */ - init_queue(wq); - init_queue(cq); - - /* Mark each of the thread sync entires */ - for(i=0; i < PGCOMP_MAX_WORKER; i++) { - pc_sync[i].wthread_id = i; - } - - /* Create threads for page-compression-flush */ - for(i=0; i < num_threads; i++) { - pc_sync[i].wthread_id = i; - pc_sync[i].wq = wq; - pc_sync[i].cq = cq; - os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), - thread_ids + START_PGCOMP_CNT + i); - //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; - pc_sync[i].wthread = (START_PGCOMP_CNT + i); - pc_sync[i].wt_status = WTHR_INITIALIZED; - } - - set_check_done_flag_count(wrk_cnt); - set_pgcomp_wrk_init_done(); - - return 0; -} - - -/*******************************************************************//** -Print work thread status information -@return why ? */ -int -wrk_thread_stat( - thread_sync_t *wthr, - unsigned int num_threads) -/*================*/ -{ - long stat_tot=0; - int i=0; - for(i=0; iwr.buf_pool) { - fprintf(stderr, "work-item wi->buf_pool:%p [likely thread exit]\n", - wi->wr.buf_pool); - return -1; - } - - wi->t_usec = 0; - if (!buf_flush_start(wi->wr.buf_pool, wi->wr.flush_type)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ - fprintf(stderr, "flush_start Failed, flush_type:%d\n", - wi->wr.flush_type); - return -1; - } - -#ifdef UNIV_DEBUG - /* Record time taken for the OP in usec */ - gettimeofday(&p_start_time, 0x0); -#endif - - if (wi->wr.flush_type == BUF_FLUSH_LRU) { - /* srv_LRU_scan_depth can be arbitrarily large value. - * We cap it with current LRU size. - */ - buf_pool_mutex_enter(wi->wr.buf_pool); - wi->wr.min = UT_LIST_GET_LEN(wi->wr.buf_pool->LRU); - buf_pool_mutex_exit(wi->wr.buf_pool); - wi->wr.min = ut_min(srv_LRU_scan_depth,wi->wr.min); - } - - wi->result = buf_flush_batch(wi->wr.buf_pool, - wi->wr.flush_type, - wi->wr.min, wi->wr.lsn_limit); - - buf_flush_end(wi->wr.buf_pool, wi->wr.flush_type); - buf_flush_common(wi->wr.flush_type, wi->result); - -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); -#endif - return 0; -} - -int service_page_comp_io(thread_sync_t * ppc) -{ - wrk_t *wi = NULL; - int ret=0; - - ppc->wt_status = WTHR_SIG_WAITING; - wi = (wrk_t *)ib_wqueue_wait(ppc->wq); - - if (wi) { - ppc->wt_status = WTHR_RUNNING; - } else { - fprintf(stderr, "%s:%d work-item is NULL\n", __FILE__, __LINE__); - ppc->wt_status = WTHR_NO_WORK; - return (0); - } - - assert(wi != NULL); - wi->id_usr = ppc->wthread; - - switch(wi->tsk) { - case MT_WRK_NONE: - assert(wi->wi_status == WRK_ITEM_EXIT); - wi->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); - break; - - case MT_WRK_WRITE: - wi->wi_status = WRK_ITEM_START; - /* Process work item */ - if (0 != (ret = flush_pool_instance(wi))) { - fprintf(stderr, "FLUSH op failed ret:%d\n", ret); - wi->wi_status = WRK_ITEM_FAILED; - } - wi->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); - break; - - case MT_WRK_READ: - /* Need to also handle the read case */ - assert(0); - /* completed task get added to rd_cq */ - /* wi->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(ppc->rd_cq, wi, heap_allocated);*/ - break; - - default: - /* None other than Write/Read handling planned */ - assert(0); - } - - ppc->wt_status = WTHR_NO_WORK; - return(0); -} - -void page_comp_io_thread_exit() -{ - ulint i; - - fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", srv_buf_pool_instances); - for (i=0; istat_cycle_num_processed = 0; - } - os_thread_exit(NULL); - OS_THREAD_DUMMY_RETURN; -} - -int print_wrk_list(wrk_t *wi_list) -{ - wrk_t *wi = wi_list; - int i=0; - - if(!wi_list) { - fprintf(stderr, "list NULL\n"); - } - - while(wi) { - fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); - wi = wi->next; - i++; - } - fprintf(stderr, "list len: %d\n", i); - return 0; -} - -/******************************************************************//** -@return a dummy parameter*/ -int pgcomp_handler_init(int num_threads, int wrk_cnt, ib_wqueue_t *wq, ib_wqueue_t *wr_cq, ib_wqueue_t *rd_cq) -{ - int i=0; - - if(is_pgcomp_wrk_init_done()) { - fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); - return -1; - } - - if(!wq || !wr_cq || !rd_cq) { - fprintf(stderr, "%s() FAILED wq:%p write-cq:%p read-cq:%p\n", - __FUNCTION__, wq, wr_cq, rd_cq); - return -1; - } - - /* work-item setup */ - setup_wrk_itm(wrk_cnt); - - /* Mark each of the thread sync entires */ - for(i=0; i < MTFLUSH_MAX_WORKER; i++) { - pc_sync[i].wthread_id = i; - } - - /* Create threads for page-compression-flush */ - for(i=0; i < num_threads; i++) { - pc_sync[i].wthread_id = i; - pc_sync[i].wq = wq; - pc_sync[i].wr_cq = wr_cq; - pc_sync[i].rd_cq = rd_cq; - - os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), - thread_ids + START_OLD_THREAD_CNT + i); - pc_sync[i].wthread = (START_OLD_THREAD_CNT + i); - pc_sync[i].wt_status = WTHR_INITIALIZED; - } - set_pgcomp_wrk_init_done(); - fprintf(stderr, "%s() Worker-Threads created..\n", __FUNCTION__); - return 0; -} - -int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) -{ - ulong stat_tot=0; - ulint i=0; - for(i=0; in_flush[flush_type], buf_pool->init_flush[flush_type]); + mutex_exit(&buf_pool->flush_state_mutex); return(FALSE); From 18353c6a4d8241ea45aeabb4a606953531c3c9dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 6 Feb 2014 17:49:55 +0200 Subject: [PATCH 14/56] Fixed issue on file space extension. File space should be extended from current offset to desired size if posix_fallocate is used. --- storage/innobase/fil/fil0fil.cc | 32 ++++++++++++++------------------ storage/xtradb/fil/fil0fil.cc | 29 ++++++++++++----------------- 2 files changed, 26 insertions(+), 35 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 2430df2b386..bb9a8699e3f 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -48,6 +48,7 @@ Created 10/25/1995 Heikki Tuuri #include "page0zip.h" #include "trx0sys.h" #include "row0mysql.h" +#include "os0file.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -4860,28 +4861,25 @@ retry: #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { - ulint n_pages = size_after_extend; + os_offset_t start_offset = start_page_no * page_size; + os_offset_t end_offset = (size_after_extend - start_page_no) * page_size; - success = os_file_set_size(node->name, node->handle, - n_pages * page_size); - - /* Temporal solution: In directFS using atomic writes - we must use posix_fallocate to extend the file because - pwrite past end of file fails but when compression is - used the file pages must be physically initialized with - zeroes, thus after file extend with posix_fallocate - we still write empty pages to file. */ - if (success && - srv_use_atomic_writes && - srv_compress_pages) { - goto extend_file; + if (posix_fallocate(node->handle, start_offset, end_offset) == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " + "space for file \'%s\' failed. Current size " + INT64PF ", desired size " INT64PF "\n", + node->name, start_offset, end_offset); + success = FALSE; + } else { + success = TRUE; } mutex_enter(&fil_system->mutex); if (success) { - node->size += n_pages; - space->size += n_pages; + node->size += (size_after_extend - start_page_no); + space->size += (size_after_extend - start_page_no); + os_has_said_disk_full = FALSE; } @@ -4895,8 +4893,6 @@ retry: } #endif -extend_file: - /* Extend at most 64 pages at a time */ buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; buf2 = static_cast(mem_alloc(buf_size + page_size)); diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index e170004cea1..0dae3a28690 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -4988,27 +4988,24 @@ retry: #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { - ulint n_pages = size_after_extend; + os_offset_t start_offset = start_page_no * page_size; + os_offset_t end_offset = (size_after_extend - start_page_no) * page_size; - success = os_file_set_size(node->name, node->handle, n_pages * page_size); - - /* Temporal solution: In directFS using atomic writes - we must use posix_fallocate to extend the file because - pwrite past end of file fails but when compression is - used the file pages must be physically initialized with - zeroes, thus after file extend with posix_fallocate - we still write empty pages to file. */ - if (success && - srv_use_atomic_writes && - srv_compress_pages) { - goto extend_file; + if (posix_fallocate(node->handle, start_offset, end_offset) == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " + "space for file \'%s\' failed. Current size " + INT64PF ", desired size " INT64PF "\n", + node->name, start_offset, end_offset); + success = FALSE; + } else { + success = TRUE; } mutex_enter(&fil_system->mutex); if (success) { - node->size += n_pages; - space->size += n_pages; + node->size += (size_after_extend - start_page_no); + space->size += (size_after_extend - start_page_no); os_has_said_disk_full = FALSE; } @@ -5022,8 +5019,6 @@ retry: } #endif -extend_file: - /* Extend at most 64 pages at a time */ buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; buf2 = static_cast(mem_alloc(buf_size + page_size)); From a5cf3a800e20e86a4469dff659e68cc1b21263e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 7 Feb 2014 15:31:31 +0200 Subject: [PATCH 15/56] Merged latest mt-flush code to xtradb. Cleaned up thread statistic output code. --- storage/innobase/buf/buf0mtflu.cc | 116 ++--- storage/xtradb/CMakeLists.txt | 3 +- storage/xtradb/buf/buf0flu.cc | 228 +--------- storage/xtradb/buf/buf0mtflu.cc | 694 +++++++++++++++++++++++++++++ storage/xtradb/include/buf0flu.h | 57 +++ storage/xtradb/include/buf0mtflu.h | 95 ++++ storage/xtradb/include/srv0srv.h | 2 +- storage/xtradb/srv/srv0srv.cc | 2 + storage/xtradb/srv/srv0start.cc | 431 +----------------- 9 files changed, 910 insertions(+), 718 deletions(-) create mode 100644 storage/xtradb/buf/buf0mtflu.cc create mode 100644 storage/xtradb/include/buf0mtflu.h diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index 901f766c472..a81ccee5650 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -116,18 +116,13 @@ typedef struct wrk_itm /* Thread syncronization data */ typedef struct thread_sync { + ulint n_threads; /*!< Number of threads */ os_thread_id_t wthread_id; /*!< Identifier */ os_thread_t wthread; /*!< Thread id */ ib_wqueue_t *wq; /*!< Work Queue */ ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ wthr_status_t wt_status; /*!< Worker thread status */ - ulint stat_universal_num_processed; - /*!< Total number of pages - processed by this thread */ - ulint stat_cycle_num_processed; - /*!< Number of pages processed - on this cycle */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ wrk_t* work_item; /*!< Work items to be processed */ @@ -231,6 +226,7 @@ buf_mtflu_flush_pool_instance( work_item->wr.min, work_item->wr.lsn_limit); + buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); @@ -239,28 +235,29 @@ buf_mtflu_flush_pool_instance( #ifdef UNIV_DEBUG /******************************************************************//** -Output work item list status, +Print flush statistics of work items. */ static void -mtflu_print_work_list( -/*==================*/ - wrk_t* wi_list) /*!< in: Work item list */ +mtflu_print_thread_stat( +/*====================*/ + wrk_t* work_item) /*!< in: Work items */ { - wrk_t* wi = wi_list; + ulint stat_tot=0; ulint i=0; - if(!wi_list) { - fprintf(stderr, "list NULL\n"); - } + for(i=0; i< MTFLUSH_MAX_WORKER; i++) { + stat_tot+=work_item[i].n_flushed; - while(wi) { - fprintf(stderr, "-\t[%p]\t[%s]\t[%lu] > %p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->n_flushed, wi->next); - wi = wi->next; - i++; - } - fprintf(stderr, "list len: %d\n", i); + fprintf(stderr, "MTFLUSH: Thread[%lu] stat [%lu]\n", + work_item[i].id_usr, + work_item[i].n_flushed); + + if (work_item[i].next == NULL) { + break; /* No more filled work items */ + } + } + fprintf(stderr, "MTFLUSH: Stat-Total:%lu\n", stat_tot); } #endif /* UNIV_DEBUG */ @@ -282,10 +279,6 @@ mtflush_service_io( mtflush_io->wt_status = WTHR_SIG_WAITING; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); -#ifdef UNIV_DEBUG - mtflu_print_work_list(mtflush_io->work_item); -#endif - if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; } else { @@ -345,10 +338,28 @@ DECLARE_THREAD(mtflush_io_thread)( void * arg) { thread_sync_t *mtflush_io = ((thread_sync_t *)arg); +#ifdef UNIV_DEBUG + ib_uint64_t stat_universal_num_processed = 0; + ib_uint64_t stat_cycle_num_processed = 0; + wrk_t* work_item = mtflush_io[0].work_item; + ulint i; +#endif while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { mtflush_service_io(mtflush_io); - mtflush_io->stat_cycle_num_processed = 0; + +#ifdef UNIV_DEBUG + for(i=0; i < MTFLUSH_MAX_WORKER; i++) { + stat_cycle_num_processed+= work_item[i].n_flushed; + } + + stat_universal_num_processed+=stat_cycle_num_processed; + stat_cycle_num_processed = 0; + fprintf(stderr, "MTFLUSH_IO_THREAD: total %lu cycle %lu\n", + stat_universal_num_processed, + stat_cycle_num_processed); + mtflu_print_thread_stat(work_item); +#endif } /* This should make sure that all current work items are @@ -458,13 +469,16 @@ buf_mtflu_handler_init( work_items = (wrk_t*)mem_heap_alloc(mtflush_heap, MTFLUSH_MAX_WORKER * sizeof(wrk_t)); ut_a(work_items != NULL); + memset(work_items, 0, sizeof(wrk_t) * MTFLUSH_MAX_WORKER); + memset(mtflush_ctx, 0, sizeof(thread_sync_t) * MTFLUSH_MAX_WORKER); /* Initialize work items */ - mtflu_setup_work_items(work_items, MTFLUSH_MAX_WORKER); + mtflu_setup_work_items(work_items, n_threads); /* Create threads for page-compression-flush */ for(i=0; i < n_threads; i++) { os_thread_id_t new_thread_id; + mtflush_ctx[i].n_threads = n_threads; mtflush_ctx[i].wq = mtflush_work_queue; mtflush_ctx[i].wr_cq = mtflush_write_comp_queue; mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; @@ -531,19 +545,16 @@ buf_mtflu_flush_work_items( per_pool_pages_flushed[i] = done_wi->n_flushed; } - if(done_wi->id_usr == -1 && + if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { fprintf(stderr, - "**Set/Unused work_item[%d] flush_type=%lu\n", + "**Set/Unused work_item[%lu] flush_type=%lu\n", i, done_wi->wr.flush_type); ut_a(0); } n_flushed+= done_wi->n_flushed; - /* Reset for next round*/ - mtflush_ctx->work_item[i].id_usr = -1; - i++; } } @@ -551,47 +562,6 @@ buf_mtflu_flush_work_items( return(n_flushed); } -/*******************************************************************//** -Flushes dirty blocks from the end of the LRU list and also -puts replaceable clean pages from the end of the LRU list to the free -list. -NOTE: The calling thread is not allowed to own any latches on pages! -@return true if a batch was queued successfully. false if another batch -of same type was already running. */ -bool -buf_mtflu_flush_LRU( -/*================*/ - buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - ulint* n_processed) /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ -{ - ulint page_count; - - if (n_processed) { - *n_processed = 0; - } - - if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { - return(false); - } - - page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0); - - buf_flush_end(buf_pool, BUF_FLUSH_LRU); - - buf_flush_common(BUF_FLUSH_LRU, page_count); - - if (n_processed) { - *n_processed = page_count; - } - - return(true); -} - /*******************************************************************//** Multi-threaded version of buf_flush_list */ diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 5050ca34da9..14fbb14bdd7 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -284,8 +284,7 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc -# TODO: JAN uncomment -# buf/buf0mtflu.cc + buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index 8ed11fd674a..a080ef0ee48 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -32,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri #endif #include "buf0buf.h" +#include "buf0mtflu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" @@ -1949,47 +1950,6 @@ void buf_pool_exit_LRU_mutex( mutex_exit(&buf_pool->LRU_list_mutex); } -/*******************************************************************//** -This utility flushes dirty blocks from the end of the LRU list and also -puts replaceable clean pages from the end of the LRU list to the free -list. -NOTE: The calling thread is not allowed to own any latches on pages! -@return true if a batch was queued successfully. false if another batch -of same type was already running. */ -static -bool -pgcomp_buf_flush_LRU( -/*==========*/ - buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - ulint* n_processed) /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ -{ - flush_counters_t n; - - if (n_processed) { - *n_processed = 0; - } - - if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { - return(false); - } - - buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, false, &n); - - buf_flush_end(buf_pool, BUF_FLUSH_LRU); - - buf_flush_common(BUF_FLUSH_LRU, n.flushed); - - if (n_processed) { - *n_processed = n.flushed; - } - - return(true); -} /* JAN: TODO: END: */ /*******************************************************************//** @@ -2029,126 +1989,6 @@ buf_flush_LRU( return(true); } -/* JAN: TODO: */ -/*******************************************************************//**/ -extern int is_pgcomp_wrk_init_done(void); -extern int pgcomp_flush_work_items( - int buf_pool_inst, - int *pages_flushed, - buf_flush_t flush_type, - int min_n, - lsn_t lsn_limit); - -#define MT_COMP_WATER_MARK 50 - -#ifdef UNIV_DEBUG -#include -int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) -{ - if (g_time->tv_usec < s_time->tv_usec) - { - int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; - s_time->tv_usec -= 1000000 * nsec; - s_time->tv_sec += nsec; - } - if (g_time->tv_usec - s_time->tv_usec > 1000000) - { - int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; - s_time->tv_usec += 1000000 * nsec; - s_time->tv_sec -= nsec; - } - d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; - d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; - - return 0; -} -#endif - -static os_fast_mutex_t pgcomp_mtx; - -void pgcomp_init(void) -{ - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &pgcomp_mtx); -} - -void pgcomp_deinit(void) -{ - os_fast_mutex_free(&pgcomp_mtx); -} - -/*******************************************************************//** -Multi-threaded version of buf_flush_list -*/ -UNIV_INTERN -bool -pgcomp_buf_flush_list( -/*==================*/ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all - blocks whose oldest_modification is - smaller than this should be flushed - (if their number does not exceed - min_n), otherwise ignored */ - ulint* n_processed) /*!< out: the number of pages - which were processed is passed - back to caller. Ignored if NULL */ - -{ - ulint i; - bool success = true; -#ifdef UNIV_DEBUG - struct timeval p_start_time, p_end_time, d_time; -#endif - int cnt_flush[MTFLUSH_MAX_WORKER]; - - if (n_processed) { - *n_processed = 0; - } - - if (min_n != ULINT_MAX) { - /* Ensure that flushing is spread evenly amongst the - buffer pool instances. When min_n is ULINT_MAX - we need to flush everything up to the lsn limit - so no limit here. */ - min_n = (min_n + srv_buf_pool_instances - 1) - / srv_buf_pool_instances; - } - -#ifdef UNIV_DEBUG - gettimeofday(&p_start_time, 0x0); -#endif - // os_fast_mutex_lock(&pgcomp_mtx); - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LIST, - min_n, lsn_limit); - // os_fast_mutex_unlock(&pgcomp_mtx); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (n_processed) { - *n_processed += cnt_flush[i]; - } - if (cnt_flush[i]) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - cnt_flush[i]); - } - } -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu %llu usec]\n", - __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - return(success); -} - -/* JAN: TODO: END: */ - /*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. @@ -2181,11 +2021,9 @@ buf_flush_list( bool timeout = false; ulint flush_start_time = 0; - /* JAN: TODO: */ - if (is_pgcomp_wrk_init_done()) { - return(pgcomp_buf_flush_list(min_n, lsn_limit, n_processed)); + if (buf_mtflu_init_done()) { + return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed)); } - /* JAN: TODO: END: */ for (i = 0; i < srv_buf_pool_instances; i++) { requested_pages[i] = 0; @@ -2380,60 +2218,6 @@ buf_flush_single_page_from_LRU( return(freed); } -/* JAN: TODO: */ -/*********************************************************************//** -pgcomp_Clears up tail of the LRU lists: -* Put replaceable pages at the tail of LRU to the free list -* Flush dirty pages at the tail of LRU to the disk -The depth to which we scan each buffer pool is controlled by dynamic -config parameter innodb_LRU_scan_depth. -@return total pages flushed */ -UNIV_INTERN -ulint -pgcomp_buf_flush_LRU_tail(void) -/*====================*/ -{ -#ifdef UNIV_DEBUG - struct timeval p_start_time, p_end_time, d_time; -#endif - ulint total_flushed=0, i=0; - int cnt_flush[32]; - -#ifdef UNIV_DEBUG - gettimeofday(&p_start_time, 0x0); -#endif - ut_ad(is_pgcomp_wrk_init_done()); - - os_fast_mutex_lock(&pgcomp_mtx); - pgcomp_flush_work_items(srv_buf_pool_instances, - cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); - os_fast_mutex_unlock(&pgcomp_mtx); - - for (i = 0; i < srv_buf_pool_instances; i++) { - if (cnt_flush[i]) { - total_flushed += cnt_flush[i]; - - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_TOTAL_PAGE, - MONITOR_LRU_BATCH_COUNT, - MONITOR_LRU_BATCH_PAGES, - cnt_flush[i]); - } - } - -#if UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - - fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( - srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, - (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); -#endif - - return(total_flushed); -} - -/* JAN: TODO: END: */ /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list @@ -2458,12 +2242,10 @@ buf_flush_LRU_tail(void) ulint free_list_lwm = srv_LRU_scan_depth / 100 * srv_cleaner_free_list_lwm; - /* JAN: TODO: */ - if(is_pgcomp_wrk_init_done()) + if(buf_mtflu_init_done()) { - return(pgcomp_buf_flush_LRU_tail()); + return(buf_mtflu_flush_LRU_tail()); } - /* JAN: TODO: END */ for (ulint i = 0; i < srv_buf_pool_instances; i++) { diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc new file mode 100644 index 00000000000..14ece48519f --- /dev/null +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -0,0 +1,694 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file buf/buf0mtflu.cc +Multi-threaded flush method implementation + +Created 06/11/2013 Dhananjoy Das DDas@fusionio.com +Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com +Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com +Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0mtflu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "fil0fil.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "os0sync.h" +#include "trx0sys.h" +#include "srv0mon.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" + +#define MT_COMP_WATER_MARK 50 + +/* Work item status */ +typedef enum wrk_status { + WRK_ITEM_SET=0, /*!< Work item is set */ + WRK_ITEM_START=1, /*!< Processing of work item has started */ + WRK_ITEM_DONE=2, /*!< Processing is done usually set to + SUCCESS/FAILED */ + WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ + WRK_ITEM_FAILED=3, /*!< Work item process failed */ + WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +/* Work item task type */ +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, /*!< Exit queue-wait */ + MT_WRK_WRITE=1, /*!< Flush operation */ + MT_WRK_READ=2, /*!< Read operation */ + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; + +/* Work thread status */ +typedef enum wthr_status { + WTHR_NOT_INIT=0, /*!< Work thread not initialized */ + WTHR_INITIALIZED=1, /*!< Work thread initialized */ + WTHR_SIG_WAITING=2, /*!< Work thread wating signal */ + WTHR_RUNNING=3, /*!< Work thread running */ + WTHR_NO_WORK=4, /*!< Work thread has no work */ + WTHR_KILL_IT=5, /*!< Work thread should exit */ + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +/* Write work task */ +typedef struct wr_tsk { + buf_pool_t *buf_pool; /*!< buffer-pool instance */ + buf_flush_t flush_type; /*!< flush-type for buffer-pool + flush operation */ + ulint min; /*!< minimum number of pages + requested to be flushed */ + lsn_t lsn_limit; /*!< lsn limit for the buffer-pool + flush operation */ +} wr_tsk_t; + +/* Read work task */ +typedef struct rd_tsk { + buf_pool_t *page_pool; /*!< list of pages to decompress; */ +} rd_tsk_t; + +/* Work item */ +typedef struct wrk_itm +{ + mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type + one of the entries wr_tsk/rd_tsk + will be used */ + wr_tsk_t wr; /*!< Flush page list */ + rd_tsk_t rd; /*!< Decompress page list */ + ulint n_flushed; /*!< Flushed pages count */ + os_thread_t id_usr; /*!< Thread-id currently working */ + wrk_status_t wi_status; /*!< Work item status */ + struct wrk_itm *next; /*!< Next work item */ +} wrk_t; + +/* Thread syncronization data */ +typedef struct thread_sync +{ + ulint n_threads; /*!< Number of threads */ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + ib_wqueue_t *wq; /*!< Work Queue */ + ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ + ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ + wthr_status_t wt_status; /*!< Worker thread status */ + mem_heap_t* wheap; /*!< Work heap where memory + is allocated */ + wrk_t* work_item; /*!< Work items to be processed */ +} thread_sync_t; + +/* QUESTION: Is this array used from several threads concurrently ? */ +// static wrk_t work_items[MTFLUSH_MAX_WORKER]; + +/* TODO: REALLY NEEDED ? */ +static int mtflush_work_initialized = -1; +static os_fast_mutex_t mtflush_mtx; +static thread_sync_t* mtflush_ctx=NULL; + +/******************************************************************//** +Initialize work items. */ +static +void +mtflu_setup_work_items( +/*===================*/ + wrk_t* work_items, /*!< inout: Work items */ + ulint n_items) /*!< in: Number of work items */ +{ + ulint i; + for(i=0; iwr.buf_pool != NULL); + + if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ +#ifdef UNIV_DEBUG + /* QUESTION: is this a really failure ? */ + fprintf(stderr, "flush_start Failed, flush_type:%d\n", + work_item->wr.flush_type); +#endif + return 0; + } + + + if (work_item->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(work_item->wr.buf_pool); + work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU); + buf_pool_mutex_exit(work_item->wr.buf_pool); + work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min); + } + + buf_flush_batch(work_item->wr.buf_pool, + work_item->wr.flush_type, + work_item->wr.min, + work_item->wr.lsn_limit, + false, + &n); + + work_item->n_flushed = n.flushed; + + buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); + buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); + + return 0; +} + +#ifdef UNIV_DEBUG +/******************************************************************//** +Print flush statistics of work items +*/ +static +void +mtflu_print_thread_stat( +/*====================*/ + wrk_t* work_item) /*!< in: Work items */ +{ + ulint stat_tot=0; + ulint i=0; + + for(i=0; i< MTFLUSH_MAX_WORKER; i++) { + stat_tot+=work_item[i].n_flushed; + + fprintf(stderr, "MTFLUSH: Thread[%lu] stat [%lu]\n", + work_item[i].id_usr, + work_item[i].n_flushed); + + if (work_item[i].next == NULL) { + break; /* No more filled work items */ + } + } + + fprintf(stderr, "MTFLUSH: Stat-Total:%lu\n", stat_tot); +} +#endif /* UNIV_DEBUG */ + +/******************************************************************//** +Worker function to wait for work items and processing them and +sending reply back. +*/ +static +void +mtflush_service_io( +/*===============*/ + thread_sync_t* mtflush_io) /*!< inout: multi-threaded flush + syncronization data */ +{ + wrk_t *work_item = NULL; + ulint n_flushed=0; + ib_time_t max_wait_usecs = 5000000; + + mtflush_io->wt_status = WTHR_SIG_WAITING; + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); + + if (work_item) { + mtflush_io->wt_status = WTHR_RUNNING; + } else { + /* Because of timeout this thread did not get any work */ + mtflush_io->wt_status = WTHR_NO_WORK; + return; + } + + work_item->id_usr = mtflush_io->wthread; + + switch(work_item->tsk) { + case MT_WRK_NONE: + ut_a(work_item->wi_status == WRK_ITEM_EXIT); + work_item->wi_status = WRK_ITEM_SUCCESS; + /* QUESTION: Why completed work items are inserted to + completion queue ? */ + ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + break; + + case MT_WRK_WRITE: + work_item->wi_status = WRK_ITEM_START; + /* Process work item */ + /* QUESTION: Is this a really a error ? */ + if (0 != (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { + fprintf(stderr, "FLUSH op failed ret:%lu\n", n_flushed); + work_item->wi_status = WRK_ITEM_FAILED; + } + work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + break; + + case MT_WRK_READ: + /* Need to also handle the read case */ + /* TODO: ? */ + ut_a(0); + /* completed task get added to rd_cq */ + /* work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->rd_cq, work_item, mtflush_io->wheap);*/ + break; + + default: + /* None other than Write/Read handling planned */ + ut_a(0); + } + + mtflush_io->wt_status = WTHR_NO_WORK; +} + +/******************************************************************//** +Thead used to flush dirty pages when multi-threaded flush is +used. +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(mtflush_io_thread)( +/*==============================*/ + void * arg) +{ + thread_sync_t *mtflush_io = ((thread_sync_t *)arg); +#ifdef UNIV_DEBUG + ib_uint64_t stat_universal_num_processed = 0; + ib_uint64_t stat_cycle_num_processed = 0; + wrk_t* work_item = mtflush_io[0].work_item; + ulint i; +#endif + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + mtflush_service_io(mtflush_io); + +#ifdef UNIV_DEBUG + for(i=0; i < MTFLUSH_MAX_WORKER; i++) { + stat_cycle_num_processed+= work_item[i].n_flushed; + } + + stat_universal_num_processed+=stat_cycle_num_processed; + stat_cycle_num_processed = 0; + fprintf(stderr, "MTFLUSH_IO_THREAD: total %lu cycle %lu\n", + stat_universal_num_processed, + stat_cycle_num_processed); + mtflu_print_thread_stat(work_item); +#endif + } + + /* This should make sure that all current work items are + processed before threads exit. */ + while (!ib_wqueue_is_empty(mtflush_io->wq)) { + mtflush_service_io(mtflush_io); + } + + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void) +/*==========================*/ +{ + ulint i; + thread_sync_t* mtflush_io = mtflush_ctx; + + ut_a(mtflush_io != NULL); + + fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", + srv_buf_pool_instances); + + /* Send one exit work item/thread */ + for (i=0; i < srv_buf_pool_instances; i++) { + mtflush_io->work_item[i].wr.buf_pool = NULL; + mtflush_io->work_item[i].rd.page_pool = NULL; + mtflush_io->work_item[i].tsk = MT_WRK_NONE; + mtflush_io->work_item[i].wi_status = WRK_ITEM_EXIT; + + ib_wqueue_add(mtflush_io->wq, + (void *)&(mtflush_io->work_item[i]), + mtflush_io->wheap); + } + + /* Wait until all work items on a work queue are processed */ + while(!ib_wqueue_is_empty(mtflush_io->wq)) { + /* Wait about 1/2 sec */ + os_thread_sleep(50000); + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + + /* Collect all work done items */ + for (i=0; i < srv_buf_pool_instances;) { + wrk_t* work_item; + + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); + + if (work_item) { + i++; + } + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); + ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); + + /* Free all queues */ + ib_wqueue_free(mtflush_io->wq); + ib_wqueue_free(mtflush_io->wr_cq); + ib_wqueue_free(mtflush_io->rd_cq); + + /* Free heap */ + mem_heap_free(mtflush_io->wheap); + + os_fast_mutex_free(&mtflush_mtx); +} + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt) /*!< in: Number of work items */ +{ + ulint i; + mem_heap_t* mtflush_heap; + ib_wqueue_t* mtflush_work_queue; + ib_wqueue_t* mtflush_write_comp_queue; + ib_wqueue_t* mtflush_read_comp_queue; + wrk_t* work_items; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + + /* Create heap, work queue, write completion queue, read + completion queue for multi-threaded flush, and init + handler. */ + mtflush_heap = mem_heap_create(0); + ut_a(mtflush_heap != NULL); + mtflush_work_queue = ib_wqueue_create(); + ut_a(mtflush_work_queue != NULL); + mtflush_write_comp_queue = ib_wqueue_create(); + ut_a(mtflush_write_comp_queue != NULL); + mtflush_read_comp_queue = ib_wqueue_create(); + ut_a(mtflush_read_comp_queue != NULL); + + mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, + MTFLUSH_MAX_WORKER * sizeof(thread_sync_t)); + ut_a(mtflush_ctx != NULL); + work_items = (wrk_t*)mem_heap_alloc(mtflush_heap, + MTFLUSH_MAX_WORKER * sizeof(wrk_t)); + ut_a(work_items != NULL); + memset(work_items, 0, sizeof(wrk_t) * MTFLUSH_MAX_WORKER); + memset(mtflush_ctx, 0, sizeof(thread_sync_t) * MTFLUSH_MAX_WORKER); + + /* Initialize work items */ + mtflu_setup_work_items(work_items, n_threads); + + /* Create threads for page-compression-flush */ + for(i=0; i < n_threads; i++) { + os_thread_id_t new_thread_id; + mtflush_ctx[i].n_threads = n_threads; + mtflush_ctx[i].wq = mtflush_work_queue; + mtflush_ctx[i].wr_cq = mtflush_write_comp_queue; + mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; + mtflush_ctx[i].wheap = mtflush_heap; + mtflush_ctx[i].wt_status = WTHR_INITIALIZED; + mtflush_ctx[i].work_item = work_items; + + mtflush_ctx[i].wthread = os_thread_create( + mtflush_io_thread, + ((void *)(mtflush_ctx + i)), + &new_thread_id); + + mtflush_ctx[i].wthread_id = new_thread_id; + } + + buf_mtflu_work_init(); + + return((void *)mtflush_ctx); +} + +/******************************************************************//** +Flush buffer pool instances. +@return number of pages flushed. */ +ulint +buf_mtflu_flush_work_items( +/*=======================*/ + ulint buf_pool_inst, /*!< in: Number of buffer pool instances */ + ulint *per_pool_pages_flushed, /*!< out: Number of pages + flushed/instance */ + buf_flush_t flush_type, /*!< in: Type of flush */ + ulint min_n, /*!< in: Wished minimum number of + blocks to be flushed */ + lsn_t lsn_limit) /*!< in: All blocks whose + oldest_modification is smaller than + this should be flushed (if their + number does not exceed min_n) */ +{ + ulint n_flushed=0, i; + wrk_t *done_wi; + + for(i=0;iwork_item[i].tsk = MT_WRK_WRITE; + mtflush_ctx->work_item[i].rd.page_pool = NULL; + mtflush_ctx->work_item[i].wr.buf_pool = buf_pool_from_array(i); + mtflush_ctx->work_item[i].wr.flush_type = flush_type; + mtflush_ctx->work_item[i].wr.min = min_n; + mtflush_ctx->work_item[i].wr.lsn_limit = lsn_limit; + mtflush_ctx->work_item[i].id_usr = -1; + mtflush_ctx->work_item[i].wi_status = WRK_ITEM_SET; + + ib_wqueue_add(mtflush_ctx->wq, + (void *)(&(mtflush_ctx->work_item[i])), + mtflush_ctx->wheap); + } + + /* wait on the completion to arrive */ + for(i=0; i< buf_pool_inst;) { + done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, 50000); + + if (done_wi != NULL) { + if(done_wi->n_flushed == 0) { + per_pool_pages_flushed[i] = 0; + } else { + per_pool_pages_flushed[i] = done_wi->n_flushed; + } + + if((int)done_wi->id_usr == -1 && + done_wi->wi_status == WRK_ITEM_SET ) { + fprintf(stderr, + "**Set/Unused work_item[%lu] flush_type=%lu\n", + i, + done_wi->wr.flush_type); + ut_a(0); + } + + n_flushed+= done_wi->n_flushed; + i++; + } + } + + return(n_flushed); +} + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + + /* QUESTION: What is procted by below mutex ? */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + } + } +#ifdef UNIV_DEBUG + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); +#endif + return(success); +} + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void) +/*==========================*/ +{ + ulint total_flushed=0, i; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; + + ut_a(buf_mtflu_init_done()); + + /* QUESTION: What is protected by below mutex ? */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + +#if UNIV_DEBUG + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); +#endif + + return(total_flushed); +} + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*!wr.buf_pool) { - fprintf(stderr, "work-item wi->buf_pool:%p [likely thread exit]\n", - wi->wr.buf_pool); - return -1; - } - - wi->t_usec = 0; - if (!buf_flush_start(wi->wr.buf_pool, wi->wr.flush_type)) { - /* We have two choices here. If lsn_limit was - specified then skipping an instance of buffer - pool means we cannot guarantee that all pages - up to lsn_limit has been flushed. We can - return right now with failure or we can try - to flush remaining buffer pools up to the - lsn_limit. We attempt to flush other buffer - pools based on the assumption that it will - help in the retry which will follow the - failure. */ - fprintf(stderr, "flush_start Failed, flush_type:%d\n", - wi->wr.flush_type); - return -1; - } - -#ifdef UNIV_DEBUG - /* Record time taken for the OP in usec */ - gettimeofday(&p_start_time, 0x0); -#endif - - if (wi->wr.flush_type == BUF_FLUSH_LRU) { - /* srv_LRU_scan_depth can be arbitrarily large value. - * We cap it with current LRU size. - */ - buf_pool_enter_LRU_mutex(wi->wr.buf_pool); - wi->wr.min = UT_LIST_GET_LEN(wi->wr.buf_pool->LRU); - buf_pool_exit_LRU_mutex(wi->wr.buf_pool); - wi->wr.min = ut_min(srv_LRU_scan_depth,wi->wr.min); - } - - wi->result = buf_flush_batch(wi->wr.buf_pool, - wi->wr.flush_type, - wi->wr.min, wi->wr.lsn_limit, - false, &n); - - buf_flush_end(wi->wr.buf_pool, wi->wr.flush_type); - buf_flush_common(wi->wr.flush_type, wi->result); - -#ifdef UNIV_DEBUG - gettimeofday(&p_end_time, 0x0); - timediff(&p_end_time, &p_start_time, &d_time); - wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); -#endif - - return 0; -} - -int service_page_comp_io(thread_sync_t * ppc) -{ - wrk_t *wi = NULL; - int ret=0; - - ppc->wt_status = WTHR_SIG_WAITING; - wi = (wrk_t *)ib_wqueue_wait(ppc->wq); - - if (wi) { - ppc->wt_status = WTHR_RUNNING; - } else { - fprintf(stderr, "%s:%d work-item is NULL\n", __FILE__, __LINE__); - ppc->wt_status = WTHR_NO_WORK; - return (0); - } - - assert(wi != NULL); - wi->id_usr = ppc->wthread; - - switch(wi->tsk) { - case MT_WRK_NONE: - assert(wi->wi_status == WRK_ITEM_EXIT); - wi->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); - break; - - case MT_WRK_WRITE: - wi->wi_status = WRK_ITEM_START; - /* Process work item */ - if (0 != (ret = flush_pool_instance(wi))) { - fprintf(stderr, "FLUSH op failed ret:%d\n", ret); - wi->wi_status = WRK_ITEM_FAILED; - } - wi->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(ppc->wr_cq, wi, heap_allocated); - break; - - case MT_WRK_READ: - /* Need to also handle the read case */ - assert(0); - /* completed task get added to rd_cq */ - /* wi->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(ppc->rd_cq, wi, heap_allocated);*/ - break; - - default: - /* None other than Write/Read handling planned */ - assert(0); - } - - ppc->wt_status = WTHR_NO_WORK; - return(0); -} - -void page_comp_io_thread_exit() -{ - ulint i; - - fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", srv_buf_pool_instances); - for (i=0; istat_cycle_num_processed = 0; - } - os_thread_exit(NULL); - OS_THREAD_DUMMY_RETURN; -} - -int print_wrk_list(wrk_t *wi_list) -{ - wrk_t *wi = wi_list; - int i=0; - - if(!wi_list) { - fprintf(stderr, "list NULL\n"); - } - - while(wi) { - fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", - wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); - wi = wi->next; - i++; - } - fprintf(stderr, "list len: %d\n", i); - return 0; -} - -/******************************************************************//** -@return a dummy parameter*/ -int pgcomp_handler_init(int num_threads, int wrk_cnt, ib_wqueue_t *wq, ib_wqueue_t *wr_cq, ib_wqueue_t *rd_cq) -{ - int i=0; - - if(is_pgcomp_wrk_init_done()) { - fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); - return -1; - } - - if(!wq || !wr_cq || !rd_cq) { - fprintf(stderr, "%s() FAILED wq:%p write-cq:%p read-cq:%p\n", - __FUNCTION__, wq, wr_cq, rd_cq); - return -1; - } - - /* work-item setup */ - setup_wrk_itm(wrk_cnt); - - /* Mark each of the thread sync entires */ - for(i=0; i < MTFLUSH_MAX_WORKER; i++) { - pc_sync[i].wthread_id = i; - } - - /* Create threads for page-compression-flush */ - for(i=0; i < num_threads; i++) { - pc_sync[i].wthread_id = i; - pc_sync[i].wq = wq; - pc_sync[i].wr_cq = wr_cq; - pc_sync[i].rd_cq = rd_cq; - - os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), - thread_ids + START_OLD_THREAD_CNT + i); - pc_sync[i].wthread = (START_OLD_THREAD_CNT + i); - pc_sync[i].wt_status = WTHR_INITIALIZED; - } - set_pgcomp_wrk_init_done(); - fprintf(stderr, "%s() Worker-Threads created..\n", __FUNCTION__); - return 0; -} - -int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) -{ - ulong stat_tot=0; - ulint i=0; - for(i=0; i Date: Tue, 11 Feb 2014 20:05:09 +0200 Subject: [PATCH 16/56] Removed unnecessary files and set lz4 under HAVE_LZ4 compiler option using cmake find_library. Fixed bunch of compiler warnings. --- cmake/lz4.cmake | 35 + storage/innobase/CMakeLists.txt | 4 +- storage/innobase/buf/buf0flu.cc | 27 +- storage/innobase/buf/buf0mtflu.cc | 2 +- storage/innobase/fil/fil0pagecompress.cc | 35 +- storage/innobase/fil/lz4.c | 822 ------------------ storage/innobase/fil/lz4.h | 205 ----- storage/innobase/handler/ha_innodb.cc | 3 +- storage/innobase/include/dict0pagecompress.ic | 12 +- storage/xtradb/CMakeLists.txt | 4 +- storage/xtradb/buf/buf0flu.cc | 4 +- storage/xtradb/buf/buf0mtflu.cc | 2 +- storage/xtradb/fil/fil0pagecompress.cc | 34 +- storage/xtradb/fil/lz4.c | 822 ------------------ storage/xtradb/fil/lz4.h | 205 ----- storage/xtradb/handler/ha_innodb.cc | 2 + storage/xtradb/include/dict0pagecompress.ic | 12 +- 17 files changed, 121 insertions(+), 2109 deletions(-) create mode 100644 cmake/lz4.cmake delete mode 100644 storage/innobase/fil/lz4.c delete mode 100644 storage/innobase/fil/lz4.h delete mode 100644 storage/xtradb/fil/lz4.c delete mode 100644 storage/xtradb/fil/lz4.h diff --git a/cmake/lz4.cmake b/cmake/lz4.cmake new file mode 100644 index 00000000000..56120e2cdd0 --- /dev/null +++ b/cmake/lz4.cmake @@ -0,0 +1,35 @@ +# Copyright (C) 2014, SkySQL Ab. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +MACRO (MYSQL_CHECK_LZ4) + +CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) +CHECK_LIBRARY_EXISTS(liblz4.a LZ4_compress_limitedOutput "" HAVE_LZ4_LIB) + +IF(HAVE_LZ4_LIB AND HAVE_LZ4_H) + ADD_DEFINITIONS(-DHAVE_LZ4=1) + LINK_LIBRARIES(liblz4.a) +ENDIF() +ENDMACRO() + +MACRO (MYSQL_CHECK_SHARED_LZ4) + +CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) +CHECK_LIBRARY_EXISTS(lz4 LZ4_compress_limitedOutput "" HAVE_LZ4_SHARED_LIB) + +IF (HAVE_LZ4_SHARED_LIB AND HAVE_LZ4_H) + ADD_DEFINITIONS(-DHAVE_LZ4=1) + LINK_LIBRARIES(lz4) +ENDIF() +ENDMACRO() diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 64c22f9f7df..136a7a2ae0b 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -18,6 +18,9 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) +INCLUDE(lz4) + +MYSQL_CHECK_LZ4() # OS tests IF(UNIX) @@ -293,7 +296,6 @@ SET(INNOBASE_SOURCES eval/eval0proc.cc fil/fil0fil.cc fil/fil0pagecompress.cc - fil/lz4.c fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index d131f2efb44..2174699bd19 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -2390,7 +2390,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( ulint next_loop_time = ut_time_ms() + 1000; ulint n_flushed = 0; ulint last_activity = srv_get_activity_count(); - ulint n_lru=0, n_pgc_flush=0, n_pgc_batch=0; + ulint n_lru=0; ut_ad(!srv_read_only_mode); @@ -2429,17 +2429,12 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( #endif /* Flush pages from flush_list if required */ - n_flushed += n_pgc_flush = page_cleaner_flush_pages_if_needed(); + n_flushed += page_cleaner_flush_pages_if_needed(); -#ifdef UNIV_DEBUG - if (n_pgc_flush) { - fprintf(stderr,"n_pgc_flush:%lu ",n_pgc_flush); - } -#endif } else { - n_pgc_batch = n_flushed = page_cleaner_do_flush_batch( - PCT_IO(100), - LSN_MAX); + n_flushed = page_cleaner_do_flush_batch( + PCT_IO(100), + LSN_MAX); if (n_flushed) { MONITOR_INC_VALUE_CUMULATIVE( @@ -2448,21 +2443,11 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( MONITOR_FLUSH_BACKGROUND_PAGES, n_flushed); } -#ifdef UNIV_DEBUG - if (n_pgc_batch) { - fprintf(stderr,"n_pgc_batch:%lu ",n_pgc_batch); - } -#endif } -#ifdef UNIV_DEBUG - if (n_lru || n_pgc_flush || n_pgc_batch) { - fprintf(stderr,"\n"); - n_lru = n_pgc_flush = n_pgc_batch = 0; - } -#endif } ut_ad(srv_shutdown_state > 0); + if (srv_fast_shutdown == 2) { /* In very fast shutdown we simulate a crash of buffer pool. We are not required to do any flushing */ diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index a81ccee5650..a42e6158250 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -548,7 +548,7 @@ buf_mtflu_flush_work_items( if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { fprintf(stderr, - "**Set/Unused work_item[%lu] flush_type=%lu\n", + "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); ut_a(0); diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 10ac273955f..26e975bddf3 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -63,7 +63,9 @@ static ulint srv_data_read, srv_data_written; #include #endif #include "row0mysql.h" +#ifdef HAVE_LZ4 #include "lz4.h" +#endif /****************************************************************//** For page compressed pages compress the page before actual write @@ -108,10 +110,11 @@ fil_compress_page( fprintf(stderr, "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", space_id, fil_space_name(space), len); -#endif +#endif /* UNIV_DEBUG */ write_size = UNIV_PAGE_SIZE - header_len; +#ifdef HAVE_LZ4 if (srv_use_lz4) { err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); write_size = err; @@ -127,6 +130,7 @@ fil_compress_page( return (buf); } } else { +#endif /* HAVE_LZ4 */ err = compress2(out_buf+header_len, &write_size, buf, len, level); if (err != Z_OK) { @@ -139,7 +143,9 @@ fil_compress_page( *out_len = len; return (buf); } +#ifdef HAVE_LZ4 } +#endif /* HAVE_LZ4 */ /* Set up the page header */ memcpy(out_buf, buf, FIL_PAGE_DATA); @@ -148,11 +154,18 @@ fil_compress_page( /* Set up the correct page type */ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); /* Set up the flush lsn to be compression algorithm */ + +#ifdef HAVE_LZ4 if (srv_use_lz4) { mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); } else { +#endif /* HAVE_LZ4 */ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); + +#ifdef HAVE_LZ4 } +#endif /* HAVE_LZ4 */ + /* Set up the actual payload lenght */ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); @@ -161,12 +174,18 @@ fil_compress_page( ut_ad(fil_page_is_compressed(out_buf)); ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + +#ifdef HAVE_LZ4 if (srv_use_lz4) { ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); } else { +#endif /* HAVE_LZ4 */ ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); + +#ifdef HAVE_LZ4 } -#endif +#endif /* HAVE_LZ4 */ +#endif /* UNIV_DEBUG */ write_size+=header_len; /* Actual write needs to be alligned on block size */ @@ -236,8 +255,8 @@ fil_decompress_page( if (page_buf == NULL) { #ifdef UNIV_DEBUG fprintf(stderr, - "InnoDB: Note: Compression buffer not given, allocating...\n"); -#endif + "InnoDB: Note: FIL: Compression buffer not given, allocating...\n"); +#endif /* UNIV_DEBUG */ in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); } else { in_buf = page_buf; @@ -261,7 +280,7 @@ fil_decompress_page( fprintf(stderr, "InnoDB: Note: Preparing for decompress for len %lu\n", actual_size); -#endif +#endif /* UNIV_DEBUG */ err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); @@ -284,11 +303,12 @@ fil_decompress_page( fprintf(stderr, "InnoDB: Note: Decompression succeeded for len %lu \n", len); -#endif +#endif /* UNIV_DEBUG */ +#ifdef HAVE_LZ4 } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); - if (err != actual_size) { + if (err != (int)actual_size) { fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" "InnoDB: but decompression read only %d bytes.\n" @@ -298,6 +318,7 @@ fil_decompress_page( ut_error; } +#endif /* HAVE_LZ4 */ } else { fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" diff --git a/storage/innobase/fil/lz4.c b/storage/innobase/fil/lz4.c deleted file mode 100644 index 4e864de67d3..00000000000 --- a/storage/innobase/fil/lz4.c +++ /dev/null @@ -1,822 +0,0 @@ -/* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2013, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 source repository : http://code.google.com/p/lz4/ - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c -*/ - -//************************************** -// Tuning parameters -//************************************** -// MEMORY_USAGE : -// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) -// Increasing memory usage improves compression ratio -// Reduced memory usage can improve speed, due to cache effect -// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache -#define MEMORY_USAGE 14 - -// HEAPMODE : -// Select how default compression functions will allocate memory for their hash table, -// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). -#define HEAPMODE 0 - - -//************************************** -// CPU Feature Detection -//************************************** -// 32 or 64 bits ? -#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ - || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \ - || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \ - || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode -# define LZ4_ARCH64 1 -#else -# define LZ4_ARCH64 0 -#endif - -// Little Endian or Big Endian ? -// Overwrite the #define below if you know your architecture endianess -#if defined (__GLIBC__) -# include -# if (__BYTE_ORDER == __BIG_ENDIAN) -# define LZ4_BIG_ENDIAN 1 -# endif -#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) -# define LZ4_BIG_ENDIAN 1 -#elif defined(__sparc) || defined(__sparc__) \ - || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ - || defined(__hpux) || defined(__hppa) \ - || defined(_MIPSEB) || defined(__s390__) -# define LZ4_BIG_ENDIAN 1 -#else -// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. -#endif - -// Unaligned memory access is automatically enabled for "common" CPU, such as x86. -// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property -// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance -#if defined(__ARM_FEATURE_UNALIGNED) -# define LZ4_FORCE_UNALIGNED_ACCESS 1 -#endif - -// Define this parameter if your target system or compiler does not support hardware bit count -#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count -# define LZ4_FORCE_SW_BITCOUNT -#endif - -// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : -// This option may provide a small boost to performance for some big endian cpu, although probably modest. -// You may set this option to 1 if data will remain within closed environment. -// This option is useless on Little_Endian CPU (such as x86) -//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 - - -//************************************** -// Compiler Options -//************************************** -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99 -/* "restrict" is a known keyword */ -#else -# define restrict // Disable restrict -#endif - -#ifdef _MSC_VER // Visual Studio -# define FORCE_INLINE static __forceinline -# include // For Visual 2005 -# if LZ4_ARCH64 // 64-bits -# pragma intrinsic(_BitScanForward64) // For Visual 2005 -# pragma intrinsic(_BitScanReverse64) // For Visual 2005 -# else // 32-bits -# pragma intrinsic(_BitScanForward) // For Visual 2005 -# pragma intrinsic(_BitScanReverse) // For Visual 2005 -# endif -# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant -#else -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -#endif - -#ifdef _MSC_VER -# define lz4_bswap16(x) _byteswap_ushort(x) -#else -# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) -#endif - -#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) -# define expect(expr,value) (__builtin_expect ((expr),(value)) ) -#else -# define expect(expr,value) (expr) -#endif - -#define likely(expr) expect((expr) != 0, 1) -#define unlikely(expr) expect((expr) != 0, 0) - - -//************************************** -// Memory routines -//************************************** -#include // malloc, calloc, free -#define ALLOCATOR(n,s) calloc(n,s) -#define FREEMEM free -#include // memset, memcpy -#define MEM_INIT memset - - -//************************************** -// Includes -//************************************** -#include "lz4.h" - - -//************************************** -// Basic Types -//************************************** -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 -# include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; -#else - typedef unsigned char BYTE; - typedef unsigned short U16; - typedef unsigned int U32; - typedef signed int S32; - typedef unsigned long long U64; -#endif - -#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) -# define _PACKED __attribute__ ((packed)) -#else -# define _PACKED -#endif - -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) -# pragma pack(1) -# else -# pragma pack(push, 1) -# endif -#endif - -typedef struct { U16 v; } _PACKED U16_S; -typedef struct { U32 v; } _PACKED U32_S; -typedef struct { U64 v; } _PACKED U64_S; -typedef struct {size_t v;} _PACKED size_t_S; - -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) -# pragma pack(0) -# else -# pragma pack(pop) -# endif -#endif - -#define A16(x) (((U16_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A64(x) (((U64_S *)(x))->v) -#define AARCH(x) (((size_t_S *)(x))->v) - - -//************************************** -// Constants -//************************************** -#define LZ4_HASHLOG (MEMORY_USAGE-2) -#define HASHTABLESIZE (1 << MEMORY_USAGE) -#define HASHNBCELLS4 (1 << LZ4_HASHLOG) - -#define MINMATCH 4 - -#define COPYLENGTH 8 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH+MINMATCH) -const int LZ4_minLength = (MFLIMIT+1); - -#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1)) -#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data - -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) - -#define ML_BITS 4 -#define ML_MASK ((1U<=e; - - -//**************************** -// Private functions -//**************************** -#if LZ4_ARCH64 - -FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) -{ -# if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); -# else - int r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif -# else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward64( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif -# endif -} - -#else - -FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) -{ -# if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); -# else - int r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif -# else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanForward( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif -# endif -} - -#endif - - -//**************************** -// Compression functions -//**************************** -FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType) -{ - if (tableType == byU16) - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); - else - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); -} - -FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } - -FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - switch (tableType) - { - case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } - case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } - case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } - } -} - -FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - U32 h = LZ4_hashPosition(p, tableType); - LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); -} - -FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } - if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } - { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return -} - -FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - U32 h = LZ4_hashPosition(p, tableType); - return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); -} - - -FORCE_INLINE int LZ4_compress_generic( - void* ctx, - const char* source, - char* dest, - int inputSize, - int maxOutputSize, - - limitedOutput_directive limitedOutput, - tableType_t tableType, - prefix64k_directive prefix) -{ - const BYTE* ip = (const BYTE*) source; - const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source; - const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source); - const BYTE* anchor = (const BYTE*) source; - const BYTE* const iend = ip + inputSize; - const BYTE* const mflimit = iend - MFLIMIT; - const BYTE* const matchlimit = iend - LASTLITERALS; - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + maxOutputSize; - - int length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - // Init conditions - if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative) - if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block - if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit - if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit) - if (inputSize> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if unlikely(forwardIp > mflimit) { goto _last_literals; } - - forwardH = LZ4_hashPosition(forwardIp, tableType); - ref = LZ4_getPositionOnHash(h, ctx, tableType, base); - LZ4_putPositionOnHash(ip, h, ctx, tableType, base); - - } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip))); - - // Catch up - while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } - - // Encode Literal length - length = (int)(ip - anchor); - token = op++; - if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit - if (length>=(int)RUN_MASK) - { - int len = length-RUN_MASK; - *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; - *op++ = (BYTE)len; - } - else *token = (BYTE)(length<>8) > oend)) return 0; // Check output limit - if (length>=(int)ML_MASK) - { - *token += ML_MASK; - length -= ML_MASK; - for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; } - if (length >= 255) { length-=255; *op++ = 255; } - *op++ = (BYTE)length; - } - else *token += (BYTE)(length); - - // Test end of chunk - if (ip > mflimit) { anchor = ip; break; } - - // Fill table - LZ4_putPosition(ip-2, ctx, tableType, base); - - // Test next position - ref = LZ4_getPosition(ip, ctx, tableType, base); - LZ4_putPosition(ip, ctx, tableType, base); - if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } - - // Prepare next loop - anchor = ip++; - forwardH = LZ4_hashPosition(ip, tableType); - } - -_last_literals: - // Encode Last Literals - { - int lastRun = (int)(iend - anchor); - if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit - if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } - else *op++ = (BYTE)(lastRun<hashTable, 0, sizeof(lz4ds->hashTable)); - lz4ds->bufferStart = base; - lz4ds->base = base; - lz4ds->nextBlock = base; -} - - -void* LZ4_create (const char* inputBuffer) -{ - void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure)); - LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer); - return lz4ds; -} - - -int LZ4_free (void* LZ4_Data) -{ - FREEMEM(LZ4_Data); - return (0); -} - - -char* LZ4_slideInputBuffer (void* LZ4_Data) -{ - LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data; - size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB); - - if ( (lz4ds->base - delta > lz4ds->base) // underflow control - || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit - { - size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base; - int nH; - - for (nH=0; nH < HASHNBCELLS4; nH++) - { - if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0; - else lz4ds->hashTable[nH] -= (U32)deltaLimit; - } - memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); - lz4ds->base = lz4ds->bufferStart; - lz4ds->nextBlock = lz4ds->base + 64 KB; - } - else - { - memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); - lz4ds->nextBlock -= delta; - lz4ds->base -= delta; - } - - return (char*)(lz4ds->nextBlock); -} - - -//**************************** -// Decompression functions -//**************************** - -// This generic decompression function cover all use cases. -// It shall be instanciated several times, using different sets of directives -// Note that it is essential this generic function is really inlined, -// in order to remove useless branches during compilation optimisation. -FORCE_INLINE int LZ4_decompress_generic( - const char* source, - char* dest, - int inputSize, // - int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer. - - int endOnInput, // endOnOutputSize, endOnInputSize - int prefix64k, // noPrefix, withPrefix - int partialDecoding, // full, partial - int targetOutputSize // only used if partialDecoding==partial - ) -{ - // Local Variables - const BYTE* restrict ip = (const BYTE*) source; - const BYTE* ref; - const BYTE* const iend = ip + inputSize; - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + outputSize; - BYTE* cpy; - BYTE* oexit = op + targetOutputSize; - - const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64 - static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; - - - // Special cases - if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything - if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer - if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1); - - - // Main Loop - while (1) - { - unsigned token; - size_t length; - - // get runlength - token = *ip++; - if ((length=(token>>ML_BITS)) == RUN_MASK) - { - unsigned s=255; - while (((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) - || ((!endOnInput) && (cpy>oend-COPYLENGTH))) - { - if (partialDecoding) - { - if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer - if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer - } - else - { - if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there - if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed - } - memcpy(op, ip, length); - ip += length; - op += length; - break; // Necessarily EOF, due to parsing restrictions - } - LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; - - // get offset - LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; - if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer - - // get matchlength - if ((length=(token&ML_MASK)) == ML_MASK) - { - while ((!endOnInput) || (ipoend-COPYLENGTH-(STEPSIZE-4)) - { - if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals - LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH)); - while(op (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) -static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } - -/* -LZ4_compressBound() : - Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) - primarily useful for memory allocation of output buffer. - inline function is recommended for the general case, - macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). - - isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE - return : maximum output size in a "worst case" scenario - or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) -*/ - - -int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); - -/* -LZ4_compress_limitedOutput() : - Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. - If it cannot achieve it, compression will stop, and result of the function will be zero. - This function never writes outside of provided output buffer. - - inputSize : Max supported value is LZ4_MAX_INPUT_VALUE - maxOutputSize : is the size of the destination buffer (which must be already allocated) - return : the number of bytes written in buffer 'dest' - or 0 if the compression fails -*/ - - -int LZ4_decompress_fast (const char* source, char* dest, int outputSize); - -/* -LZ4_decompress_fast() : - outputSize : is the original (uncompressed) size - return : the number of bytes read from the source buffer (in other words, the compressed size) - If the source stream is malformed, the function will stop decoding and return a negative result. - note : This function is a bit faster than LZ4_decompress_safe() - This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet. - Use this function preferably into a trusted environment (data to decode comes from a trusted source). - Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes. -*/ - -int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize); - -/* -LZ4_decompress_safe_partial() : - This function decompress a compressed block of size 'inputSize' at position 'source' - into output buffer 'dest' of size 'maxOutputSize'. - The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, - reducing decompression time. - return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) - Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. - Always control how many bytes were decoded. - If the source stream is detected malformed, the function will stop decoding and return a negative result. - This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets -*/ - - -//**************************** -// Stream Functions -//**************************** - -void* LZ4_create (const char* inputBuffer); -int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize); -int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize); -char* LZ4_slideInputBuffer (void* LZ4_Data); -int LZ4_free (void* LZ4_Data); - -/* -These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. -In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function : - -void* LZ4_create (const char* inputBuffer); -The result of the function is the (void*) pointer on the LZ4 Data Structure. -This pointer will be needed in all other functions. -If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. -The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. -The input buffer must be already allocated, and size at least 192KB. -'inputBuffer' will also be the 'const char* source' of the first block. - -All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. -To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(). -Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(), -but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one. -If next block does not begin immediately after the previous one, the compression will fail (return 0). - -When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : -char* LZ4_slideInputBuffer(void* LZ4_Data); -must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. -Note that, for this function to work properly, minimum size of an input buffer must be 192KB. -==> The memory position where the next input data block must start is provided as the result of the function. - -Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual. - -When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure. -*/ - - -int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize); -int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize); - -/* -*_withPrefix64k() : - These decoding functions work the same as their "normal name" versions, - but can use up to 64KB of data in front of 'char* dest'. - These functions are necessary to decode inter-dependant blocks. -*/ - - -//**************************** -// Obsolete Functions -//**************************** - -static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } -static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } - -/* -These functions are deprecated and should no longer be used. -They are provided here for compatibility with existing user programs. -*/ - - - -#if defined (__cplusplus) -} -#endif diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index d4ce4eb9c4f..c284028c51c 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16607,11 +16607,12 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, "Use trim.", NULL, NULL, TRUE); +#ifdef HAVE_LZ4 static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, PLUGIN_VAR_OPCMDARG , "Use LZ4 for page compression", NULL, NULL, FALSE); - +#endif /* HAVE_LZ4 */ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic index fb9581fc657..ea3c7546850 100644 --- a/storage/innobase/include/dict0pagecompress.ic +++ b/storage/innobase/include/dict0pagecompress.ic @@ -54,12 +54,12 @@ dict_tf_verify_flags( DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", return(ULINT_UNDEFINED);); - ut_ad(!table_unused); - ut_ad(!fsp_unused); - ut_ad(page_ssize == 0 || page_ssize != 0); /* silence compiler */ - ut_ad(compact == 0 || compact == 1); /* silence compiler */ - ut_ad(data_dir == 0 || data_dir == 1); /* silence compiler */ - ut_ad(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + ut_a(!table_unused); + ut_a(!fsp_unused); + ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_a(compact == 0 || compact == 1); /* silence compiler */ + ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */ if (ssize != zip_ssize) { fprintf(stderr, diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 14fbb14bdd7..a13b19638af 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -18,6 +18,9 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) +INCLUDE(lz4) + +MYSQL_CHECK_SHARED_LZ4() # OS tests IF(UNIX) @@ -299,7 +302,6 @@ SET(INNOBASE_SOURCES eval/eval0proc.cc fil/fil0fil.cc fil/fil0pagecompress.cc - fil/lz4.c fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index a080ef0ee48..04fe25afa01 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1863,8 +1863,10 @@ buf_flush_start( /* There is already a flush batch of the same type running */ - fprintf(stderr, "Error: flush_type %d n_flush %lu init_flush\n", +#ifdef UNIV_DEBUG + fprintf(stderr, "Error: flush_type %d n_flush %lu init_flush %lu\n", flush_type, buf_pool->n_flush[flush_type], buf_pool->init_flush[flush_type]); +#endif mutex_exit(&buf_pool->flush_state_mutex); diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 14ece48519f..31cf74e7f5a 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -554,7 +554,7 @@ buf_mtflu_flush_work_items( if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { fprintf(stderr, - "**Set/Unused work_item[%lu] flush_type=%lu\n", + "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); ut_a(0); diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index 10ac273955f..8f835113b7f 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -63,7 +63,9 @@ static ulint srv_data_read, srv_data_written; #include #endif #include "row0mysql.h" +#ifdef HAVE_LZ4 #include "lz4.h" +#endif /****************************************************************//** For page compressed pages compress the page before actual write @@ -108,10 +110,11 @@ fil_compress_page( fprintf(stderr, "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", space_id, fil_space_name(space), len); -#endif +#endif /* UNIV_DEBUG */ write_size = UNIV_PAGE_SIZE - header_len; +#ifdef HAVE_LZ4 if (srv_use_lz4) { err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); write_size = err; @@ -127,6 +130,7 @@ fil_compress_page( return (buf); } } else { +#endif /* HAVE_LZ4 */ err = compress2(out_buf+header_len, &write_size, buf, len, level); if (err != Z_OK) { @@ -139,7 +143,9 @@ fil_compress_page( *out_len = len; return (buf); } +#ifdef HAVE_LZ4 } +#endif /* HAVE_LZ4 */ /* Set up the page header */ memcpy(out_buf, buf, FIL_PAGE_DATA); @@ -148,11 +154,15 @@ fil_compress_page( /* Set up the correct page type */ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); /* Set up the flush lsn to be compression algorithm */ +#ifdef HAVE_LZ4 if (srv_use_lz4) { mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); } else { +#endif /* HAVE_LZ4 */ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); +#ifdef HAVE_LZ4 } +#endif /* HAVE_LZ4 */ /* Set up the actual payload lenght */ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); @@ -161,12 +171,17 @@ fil_compress_page( ut_ad(fil_page_is_compressed(out_buf)); ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + +#ifdef HAVE_LZ4 if (srv_use_lz4) { ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); } else { +#endif /* HAVE_LZ4 */ ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); +#ifdef HAVE_LZ4 } -#endif +#endif /* HAVE_LZ4 */ +#endif /* UNIV_DEBUG */ write_size+=header_len; /* Actual write needs to be alligned on block size */ @@ -178,7 +193,7 @@ fil_compress_page( fprintf(stderr, "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", space_id, fil_space_name(space), len, write_size); -#endif +#endif /* UNIV_DEBUG */ #define SECT_SIZE 512 @@ -236,8 +251,8 @@ fil_decompress_page( if (page_buf == NULL) { #ifdef UNIV_DEBUG fprintf(stderr, - "InnoDB: Note: Compression buffer not given, allocating...\n"); -#endif + "InnoDB: FIL: Note: Compression buffer not given, allocating...\n"); +#endif /* UNIV_DEBUG */ in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); } else { in_buf = page_buf; @@ -261,11 +276,10 @@ fil_decompress_page( fprintf(stderr, "InnoDB: Note: Preparing for decompress for len %lu\n", actual_size); -#endif +#endif /* UNIV_DEBUG */ err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); - /* If uncompress fails it means that page is corrupted */ if (err != Z_OK) { @@ -284,11 +298,12 @@ fil_decompress_page( fprintf(stderr, "InnoDB: Note: Decompression succeeded for len %lu \n", len); -#endif +#endif /* UNIV_DEBUG */ +#ifdef HAVE_LZ4 } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); - if (err != actual_size) { + if (err != (int)actual_size) { fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" "InnoDB: but decompression read only %d bytes.\n" @@ -298,6 +313,7 @@ fil_decompress_page( ut_error; } +#endif /* HAVE_LZ4 */ } else { fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" diff --git a/storage/xtradb/fil/lz4.c b/storage/xtradb/fil/lz4.c deleted file mode 100644 index 4e864de67d3..00000000000 --- a/storage/xtradb/fil/lz4.c +++ /dev/null @@ -1,822 +0,0 @@ -/* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2013, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 source repository : http://code.google.com/p/lz4/ - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c -*/ - -//************************************** -// Tuning parameters -//************************************** -// MEMORY_USAGE : -// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) -// Increasing memory usage improves compression ratio -// Reduced memory usage can improve speed, due to cache effect -// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache -#define MEMORY_USAGE 14 - -// HEAPMODE : -// Select how default compression functions will allocate memory for their hash table, -// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). -#define HEAPMODE 0 - - -//************************************** -// CPU Feature Detection -//************************************** -// 32 or 64 bits ? -#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ - || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \ - || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \ - || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode -# define LZ4_ARCH64 1 -#else -# define LZ4_ARCH64 0 -#endif - -// Little Endian or Big Endian ? -// Overwrite the #define below if you know your architecture endianess -#if defined (__GLIBC__) -# include -# if (__BYTE_ORDER == __BIG_ENDIAN) -# define LZ4_BIG_ENDIAN 1 -# endif -#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) -# define LZ4_BIG_ENDIAN 1 -#elif defined(__sparc) || defined(__sparc__) \ - || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ - || defined(__hpux) || defined(__hppa) \ - || defined(_MIPSEB) || defined(__s390__) -# define LZ4_BIG_ENDIAN 1 -#else -// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. -#endif - -// Unaligned memory access is automatically enabled for "common" CPU, such as x86. -// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property -// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance -#if defined(__ARM_FEATURE_UNALIGNED) -# define LZ4_FORCE_UNALIGNED_ACCESS 1 -#endif - -// Define this parameter if your target system or compiler does not support hardware bit count -#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count -# define LZ4_FORCE_SW_BITCOUNT -#endif - -// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : -// This option may provide a small boost to performance for some big endian cpu, although probably modest. -// You may set this option to 1 if data will remain within closed environment. -// This option is useless on Little_Endian CPU (such as x86) -//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 - - -//************************************** -// Compiler Options -//************************************** -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99 -/* "restrict" is a known keyword */ -#else -# define restrict // Disable restrict -#endif - -#ifdef _MSC_VER // Visual Studio -# define FORCE_INLINE static __forceinline -# include // For Visual 2005 -# if LZ4_ARCH64 // 64-bits -# pragma intrinsic(_BitScanForward64) // For Visual 2005 -# pragma intrinsic(_BitScanReverse64) // For Visual 2005 -# else // 32-bits -# pragma intrinsic(_BitScanForward) // For Visual 2005 -# pragma intrinsic(_BitScanReverse) // For Visual 2005 -# endif -# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant -#else -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -#endif - -#ifdef _MSC_VER -# define lz4_bswap16(x) _byteswap_ushort(x) -#else -# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) -#endif - -#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) -# define expect(expr,value) (__builtin_expect ((expr),(value)) ) -#else -# define expect(expr,value) (expr) -#endif - -#define likely(expr) expect((expr) != 0, 1) -#define unlikely(expr) expect((expr) != 0, 0) - - -//************************************** -// Memory routines -//************************************** -#include // malloc, calloc, free -#define ALLOCATOR(n,s) calloc(n,s) -#define FREEMEM free -#include // memset, memcpy -#define MEM_INIT memset - - -//************************************** -// Includes -//************************************** -#include "lz4.h" - - -//************************************** -// Basic Types -//************************************** -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 -# include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; -#else - typedef unsigned char BYTE; - typedef unsigned short U16; - typedef unsigned int U32; - typedef signed int S32; - typedef unsigned long long U64; -#endif - -#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) -# define _PACKED __attribute__ ((packed)) -#else -# define _PACKED -#endif - -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) -# pragma pack(1) -# else -# pragma pack(push, 1) -# endif -#endif - -typedef struct { U16 v; } _PACKED U16_S; -typedef struct { U32 v; } _PACKED U32_S; -typedef struct { U64 v; } _PACKED U64_S; -typedef struct {size_t v;} _PACKED size_t_S; - -#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) -# pragma pack(0) -# else -# pragma pack(pop) -# endif -#endif - -#define A16(x) (((U16_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A64(x) (((U64_S *)(x))->v) -#define AARCH(x) (((size_t_S *)(x))->v) - - -//************************************** -// Constants -//************************************** -#define LZ4_HASHLOG (MEMORY_USAGE-2) -#define HASHTABLESIZE (1 << MEMORY_USAGE) -#define HASHNBCELLS4 (1 << LZ4_HASHLOG) - -#define MINMATCH 4 - -#define COPYLENGTH 8 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH+MINMATCH) -const int LZ4_minLength = (MFLIMIT+1); - -#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1)) -#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data - -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) - -#define ML_BITS 4 -#define ML_MASK ((1U<=e; - - -//**************************** -// Private functions -//**************************** -#if LZ4_ARCH64 - -FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) -{ -# if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); -# else - int r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif -# else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward64( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif -# endif -} - -#else - -FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) -{ -# if defined(LZ4_BIG_ENDIAN) -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); -# else - int r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif -# else -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanForward( &r, val ); - return (int)(r>>3); -# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif -# endif -} - -#endif - - -//**************************** -// Compression functions -//**************************** -FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType) -{ - if (tableType == byU16) - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); - else - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); -} - -FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } - -FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - switch (tableType) - { - case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } - case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } - case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } - } -} - -FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - U32 h = LZ4_hashPosition(p, tableType); - LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); -} - -FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } - if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } - { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return -} - -FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - U32 h = LZ4_hashPosition(p, tableType); - return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); -} - - -FORCE_INLINE int LZ4_compress_generic( - void* ctx, - const char* source, - char* dest, - int inputSize, - int maxOutputSize, - - limitedOutput_directive limitedOutput, - tableType_t tableType, - prefix64k_directive prefix) -{ - const BYTE* ip = (const BYTE*) source; - const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source; - const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source); - const BYTE* anchor = (const BYTE*) source; - const BYTE* const iend = ip + inputSize; - const BYTE* const mflimit = iend - MFLIMIT; - const BYTE* const matchlimit = iend - LASTLITERALS; - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + maxOutputSize; - - int length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - // Init conditions - if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative) - if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block - if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit - if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit) - if (inputSize> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if unlikely(forwardIp > mflimit) { goto _last_literals; } - - forwardH = LZ4_hashPosition(forwardIp, tableType); - ref = LZ4_getPositionOnHash(h, ctx, tableType, base); - LZ4_putPositionOnHash(ip, h, ctx, tableType, base); - - } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip))); - - // Catch up - while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } - - // Encode Literal length - length = (int)(ip - anchor); - token = op++; - if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit - if (length>=(int)RUN_MASK) - { - int len = length-RUN_MASK; - *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; - *op++ = (BYTE)len; - } - else *token = (BYTE)(length<>8) > oend)) return 0; // Check output limit - if (length>=(int)ML_MASK) - { - *token += ML_MASK; - length -= ML_MASK; - for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; } - if (length >= 255) { length-=255; *op++ = 255; } - *op++ = (BYTE)length; - } - else *token += (BYTE)(length); - - // Test end of chunk - if (ip > mflimit) { anchor = ip; break; } - - // Fill table - LZ4_putPosition(ip-2, ctx, tableType, base); - - // Test next position - ref = LZ4_getPosition(ip, ctx, tableType, base); - LZ4_putPosition(ip, ctx, tableType, base); - if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } - - // Prepare next loop - anchor = ip++; - forwardH = LZ4_hashPosition(ip, tableType); - } - -_last_literals: - // Encode Last Literals - { - int lastRun = (int)(iend - anchor); - if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit - if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } - else *op++ = (BYTE)(lastRun<hashTable, 0, sizeof(lz4ds->hashTable)); - lz4ds->bufferStart = base; - lz4ds->base = base; - lz4ds->nextBlock = base; -} - - -void* LZ4_create (const char* inputBuffer) -{ - void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure)); - LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer); - return lz4ds; -} - - -int LZ4_free (void* LZ4_Data) -{ - FREEMEM(LZ4_Data); - return (0); -} - - -char* LZ4_slideInputBuffer (void* LZ4_Data) -{ - LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data; - size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB); - - if ( (lz4ds->base - delta > lz4ds->base) // underflow control - || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit - { - size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base; - int nH; - - for (nH=0; nH < HASHNBCELLS4; nH++) - { - if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0; - else lz4ds->hashTable[nH] -= (U32)deltaLimit; - } - memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); - lz4ds->base = lz4ds->bufferStart; - lz4ds->nextBlock = lz4ds->base + 64 KB; - } - else - { - memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); - lz4ds->nextBlock -= delta; - lz4ds->base -= delta; - } - - return (char*)(lz4ds->nextBlock); -} - - -//**************************** -// Decompression functions -//**************************** - -// This generic decompression function cover all use cases. -// It shall be instanciated several times, using different sets of directives -// Note that it is essential this generic function is really inlined, -// in order to remove useless branches during compilation optimisation. -FORCE_INLINE int LZ4_decompress_generic( - const char* source, - char* dest, - int inputSize, // - int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer. - - int endOnInput, // endOnOutputSize, endOnInputSize - int prefix64k, // noPrefix, withPrefix - int partialDecoding, // full, partial - int targetOutputSize // only used if partialDecoding==partial - ) -{ - // Local Variables - const BYTE* restrict ip = (const BYTE*) source; - const BYTE* ref; - const BYTE* const iend = ip + inputSize; - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + outputSize; - BYTE* cpy; - BYTE* oexit = op + targetOutputSize; - - const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64 - static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; - - - // Special cases - if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything - if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer - if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1); - - - // Main Loop - while (1) - { - unsigned token; - size_t length; - - // get runlength - token = *ip++; - if ((length=(token>>ML_BITS)) == RUN_MASK) - { - unsigned s=255; - while (((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) - || ((!endOnInput) && (cpy>oend-COPYLENGTH))) - { - if (partialDecoding) - { - if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer - if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer - } - else - { - if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there - if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed - } - memcpy(op, ip, length); - ip += length; - op += length; - break; // Necessarily EOF, due to parsing restrictions - } - LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; - - // get offset - LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; - if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer - - // get matchlength - if ((length=(token&ML_MASK)) == ML_MASK) - { - while ((!endOnInput) || (ipoend-COPYLENGTH-(STEPSIZE-4)) - { - if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals - LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH)); - while(op (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) -static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } - -/* -LZ4_compressBound() : - Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) - primarily useful for memory allocation of output buffer. - inline function is recommended for the general case, - macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). - - isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE - return : maximum output size in a "worst case" scenario - or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) -*/ - - -int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); - -/* -LZ4_compress_limitedOutput() : - Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. - If it cannot achieve it, compression will stop, and result of the function will be zero. - This function never writes outside of provided output buffer. - - inputSize : Max supported value is LZ4_MAX_INPUT_VALUE - maxOutputSize : is the size of the destination buffer (which must be already allocated) - return : the number of bytes written in buffer 'dest' - or 0 if the compression fails -*/ - - -int LZ4_decompress_fast (const char* source, char* dest, int outputSize); - -/* -LZ4_decompress_fast() : - outputSize : is the original (uncompressed) size - return : the number of bytes read from the source buffer (in other words, the compressed size) - If the source stream is malformed, the function will stop decoding and return a negative result. - note : This function is a bit faster than LZ4_decompress_safe() - This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet. - Use this function preferably into a trusted environment (data to decode comes from a trusted source). - Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes. -*/ - -int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize); - -/* -LZ4_decompress_safe_partial() : - This function decompress a compressed block of size 'inputSize' at position 'source' - into output buffer 'dest' of size 'maxOutputSize'. - The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, - reducing decompression time. - return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) - Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. - Always control how many bytes were decoded. - If the source stream is detected malformed, the function will stop decoding and return a negative result. - This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets -*/ - - -//**************************** -// Stream Functions -//**************************** - -void* LZ4_create (const char* inputBuffer); -int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize); -int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize); -char* LZ4_slideInputBuffer (void* LZ4_Data); -int LZ4_free (void* LZ4_Data); - -/* -These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. -In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function : - -void* LZ4_create (const char* inputBuffer); -The result of the function is the (void*) pointer on the LZ4 Data Structure. -This pointer will be needed in all other functions. -If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. -The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. -The input buffer must be already allocated, and size at least 192KB. -'inputBuffer' will also be the 'const char* source' of the first block. - -All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. -To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(). -Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(), -but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one. -If next block does not begin immediately after the previous one, the compression will fail (return 0). - -When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : -char* LZ4_slideInputBuffer(void* LZ4_Data); -must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. -Note that, for this function to work properly, minimum size of an input buffer must be 192KB. -==> The memory position where the next input data block must start is provided as the result of the function. - -Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual. - -When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure. -*/ - - -int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize); -int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize); - -/* -*_withPrefix64k() : - These decoding functions work the same as their "normal name" versions, - but can use up to 64KB of data in front of 'char* dest'. - These functions are necessary to decode inter-dependant blocks. -*/ - - -//**************************** -// Obsolete Functions -//**************************** - -static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } -static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } - -/* -These functions are deprecated and should no longer be used. -They are provided here for compatibility with existing user programs. -*/ - - - -#if defined (__cplusplus) -} -#endif diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index ead0b0fc902..2b23526da5d 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -17968,10 +17968,12 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, "Use trim.", NULL, NULL, TRUE); +#ifdef HAVE_LZ4 static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, PLUGIN_VAR_OPCMDARG , "Use LZ4 for page compression", NULL, NULL, FALSE); +#endif /* HAVE_LZ4 */ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), diff --git a/storage/xtradb/include/dict0pagecompress.ic b/storage/xtradb/include/dict0pagecompress.ic index fb9581fc657..ea3c7546850 100644 --- a/storage/xtradb/include/dict0pagecompress.ic +++ b/storage/xtradb/include/dict0pagecompress.ic @@ -54,12 +54,12 @@ dict_tf_verify_flags( DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", return(ULINT_UNDEFINED);); - ut_ad(!table_unused); - ut_ad(!fsp_unused); - ut_ad(page_ssize == 0 || page_ssize != 0); /* silence compiler */ - ut_ad(compact == 0 || compact == 1); /* silence compiler */ - ut_ad(data_dir == 0 || data_dir == 1); /* silence compiler */ - ut_ad(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + ut_a(!table_unused); + ut_a(!fsp_unused); + ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_a(compact == 0 || compact == 1); /* silence compiler */ + ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */ if (ssize != zip_ssize) { fprintf(stderr, From 184e302ab471ebf47662221eba883cb47a3fa84c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 12 Feb 2014 07:09:06 +0200 Subject: [PATCH 17/56] Fix compiler error if lz4 is not found on the system. --- storage/innobase/handler/ha_innodb.cc | 2 ++ storage/xtradb/handler/ha_innodb.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index c284028c51c..812aa0cfe83 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16763,7 +16763,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(trim_pct), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), +#ifdef HAVE_LZ4 MYSQL_SYSVAR(use_lz4), +#endif NULL }; diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 2b23526da5d..557872abdf0 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -18169,7 +18169,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(trim_pct), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), +#ifdef HAVE_LZ4 MYSQL_SYSVAR(use_lz4), +#endif NULL }; From f6ad325883dafdcdf1645d198bfe1a59e5a2b44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 12 Feb 2014 10:55:45 +0200 Subject: [PATCH 18/56] Code cleanup. Removed those questions that are now addressed. --- storage/innobase/buf/buf0mtflu.cc | 41 +++++++++++++++++-------------- storage/xtradb/buf/buf0mtflu.cc | 41 +++++++++++++++++-------------- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index a42e6158250..9cf5a66fc72 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -108,7 +108,7 @@ typedef struct wrk_itm wr_tsk_t wr; /*!< Flush page list */ rd_tsk_t rd; /*!< Decompress page list */ ulint n_flushed; /*!< Flushed pages count */ - os_thread_t id_usr; /*!< Thread-id currently working */ + os_thread_id_t id_usr; /*!< Thread-id currently working */ wrk_status_t wi_status; /*!< Work item status */ struct wrk_itm *next; /*!< Next work item */ } wrk_t; @@ -125,12 +125,12 @@ typedef struct thread_sync wthr_status_t wt_status; /*!< Worker thread status */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ - wrk_t* work_item; /*!< Work items to be processed */ + wrk_t* work_item; /*!< Array of work-items that are + individually accessed by multiple + threads. Items are accessed in a + thread safe manner.*/ } thread_sync_t; -/* QUESTION: Is this array used from several threads concurrently ? */ -// static wrk_t work_items[MTFLUSH_MAX_WORKER]; - /* TODO: REALLY NEEDED ? */ static int mtflush_work_initialized = -1; static os_fast_mutex_t mtflush_mtx; @@ -203,9 +203,7 @@ buf_mtflu_flush_pool_instance( help in the retry which will follow the failure. */ #ifdef UNIV_DEBUG - /* QUESTION: is this a really failure ? */ - fprintf(stderr, "flush_start Failed, flush_type:%d\n", - work_item->wr.flush_type); + fprintf(stderr, "flush start failed.\n"); #endif return 0; } @@ -230,7 +228,7 @@ buf_mtflu_flush_pool_instance( buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); - return 0; + return work_item->n_flushed; } #ifdef UNIV_DEBUG @@ -287,23 +285,30 @@ mtflush_service_io( return; } - work_item->id_usr = mtflush_io->wthread; + work_item->id_usr = os_thread_get_curr_id(); + + /* This works as a producer/consumer model, where in tasks are + * inserted into the work-queue (wq) and completions are based + * on the type of operations performed and as a result the WRITE/ + * compression/flush operation completions get posted to wr_cq. + * And READ/decompress operations completions get posted to rd_cq. + * in future we may have others. + */ switch(work_item->tsk) { case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_SUCCESS; - /* QUESTION: Why completed work items are inserted to - completion queue ? */ ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); break; case MT_WRK_WRITE: work_item->wi_status = WRK_ITEM_START; /* Process work item */ - /* QUESTION: Is this a really a error ? */ - if (0 != (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { - fprintf(stderr, "FLUSH op failed ret:%lu\n", n_flushed); + if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { +#ifdef UNIV_DEBUG + fprintf(stderr, "No pages flushed\n"); +#endif work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; @@ -551,7 +556,7 @@ buf_mtflu_flush_work_items( "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); - ut_a(0); + ut_ad(0); } n_flushed+= done_wi->n_flushed; @@ -598,7 +603,7 @@ buf_mtflu_flush_list( / srv_buf_pool_instances; } - /* QUESTION: What is procted by below mutex ? */ + /* This lock is to safequard against re-entry if any. */ os_fast_mutex_lock(&mtflush_mtx); buf_mtflu_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LIST, @@ -641,7 +646,7 @@ buf_mtflu_flush_LRU_tail(void) ut_a(buf_mtflu_init_done()); - /* QUESTION: What is protected by below mutex ? */ + /* This lock is to safeguard against re-entry if any */ os_fast_mutex_lock(&mtflush_mtx); buf_mtflu_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 31cf74e7f5a..f98d99228af 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -108,7 +108,7 @@ typedef struct wrk_itm wr_tsk_t wr; /*!< Flush page list */ rd_tsk_t rd; /*!< Decompress page list */ ulint n_flushed; /*!< Flushed pages count */ - os_thread_t id_usr; /*!< Thread-id currently working */ + os_thread_id_t id_usr; /*!< Thread-id currently working */ wrk_status_t wi_status; /*!< Work item status */ struct wrk_itm *next; /*!< Next work item */ } wrk_t; @@ -125,12 +125,12 @@ typedef struct thread_sync wthr_status_t wt_status; /*!< Worker thread status */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ - wrk_t* work_item; /*!< Work items to be processed */ + wrk_t* work_item; /*!< Array of work-items that are + individually accessed by multiple + threads. Items are accessed in a + thread safe manner.*/ } thread_sync_t; -/* QUESTION: Is this array used from several threads concurrently ? */ -// static wrk_t work_items[MTFLUSH_MAX_WORKER]; - /* TODO: REALLY NEEDED ? */ static int mtflush_work_initialized = -1; static os_fast_mutex_t mtflush_mtx; @@ -205,9 +205,7 @@ buf_mtflu_flush_pool_instance( help in the retry which will follow the failure. */ #ifdef UNIV_DEBUG - /* QUESTION: is this a really failure ? */ - fprintf(stderr, "flush_start Failed, flush_type:%d\n", - work_item->wr.flush_type); + fprintf(stderr, "flush start failed.\n"); #endif return 0; } @@ -235,7 +233,7 @@ buf_mtflu_flush_pool_instance( buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); - return 0; + return work_item->n_flushed; } #ifdef UNIV_DEBUG @@ -293,23 +291,30 @@ mtflush_service_io( return; } - work_item->id_usr = mtflush_io->wthread; + work_item->id_usr = os_thread_get_curr_id(); + + /* This works as a producer/consumer model, where in tasks are + * inserted into the work-queue (wq) and completions are based + * on the type of operations performed and as a result the WRITE/ + * compression/flush operation completions get posted to wr_cq. + * And READ/decompress operations completions get posted to rd_cq. + * in future we may have others. + */ switch(work_item->tsk) { case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_SUCCESS; - /* QUESTION: Why completed work items are inserted to - completion queue ? */ ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); break; case MT_WRK_WRITE: work_item->wi_status = WRK_ITEM_START; /* Process work item */ - /* QUESTION: Is this a really a error ? */ - if (0 != (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { - fprintf(stderr, "FLUSH op failed ret:%lu\n", n_flushed); + if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { +#ifdef UNIV_DEBUG + fprintf(stderr, "No pages flushed\n"); +#endif work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; @@ -557,7 +562,7 @@ buf_mtflu_flush_work_items( "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); - ut_a(0); + ut_ad(0); } n_flushed+= done_wi->n_flushed; @@ -604,7 +609,7 @@ buf_mtflu_flush_list( / srv_buf_pool_instances; } - /* QUESTION: What is procted by below mutex ? */ + /* This lock is to safequard against re-entry if any. */ os_fast_mutex_lock(&mtflush_mtx); buf_mtflu_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LIST, @@ -647,7 +652,7 @@ buf_mtflu_flush_LRU_tail(void) ut_a(buf_mtflu_init_done()); - /* QUESTION: What is protected by below mutex ? */ + /* This lock is to safeguard against re-entry if any */ os_fast_mutex_lock(&mtflush_mtx); buf_mtflu_flush_work_items(srv_buf_pool_instances, cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); From 1fa19bf777cb435e6630694fae029802260b5f6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 12 Feb 2014 12:52:34 +0200 Subject: [PATCH 19/56] Fixed issue on atomic writes setup and atomic blobs setup on system tables. --- storage/innobase/include/dict0dict.ic | 28 ++++++++++++++++++++++----- storage/xtradb/include/dict0dict.ic | 28 ++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index ed891a00fd4..7cc0404e0eb 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -681,12 +681,16 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } } /* Make sure there are no bits that we do not know about. */ if (unused) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + type, unused); return(ULINT_UNDEFINED); } @@ -701,6 +705,8 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + type, zip_ssize); return(ULINT_UNDEFINED); } @@ -710,11 +716,15 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } } @@ -731,6 +741,9 @@ dict_sys_tables_type_validate( low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs || !page_compression) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + "InnoDB: Error: atomic_blobs %lu\n", + type, page_compression, page_compression_level, atomic_blobs); return(ULINT_UNDEFINED); } } @@ -738,6 +751,9 @@ dict_sys_tables_type_validate( if (awrites == ATOMIC_WRITES_ON || (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { if (!atomic_blobs) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu atomic_blobs %lu\n", + type, atomic_writes, atomic_blobs); + return(ULINT_UNDEFINED); } } @@ -846,10 +862,9 @@ dict_tf_set( } if (page_compressed) { - *flags = DICT_TF_COMPACT - | (1 << DICT_TF_POS_ATOMIC_BLOBS) - | (1 << DICT_TF_POS_PAGE_COMPRESSION) - | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); ut_ad(zip_ssize == 0); ut_ad(dict_tf_get_page_compression(*flags) == TRUE); @@ -863,7 +878,8 @@ dict_tf_set( if (awrites == ATOMIC_WRITES_ON || (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { - *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES) + | (1 << DICT_TF_POS_ATOMIC_BLOBS); } if (use_data_dir) { @@ -996,6 +1012,8 @@ dict_tf_to_sys_tables_type( | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL | DICT_TF_MASK_ATOMIC_WRITES); + ut_a(dict_sys_tables_type_validate(type, 0)); + return(type); } diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index 1ce4fe6a2f1..3f6d56fab1f 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -685,12 +685,16 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } } /* Make sure there are no bits that we do not know about. */ if (unused) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + type, unused); return(ULINT_UNDEFINED); } @@ -705,6 +709,8 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + type, zip_ssize); return(ULINT_UNDEFINED); } @@ -714,11 +720,15 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } } @@ -735,6 +745,9 @@ dict_sys_tables_type_validate( low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs || !page_compression) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + "InnoDB: Error: atomic_blobs %lu\n", + type, page_compression, page_compression_level, atomic_blobs); return(ULINT_UNDEFINED); } } @@ -742,6 +755,9 @@ dict_sys_tables_type_validate( if (awrites == ATOMIC_WRITES_ON || (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { if (!atomic_blobs) { + fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu atomic_blobs %lu\n", + type, atomic_writes, atomic_blobs); + return(ULINT_UNDEFINED); } } @@ -854,10 +870,9 @@ dict_tf_set( } if (page_compressed) { - *flags = DICT_TF_COMPACT - | (1 << DICT_TF_POS_ATOMIC_BLOBS) - | (1 << DICT_TF_POS_PAGE_COMPRESSION) - | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); ut_ad(zip_ssize == 0); ut_ad(dict_tf_get_page_compression(*flags) == TRUE); @@ -871,7 +886,8 @@ dict_tf_set( if (awrites == ATOMIC_WRITES_ON || (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { - *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES) + | (1 << DICT_TF_POS_ATOMIC_BLOBS); } } @@ -1000,6 +1016,8 @@ dict_tf_to_sys_tables_type( | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL | DICT_TF_MASK_ATOMIC_WRITES); + ut_a(dict_sys_tables_type_validate(type, 0)); + return(type); } From da927da04def025f91f6d71172d6b525513a6cd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 12 Feb 2014 18:00:03 +0200 Subject: [PATCH 20/56] Fixed issue on atomic writes and system tables. Atomic writes can be used also on system tables but not per table. --- storage/innobase/buf/buf0mtflu.cc | 14 ++++++--- storage/innobase/include/dict0dict.ic | 45 ++++++++++++++------------- storage/xtradb/buf/buf0mtflu.cc | 14 ++++++--- storage/xtradb/include/dict0dict.ic | 45 ++++++++++++++------------- 4 files changed, 66 insertions(+), 52 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index 9cf5a66fc72..a28b1885fe4 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -298,7 +298,7 @@ mtflush_service_io( switch(work_item->tsk) { case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); - work_item->wi_status = WRK_ITEM_SUCCESS; + work_item->wi_status = WRK_ITEM_EXIT; ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); break; @@ -419,11 +419,17 @@ buf_mtflu_io_thread_exit(void) work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); - if (work_item) { + /* If we receive reply to work item and it's status is exit, + thead has processed this message and existed */ + if (work_item && work_item->wi_status == WRK_ITEM_EXIT) { i++; } } + /* Wait about 1/2 sec to allow threads really exit */ + os_thread_sleep(50000); + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); @@ -432,10 +438,10 @@ buf_mtflu_io_thread_exit(void) ib_wqueue_free(mtflush_io->wr_cq); ib_wqueue_free(mtflush_io->rd_cq); + os_fast_mutex_free(&mtflush_mtx); + /* Free heap */ mem_heap_free(mtflush_io->wheap); - - os_fast_mutex_free(&mtflush_mtx); } /******************************************************************//** diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 7cc0404e0eb..73fc9ac56fd 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -645,6 +645,24 @@ dict_tf_is_valid( } } + if (atomic_writes) { + + if(atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); + } + } + /* CREATE TABLE ... DATA DIRECTORY is supported for any row format, so the DATA_DIR flag is compatible with all other table flags. */ @@ -670,7 +688,8 @@ dict_sys_tables_type_validate( ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); - atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + + ut_a(atomic_writes >= 0 && atomic_writes <= ATOMIC_WRITES_OFF); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -748,16 +767,6 @@ dict_sys_tables_type_validate( } } - if (awrites == ATOMIC_WRITES_ON || - (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { - if (!atomic_blobs) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu atomic_blobs %lu\n", - type, atomic_writes, atomic_blobs); - - return(ULINT_UNDEFINED); - } - } - /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -871,16 +880,8 @@ dict_tf_set( ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); } - if (awrites != ATOMIC_WRITES_DEFAULT) { - *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); - ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); - } - - if (awrites == ATOMIC_WRITES_ON || - (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { - *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES) - | (1 << DICT_TF_POS_ATOMIC_BLOBS); - } + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index f98d99228af..5b4d285be21 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -304,7 +304,7 @@ mtflush_service_io( switch(work_item->tsk) { case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); - work_item->wi_status = WRK_ITEM_SUCCESS; + work_item->wi_status = WRK_ITEM_EXIT; ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); break; @@ -425,11 +425,17 @@ buf_mtflu_io_thread_exit(void) work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); - if (work_item) { + /* If we receive reply to work item and it's status is exit, + thead has processed this message and existed */ + if (work_item && work_item->wi_status == WRK_ITEM_EXIT) { i++; } } + /* Wait about 1/2 sec to allow threads really exit */ + os_thread_sleep(50000); + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); @@ -438,10 +444,10 @@ buf_mtflu_io_thread_exit(void) ib_wqueue_free(mtflush_io->wr_cq); ib_wqueue_free(mtflush_io->rd_cq); + os_fast_mutex_free(&mtflush_mtx); + /* Free heap */ mem_heap_free(mtflush_io->wheap); - - os_fast_mutex_free(&mtflush_mtx); } /******************************************************************//** diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index 3f6d56fab1f..d0fbb0d33d2 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -649,6 +649,24 @@ dict_tf_is_valid( } } + if (atomic_writes) { + + if(atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); + } + } + /* CREATE TABLE ... DATA DIRECTORY is supported for any row format, so the DATA_DIR flag is compatible with all other table flags. */ @@ -674,7 +692,8 @@ dict_sys_tables_type_validate( ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); - atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + + ut_a(atomic_writes >= 0 && atomic_writes <= ATOMIC_WRITES_OFF); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -752,16 +771,6 @@ dict_sys_tables_type_validate( } } - if (awrites == ATOMIC_WRITES_ON || - (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { - if (!atomic_blobs) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu atomic_blobs %lu\n", - type, atomic_writes, atomic_blobs); - - return(ULINT_UNDEFINED); - } - } - /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -879,16 +888,8 @@ dict_tf_set( ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); } - if (awrites != ATOMIC_WRITES_DEFAULT) { - *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); - ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); - } - - if (awrites == ATOMIC_WRITES_ON || - (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { - *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES) - | (1 << DICT_TF_POS_ATOMIC_BLOBS); - } + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); } /********************************************************************//** From d17ecff410180adf96dcd7f261157d52e7f62af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 13 Feb 2014 09:13:56 +0200 Subject: [PATCH 21/56] Fixed issue on data dictionary corruption. Fixed issue on multi-threaded flush at shutdown. Removed unnecessary startup option innodb_compress_pages. Added a new startup option innodb_mtflush_threads, default 8. --- storage/innobase/buf/buf0mtflu.cc | 37 +++++++++++++++------------ storage/innobase/handler/ha_innodb.cc | 24 ++++++++--------- storage/innobase/include/dict0dict.ic | 19 +++++++++----- storage/innobase/include/fsp0fsp.ic | 22 +++++++++++++--- storage/innobase/include/srv0srv.h | 4 +-- storage/innobase/os/os0file.cc | 3 ++- storage/innobase/srv/srv0srv.cc | 5 +--- storage/innobase/srv/srv0start.cc | 11 +++----- storage/xtradb/buf/buf0mtflu.cc | 37 +++++++++++++++------------ storage/xtradb/handler/ha_innodb.cc | 24 ++++++++--------- storage/xtradb/include/dict0dict.ic | 19 +++++++++----- storage/xtradb/include/fsp0fsp.ic | 22 +++++++++++++--- storage/xtradb/include/srv0srv.h | 4 +-- storage/xtradb/os/os0file.cc | 3 ++- storage/xtradb/srv/srv0srv.cc | 5 +--- storage/xtradb/srv/srv0start.cc | 12 ++++----- 16 files changed, 141 insertions(+), 110 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index a28b1885fe4..fb1d1ce54ae 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -272,7 +272,7 @@ mtflush_service_io( { wrk_t *work_item = NULL; ulint n_flushed=0; - ib_time_t max_wait_usecs = 5000000; + ib_time_t max_wait_usecs = 50000; mtflush_io->wt_status = WTHR_SIG_WAITING; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); @@ -300,7 +300,8 @@ mtflush_service_io( ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_EXIT; ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); - break; + mtflush_io->wt_status = WTHR_KILL_IT; + return; case MT_WRK_WRITE: work_item->wi_status = WRK_ITEM_START; @@ -346,11 +347,11 @@ DECLARE_THREAD(mtflush_io_thread)( #ifdef UNIV_DEBUG ib_uint64_t stat_universal_num_processed = 0; ib_uint64_t stat_cycle_num_processed = 0; - wrk_t* work_item = mtflush_io[0].work_item; + wrk_t* work_item = mtflush_io[0].work_item; ulint i; #endif - while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + while (TRUE) { mtflush_service_io(mtflush_io); #ifdef UNIV_DEBUG @@ -365,12 +366,9 @@ DECLARE_THREAD(mtflush_io_thread)( stat_cycle_num_processed); mtflu_print_thread_stat(work_item); #endif - } - - /* This should make sure that all current work items are - processed before threads exit. */ - while (!ib_wqueue_is_empty(mtflush_io->wq)) { - mtflush_service_io(mtflush_io); + if (mtflush_io->wt_status == WTHR_KILL_IT) { + break; + } } os_thread_exit(NULL); @@ -385,16 +383,21 @@ void buf_mtflu_io_thread_exit(void) /*==========================*/ { - ulint i; + long i; thread_sync_t* mtflush_io = mtflush_ctx; ut_a(mtflush_io != NULL); - fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", + /* Confirm if the io-thread KILL is in progress, bailout */ + if (mtflush_io->wt_status == WTHR_KILL_IT) { + return; + } + + fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", srv_buf_pool_instances); /* Send one exit work item/thread */ - for (i=0; i < srv_buf_pool_instances; i++) { + for (i=0; i < srv_mtflush_threads; i++) { mtflush_io->work_item[i].wr.buf_pool = NULL; mtflush_io->work_item[i].rd.page_pool = NULL; mtflush_io->work_item[i].tsk = MT_WRK_NONE; @@ -407,14 +410,14 @@ buf_mtflu_io_thread_exit(void) /* Wait until all work items on a work queue are processed */ while(!ib_wqueue_is_empty(mtflush_io->wq)) { - /* Wait about 1/2 sec */ - os_thread_sleep(50000); + /* Wait */ + os_thread_sleep(500000); } ut_a(ib_wqueue_is_empty(mtflush_io->wq)); /* Collect all work done items */ - for (i=0; i < srv_buf_pool_instances;) { + for (i=0; i < srv_mtflush_threads;) { wrk_t* work_item; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); @@ -558,11 +561,13 @@ buf_mtflu_flush_work_items( if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { +#ifdef UNIV_DEBUG fprintf(stderr, "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); ut_ad(0); +#endif } n_flushed+= done_wi->n_flushed; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 812aa0cfe83..4999a202bd6 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -9769,14 +9769,6 @@ ha_innobase::check_table_options( /* Check page compression requirements */ if (options->page_compressed) { - if (!srv_compress_pages) { - push_warning( - thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, - "InnoDB: PAGE_COMPRESSED requires" - "innodb_compress_pages not enabled"); - return "PAGE_COMPRESSED"; - } if (row_format == ROW_TYPE_COMPRESSED) { push_warning( @@ -16587,11 +16579,6 @@ static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, NULL, NULL, FALSE); #endif /* UNIV_DEBUG */ -static MYSQL_SYSVAR_BOOL(compress_pages, srv_compress_pages, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Use page compression.", - NULL, NULL, FALSE); - static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, PLUGIN_VAR_OPCMDARG , "How many percent of compressed pages should be trimmed", @@ -16614,6 +16601,15 @@ static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, NULL, NULL, FALSE); #endif /* HAVE_LZ4 */ +static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, + PLUGIN_VAR_RQCMDARG, + "Number of multi-threaded flush threads", + NULL, NULL, + MTFLUSH_DEFAULT_WORKER, /* Default setting */ + 1, /* Minimum setting */ + MTFLUSH_MAX_WORKER, /* Max setting */ + 0); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16759,13 +16755,13 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(limit_optimistic_insert_debug), MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ - MYSQL_SYSVAR(compress_pages), MYSQL_SYSVAR(trim_pct), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), #ifdef HAVE_LZ4 MYSQL_SYSVAR(use_lz4), #endif + MYSQL_SYSVAR(mtflush_threads), NULL }; diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 73fc9ac56fd..2be68e37dc8 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -700,7 +700,7 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } @@ -708,7 +708,7 @@ dict_sys_tables_type_validate( /* Make sure there are no bits that we do not know about. */ if (unused) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n", type, unused); return(ULINT_UNDEFINED); } @@ -724,7 +724,7 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", type, zip_ssize); return(ULINT_UNDEFINED); } @@ -735,14 +735,14 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } @@ -760,13 +760,20 @@ dict_sys_tables_type_validate( low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs || !page_compression) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" "InnoDB: Error: atomic_blobs %lu\n", type, page_compression, page_compression_level, atomic_blobs); return(ULINT_UNDEFINED); } } + /* Validate that the atomic writes number is within allowed range. */ + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n", + type, atomic_writes); + return(ULINT_UNDEFINED); + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index cb12d556ec4..fb253370b6e 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -67,13 +67,14 @@ fsp_flags_is_valid( ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); - atomic_writes_t awrites = (atomic_writes_t)atomic_writes; DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); /* fsp_flags is zero unless atomic_blobs is set. */ /* Make sure there are no bits that we do not know about. */ if (unused != 0 || flags == 1) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n", + flags, unused); return(false); } else if (post_antelope) { /* The Antelope row formats REDUNDANT and COMPACT did @@ -81,6 +82,8 @@ fsp_flags_is_valid( 4-byte field is zero for Antelope row formats. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n", + flags, atomic_blobs); return(false); } } @@ -92,10 +95,14 @@ fsp_flags_is_valid( externally stored parts. */ if (post_antelope || zip_ssize != 0) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n", + flags, zip_ssize, atomic_blobs); return(false); } } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n", + flags, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(false); } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { @@ -103,9 +110,13 @@ fsp_flags_is_valid( be zero for an original 16k page size. Validate the page shift size is within allowed range. */ + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n", + flags, page_ssize, UNIV_PAGE_SSIZE_MAX); return(false); } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n", + flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG); return(false); } @@ -113,13 +124,16 @@ fsp_flags_is_valid( to be set */ if (page_compression_level || page_compression) { if (!page_compression || !atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n" + "InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n", + flags, page_compression, page_compression_level, atomic_blobs); return(false); } } - if ((awrites == ATOMIC_WRITES_ON || - (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) - && !atomic_blobs) { + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n", + flags, atomic_writes); return (false); } diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 0ffb966d9a3..725aaf9553d 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -236,9 +236,6 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ extern my_bool srv_use_native_aio; -/* Is page compression used */ -extern my_bool srv_compress_pages; - /* Is page compression used only for index pages */ extern my_bool srv_page_compress_index_pages; @@ -259,6 +256,7 @@ extern my_bool srv_use_lz4; /* Number of flush threads */ #define MTFLUSH_MAX_WORKER 64 +#define MTFLUSH_DEFAULT_WORKER 8 extern long srv_mtflush_threads; #ifdef __WIN__ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 06c1a8c6ed4..683cd78b901 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -6153,6 +6153,7 @@ os_file_trim( ulint len) /*!< in: length of area */ { +#define SECT_SIZE 512 size_t trim_len = UNIV_PAGE_SIZE - len; os_offset_t off = slot->offset + len; @@ -6184,6 +6185,7 @@ os_file_trim( #ifdef __linux__ #if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + trim_len = (trim_len & ~(SECT_SIZE - 1)) + SECT_SIZE; int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); if (ret) { @@ -6252,7 +6254,6 @@ os_file_trim( } #endif -#define SECT_SIZE 512 srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); srv_stats.page_compressed_trim_op.inc(); diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index fa1675f7a17..92cfda1c65e 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -146,9 +146,6 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; -/* If this flag is TRUE, then we will use page compression -to the pages */ -UNIV_INTERN my_bool srv_compress_pages = FALSE; /* If this flag is TRUE, then we will use page compression only for index pages */ UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; @@ -163,7 +160,7 @@ UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ UNIV_INTERN my_bool srv_use_lz4 = FALSE; /* Number of threads used for multi-threaded flush */ -UNIV_INTERN long srv_mtflush_threads = 0; +UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 879b2335720..a469dac8296 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -2593,11 +2593,7 @@ files_checked: if (!srv_read_only_mode) { - if (srv_buf_pool_instances <= MTFLUSH_MAX_WORKER) { - srv_mtflush_threads = srv_buf_pool_instances; - } - /* else we default to 8 worker-threads */ - + /* Start multi-threaded flush threads */ mtflush_ctx = buf_mtflu_handler_init(srv_mtflush_threads, srv_buf_pool_instances); @@ -2607,7 +2603,8 @@ files_checked: (thread_ids + 6 + 32)); #if UNIV_DEBUG - fprintf(stderr, "%s:%d buf-pool-instances:%lu\n", __FILE__, __LINE__, srv_buf_pool_instances); + fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", + __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); #endif os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); @@ -2879,7 +2876,7 @@ innobase_shutdown_for_mysql(void) buf_mtflu_io_thread_exit(); #ifdef UNIV_DEBUG - fprintf(stderr, "%s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); + fprintf(stderr, "InnoDB: Note: %s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); #endif os_mutex_enter(os_sync_mutex); diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 5b4d285be21..beb46cc2813 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -278,7 +278,7 @@ mtflush_service_io( { wrk_t *work_item = NULL; ulint n_flushed=0; - ib_time_t max_wait_usecs = 5000000; + ib_time_t max_wait_usecs = 50000; mtflush_io->wt_status = WTHR_SIG_WAITING; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); @@ -306,7 +306,8 @@ mtflush_service_io( ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_EXIT; ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); - break; + mtflush_io->wt_status = WTHR_KILL_IT; + return; case MT_WRK_WRITE: work_item->wi_status = WRK_ITEM_START; @@ -352,11 +353,11 @@ DECLARE_THREAD(mtflush_io_thread)( #ifdef UNIV_DEBUG ib_uint64_t stat_universal_num_processed = 0; ib_uint64_t stat_cycle_num_processed = 0; - wrk_t* work_item = mtflush_io[0].work_item; + wrk_t* work_item = mtflush_io[0].work_item; ulint i; #endif - while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + while (TRUE) { mtflush_service_io(mtflush_io); #ifdef UNIV_DEBUG @@ -371,12 +372,9 @@ DECLARE_THREAD(mtflush_io_thread)( stat_cycle_num_processed); mtflu_print_thread_stat(work_item); #endif - } - - /* This should make sure that all current work items are - processed before threads exit. */ - while (!ib_wqueue_is_empty(mtflush_io->wq)) { - mtflush_service_io(mtflush_io); + if (mtflush_io->wt_status == WTHR_KILL_IT) { + break; + } } os_thread_exit(NULL); @@ -391,16 +389,21 @@ void buf_mtflu_io_thread_exit(void) /*==========================*/ { - ulint i; + long i; thread_sync_t* mtflush_io = mtflush_ctx; ut_a(mtflush_io != NULL); - fprintf(stderr, "signal page_comp_io_threads to exit [%lu]\n", + /* Confirm if the io-thread KILL is in progress, bailout */ + if (mtflush_io->wt_status == WTHR_KILL_IT) { + return; + } + + fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", srv_buf_pool_instances); /* Send one exit work item/thread */ - for (i=0; i < srv_buf_pool_instances; i++) { + for (i=0; i < srv_mtflush_threads; i++) { mtflush_io->work_item[i].wr.buf_pool = NULL; mtflush_io->work_item[i].rd.page_pool = NULL; mtflush_io->work_item[i].tsk = MT_WRK_NONE; @@ -413,14 +416,14 @@ buf_mtflu_io_thread_exit(void) /* Wait until all work items on a work queue are processed */ while(!ib_wqueue_is_empty(mtflush_io->wq)) { - /* Wait about 1/2 sec */ - os_thread_sleep(50000); + /* Wait */ + os_thread_sleep(500000); } ut_a(ib_wqueue_is_empty(mtflush_io->wq)); /* Collect all work done items */ - for (i=0; i < srv_buf_pool_instances;) { + for (i=0; i < srv_mtflush_threads;) { wrk_t* work_item; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); @@ -564,11 +567,13 @@ buf_mtflu_flush_work_items( if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { +#ifdef UNIV_DEBUG fprintf(stderr, "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); ut_ad(0); +#endif } n_flushed+= done_wi->n_flushed; diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 557872abdf0..f26ad436190 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -10294,14 +10294,6 @@ ha_innobase::check_table_options( /* Check page compression requirements */ if (options->page_compressed) { - if (!srv_compress_pages) { - push_warning( - thd, Sql_condition::WARN_LEVEL_WARN, - HA_WRONG_CREATE_OPTION, - "InnoDB: PAGE_COMPRESSED requires" - "innodb_compress_pages not enabled"); - return "PAGE_COMPRESSED"; - } if (row_format == ROW_TYPE_COMPRESSED) { push_warning( @@ -17942,11 +17934,6 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, "Print stacktrace on long semaphore wait (off by default supported only on linux)", NULL, NULL, FALSE); -static MYSQL_SYSVAR_BOOL(compress_pages, srv_compress_pages, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Use page compression.", - NULL, NULL, FALSE); - static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, PLUGIN_VAR_OPCMDARG , "How many percent of compressed pages should be trimmed", @@ -17975,6 +17962,15 @@ static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, NULL, NULL, FALSE); #endif /* HAVE_LZ4 */ +static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, + PLUGIN_VAR_RQCMDARG, + "Number of multi-threaded flush threads", + NULL, NULL, + MTFLUSH_DEFAULT_WORKER, /* Default setting */ + 1, /* Minimum setting */ + MTFLUSH_MAX_WORKER, /* Max setting */ + 0); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), MYSQL_SYSVAR(additional_mem_pool_size), @@ -18165,13 +18161,13 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fake_changes), MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), - MYSQL_SYSVAR(compress_pages), MYSQL_SYSVAR(trim_pct), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), #ifdef HAVE_LZ4 MYSQL_SYSVAR(use_lz4), #endif + MYSQL_SYSVAR(mtflush_threads), NULL }; diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index d0fbb0d33d2..d37db209beb 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -704,7 +704,7 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } @@ -712,7 +712,7 @@ dict_sys_tables_type_validate( /* Make sure there are no bits that we do not know about. */ if (unused) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n", type, unused); return(ULINT_UNDEFINED); } @@ -728,7 +728,7 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", type, zip_ssize); return(ULINT_UNDEFINED); } @@ -739,14 +739,14 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } @@ -764,13 +764,20 @@ dict_sys_tables_type_validate( low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs || !page_compression) { - fprintf(stderr, "InnoDB Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" "InnoDB: Error: atomic_blobs %lu\n", type, page_compression, page_compression_level, atomic_blobs); return(ULINT_UNDEFINED); } } + /* Validate that the atomic writes number is within allowed range. */ + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n", + type, atomic_writes); + return(ULINT_UNDEFINED); + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic index bc46967fab0..3563f5ef372 100644 --- a/storage/xtradb/include/fsp0fsp.ic +++ b/storage/xtradb/include/fsp0fsp.ic @@ -67,13 +67,14 @@ fsp_flags_is_valid( ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); - atomic_writes_t awrites = (atomic_writes_t)atomic_writes; DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); /* fsp_flags is zero unless atomic_blobs is set. */ /* Make sure there are no bits that we do not know about. */ if (unused != 0 || flags == 1) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n", + flags, unused); return(false); } else if (post_antelope) { /* The Antelope row formats REDUNDANT and COMPACT did @@ -81,6 +82,8 @@ fsp_flags_is_valid( 4-byte field is zero for Antelope row formats. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n", + flags, atomic_blobs); return(false); } } @@ -92,10 +95,14 @@ fsp_flags_is_valid( externally stored parts. */ if (post_antelope || zip_ssize != 0) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n", + flags, zip_ssize, atomic_blobs); return(false); } } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n", + flags, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(false); } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { @@ -103,9 +110,13 @@ fsp_flags_is_valid( be zero for an original 16k page size. Validate the page shift size is within allowed range. */ + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n", + flags, page_ssize, UNIV_PAGE_SSIZE_MAX); return(false); } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n", + flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG); return(false); } @@ -117,13 +128,16 @@ fsp_flags_is_valid( to be set */ if (page_compression_level || page_compression) { if (!page_compression || !atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n" + "InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n", + flags, page_compression, page_compression_level, atomic_blobs); return(false); } } - if ((awrites == ATOMIC_WRITES_ON || - (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) - && !atomic_blobs) { + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n", + flags, atomic_writes); return (false); } diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index f7de92d2288..bfb59865841 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -256,9 +256,6 @@ extern ibool srv_use_native_conditions; #endif /* __WIN__ */ #endif /* !UNIV_HOTBACKUP */ -/* Is page compression used */ -extern my_bool srv_compress_pages; - /* Is page compression used only for index pages */ extern my_bool srv_page_compress_index_pages; @@ -279,6 +276,7 @@ extern my_bool srv_use_lz4; /* Number of flush threads */ #define MTFLUSH_MAX_WORKER 64 +#define MTFLUSH_DEFAULT_WORKER 8 extern long srv_mtflush_threads; /** Server undo tablespaces directory, can be absolute path. */ diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 43adf78c63c..d9a5be2b049 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -6208,6 +6208,7 @@ os_file_trim( ulint len) /*!< in: length of area */ { +#define SECT_SIZE 512 size_t trim_len = UNIV_PAGE_SIZE - len; os_offset_t off = slot->offset + len; @@ -6239,6 +6240,7 @@ os_file_trim( #ifdef __linux__ #if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + trim_len = (trim_len & ~(SECT_SIZE - 1)) + SECT_SIZE; int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); if (ret) { @@ -6307,7 +6309,6 @@ os_file_trim( } #endif -#define SECT_SIZE 512 srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); srv_stats.page_compressed_trim_op.inc(); diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 7bd1ef52951..d6801b701ae 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -161,9 +161,6 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; -/* If this flag is TRUE, then we will use page compression -to the pages */ -UNIV_INTERN my_bool srv_compress_pages = FALSE; /* If this flag is TRUE, then we will use page compression only for index pages */ UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; @@ -181,7 +178,7 @@ UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ UNIV_INTERN my_bool srv_use_lz4 = FALSE; /* Number of threads used for multi-threaded flush */ -UNIV_INTERN long srv_mtflush_threads = 0; +UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 29afd0d0c98..bb539569e9a 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -2718,22 +2718,20 @@ files_checked: } if (!srv_read_only_mode) { - if (srv_buf_pool_instances <= MTFLUSH_MAX_WORKER) { - srv_mtflush_threads = srv_buf_pool_instances; - } + /* Start multi-threaded flush threads */ mtflush_ctx = buf_mtflu_handler_init(srv_mtflush_threads, srv_buf_pool_instances); /* Set up the thread ids */ buf_mtflu_set_thread_ids(srv_mtflush_threads, mtflush_ctx, - (thread_ids + 6 + 32)); + (thread_ids + 6 + SRV_MAX_N_PURGE_THREADS)); #if UNIV_DEBUG - fprintf(stderr, "%s:%d buf-pool-instances:%lu\n", __FILE__, __LINE__, srv_buf_pool_instances); + fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", + __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); #endif - /* JAN: TODO: END */ os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); } @@ -3011,7 +3009,7 @@ innobase_shutdown_for_mysql(void) buf_mtflu_io_thread_exit(); #ifdef UNIV_DEBUG - fprintf(stderr, "%s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); + fprintf(stderr, "InnoDB: Note: %s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); #endif os_mutex_enter(os_sync_mutex); From dfc295035609c669e699f1df07d60495d6b8dbb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 13 Feb 2014 12:23:55 +0200 Subject: [PATCH 22/56] Fixed small issue with dictionary. --- storage/innobase/include/dict0dict.ic | 2 -- storage/xtradb/include/dict0dict.ic | 2 -- 2 files changed, 4 deletions(-) diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 2be68e37dc8..045d1185ebd 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -1020,8 +1020,6 @@ dict_tf_to_sys_tables_type( | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL | DICT_TF_MASK_ATOMIC_WRITES); - ut_a(dict_sys_tables_type_validate(type, 0)); - return(type); } diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index d37db209beb..ab9241c29a7 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -1024,8 +1024,6 @@ dict_tf_to_sys_tables_type( | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL | DICT_TF_MASK_ATOMIC_WRITES); - ut_a(dict_sys_tables_type_validate(type, 0)); - return(type); } From 9c614665ee78028b9cf2edfe043373b4f6f0ff3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 13 Feb 2014 12:35:37 +0200 Subject: [PATCH 23/56] Fixed compiler warnings. --- storage/innobase/include/dict0dict.ic | 2 +- storage/xtradb/include/dict0dict.ic | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 045d1185ebd..d1cfdb0b8f7 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -888,7 +888,7 @@ dict_tf_set( } *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); - ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); + ut_a(dict_tf_get_atomic_writes(*flags) == awrites); if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index ab9241c29a7..2dc449bac4d 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -896,7 +896,7 @@ dict_tf_set( } *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); - ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); + ut_a(dict_tf_get_atomic_writes(*flags) == awrites); } /********************************************************************//** From cae21c52f604ba804f07f858edae5a930978d820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 14 Feb 2014 15:02:26 +0200 Subject: [PATCH 24/56] Fix timing on queues, this could clearly lead to starvation. --- storage/innobase/buf/buf0mtflu.cc | 11 ++++++----- storage/xtradb/buf/buf0mtflu.cc | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index fb1d1ce54ae..ee53e52a9cf 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -51,6 +51,8 @@ Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com #include "fil0pagecompress.h" #define MT_COMP_WATER_MARK 50 +/** Time to wait for a message. */ +#define MT_WAIT_IN_USECS 5000000 /* Work item status */ typedef enum wrk_status { @@ -272,10 +274,9 @@ mtflush_service_io( { wrk_t *work_item = NULL; ulint n_flushed=0; - ib_time_t max_wait_usecs = 50000; mtflush_io->wt_status = WTHR_SIG_WAITING; - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; @@ -411,7 +412,7 @@ buf_mtflu_io_thread_exit(void) /* Wait until all work items on a work queue are processed */ while(!ib_wqueue_is_empty(mtflush_io->wq)) { /* Wait */ - os_thread_sleep(500000); + os_thread_sleep(MT_WAIT_IN_USECS * 2); } ut_a(ib_wqueue_is_empty(mtflush_io->wq)); @@ -420,7 +421,7 @@ buf_mtflu_io_thread_exit(void) for (i=0; i < srv_mtflush_threads;) { wrk_t* work_item; - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); /* If we receive reply to work item and it's status is exit, thead has processed this message and existed */ @@ -550,7 +551,7 @@ buf_mtflu_flush_work_items( /* wait on the completion to arrive */ for(i=0; i< buf_pool_inst;) { - done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, 50000); + done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, MT_WAIT_IN_USECS); if (done_wi != NULL) { if(done_wi->n_flushed == 0) { diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index beb46cc2813..1f43e84a12f 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -51,6 +51,8 @@ Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com #include "fil0pagecompress.h" #define MT_COMP_WATER_MARK 50 +/** Time to wait for a message. */ +#define MT_WAIT_IN_USECS 5000000 /* Work item status */ typedef enum wrk_status { @@ -278,10 +280,9 @@ mtflush_service_io( { wrk_t *work_item = NULL; ulint n_flushed=0; - ib_time_t max_wait_usecs = 50000; mtflush_io->wt_status = WTHR_SIG_WAITING; - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, max_wait_usecs); + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; @@ -417,7 +418,7 @@ buf_mtflu_io_thread_exit(void) /* Wait until all work items on a work queue are processed */ while(!ib_wqueue_is_empty(mtflush_io->wq)) { /* Wait */ - os_thread_sleep(500000); + os_thread_sleep(MT_WAIT_IN_USECS * 2); } ut_a(ib_wqueue_is_empty(mtflush_io->wq)); @@ -426,7 +427,7 @@ buf_mtflu_io_thread_exit(void) for (i=0; i < srv_mtflush_threads;) { wrk_t* work_item; - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, 50000); + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); /* If we receive reply to work item and it's status is exit, thead has processed this message and existed */ @@ -556,7 +557,7 @@ buf_mtflu_flush_work_items( /* wait on the completion to arrive */ for(i=0; i< buf_pool_inst;) { - done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, 50000); + done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, MT_WAIT_IN_USECS); if (done_wi != NULL) { if(done_wi->n_flushed == 0) { From 25318038a92872492036e8eb5da9363f22d1b7c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Sat, 15 Feb 2014 09:51:06 +0200 Subject: [PATCH 25/56] Fixed hang seen on TPC-C measure phase. We should not use timedwait on threads waiting for a job. They should sleep and let other threads to their work. At shutdown, we know that we put "work" and that is handled as soon as possible. --- storage/innobase/buf/buf0mtflu.cc | 6 +++--- storage/xtradb/buf/buf0mtflu.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index ee53e52a9cf..3750dbaa13e 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -276,12 +276,12 @@ mtflush_service_io( ulint n_flushed=0; mtflush_io->wt_status = WTHR_SIG_WAITING; - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); + work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; } else { - /* Because of timeout this thread did not get any work */ + /* Thread did not get any work */ mtflush_io->wt_status = WTHR_NO_WORK; return; } @@ -551,7 +551,7 @@ buf_mtflu_flush_work_items( /* wait on the completion to arrive */ for(i=0; i< buf_pool_inst;) { - done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, MT_WAIT_IN_USECS); + done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); if (done_wi != NULL) { if(done_wi->n_flushed == 0) { diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 1f43e84a12f..55902cc7a58 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -282,12 +282,12 @@ mtflush_service_io( ulint n_flushed=0; mtflush_io->wt_status = WTHR_SIG_WAITING; - work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); + work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; } else { - /* Because of timeout this thread did not get any work */ + /* Thread did not get any work */ mtflush_io->wt_status = WTHR_NO_WORK; return; } @@ -557,7 +557,7 @@ buf_mtflu_flush_work_items( /* wait on the completion to arrive */ for(i=0; i< buf_pool_inst;) { - done_wi = (wrk_t *)ib_wqueue_timedwait(mtflush_ctx->wr_cq, MT_WAIT_IN_USECS); + done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); if (done_wi != NULL) { if(done_wi->n_flushed == 0) { From 24bc0314c2b8ba373d970f15d5fba52c02cd01d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 19 Feb 2014 20:25:55 +0200 Subject: [PATCH 26/56] Removed unnecessary memory initialization of page compressed buffer and added guard against unalligned trim size. --- storage/innobase/os/os0file.cc | 7 ++----- storage/xtradb/os/os0file.cc | 6 ++---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 683cd78b901..32f469ac240 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4482,10 +4482,6 @@ found: ut_ad(slot->page_buf); - /* Write buffer full of zeros, this is needed for trim, - can't really avoid this now. */ - memset(slot->page_buf, 0, len); - tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); /* If compression succeeded, set up the length and buffer */ @@ -6155,6 +6151,8 @@ os_file_trim( #define SECT_SIZE 512 size_t trim_len = UNIV_PAGE_SIZE - len; + // len here should be alligned to sector size + ut_a(trim_len == ((trim_len + SECT_SIZE-1) & ~(SECT_SIZE-1))); os_offset_t off = slot->offset + len; // Nothing to do if trim length is zero or if actual write @@ -6185,7 +6183,6 @@ os_file_trim( #ifdef __linux__ #if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) - trim_len = (trim_len & ~(SECT_SIZE - 1)) + SECT_SIZE; int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); if (ret) { diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index d9a5be2b049..2dec28b71f3 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -4585,10 +4585,6 @@ found: ut_ad(slot->page_buf); - /* Write buffer full of zeros, this is needed for trim, - can't really avoid this now. */ - memset(slot->page_buf, 0, len); - tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); /* If compression succeeded, set up the length and buffer */ @@ -6210,6 +6206,8 @@ os_file_trim( #define SECT_SIZE 512 size_t trim_len = UNIV_PAGE_SIZE - len; + // len here should be alligned to sector size + ut_a(trim_len == ((trim_len + SECT_SIZE-1) & ~(SECT_SIZE-1))); os_offset_t off = slot->offset + len; // Nothing to do if trim length is zero or if actual write From 3c7714301718cc1b18847829582b3e3b71be940e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 21 Feb 2014 10:20:18 +0200 Subject: [PATCH 27/56] Write size was not correctly alligned to SECT_SIZE. This lead to situation where trim corrupted the database. Fixed the issue and added temporal guards against unalligned write/trim. --- storage/innobase/fil/fil0pagecompress.cc | 9 ++++++--- storage/innobase/os/os0file.cc | 5 +++-- storage/xtradb/fil/fil0pagecompress.cc | 9 ++++++--- storage/xtradb/os/os0file.cc | 5 +++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 26e975bddf3..dfa216d0ae2 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -188,9 +188,13 @@ fil_compress_page( #endif /* UNIV_DEBUG */ write_size+=header_len; + +#define SECT_SIZE 512 + /* Actual write needs to be alligned on block size */ - if (write_size % OS_FILE_LOG_BLOCK_SIZE) { - write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + if (write_size % SECT_SIZE) { + write_size = (write_size + SECT_SIZE-1) & ~(SECT_SIZE-1); + ut_a((write_size % SECT_SIZE) == 0); } #ifdef UNIV_DEBUG @@ -199,7 +203,6 @@ fil_compress_page( space_id, fil_space_name(space), len, write_size); #endif -#define SECT_SIZE 512 srv_stats.page_compression_saved.add((len - write_size)); if ((len - write_size) > 0) { diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 32f469ac240..cdd8a68b4d4 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -6151,9 +6151,10 @@ os_file_trim( #define SECT_SIZE 512 size_t trim_len = UNIV_PAGE_SIZE - len; - // len here should be alligned to sector size - ut_a(trim_len == ((trim_len + SECT_SIZE-1) & ~(SECT_SIZE-1))); os_offset_t off = slot->offset + len; + // len here should be alligned to sector size + ut_a((trim_len % SECT_SIZE) == 0); + ut_a((len % SECT_SIZE) == 0); // Nothing to do if trim length is zero or if actual write // size is initialized and it is smaller than current write size. diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index 8f835113b7f..05dcf372112 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -184,9 +184,13 @@ fil_compress_page( #endif /* UNIV_DEBUG */ write_size+=header_len; + +#define SECT_SIZE 512 + /* Actual write needs to be alligned on block size */ - if (write_size % OS_FILE_LOG_BLOCK_SIZE) { - write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + if (write_size % SECT_SIZE) { + write_size = (write_size + SECT_SIZE-1) & ~(SECT_SIZE-1); + ut_a((write_size % SECT_SIZE) == 0); } #ifdef UNIV_DEBUG @@ -195,7 +199,6 @@ fil_compress_page( space_id, fil_space_name(space), len, write_size); #endif /* UNIV_DEBUG */ -#define SECT_SIZE 512 srv_stats.page_compression_saved.add((len - write_size)); if ((len - write_size) > 0) { diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 2dec28b71f3..72ceed1debc 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -6206,9 +6206,10 @@ os_file_trim( #define SECT_SIZE 512 size_t trim_len = UNIV_PAGE_SIZE - len; - // len here should be alligned to sector size - ut_a(trim_len == ((trim_len + SECT_SIZE-1) & ~(SECT_SIZE-1))); os_offset_t off = slot->offset + len; + // len here should be alligned to sector size + ut_a((trim_len % SECT_SIZE) == 0); + ut_a((len % SECT_SIZE) == 0); // Nothing to do if trim length is zero or if actual write // size is initialized and it is smaller than current write size. From 38471a6d6aa6ed96cac056794a1c5ee22d861c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 21 Feb 2014 12:51:03 +0200 Subject: [PATCH 28/56] Remove incorrect trim_len calculation. We have already alligned actual page data write. --- storage/xtradb/os/os0file.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 72ceed1debc..945b4e788c5 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -6239,7 +6239,6 @@ os_file_trim( #ifdef __linux__ #if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) - trim_len = (trim_len & ~(SECT_SIZE - 1)) + SECT_SIZE; int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); if (ret) { From 24235e99d83170f1802875f020179cc5dcda3182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Tue, 25 Feb 2014 13:15:55 +0200 Subject: [PATCH 29/56] Fixed memory leak on queue nodes by using local memory heap on normal execution and global memory heap on shutdown. Added a funcition to get work items from queue without waiting and additional info when there is no work to do for a extended periods. --- storage/innobase/buf/buf0mtflu.cc | 156 +++++++---------------- storage/innobase/include/ut0wqueue.h | 9 ++ storage/innobase/ut/ut0wqueue.cc | 32 +++++ storage/xtradb/buf/buf0mtflu.cc | 177 ++++++++++----------------- storage/xtradb/include/ut0list.h | 9 ++ storage/xtradb/include/ut0list.ic | 20 +++ storage/xtradb/include/ut0wqueue.h | 17 +++ storage/xtradb/ut/ut0wqueue.cc | 49 ++++++++ 8 files changed, 248 insertions(+), 221 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index 3750dbaa13e..19dfc883ca0 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -113,6 +113,8 @@ typedef struct wrk_itm os_thread_id_t id_usr; /*!< Thread-id currently working */ wrk_status_t wi_status; /*!< Work item status */ struct wrk_itm *next; /*!< Next work item */ + mem_heap_t *wheap; /*!< Heap were to allocate memory + for queue nodes */ } wrk_t; /* Thread syncronization data */ @@ -127,39 +129,12 @@ typedef struct thread_sync wthr_status_t wt_status; /*!< Worker thread status */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ - wrk_t* work_item; /*!< Array of work-items that are - individually accessed by multiple - threads. Items are accessed in a - thread safe manner.*/ } thread_sync_t; -/* TODO: REALLY NEEDED ? */ static int mtflush_work_initialized = -1; static os_fast_mutex_t mtflush_mtx; static thread_sync_t* mtflush_ctx=NULL; -/******************************************************************//** -Initialize work items. */ -static -void -mtflu_setup_work_items( -/*===================*/ - wrk_t* work_items, /*!< inout: Work items */ - ulint n_items) /*!< in: Number of work items */ -{ - ulint i; - for(i=0; in_flushed; } -#ifdef UNIV_DEBUG -/******************************************************************//** -Print flush statistics of work items. -*/ -static -void -mtflu_print_thread_stat( -/*====================*/ - wrk_t* work_item) /*!< in: Work items */ -{ - ulint stat_tot=0; - ulint i=0; - - for(i=0; i< MTFLUSH_MAX_WORKER; i++) { - stat_tot+=work_item[i].n_flushed; - - fprintf(stderr, "MTFLUSH: Thread[%lu] stat [%lu]\n", - work_item[i].id_usr, - work_item[i].n_flushed); - - if (work_item[i].next == NULL) { - break; /* No more filled work items */ - } - } - fprintf(stderr, "MTFLUSH: Stat-Total:%lu\n", stat_tot); -} -#endif /* UNIV_DEBUG */ - /******************************************************************//** Worker function to wait for work items and processing them and sending reply back. @@ -276,7 +223,12 @@ mtflush_service_io( ulint n_flushed=0; mtflush_io->wt_status = WTHR_SIG_WAITING; - work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); + + work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); + + if (work_item == NULL) { + work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); + } if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; @@ -300,7 +252,7 @@ mtflush_service_io( case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_EXIT; - ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); mtflush_io->wt_status = WTHR_KILL_IT; return; @@ -314,16 +266,11 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); break; case MT_WRK_READ: - /* Need to also handle the read case */ - /* TODO: ? */ ut_a(0); - /* completed task get added to rd_cq */ - /* work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->rd_cq, work_item, mtflush_io->wheap);*/ break; default: @@ -348,25 +295,12 @@ DECLARE_THREAD(mtflush_io_thread)( #ifdef UNIV_DEBUG ib_uint64_t stat_universal_num_processed = 0; ib_uint64_t stat_cycle_num_processed = 0; - wrk_t* work_item = mtflush_io[0].work_item; ulint i; #endif while (TRUE) { mtflush_service_io(mtflush_io); -#ifdef UNIV_DEBUG - for(i=0; i < MTFLUSH_MAX_WORKER; i++) { - stat_cycle_num_processed+= work_item[i].n_flushed; - } - - stat_universal_num_processed+=stat_cycle_num_processed; - stat_cycle_num_processed = 0; - fprintf(stderr, "MTFLUSH_IO_THREAD: total %lu cycle %lu\n", - stat_universal_num_processed, - stat_cycle_num_processed); - mtflu_print_thread_stat(work_item); -#endif if (mtflush_io->wt_status == WTHR_KILL_IT) { break; } @@ -386,26 +320,31 @@ buf_mtflu_io_thread_exit(void) { long i; thread_sync_t* mtflush_io = mtflush_ctx; + wrk_t* work_item; ut_a(mtflush_io != NULL); + /* Allocate work items for shutdown message */ + work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + /* Confirm if the io-thread KILL is in progress, bailout */ if (mtflush_io->wt_status == WTHR_KILL_IT) { return; } fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", - srv_buf_pool_instances); + srv_mtflush_threads); /* Send one exit work item/thread */ for (i=0; i < srv_mtflush_threads; i++) { - mtflush_io->work_item[i].wr.buf_pool = NULL; - mtflush_io->work_item[i].rd.page_pool = NULL; - mtflush_io->work_item[i].tsk = MT_WRK_NONE; - mtflush_io->work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wr.buf_pool = NULL; + work_item[i].rd.page_pool = NULL; + work_item[i].tsk = MT_WRK_NONE; + work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wheap = mtflush_io->wheap; ib_wqueue_add(mtflush_io->wq, - (void *)&(mtflush_io->work_item[i]), + (void *)&(work_item[i]), mtflush_io->wheap); } @@ -431,7 +370,7 @@ buf_mtflu_io_thread_exit(void) } /* Wait about 1/2 sec to allow threads really exit */ - os_thread_sleep(50000); + os_thread_sleep(5000000); ut_a(ib_wqueue_is_empty(mtflush_io->wq)); ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); @@ -462,7 +401,6 @@ buf_mtflu_handler_init( ib_wqueue_t* mtflush_work_queue; ib_wqueue_t* mtflush_write_comp_queue; ib_wqueue_t* mtflush_read_comp_queue; - wrk_t* work_items; os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); @@ -481,14 +419,6 @@ buf_mtflu_handler_init( mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, MTFLUSH_MAX_WORKER * sizeof(thread_sync_t)); ut_a(mtflush_ctx != NULL); - work_items = (wrk_t*)mem_heap_alloc(mtflush_heap, - MTFLUSH_MAX_WORKER * sizeof(wrk_t)); - ut_a(work_items != NULL); - memset(work_items, 0, sizeof(wrk_t) * MTFLUSH_MAX_WORKER); - memset(mtflush_ctx, 0, sizeof(thread_sync_t) * MTFLUSH_MAX_WORKER); - - /* Initialize work items */ - mtflu_setup_work_items(work_items, n_threads); /* Create threads for page-compression-flush */ for(i=0; i < n_threads; i++) { @@ -499,7 +429,6 @@ buf_mtflu_handler_init( mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; mtflush_ctx[i].wheap = mtflush_heap; mtflush_ctx[i].wt_status = WTHR_INITIALIZED; - mtflush_ctx[i].work_item = work_items; mtflush_ctx[i].wthread = os_thread_create( mtflush_io_thread, @@ -533,20 +462,28 @@ buf_mtflu_flush_work_items( { ulint n_flushed=0, i; wrk_t *done_wi; + mem_heap_t* work_heap; + wrk_t* work_item; + + /* Allocate heap where all work items used and queue + node items areallocated */ + work_heap = mem_heap_create(0); + work_item = (wrk_t*)mem_heap_alloc(work_heap, sizeof(wrk_t)*buf_pool_inst); for(i=0;iwork_item[i].tsk = MT_WRK_WRITE; - mtflush_ctx->work_item[i].rd.page_pool = NULL; - mtflush_ctx->work_item[i].wr.buf_pool = buf_pool_from_array(i); - mtflush_ctx->work_item[i].wr.flush_type = flush_type; - mtflush_ctx->work_item[i].wr.min = min_n; - mtflush_ctx->work_item[i].wr.lsn_limit = lsn_limit; - mtflush_ctx->work_item[i].id_usr = -1; - mtflush_ctx->work_item[i].wi_status = WRK_ITEM_SET; + work_item[i].tsk = MT_WRK_WRITE; + work_item[i].rd.page_pool = NULL; + work_item[i].wr.buf_pool = buf_pool_from_array(i); + work_item[i].wr.flush_type = flush_type; + work_item[i].wr.min = min_n; + work_item[i].wr.lsn_limit = lsn_limit; + work_item[i].id_usr = -1; + work_item[i].wi_status = WRK_ITEM_SET; + work_item[i].wheap = work_heap; ib_wqueue_add(mtflush_ctx->wq, - (void *)(&(mtflush_ctx->work_item[i])), - mtflush_ctx->wheap); + (void *)(&(work_item[i])), + work_heap); } /* wait on the completion to arrive */ @@ -554,21 +491,15 @@ buf_mtflu_flush_work_items( done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); if (done_wi != NULL) { - if(done_wi->n_flushed == 0) { - per_pool_pages_flushed[i] = 0; - } else { - per_pool_pages_flushed[i] = done_wi->n_flushed; - } + per_pool_pages_flushed[i] = done_wi->n_flushed; if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { -#ifdef UNIV_DEBUG fprintf(stderr, "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); - ut_ad(0); -#endif + ut_a(0); } n_flushed+= done_wi->n_flushed; @@ -576,6 +507,9 @@ buf_mtflu_flush_work_items( } } + /* Release used work_items and queue nodes */ + mem_heap_free(work_heap); + return(n_flushed); } diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h index 33385ddf2d4..bbbbd3b146b 100644 --- a/storage/innobase/include/ut0wqueue.h +++ b/storage/innobase/include/ut0wqueue.h @@ -95,6 +95,15 @@ ib_wqueue_timedwait( ib_wqueue_t* wq, /* in: work queue */ ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*data : NULL); } +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq) /*mutex); + + if(!ib_list_is_empty(wq->items)) { + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + } + } + + /* We must reset the event when the list + gets emptied. */ + if(ib_list_is_empty(wq->items)) { + os_event_reset(wq->event); + } + + mutex_exit(&wq->mutex); + + return (node ? node->data : NULL); +} + /******************************************************************** Check if queue is empty. */ diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 55902cc7a58..35a15bd5a14 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -113,6 +113,8 @@ typedef struct wrk_itm os_thread_id_t id_usr; /*!< Thread-id currently working */ wrk_status_t wi_status; /*!< Work item status */ struct wrk_itm *next; /*!< Next work item */ + mem_heap_t *wheap; /*!< Heap were to allocate memory + for queue nodes */ } wrk_t; /* Thread syncronization data */ @@ -127,39 +129,12 @@ typedef struct thread_sync wthr_status_t wt_status; /*!< Worker thread status */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ - wrk_t* work_item; /*!< Array of work-items that are - individually accessed by multiple - threads. Items are accessed in a - thread safe manner.*/ } thread_sync_t; -/* TODO: REALLY NEEDED ? */ static int mtflush_work_initialized = -1; static os_fast_mutex_t mtflush_mtx; static thread_sync_t* mtflush_ctx=NULL; -/******************************************************************//** -Initialize work items. */ -static -void -mtflu_setup_work_items( -/*===================*/ - wrk_t* work_items, /*!< inout: Work items */ - ulint n_items) /*!< in: Number of work items */ -{ - ulint i; - for(i=0; in_flushed; } -#ifdef UNIV_DEBUG -/******************************************************************//** -Print flush statistics of work items -*/ -static -void -mtflu_print_thread_stat( -/*====================*/ - wrk_t* work_item) /*!< in: Work items */ -{ - ulint stat_tot=0; - ulint i=0; - - for(i=0; i< MTFLUSH_MAX_WORKER; i++) { - stat_tot+=work_item[i].n_flushed; - - fprintf(stderr, "MTFLUSH: Thread[%lu] stat [%lu]\n", - work_item[i].id_usr, - work_item[i].n_flushed); - - if (work_item[i].next == NULL) { - break; /* No more filled work items */ - } - } - - fprintf(stderr, "MTFLUSH: Stat-Total:%lu\n", stat_tot); -} -#endif /* UNIV_DEBUG */ - /******************************************************************//** Worker function to wait for work items and processing them and sending reply back. @@ -282,7 +228,12 @@ mtflush_service_io( ulint n_flushed=0; mtflush_io->wt_status = WTHR_SIG_WAITING; - work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); + + work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); + + if (work_item == NULL) { + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); + } if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; @@ -306,7 +257,7 @@ mtflush_service_io( case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_EXIT; - ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); mtflush_io->wt_status = WTHR_KILL_IT; return; @@ -320,16 +271,11 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->wr_cq, work_item, mtflush_io->wheap); + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); break; case MT_WRK_READ: - /* Need to also handle the read case */ - /* TODO: ? */ ut_a(0); - /* completed task get added to rd_cq */ - /* work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->rd_cq, work_item, mtflush_io->wheap);*/ break; default: @@ -351,28 +297,36 @@ DECLARE_THREAD(mtflush_io_thread)( void * arg) { thread_sync_t *mtflush_io = ((thread_sync_t *)arg); + ulint n_timeout = 0; #ifdef UNIV_DEBUG ib_uint64_t stat_universal_num_processed = 0; ib_uint64_t stat_cycle_num_processed = 0; - wrk_t* work_item = mtflush_io[0].work_item; ulint i; #endif while (TRUE) { + fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); + mtflush_service_io(mtflush_io); -#ifdef UNIV_DEBUG - for(i=0; i < MTFLUSH_MAX_WORKER; i++) { - stat_cycle_num_processed+= work_item[i].n_flushed; + if (mtflush_io->wt_status == WTHR_NO_WORK) { + n_timeout++; + + if (n_timeout > 10) { + fprintf(stderr, "InnoDB: Note: Thread %lu has not received " + " work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); + n_timeout = 0; + } + } else { + n_timeout = 0; } - stat_universal_num_processed+=stat_cycle_num_processed; - stat_cycle_num_processed = 0; - fprintf(stderr, "MTFLUSH_IO_THREAD: total %lu cycle %lu\n", - stat_universal_num_processed, - stat_cycle_num_processed); - mtflu_print_thread_stat(work_item); -#endif if (mtflush_io->wt_status == WTHR_KILL_IT) { break; } @@ -392,26 +346,31 @@ buf_mtflu_io_thread_exit(void) { long i; thread_sync_t* mtflush_io = mtflush_ctx; + wrk_t* work_item; ut_a(mtflush_io != NULL); + /* Allocate work items for shutdown message */ + work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + /* Confirm if the io-thread KILL is in progress, bailout */ if (mtflush_io->wt_status == WTHR_KILL_IT) { return; } fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", - srv_buf_pool_instances); + srv_mtflush_threads); /* Send one exit work item/thread */ for (i=0; i < srv_mtflush_threads; i++) { - mtflush_io->work_item[i].wr.buf_pool = NULL; - mtflush_io->work_item[i].rd.page_pool = NULL; - mtflush_io->work_item[i].tsk = MT_WRK_NONE; - mtflush_io->work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wr.buf_pool = NULL; + work_item[i].rd.page_pool = NULL; + work_item[i].tsk = MT_WRK_NONE; + work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wheap = mtflush_io->wheap; ib_wqueue_add(mtflush_io->wq, - (void *)&(mtflush_io->work_item[i]), + (void *)&(work_item[i]), mtflush_io->wheap); } @@ -437,7 +396,7 @@ buf_mtflu_io_thread_exit(void) } /* Wait about 1/2 sec to allow threads really exit */ - os_thread_sleep(50000); + os_thread_sleep(5000000); ut_a(ib_wqueue_is_empty(mtflush_io->wq)); ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); @@ -468,7 +427,6 @@ buf_mtflu_handler_init( ib_wqueue_t* mtflush_work_queue; ib_wqueue_t* mtflush_write_comp_queue; ib_wqueue_t* mtflush_read_comp_queue; - wrk_t* work_items; os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); @@ -487,14 +445,6 @@ buf_mtflu_handler_init( mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, MTFLUSH_MAX_WORKER * sizeof(thread_sync_t)); ut_a(mtflush_ctx != NULL); - work_items = (wrk_t*)mem_heap_alloc(mtflush_heap, - MTFLUSH_MAX_WORKER * sizeof(wrk_t)); - ut_a(work_items != NULL); - memset(work_items, 0, sizeof(wrk_t) * MTFLUSH_MAX_WORKER); - memset(mtflush_ctx, 0, sizeof(thread_sync_t) * MTFLUSH_MAX_WORKER); - - /* Initialize work items */ - mtflu_setup_work_items(work_items, n_threads); /* Create threads for page-compression-flush */ for(i=0; i < n_threads; i++) { @@ -505,7 +455,6 @@ buf_mtflu_handler_init( mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; mtflush_ctx[i].wheap = mtflush_heap; mtflush_ctx[i].wt_status = WTHR_INITIALIZED; - mtflush_ctx[i].work_item = work_items; mtflush_ctx[i].wthread = os_thread_create( mtflush_io_thread, @@ -539,20 +488,28 @@ buf_mtflu_flush_work_items( { ulint n_flushed=0, i; wrk_t *done_wi; + mem_heap_t* work_heap; + wrk_t* work_item; + + /* Allocate heap where all work items used and queue + node items areallocated */ + work_heap = mem_heap_create(0); + work_item = (wrk_t*)mem_heap_alloc(work_heap, sizeof(wrk_t)*buf_pool_inst); for(i=0;iwork_item[i].tsk = MT_WRK_WRITE; - mtflush_ctx->work_item[i].rd.page_pool = NULL; - mtflush_ctx->work_item[i].wr.buf_pool = buf_pool_from_array(i); - mtflush_ctx->work_item[i].wr.flush_type = flush_type; - mtflush_ctx->work_item[i].wr.min = min_n; - mtflush_ctx->work_item[i].wr.lsn_limit = lsn_limit; - mtflush_ctx->work_item[i].id_usr = -1; - mtflush_ctx->work_item[i].wi_status = WRK_ITEM_SET; + work_item[i].tsk = MT_WRK_WRITE; + work_item[i].rd.page_pool = NULL; + work_item[i].wr.buf_pool = buf_pool_from_array(i); + work_item[i].wr.flush_type = flush_type; + work_item[i].wr.min = min_n; + work_item[i].wr.lsn_limit = lsn_limit; + work_item[i].id_usr = -1; + work_item[i].wi_status = WRK_ITEM_SET; + work_item[i].wheap = work_heap; ib_wqueue_add(mtflush_ctx->wq, - (void *)(&(mtflush_ctx->work_item[i])), - mtflush_ctx->wheap); + (void *)(&(work_item[i])), + work_heap); } /* wait on the completion to arrive */ @@ -560,21 +517,15 @@ buf_mtflu_flush_work_items( done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); if (done_wi != NULL) { - if(done_wi->n_flushed == 0) { - per_pool_pages_flushed[i] = 0; - } else { - per_pool_pages_flushed[i] = done_wi->n_flushed; - } + per_pool_pages_flushed[i] = done_wi->n_flushed; if((int)done_wi->id_usr == -1 && done_wi->wi_status == WRK_ITEM_SET ) { -#ifdef UNIV_DEBUG fprintf(stderr, "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); - ut_ad(0); -#endif + ut_a(0); } n_flushed+= done_wi->n_flushed; @@ -582,6 +533,12 @@ buf_mtflu_flush_work_items( } } + ut_a(ib_wqueue_is_empty(mtflush_ctx->wq)); + ut_a(ib_wqueue_is_empty(mtflush_ctx->wr_cq)); + + /* Release used work_items and queue nodes */ + mem_heap_free(work_heap); + return(n_flushed); } diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h index 29fc8669ce4..b1035bad099 100644 --- a/storage/xtradb/include/ut0list.h +++ b/storage/xtradb/include/ut0list.h @@ -150,6 +150,15 @@ ib_list_is_empty( /* out: TRUE if empty else */ const ib_list_t* list); /* in: list */ +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list); /*first || list->last)); } + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h index 33385ddf2d4..6513f4982c0 100644 --- a/storage/xtradb/include/ut0wqueue.h +++ b/storage/xtradb/include/ut0wqueue.h @@ -95,6 +95,23 @@ ib_wqueue_timedwait( ib_wqueue_t* wq, /* in: work queue */ ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*data : NULL); } +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq) /*mutex); + + if(!ib_list_is_empty(wq->items)) { + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + } + } + + /* We must reset the event when the list + gets emptied. */ + if(ib_list_is_empty(wq->items)) { + os_event_reset(wq->event); + } + + mutex_exit(&wq->mutex); + + return (node ? node->data : NULL); +} + /******************************************************************** Check if queue is empty. */ @@ -173,3 +205,20 @@ ib_wqueue_is_empty( { return(ib_list_is_empty(wq->items)); } + +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq) /*mutex); + len = ib_list_len(wq->items); + mutex_exit(&wq->mutex); + + return(len); +} From b620e7368f05af52f3fa1a759bc446140baf7b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 26 Feb 2014 19:00:24 +0200 Subject: [PATCH 30/56] Small fixes to work_item handling. --- storage/innobase/buf/buf0mtflu.cc | 19 ++++++++----------- storage/xtradb/buf/buf0mtflu.cc | 19 ++++++++----------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index 19dfc883ca0..62ed3f539e2 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -56,13 +56,14 @@ Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com /* Work item status */ typedef enum wrk_status { - WRK_ITEM_SET=0, /*!< Work item is set */ + WRK_ITEM_UNSET=0, /*!< Work item is not set */ WRK_ITEM_START=1, /*!< Processing of work item has started */ WRK_ITEM_DONE=2, /*!< Processing is done usually set to SUCCESS/FAILED */ WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ WRK_ITEM_FAILED=3, /*!< Work item process failed */ WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_SET=5, /*!< Work item is set */ WRK_ITEM_STATUS_UNDEFINED } wrk_status_t; @@ -179,9 +180,7 @@ buf_mtflu_flush_pool_instance( pools based on the assumption that it will help in the retry which will follow the failure. */ -#ifdef UNIV_DEBUG - fprintf(stderr, "flush start failed.\n"); -#endif + fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); return 0; } @@ -257,12 +256,10 @@ mtflush_service_io( return; case MT_WRK_WRITE: + ut_a(work_item->wi_status == WRK_ITEM_SET); work_item->wi_status = WRK_ITEM_START; /* Process work item */ if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { -#ifdef UNIV_DEBUG - fprintf(stderr, "No pages flushed\n"); -#endif work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; @@ -320,7 +317,7 @@ buf_mtflu_io_thread_exit(void) { long i; thread_sync_t* mtflush_io = mtflush_ctx; - wrk_t* work_item; + wrk_t* work_item = NULL; ut_a(mtflush_io != NULL); @@ -358,7 +355,7 @@ buf_mtflu_io_thread_exit(void) /* Collect all work done items */ for (i=0; i < srv_mtflush_threads;) { - wrk_t* work_item; + wrk_t* work_item = NULL; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); @@ -461,9 +458,8 @@ buf_mtflu_flush_work_items( number does not exceed min_n) */ { ulint n_flushed=0, i; - wrk_t *done_wi; mem_heap_t* work_heap; - wrk_t* work_item; + wrk_t* work_item=NULL; /* Allocate heap where all work items used and queue node items areallocated */ @@ -488,6 +484,7 @@ buf_mtflu_flush_work_items( /* wait on the completion to arrive */ for(i=0; i< buf_pool_inst;) { + wrk_t *done_wi = NULL; done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); if (done_wi != NULL) { diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 35a15bd5a14..eeb9bf36c86 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -56,13 +56,14 @@ Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com /* Work item status */ typedef enum wrk_status { - WRK_ITEM_SET=0, /*!< Work item is set */ + WRK_ITEM_UNSET=0, /*!< Work item is not set */ WRK_ITEM_START=1, /*!< Processing of work item has started */ WRK_ITEM_DONE=2, /*!< Processing is done usually set to SUCCESS/FAILED */ WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ WRK_ITEM_FAILED=3, /*!< Work item process failed */ WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_SET=5, /*!< Work item is set */ WRK_ITEM_STATUS_UNDEFINED } wrk_status_t; @@ -181,9 +182,7 @@ buf_mtflu_flush_pool_instance( pools based on the assumption that it will help in the retry which will follow the failure. */ -#ifdef UNIV_DEBUG - fprintf(stderr, "flush start failed.\n"); -#endif + fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); return 0; } @@ -262,12 +261,10 @@ mtflush_service_io( return; case MT_WRK_WRITE: + ut_a(work_item->wi_status == WRK_ITEM_SET); work_item->wi_status = WRK_ITEM_START; /* Process work item */ if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { -#ifdef UNIV_DEBUG - fprintf(stderr, "No pages flushed\n"); -#endif work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; @@ -346,7 +343,7 @@ buf_mtflu_io_thread_exit(void) { long i; thread_sync_t* mtflush_io = mtflush_ctx; - wrk_t* work_item; + wrk_t* work_item = NULL; ut_a(mtflush_io != NULL); @@ -384,7 +381,7 @@ buf_mtflu_io_thread_exit(void) /* Collect all work done items */ for (i=0; i < srv_mtflush_threads;) { - wrk_t* work_item; + wrk_t* work_item = NULL; work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); @@ -487,9 +484,8 @@ buf_mtflu_flush_work_items( number does not exceed min_n) */ { ulint n_flushed=0, i; - wrk_t *done_wi; mem_heap_t* work_heap; - wrk_t* work_item; + wrk_t* work_item=NULL; /* Allocate heap where all work items used and queue node items areallocated */ @@ -514,6 +510,7 @@ buf_mtflu_flush_work_items( /* wait on the completion to arrive */ for(i=0; i< buf_pool_inst;) { + wrk_t *done_wi = NULL; done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); if (done_wi != NULL) { From c88a0d48c6624466d058282bf7e2e8279660564e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 28 Feb 2014 08:53:09 +0200 Subject: [PATCH 31/56] Temporal fix for flush thread hang. Added option to disable multi-threaded flush with innodb_use_mtflush = 0 option, by default multi-threaded flush is used. Updated innochecksum tool, still it does not support new checksums. --- extra/CMakeLists.txt | 16 +- extra/innochecksum.c | 325 --------------------- extra/innochecksum.cc | 396 ++++++++++++++++++++++++++ storage/innobase/buf/buf0mtflu.cc | 55 +++- storage/innobase/handler/ha_innodb.cc | 6 + storage/innobase/include/fil0fil.h | 4 +- storage/innobase/include/srv0srv.h | 5 + storage/innobase/include/ut0list.h | 9 + storage/innobase/include/ut0list.ic | 20 ++ storage/innobase/include/ut0wqueue.h | 8 + storage/innobase/srv/srv0srv.cc | 4 +- storage/innobase/srv/srv0start.cc | 29 +- storage/innobase/ut/ut0wqueue.cc | 17 ++ storage/xtradb/buf/buf0mtflu.cc | 34 ++- storage/xtradb/handler/ha_innodb.cc | 6 + storage/xtradb/include/fil0fil.h | 4 +- storage/xtradb/include/srv0srv.h | 5 + storage/xtradb/include/ut0list.h | 2 +- storage/xtradb/include/ut0list.ic | 2 +- storage/xtradb/include/ut0wqueue.h | 2 +- storage/xtradb/srv/srv0srv.cc | 4 +- storage/xtradb/srv/srv0start.cc | 31 +- 22 files changed, 614 insertions(+), 370 deletions(-) delete mode 100644 extra/innochecksum.c create mode 100644 extra/innochecksum.cc diff --git a/extra/CMakeLists.txt b/extra/CMakeLists.txt index f8f71b00743..cf3a35cb1dd 100644 --- a/extra/CMakeLists.txt +++ b/extra/CMakeLists.txt @@ -72,10 +72,24 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") ENDIF() ENDIF() +IF(WITH_INNOBASE_STORAGE_ENGINE) + # Add path to the InnoDB headers + INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include) + # We use the InnoDB code directly in case the code changes. + ADD_DEFINITIONS("-DUNIV_INNOCHECKSUM") + SET(INNOBASE_SOURCES + ../storage/innobase/buf/buf0checksum.cc + ../storage/innobase/ut/ut0crc32.cc + ../storage/innobase/ut/ut0ut.cc + ) + MYSQL_ADD_EXECUTABLE(innochecksum innochecksum.cc ${INNOBASE_SOURCES}) + TARGET_LINK_LIBRARIES(innochecksum mysys mysys_ssl) +ENDIF() + MYSQL_ADD_EXECUTABLE(replace replace.c COMPONENT Server) TARGET_LINK_LIBRARIES(replace mysys) + IF(UNIX) - MYSQL_ADD_EXECUTABLE(innochecksum innochecksum.c) MYSQL_ADD_EXECUTABLE(resolve_stack_dump resolve_stack_dump.c) TARGET_LINK_LIBRARIES(resolve_stack_dump mysys) diff --git a/extra/innochecksum.c b/extra/innochecksum.c deleted file mode 100644 index ed4dfc48789..00000000000 --- a/extra/innochecksum.c +++ /dev/null @@ -1,325 +0,0 @@ -/* - Copyright (c) 2005, 2011, Oracle and/or its affiliates - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; version 2 of the License. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -*/ - -/* - InnoDB offline file checksum utility. 85% of the code in this file - was taken wholesale fron the InnoDB codebase. - - The final 15% was originally written by Mark Smith of Danga - Interactive, Inc. - - Published with a permission. -*/ - -#include -#include -#include -#include -#include -#include -#include - -/* all of these ripped from InnoDB code from MySQL 4.0.22 */ -#define UT_HASH_RANDOM_MASK 1463735687 -#define UT_HASH_RANDOM_MASK2 1653893711 -#define FIL_PAGE_LSN 16 -#define FIL_PAGE_FILE_FLUSH_LSN 26 -#define FIL_PAGE_OFFSET 4 -#define FIL_PAGE_DATA 38 -#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 -#define FIL_PAGE_SPACE_OR_CHKSUM 0 -#define UNIV_PAGE_SIZE (2 * 8192) - -/* command line argument to do page checks (that's it) */ -/* another argument to specify page ranges... seek to right spot and go from there */ - -typedef unsigned long int ulint; - -/* innodb function in name; modified slightly to not have the ASM version (lots of #ifs that didn't apply) */ -ulint mach_read_from_4(uchar *b) -{ - return( ((ulint)(b[0]) << 24) - + ((ulint)(b[1]) << 16) - + ((ulint)(b[2]) << 8) - + (ulint)(b[3]) - ); -} - -ulint -ut_fold_ulint_pair( -/*===============*/ - /* out: folded value */ - ulint n1, /* in: ulint */ - ulint n2) /* in: ulint */ -{ - return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) - ^ UT_HASH_RANDOM_MASK) + n2); -} - -ulint -ut_fold_binary( -/*===========*/ - /* out: folded value */ - uchar* str, /* in: string of bytes */ - ulint len) /* in: length */ -{ - ulint i; - ulint fold= 0; - - for (i= 0; i < len; i++) - { - fold= ut_fold_ulint_pair(fold, (ulint)(*str)); - - str++; - } - - return(fold); -} - -ulint -buf_calc_page_new_checksum( -/*=======================*/ - /* out: checksum */ - uchar* page) /* in: buffer page */ -{ - ulint checksum; - - /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO - are written outside the buffer pool to the first pages of data - files, we have to skip them in the page checksum calculation. - We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the - checksum is stored, and also the last 8 bytes of page because - there we store the old formula checksum. */ - - checksum= ut_fold_binary(page + FIL_PAGE_OFFSET, - FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) - + ut_fold_binary(page + FIL_PAGE_DATA, - UNIV_PAGE_SIZE - FIL_PAGE_DATA - - FIL_PAGE_END_LSN_OLD_CHKSUM); - checksum= checksum & 0xFFFFFFFF; - - return(checksum); -} - -ulint -buf_calc_page_old_checksum( -/*=======================*/ - /* out: checksum */ - uchar* page) /* in: buffer page */ -{ - ulint checksum; - - checksum= ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - - checksum= checksum & 0xFFFFFFFF; - - return(checksum); -} - - -int main(int argc, char **argv) -{ - FILE *f; /* our input file */ - uchar *p; /* storage of pages read */ - int bytes; /* bytes read count */ - ulint ct; /* current page number (0 based) */ - int now; /* current time */ - int lastt; /* last time */ - ulint oldcsum, oldcsumfield, csum, csumfield, logseq, logseqfield; /* ulints for checksum storage */ - struct stat st; /* for stat, if you couldn't guess */ - unsigned long long int size; /* size of file (has to be 64 bits) */ - ulint pages; /* number of pages in file */ - ulint start_page= 0, end_page= 0, use_end_page= 0; /* for starting and ending at certain pages */ - off_t offset= 0; - int just_count= 0; /* if true, just print page count */ - int verbose= 0; - int debug= 0; - int c; - int fd; - - /* remove arguments */ - while ((c= getopt(argc, argv, "cvds:e:p:")) != -1) - { - switch (c) - { - case 'v': - verbose= 1; - break; - case 'c': - just_count= 1; - break; - case 's': - start_page= atoi(optarg); - break; - case 'e': - end_page= atoi(optarg); - use_end_page= 1; - break; - case 'p': - start_page= atoi(optarg); - end_page= atoi(optarg); - use_end_page= 1; - break; - case 'd': - debug= 1; - break; - case ':': - fprintf(stderr, "option -%c requires an argument\n", optopt); - return 1; - break; - case '?': - fprintf(stderr, "unrecognized option: -%c\n", optopt); - return 1; - break; - } - } - - /* debug implies verbose... */ - if (debug) verbose= 1; - - /* make sure we have the right arguments */ - if (optind >= argc) - { - printf("InnoDB offline file checksum utility.\n"); - printf("usage: %s [-c] [-s ] [-e ] [-p ] [-v] [-d] \n", argv[0]); - printf("\t-c\tprint the count of pages in the file\n"); - printf("\t-s n\tstart on this page number (0 based)\n"); - printf("\t-e n\tend at this page number (0 based)\n"); - printf("\t-p n\tcheck only this page (0 based)\n"); - printf("\t-v\tverbose (prints progress every 5 seconds)\n"); - printf("\t-d\tdebug mode (prints checksums for each page)\n"); - return 1; - } - - /* stat the file to get size and page count */ - if (stat(argv[optind], &st)) - { - perror("error statting file"); - return 1; - } - size= st.st_size; - pages= size / UNIV_PAGE_SIZE; - if (just_count) - { - printf("%lu\n", pages); - return 0; - } - else if (verbose) - { - printf("file %s = %llu bytes (%lu pages)...\n", argv[optind], size, pages); - printf("checking pages in range %lu to %lu\n", start_page, use_end_page ? end_page : (pages - 1)); - } - - /* open the file for reading */ - f= fopen(argv[optind], "r"); - if (!f) - { - perror("error opening file"); - return 1; - } - - /* seek to the necessary position */ - if (start_page) - { - fd= fileno(f); - if (!fd) - { - perror("unable to obtain file descriptor number"); - return 1; - } - - offset= (off_t)start_page * (off_t)UNIV_PAGE_SIZE; - - if (lseek(fd, offset, SEEK_SET) != offset) - { - perror("unable to seek to necessary offset"); - return 1; - } - } - - /* allocate buffer for reading (so we don't realloc every time) */ - p= (uchar *)malloc(UNIV_PAGE_SIZE); - - /* main checksumming loop */ - ct= start_page; - lastt= 0; - while (!feof(f)) - { - bytes= fread(p, 1, UNIV_PAGE_SIZE, f); - if (!bytes && feof(f)) return 0; - if (bytes != UNIV_PAGE_SIZE) - { - fprintf(stderr, "bytes read (%d) doesn't match universal page size (%d)\n", bytes, UNIV_PAGE_SIZE); - return 1; - } - - /* check the "stored log sequence numbers" */ - logseq= mach_read_from_4(p + FIL_PAGE_LSN + 4); - logseqfield= mach_read_from_4(p + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4); - if (debug) - printf("page %lu: log sequence number: first = %lu; second = %lu\n", ct, logseq, logseqfield); - if (logseq != logseqfield) - { - fprintf(stderr, "page %lu invalid (fails log sequence number check)\n", ct); - return 1; - } - - /* check old method of checksumming */ - oldcsum= buf_calc_page_old_checksum(p); - oldcsumfield= mach_read_from_4(p + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); - if (debug) - printf("page %lu: old style: calculated = %lu; recorded = %lu\n", ct, oldcsum, oldcsumfield); - if (oldcsumfield != mach_read_from_4(p + FIL_PAGE_LSN) && oldcsumfield != oldcsum) - { - fprintf(stderr, "page %lu invalid (fails old style checksum)\n", ct); - return 1; - } - - /* now check the new method */ - csum= buf_calc_page_new_checksum(p); - csumfield= mach_read_from_4(p + FIL_PAGE_SPACE_OR_CHKSUM); - if (debug) - printf("page %lu: new style: calculated = %lu; recorded = %lu\n", ct, csum, csumfield); - if (csumfield != 0 && csum != csumfield) - { - fprintf(stderr, "page %lu invalid (fails new style checksum)\n", ct); - return 1; - } - - /* end if this was the last page we were supposed to check */ - if (use_end_page && (ct >= end_page)) - return 0; - - /* do counter increase and progress printing */ - ct++; - if (verbose) - { - if (ct % 64 == 0) - { - now= time(0); - if (!lastt) lastt= now; - if (now - lastt >= 1) - { - printf("page %lu okay: %.3f%% done\n", (ct - 1), (float) ct / pages * 100); - lastt= now; - } - } - } - } - return 0; -} - diff --git a/extra/innochecksum.cc b/extra/innochecksum.cc new file mode 100644 index 00000000000..c89196b1eee --- /dev/null +++ b/extra/innochecksum.cc @@ -0,0 +1,396 @@ +/* + Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +/* + InnoDB offline file checksum utility. 85% of the code in this utility + is included from the InnoDB codebase. + + The final 15% was originally written by Mark Smith of Danga + Interactive, Inc. + + Published with a permission. +*/ + +#include +#include +#include +#include +#include +#include +#ifndef __WIN__ +# include +#endif +#include +#include +#include /* ORACLE_WELCOME_COPYRIGHT_NOTICE */ + +/* Only parts of these files are included from the InnoDB codebase. +The parts not included are excluded by #ifndef UNIV_INNOCHECKSUM. */ + +#include "univ.i" /* include all of this */ + +#include "buf0checksum.h" /* buf_calc_page_*() */ +#include "fil0fil.h" /* FIL_* */ +#include "fsp0fsp.h" /* fsp_flags_get_page_size() & + fsp_flags_get_zip_size() */ +#include "mach0data.h" /* mach_read_from_4() */ +#include "ut0crc32.h" /* ut_crc32_init() */ + +#ifdef UNIV_NONINL +# include "fsp0fsp.ic" +# include "mach0data.ic" +# include "ut0rnd.ic" +#endif + +/* Global variables */ +static my_bool verbose; +static my_bool debug; +static my_bool just_count; +static ulong start_page; +static ulong end_page; +static ulong do_page; +static my_bool use_end_page; +static my_bool do_one_page; +ulong srv_page_size; /* replaces declaration in srv0srv.c */ +static ulong physical_page_size; /* Page size in bytes on disk. */ +static ulong logical_page_size; /* Page size when uncompressed. */ + +/* Get the page size of the filespace from the filespace header. */ +static +my_bool +get_page_size( +/*==========*/ + FILE* f, /*!< in: file pointer, must be open + and set to start of file */ + byte* buf, /*!< in: buffer used to read the page */ + ulong* logical_page_size, /*!< out: Logical/Uncompressed page size */ + ulong* physical_page_size) /*!< out: Physical/Commpressed page size */ +{ + ulong flags; + + int bytes= fread(buf, 1, UNIV_PAGE_SIZE_MIN, f); + + if (ferror(f)) + { + perror("Error reading file header"); + return FALSE; + } + + if (bytes != UNIV_PAGE_SIZE_MIN) + { + fprintf(stderr, "Error; Was not able to read the minimum page size "); + fprintf(stderr, "of %d bytes. Bytes read was %d\n", UNIV_PAGE_SIZE_MIN, bytes); + return FALSE; + } + + rewind(f); + + flags = mach_read_from_4(buf + FIL_PAGE_DATA + FSP_SPACE_FLAGS); + + /* srv_page_size is used by InnoDB code as UNIV_PAGE_SIZE */ + srv_page_size = *logical_page_size = fsp_flags_get_page_size(flags); + + /* fsp_flags_get_zip_size() will return zero if not compressed. */ + *physical_page_size = fsp_flags_get_zip_size(flags); + if (*physical_page_size == 0) + *physical_page_size= *logical_page_size; + + return TRUE; +} + + +/* command line argument to do page checks (that's it) */ +/* another argument to specify page ranges... seek to right spot and go from there */ + +static struct my_option innochecksum_options[] = +{ + {"help", '?', "Displays this help and exits.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"info", 'I', "Synonym for --help.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Displays version information and exits.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Verbose (prints progress every 5 seconds).", + &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"debug", 'd', "Debug mode (prints checksums for each page, implies verbose).", + &debug, &debug, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"count", 'c', "Print the count of pages in the file.", + &just_count, &just_count, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"start_page", 's', "Start on this page number (0 based).", + &start_page, &start_page, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, (longlong) 2L*1024L*1024L*1024L, 0, 1, 0}, + {"end_page", 'e', "End at this page number (0 based).", + &end_page, &end_page, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, (longlong) 2L*1024L*1024L*1024L, 0, 1, 0}, + {"page", 'p', "Check only this page (0 based).", + &do_page, &do_page, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, (longlong) 2L*1024L*1024L*1024L, 0, 1, 0}, + + {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +static void print_version(void) +{ + printf("%s Ver %s, for %s (%s)\n", + my_progname, INNODB_VERSION_STR, + SYSTEM_TYPE, MACHINE_TYPE); +} + +static void usage(void) +{ + print_version(); + puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2000")); + printf("InnoDB offline file checksum utility.\n"); + printf("Usage: %s [-c] [-s ] [-e ] [-p ] [-v] [-d] \n", my_progname); + my_print_help(innochecksum_options); + my_print_variables(innochecksum_options); +} + +extern "C" my_bool +innochecksum_get_one_option( +/*========================*/ + int optid, + const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch (optid) { + case 'd': + verbose=1; /* debug implies verbose... */ + break; + case 'e': + use_end_page= 1; + break; + case 'p': + end_page= start_page= do_page; + use_end_page= 1; + do_one_page= 1; + break; + case 'V': + print_version(); + exit(0); + break; + case 'I': + case '?': + usage(); + exit(0); + break; + } + return 0; +} + +static int get_options( +/*===================*/ + int *argc, + char ***argv) +{ + int ho_error; + + if ((ho_error=handle_options(argc, argv, innochecksum_options, innochecksum_get_one_option))) + exit(ho_error); + + /* The next arg must be the filename */ + if (!*argc) + { + usage(); + return 1; + } + return 0; +} /* get_options */ + + +int main(int argc, char **argv) +{ + FILE* f; /* our input file */ + char* filename; /* our input filename. */ + unsigned char buf[UNIV_PAGE_SIZE_MAX]; /* Buffer to store pages read */ + ulong bytes; /* bytes read count */ + ulint ct; /* current page number (0 based) */ + time_t now; /* current time */ + time_t lastt; /* last time */ + ulint oldcsum, oldcsumfield, csum, csumfield, crc32, logseq, logseqfield; + /* ulints for checksum storage */ + struct stat st; /* for stat, if you couldn't guess */ + unsigned long long int size; /* size of file (has to be 64 bits) */ + ulint pages; /* number of pages in file */ + off_t offset= 0; + int fd; + + printf("InnoDB offline file checksum utility.\n"); + + ut_crc32_init(); + + MY_INIT(argv[0]); + + if (get_options(&argc,&argv)) + exit(1); + + if (verbose) + my_print_variables(innochecksum_options); + + /* The file name is not optional */ + filename = *argv; + if (*filename == '\0') + { + fprintf(stderr, "Error; File name missing\n"); + return 1; + } + + /* stat the file to get size and page count */ + if (stat(filename, &st)) + { + fprintf(stderr, "Error; %s cannot be found\n", filename); + return 1; + } + size= st.st_size; + + /* Open the file for reading */ + f= fopen(filename, "rb"); + if (f == NULL) + { + fprintf(stderr, "Error; %s cannot be opened", filename); + perror(" "); + return 1; + } + + if (!get_page_size(f, buf, &logical_page_size, &physical_page_size)) + { + return 1; + } + + /* This tool currently does not support Compressed tables */ + if (logical_page_size != physical_page_size) + { + fprintf(stderr, "Error; This file contains compressed pages\n"); + return 1; + } + + pages= (ulint) (size / physical_page_size); + + if (just_count) + { + if (verbose) + printf("Number of pages: "); + printf("%lu\n", pages); + return 0; + } + else if (verbose) + { + printf("file %s = %llu bytes (%lu pages)...\n", filename, size, pages); + if (do_one_page) + printf("InnoChecksum; checking page %lu\n", do_page); + else + printf("InnoChecksum; checking pages in range %lu to %lu\n", start_page, use_end_page ? end_page : (pages - 1)); + } + + /* seek to the necessary position */ + if (start_page) + { + fd= fileno(f); + if (!fd) + { + perror("Error; Unable to obtain file descriptor number"); + return 1; + } + + offset= (off_t)start_page * (off_t)physical_page_size; + + if (lseek(fd, offset, SEEK_SET) != offset) + { + perror("Error; Unable to seek to necessary offset"); + return 1; + } + } + + /* main checksumming loop */ + ct= start_page; + lastt= 0; + while (!feof(f)) + { + bytes= fread(buf, 1, physical_page_size, f); + if (!bytes && feof(f)) + return 0; + + if (ferror(f)) + { + fprintf(stderr, "Error reading %lu bytes", physical_page_size); + perror(" "); + return 1; + } + if (bytes != physical_page_size) + { + fprintf(stderr, "Error; bytes read (%lu) doesn't match page size (%lu)\n", bytes, physical_page_size); + return 1; + } + + /* check the "stored log sequence numbers" */ + logseq= mach_read_from_4(buf + FIL_PAGE_LSN + 4); + logseqfield= mach_read_from_4(buf + logical_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM + 4); + if (debug) + printf("page %lu: log sequence number: first = %lu; second = %lu\n", ct, logseq, logseqfield); + if (logseq != logseqfield) + { + fprintf(stderr, "Fail; page %lu invalid (fails log sequence number check)\n", ct); + return 1; + } + + /* check old method of checksumming */ + oldcsum= buf_calc_page_old_checksum(buf); + oldcsumfield= mach_read_from_4(buf + logical_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); + if (debug) + printf("page %lu: old style: calculated = %lu; recorded = %lu\n", ct, oldcsum, oldcsumfield); + if (oldcsumfield != mach_read_from_4(buf + FIL_PAGE_LSN) && oldcsumfield != oldcsum) + { + fprintf(stderr, "Fail; page %lu invalid (fails old style checksum)\n", ct); + return 1; + } + + /* now check the new method */ + csum= buf_calc_page_new_checksum(buf); + crc32= buf_calc_page_crc32(buf); + csumfield= mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM); + if (debug) + printf("page %lu: new style: calculated = %lu; crc32 = %lu; recorded = %lu\n", + ct, csum, crc32, csumfield); + if (csumfield != 0 && crc32 != csumfield && csum != csumfield) + { + fprintf(stderr, "Fail; page %lu invalid (fails innodb and crc32 checksum)\n", ct); + return 1; + } + + /* end if this was the last page we were supposed to check */ + if (use_end_page && (ct >= end_page)) + return 0; + + /* do counter increase and progress printing */ + ct++; + if (verbose) + { + if (ct % 64 == 0) + { + now= time(0); + if (!lastt) lastt= now; + if (now - lastt >= 1) + { + printf("page %lu okay: %.3f%% done\n", (ct - 1), (float) ct / pages * 100); + lastt= now; + } + } + } + } + return 0; +} + diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index 62ed3f539e2..5b591024922 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -134,6 +134,7 @@ typedef struct thread_sync static int mtflush_work_initialized = -1; static os_fast_mutex_t mtflush_mtx; +static os_fast_mutex_t mtflush_mtx_wait; static thread_sync_t* mtflush_ctx=NULL; /******************************************************************//** @@ -180,7 +181,9 @@ buf_mtflu_flush_pool_instance( pools based on the assumption that it will help in the retry which will follow the failure. */ +#ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); +#endif return 0; } @@ -223,12 +226,16 @@ mtflush_service_io( mtflush_io->wt_status = WTHR_SIG_WAITING; + /* TODO: Temporal fix for the hang bug. This needs a real fix. */ + os_fast_mutex_lock(&mtflush_mtx_wait); work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); if (work_item == NULL) { work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); } + os_fast_mutex_unlock(&mtflush_mtx_wait); + if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; } else { @@ -237,6 +244,10 @@ mtflush_service_io( return; } + if (work_item->wi_status != WRK_ITEM_EXIT) { + work_item->wi_status = WRK_ITEM_SET; + } + work_item->id_usr = os_thread_get_curr_id(); /* This works as a producer/consumer model, where in tasks are @@ -253,7 +264,7 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_EXIT; ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); mtflush_io->wt_status = WTHR_KILL_IT; - return; + break; case MT_WRK_WRITE: ut_a(work_item->wi_status == WRK_ITEM_SET); @@ -273,9 +284,9 @@ mtflush_service_io( default: /* None other than Write/Read handling planned */ ut_a(0); + break; } - mtflush_io->wt_status = WTHR_NO_WORK; } /******************************************************************//** @@ -289,6 +300,7 @@ DECLARE_THREAD(mtflush_io_thread)( void * arg) { thread_sync_t *mtflush_io = ((thread_sync_t *)arg); + ulint n_timeout = 0; #ifdef UNIV_DEBUG ib_uint64_t stat_universal_num_processed = 0; ib_uint64_t stat_cycle_num_processed = 0; @@ -296,8 +308,32 @@ DECLARE_THREAD(mtflush_io_thread)( #endif while (TRUE) { +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); +#endif /* UNIV_DEBUG */ + mtflush_service_io(mtflush_io); +#ifdef UNIV_DEBUG + if (mtflush_io->wt_status == WTHR_NO_WORK) { + n_timeout++; + + if (n_timeout > 10) { + fprintf(stderr, "InnoDB: Note: Thread %lu has not received " + " work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); + n_timeout = 0; + } + } else { + n_timeout = 0; + } +#endif /* UNIV_DEBUG */ + if (mtflush_io->wt_status == WTHR_KILL_IT) { break; } @@ -379,6 +415,7 @@ buf_mtflu_io_thread_exit(void) ib_wqueue_free(mtflush_io->rd_cq); os_fast_mutex_free(&mtflush_mtx); + os_fast_mutex_free(&mtflush_mtx_wait); /* Free heap */ mem_heap_free(mtflush_io->wheap); @@ -400,6 +437,7 @@ buf_mtflu_handler_init( ib_wqueue_t* mtflush_read_comp_queue; os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx_wait); /* Create heap, work queue, write completion queue, read completion queue for multi-threaded flush, and init @@ -465,16 +503,15 @@ buf_mtflu_flush_work_items( node items areallocated */ work_heap = mem_heap_create(0); work_item = (wrk_t*)mem_heap_alloc(work_heap, sizeof(wrk_t)*buf_pool_inst); + memset(work_item, 0, sizeof(wrk_t)*buf_pool_inst); for(i=0;iwq, @@ -490,14 +527,18 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; - if((int)done_wi->id_usr == -1 && - done_wi->wi_status == WRK_ITEM_SET ) { +#ifdef UNIV_DEBUG + /* TODO: Temporal fix for hang. This is really a bug. */ + if((int)done_wi->id_usr == 0 && + (done_wi->wi_status == WRK_ITEM_SET || + done_wi->wi_status == WRK_ITEM_UNSET)) { fprintf(stderr, "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); ut_a(0); } +#endif n_flushed+= done_wi->n_flushed; i++; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 4999a202bd6..6b44cb96677 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16610,6 +16610,11 @@ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, MTFLUSH_MAX_WORKER, /* Max setting */ 0); +static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, + PLUGIN_VAR_OPCMDARG , + "Use multi-threaded flush. Default TRUE.", + NULL, NULL, TRUE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16762,6 +16767,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(use_lz4), #endif MYSQL_SYSVAR(mtflush_threads), + MYSQL_SYSVAR(use_mtflush), NULL }; diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 918a92fa811..37bc9ba5c86 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1008,6 +1008,8 @@ Release fil_system mutex */ void fil_system_exit(void); /*==================*/ + +#ifndef UNIV_INNOCHECKSUM /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ fil_space_t* @@ -1020,5 +1022,5 @@ char* fil_space_name( /*===========*/ fil_space_t* space); /*!< in: space */ - +#endif #endif /* fil0fil_h */ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 725aaf9553d..b4bb9c09ef6 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -257,8 +257,13 @@ extern my_bool srv_use_lz4; /* Number of flush threads */ #define MTFLUSH_MAX_WORKER 64 #define MTFLUSH_DEFAULT_WORKER 8 + +/* Number of threads used for multi-threaded flush */ extern long srv_mtflush_threads; +/* If this flag is TRUE, then we will use multi threaded flush. */ +extern my_bool srv_use_mtflush; + #ifdef __WIN__ extern ibool srv_use_native_conditions; #endif /* __WIN__ */ diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h index 29fc8669ce4..796a272db59 100644 --- a/storage/innobase/include/ut0list.h +++ b/storage/innobase/include/ut0list.h @@ -150,6 +150,15 @@ ib_list_is_empty( /* out: TRUE if empty else */ const ib_list_t* list); /* in: list */ +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list); /*first || list->last)); } + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h index bbbbd3b146b..9906e299808 100644 --- a/storage/innobase/include/ut0wqueue.h +++ b/storage/innobase/include/ut0wqueue.h @@ -103,6 +103,14 @@ ib_wqueue_nowait( /*=============*/ ib_wqueue_t* wq); /*items)); } + +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq) /*mutex); + len = ib_list_len(wq->items); + mutex_exit(&wq->mutex); + + return(len); +} diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index eeb9bf36c86..f7da4c1c7a9 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -134,6 +134,7 @@ typedef struct thread_sync static int mtflush_work_initialized = -1; static os_fast_mutex_t mtflush_mtx; +static os_fast_mutex_t mtflush_mtx_wait; static thread_sync_t* mtflush_ctx=NULL; /******************************************************************//** @@ -182,7 +183,9 @@ buf_mtflu_flush_pool_instance( pools based on the assumption that it will help in the retry which will follow the failure. */ +#ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); +#endif return 0; } @@ -228,12 +231,16 @@ mtflush_service_io( mtflush_io->wt_status = WTHR_SIG_WAITING; + /* TODO: Temporal fix for the hang bug. This needs a real fix. */ + os_fast_mutex_lock(&mtflush_mtx_wait); work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); if (work_item == NULL) { work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); } + os_fast_mutex_unlock(&mtflush_mtx_wait); + if (work_item) { mtflush_io->wt_status = WTHR_RUNNING; } else { @@ -242,6 +249,10 @@ mtflush_service_io( return; } + if (work_item->wi_status != WRK_ITEM_EXIT) { + work_item->wi_status = WRK_ITEM_SET; + } + work_item->id_usr = os_thread_get_curr_id(); /* This works as a producer/consumer model, where in tasks are @@ -258,7 +269,7 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_EXIT; ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); mtflush_io->wt_status = WTHR_KILL_IT; - return; + break; case MT_WRK_WRITE: ut_a(work_item->wi_status == WRK_ITEM_SET); @@ -278,9 +289,9 @@ mtflush_service_io( default: /* None other than Write/Read handling planned */ ut_a(0); + break; } - mtflush_io->wt_status = WTHR_NO_WORK; } /******************************************************************//** @@ -302,13 +313,16 @@ DECLARE_THREAD(mtflush_io_thread)( #endif while (TRUE) { +#ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", os_thread_get_curr_id(), ib_wqueue_len(mtflush_io->wq), ib_wqueue_len(mtflush_io->wr_cq)); +#endif /* UNIV_DEBUG */ mtflush_service_io(mtflush_io); +#ifdef UNIV_DEBUG if (mtflush_io->wt_status == WTHR_NO_WORK) { n_timeout++; @@ -323,6 +337,7 @@ DECLARE_THREAD(mtflush_io_thread)( } else { n_timeout = 0; } +#endif /* UNIV_DEBUG */ if (mtflush_io->wt_status == WTHR_KILL_IT) { break; @@ -405,6 +420,7 @@ buf_mtflu_io_thread_exit(void) ib_wqueue_free(mtflush_io->rd_cq); os_fast_mutex_free(&mtflush_mtx); + os_fast_mutex_free(&mtflush_mtx_wait); /* Free heap */ mem_heap_free(mtflush_io->wheap); @@ -426,6 +442,7 @@ buf_mtflu_handler_init( ib_wqueue_t* mtflush_read_comp_queue; os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx_wait); /* Create heap, work queue, write completion queue, read completion queue for multi-threaded flush, and init @@ -491,16 +508,15 @@ buf_mtflu_flush_work_items( node items areallocated */ work_heap = mem_heap_create(0); work_item = (wrk_t*)mem_heap_alloc(work_heap, sizeof(wrk_t)*buf_pool_inst); + memset(work_item, 0, sizeof(wrk_t)*buf_pool_inst); for(i=0;iwq, @@ -516,14 +532,18 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; - if((int)done_wi->id_usr == -1 && - done_wi->wi_status == WRK_ITEM_SET ) { +#ifdef UNIV_DEBUG + /* TODO: Temporal fix for hang. This is really a bug. */ + if((int)done_wi->id_usr == 0 && + (done_wi->wi_status == WRK_ITEM_SET || + done_wi->wi_status == WRK_ITEM_UNSET)) { fprintf(stderr, "**Set/Unused work_item[%lu] flush_type=%d\n", i, done_wi->wr.flush_type); ut_a(0); } +#endif n_flushed+= done_wi->n_flushed; i++; diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index f26ad436190..f35ec84fd12 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -17971,6 +17971,11 @@ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, MTFLUSH_MAX_WORKER, /* Max setting */ 0); +static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, + PLUGIN_VAR_OPCMDARG , + "Use multi-threaded flush. Default TRUE.", + NULL, NULL, TRUE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), MYSQL_SYSVAR(additional_mem_pool_size), @@ -18168,6 +18173,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(use_lz4), #endif MYSQL_SYSVAR(mtflush_threads), + MYSQL_SYSVAR(use_mtflush), NULL }; diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index 6b69a899690..e42063f6335 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -1042,6 +1042,8 @@ Release fil_system mutex */ void fil_system_exit(void); /*==================*/ + +#ifndef UNIV_INNOCHECKSUM /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ fil_space_t* @@ -1054,5 +1056,5 @@ char* fil_space_name( /*===========*/ fil_space_t* space); /*!< in: space */ - +#endif #endif /* fil0fil_h */ diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index bfb59865841..879989770e6 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -277,8 +277,13 @@ extern my_bool srv_use_lz4; /* Number of flush threads */ #define MTFLUSH_MAX_WORKER 64 #define MTFLUSH_DEFAULT_WORKER 8 + +/* Number of threads used for multi-threaded flush */ extern long srv_mtflush_threads; +/* If this flag is TRUE, then we will use multi threaded flush. */ +extern my_bool srv_use_mtflush; + /** Server undo tablespaces directory, can be absolute path. */ extern char* srv_undo_dir; diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h index b1035bad099..796a272db59 100644 --- a/storage/xtradb/include/ut0list.h +++ b/storage/xtradb/include/ut0list.h @@ -151,7 +151,7 @@ ib_list_is_empty( const ib_list_t* list); /* in: list */ /******************************************************************** -Get number of items on list. +Get number of items on list. @return number of items on list */ UNIV_INLINE ulint diff --git a/storage/xtradb/include/ut0list.ic b/storage/xtradb/include/ut0list.ic index eaf2577b16c..7a7f53adb2f 100644 --- a/storage/xtradb/include/ut0list.ic +++ b/storage/xtradb/include/ut0list.ic @@ -60,7 +60,7 @@ ib_list_is_empty( } /******************************************************************** -Get number of items on list. +Get number of items on list. @return number of items on list */ UNIV_INLINE ulint diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h index 6513f4982c0..e6b9891aed1 100644 --- a/storage/xtradb/include/ut0wqueue.h +++ b/storage/xtradb/include/ut0wqueue.h @@ -105,7 +105,7 @@ ib_wqueue_nowait( /******************************************************************** -Get number of items on queue. +Get number of items on queue. @return number of items on queue */ ulint ib_wqueue_len( diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index d6801b701ae..f7469e29911 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -176,9 +176,11 @@ UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ -UNIV_INTERN my_bool srv_use_lz4 = FALSE; +UNIV_INTERN my_bool srv_use_lz4 = FALSE; /* Number of threads used for multi-threaded flush */ UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; +/* If this flag is TRUE, then we will use multi threaded flush. */ +UNIV_INTERN my_bool srv_use_mtflush = TRUE; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index bb539569e9a..4d97632f818 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -2719,19 +2719,23 @@ files_checked: if (!srv_read_only_mode) { - /* Start multi-threaded flush threads */ - mtflush_ctx = buf_mtflu_handler_init(srv_mtflush_threads, - srv_buf_pool_instances); - - /* Set up the thread ids */ - buf_mtflu_set_thread_ids(srv_mtflush_threads, - mtflush_ctx, - (thread_ids + 6 + SRV_MAX_N_PURGE_THREADS)); + if (srv_use_mtflush) { + /* Start multi-threaded flush threads */ + mtflush_ctx = buf_mtflu_handler_init( + srv_mtflush_threads, + srv_buf_pool_instances); + /* Set up the thread ids */ + buf_mtflu_set_thread_ids( + srv_mtflush_threads, + mtflush_ctx, + (thread_ids + 6 + SRV_MAX_N_PURGE_THREADS)); #if UNIV_DEBUG - fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", - __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); + fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", + __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); #endif + } + os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); } @@ -3004,9 +3008,12 @@ innobase_shutdown_for_mysql(void) logs_empty_and_mark_files_at_shutdown() and should have already quit or is quitting right now. */ - /* g. Exit the multi threaded flush threads */ - buf_mtflu_io_thread_exit(); + if (srv_use_mtflush) { + /* g. Exit the multi threaded flush threads */ + + buf_mtflu_io_thread_exit(); + } #ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: Note: %s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); From e667c0f926deaa7ce7f093cd4679431599c55b1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 28 Feb 2014 09:05:36 +0200 Subject: [PATCH 32/56] Fix compiler error. --- storage/innobase/srv/srv0start.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 9ad5c960e5f..76587822bb1 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -2603,7 +2603,7 @@ files_checked: buf_mtflu_set_thread_ids( srv_mtflush_threads, mtflush_ctx, - (thread_ids + 6 + SRV_MAX_N_PURGE_THREADS)); + (thread_ids + 6 + 32)); #if UNIV_DEBUG fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", From b67892cf59872867514709784c54526434784ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 12:34:33 +0200 Subject: [PATCH 33/56] Turn all new features off by default. --- storage/innobase/handler/ha_innodb.cc | 8 ++++---- storage/innobase/srv/srv0srv.cc | 4 ++-- storage/xtradb/handler/ha_innodb.cc | 8 ++++---- storage/xtradb/srv/srv0srv.cc | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 6b44cb96677..2ec17049434 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16591,8 +16591,8 @@ static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, - "Use trim.", - NULL, NULL, TRUE); + "Use trim. Default FALSE.", + NULL, NULL, FALSE); #ifdef HAVE_LZ4 static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, @@ -16612,8 +16612,8 @@ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, PLUGIN_VAR_OPCMDARG , - "Use multi-threaded flush. Default TRUE.", - NULL, NULL, TRUE); + "Use multi-threaded flush. Default FALSE.", + NULL, NULL, FALSE); static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 7d3e7bf8108..6a0abdbf148 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -152,7 +152,7 @@ UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; UNIV_INTERN long srv_trim_pct = 100; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ -UNIV_INTERN my_bool srv_use_trim = TRUE; +UNIV_INTERN my_bool srv_use_trim = FALSE; /* If this flag is TRUE, then we will use posix fallocate for file extentsion */ UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ @@ -162,7 +162,7 @@ UNIV_INTERN my_bool srv_use_lz4 = FALSE; /* Number of threads used for multi-threaded flush */ UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; /* If this flag is TRUE, then we will use multi threaded flush. */ -UNIV_INTERN my_bool srv_use_mtflush = TRUE; +UNIV_INTERN my_bool srv_use_mtflush = FALSE; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index f35ec84fd12..160ca6b8181 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -17952,8 +17952,8 @@ static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, - "Use trim.", - NULL, NULL, TRUE); + "Use trim. Default FALSE.", + NULL, NULL, FALSE); #ifdef HAVE_LZ4 static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, @@ -17973,8 +17973,8 @@ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, PLUGIN_VAR_OPCMDARG , - "Use multi-threaded flush. Default TRUE.", - NULL, NULL, TRUE); + "Use multi-threaded flush. Default FALSE.", + NULL, NULL, FALSE); static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index f7469e29911..f1ee459efd7 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -170,7 +170,7 @@ level is set for the table*/ UNIV_INTERN long srv_compress_zlib_level = 6; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ -UNIV_INTERN my_bool srv_use_trim = TRUE; +UNIV_INTERN my_bool srv_use_trim = FALSE; /* If this flag is TRUE, then we will use posix fallocate for file extentsion */ UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ @@ -180,7 +180,7 @@ UNIV_INTERN my_bool srv_use_lz4 = FALSE; /* Number of threads used for multi-threaded flush */ UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; /* If this flag is TRUE, then we will use multi threaded flush. */ -UNIV_INTERN my_bool srv_use_mtflush = TRUE; +UNIV_INTERN my_bool srv_use_mtflush = FALSE; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function From be50724d89d141360472326f4fad006ba6e377b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 14:45:45 +0200 Subject: [PATCH 34/56] Fix compiler error on windows. --- storage/innobase/include/os0file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 89cc7597375..8baa207855c 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -354,7 +354,7 @@ to original un-instrumented file I/O APIs */ # define os_file_close(file) os_file_close_func(file) -# define os_aio(type, mode, name, file, buf, offset, n, message1, message2, write_size) \ +# define os_aio(type, mode, name, file, buf, offset, n, message1, message2, write_size, page_compression, page_compression_level) \ os_aio_func(type, mode, name, file, buf, offset, n, \ message1, message2, write_size, page_compression, page_compression_level) From ec45160e3b8cb5fb4dc1118fc7c539f5f256d85c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 15:02:39 +0200 Subject: [PATCH 35/56] Fix windows compiler erros. --- storage/innobase/include/os0file.h | 11 ++++++++++- storage/xtradb/include/os0file.h | 12 +++++++++++- storage/xtradb/os/os0file.cc | 11 ++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 8baa207855c..18a3f6a5ccd 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1204,7 +1204,16 @@ os_aio_windows_handle( parameters are valid and can be used to restart the operation, for example */ void** message2, - ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ #endif /**********************************************************************//** diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index e5abd4e2961..e4df03a1c8a 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -1243,7 +1243,17 @@ os_aio_windows_handle( restart the operation, for example */ void** message2, ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ - ulint* space_id); + ulint* space_id, + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ + #endif /**********************************************************************//** diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index cd1efc21061..158485ed7a2 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -5090,7 +5090,16 @@ os_aio_windows_handle( restart the operation, for example */ void** message2, ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ - ulint* space_id) + ulint* space_id, + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ { ulint orig_seg = segment; os_aio_slot_t* slot; From 6cde211d8ddefb98945904967cb028d6e3844bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 15:15:00 +0200 Subject: [PATCH 36/56] Fix typo. --- storage/innobase/include/os0file.h | 2 +- storage/xtradb/include/os0file.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 18a3f6a5ccd..6e32a64ca48 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1212,7 +1212,7 @@ os_aio_windows_handle( actual page size does not decrease. */ ibool page_compression, /*!< in: is page compression used on this file space */ - ulint page_compression_level) /*!< page compression + ulint page_compression_level); /*!< page compression level to be used */ #endif diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index e4df03a1c8a..1cb19e57516 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -1251,7 +1251,7 @@ os_aio_windows_handle( actual page size does not decrease. */ ibool page_compression, /*!< in: is page compression used on this file space */ - ulint page_compression_level) /*!< page compression + ulint page_compression_level); /*!< page compression level to be used */ #endif From b8e0bc3a67557290aaee67e9b6f59b782eebd59e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 15:26:58 +0200 Subject: [PATCH 37/56] Additional windows fixes. --- storage/innobase/include/os0file.h | 11 +---------- storage/innobase/os/os0file.cc | 18 ++++-------------- storage/xtradb/include/os0file.h | 11 +---------- storage/xtradb/os/os0file.cc | 15 +++------------ 4 files changed, 9 insertions(+), 46 deletions(-) diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 6e32a64ca48..8baa207855c 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1204,16 +1204,7 @@ os_aio_windows_handle( parameters are valid and can be used to restart the operation, for example */ void** message2, - ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ - ulint* write_size,/*!< in/out: Actual write size initialized - after fist successfull trim - operation for this page and if - initialized we do not trim again if - actual page size does not decrease. */ - ibool page_compression, /*!< in: is page compression used - on this file space */ - ulint page_compression_level); /*!< page compression - level to be used */ + ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */ #endif /**********************************************************************//** diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 35e1cd47e37..2ca7f2009c6 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4949,8 +4949,7 @@ try_again: retval = os_aio_windows_handle( ULINT_UNDEFINED, slot->pos, &dummy_mess1, &dummy_mess2, - &dummy_type, - write_size, page_compression, page_compression_level); + &dummy_type); return(retval); } @@ -5007,16 +5006,7 @@ os_aio_windows_handle( parameters are valid and can be used to restart the operation, for example */ void** message2, - ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ - ulint* write_size,/*!< in/out: Actual write size initialized - after fist successfull trim - operation for this page and if - initialized we do not trim again if - actual page size does not decrease. */ - ibool page_compression, /*!< in: is page compression used - on this file space */ - ulint page_compression_level) /*!< page compression - level to be used */ + ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */ { ulint orig_seg = segment; os_aio_array_t* array; @@ -5123,7 +5113,7 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: if (slot->message1 && - page_compression && + slot->page_compression && slot->page_buf) { ret = WriteFile(slot->file, slot->page_buf, (DWORD) slot->len, &len, @@ -5164,7 +5154,7 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } - if (slot->message1 && page_compression) { + if (slot->message1 && slot->page_compression) { // We allocate memory for page compressed buffer if and only // if it is not yet allocated. if (slot->page_buf == NULL) { diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 1cb19e57516..d355bfdf081 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -1243,16 +1243,7 @@ os_aio_windows_handle( restart the operation, for example */ void** message2, ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ - ulint* space_id, - ulint* write_size,/*!< in/out: Actual write size initialized - after fist successfull trim - operation for this page and if - initialized we do not trim again if - actual page size does not decrease. */ - ibool page_compression, /*!< in: is page compression used - on this file space */ - ulint page_compression_level); /*!< page compression - level to be used */ + ulint* space_id); #endif diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 158485ed7a2..e4530f0f338 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -5090,16 +5090,7 @@ os_aio_windows_handle( restart the operation, for example */ void** message2, ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ - ulint* space_id, - ulint* write_size,/*!< in/out: Actual write size initialized - after fist successfull trim - operation for this page and if - initialized we do not trim again if - actual page size does not decrease. */ - ibool page_compression, /*!< in: is page compression used - on this file space */ - ulint page_compression_level) /*!< page compression - level to be used */ + ulint* space_id) { ulint orig_seg = segment; os_aio_slot_t* slot; @@ -5186,7 +5177,7 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - if (slot->message1 && page_compression && slot->page_buf) { + if (slot->message1 && slot->page_compression && slot->page_buf) { ret_val = os_file_write(slot->name, slot->file, slot->page_buf, slot->control.Offset, slot->control.OffsetHigh, slot->len); } else { @@ -5222,7 +5213,7 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } - if (slot->message1 && page_compression) { + if (slot->message1 && slot->page_compression) { // We allocate memory for page compressed buffer if and only // if it is not yet allocated. if (slot->page_buf == NULL) { From e656a8a92791944420c3793f6686357f584788bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 15:43:38 +0200 Subject: [PATCH 38/56] Fix windows os_file_write. --- storage/xtradb/os/os0file.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index e4530f0f338..c56a625a84c 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -5179,11 +5179,11 @@ os_aio_windows_handle( case OS_FILE_WRITE: if (slot->message1 && slot->page_compression && slot->page_buf) { ret_val = os_file_write(slot->name, slot->file, slot->page_buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + slot->offset, slot->len); } else { ret_val = os_file_write(slot->name, slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + slot->offset, slot->len); } break; case OS_FILE_READ: From 81318f04c8dd935d994d5ade3aed95f0059d5a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 15:51:54 +0200 Subject: [PATCH 39/56] Yet more windows fixes. --- storage/innobase/os/os0file.cc | 4 ++-- storage/xtradb/os/os0file.cc | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 2ca7f2009c6..f0ca05b7faa 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -6246,12 +6246,12 @@ os_file_trim( FALSE, __FILE__, __LINE__); if (slot->write_size) { - slot->write_size = 0; + *slot->write_size = 0; } return (FALSE); } else { if (slot->write_size) { - slot->write_size = len; + *slot->write_size = len; } } #endif diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index c56a625a84c..933690dfefa 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -5188,7 +5188,7 @@ os_aio_windows_handle( break; case OS_FILE_READ: ret_val = os_file_read(slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + slot->offset, slot->len); break; default: ut_error; @@ -6311,12 +6311,12 @@ os_file_trim( FALSE, __FILE__, __LINE__); if (slot->write_size) { - slot->write_size = 0; + *slot->write_size = 0; } return (FALSE); } else { if (slot->write_size) { - slot->write_size = len; + *slot->write_size = len; } } #endif From fd38dca5d580eafcdd6c521be686601d5efa4c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Mar 2014 18:14:29 +0200 Subject: [PATCH 40/56] Fixed a hang. The core issues is with the heap-thrashing by the individual queue's. Tried to minimize memory allocation from heap whenever it is unnecessary. --- storage/innobase/buf/buf0mtflu.cc | 167 +++++++++++++++-------------- storage/xtradb/buf/buf0mtflu.cc | 169 ++++++++++++++++-------------- 2 files changed, 184 insertions(+), 152 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index d249c1af15d..ea10d09e934 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -116,26 +116,40 @@ typedef struct wrk_itm struct wrk_itm *next; /*!< Next work item */ mem_heap_t *wheap; /*!< Heap were to allocate memory for queue nodes */ + mem_heap_t *rheap; } wrk_t; +typedef struct thread_data +{ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + wthr_status_t wt_status; /*!< Worker thread status */ +} thread_data_t; + /* Thread syncronization data */ typedef struct thread_sync { + /* Global variables used by all threads */ + os_fast_mutex_t thread_global_mtx; /*!< Mutex used protecting below + variables */ ulint n_threads; /*!< Number of threads */ - os_thread_id_t wthread_id; /*!< Identifier */ - os_thread_t wthread; /*!< Thread id */ ib_wqueue_t *wq; /*!< Work Queue */ ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ - wthr_status_t wt_status; /*!< Worker thread status */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ + mem_heap_t* rheap; /*!< Work heap where memory + is allocated */ + wthr_status_t gwt_status; /*!< Global thread status */ + + /* Variables used by only one thread at a time */ + thread_data_t* thread_data; /*!< Thread specific data */ + } thread_sync_t; static int mtflush_work_initialized = -1; -static os_fast_mutex_t mtflush_mtx; -static os_fast_mutex_t mtflush_mtx_wait; static thread_sync_t* mtflush_ctx=NULL; +static os_fast_mutex_t mtflush_mtx; /******************************************************************//** Set multi-threaded flush work initialized. */ @@ -218,29 +232,29 @@ static void mtflush_service_io( /*===============*/ - thread_sync_t* mtflush_io) /*!< inout: multi-threaded flush + thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush syncronization data */ + thread_data_t* thread_data) /* Thread status data */ { wrk_t *work_item = NULL; ulint n_flushed=0; - mtflush_io->wt_status = WTHR_SIG_WAITING; + ut_a(mtflush_io != NULL); + ut_a(thread_data != NULL); + + thread_data->wt_status = WTHR_SIG_WAITING; - /* TODO: Temporal fix for the hang bug. This needs a real fix. */ - os_fast_mutex_lock(&mtflush_mtx_wait); work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); if (work_item == NULL) { work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); } - os_fast_mutex_unlock(&mtflush_mtx_wait); - if (work_item) { - mtflush_io->wt_status = WTHR_RUNNING; + thread_data->wt_status = WTHR_RUNNING; } else { /* Thread did not get any work */ - mtflush_io->wt_status = WTHR_NO_WORK; + thread_data->wt_status = WTHR_NO_WORK; return; } @@ -262,8 +276,8 @@ mtflush_service_io( case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_EXIT; - ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); - mtflush_io->wt_status = WTHR_KILL_IT; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + thread_data->wt_status = WTHR_KILL_IT; break; case MT_WRK_WRITE: @@ -274,7 +288,7 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); break; case MT_WRK_READ: @@ -286,7 +300,6 @@ mtflush_service_io( ut_a(0); break; } - } /******************************************************************//** @@ -300,14 +313,23 @@ DECLARE_THREAD(mtflush_io_thread)( void * arg) { thread_sync_t *mtflush_io = ((thread_sync_t *)arg); - ulint n_timeout = 0; -#ifdef UNIV_DEBUG - ib_uint64_t stat_universal_num_processed = 0; - ib_uint64_t stat_cycle_num_processed = 0; + thread_data_t *this_thread_data = NULL; ulint i; -#endif + + /* Find correct slot for this thread */ + os_fast_mutex_lock(&(mtflush_io->thread_global_mtx)); + for(i=0; i < mtflush_io->n_threads; i ++) { + if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) { + break; + } + } + + ut_a(i <= mtflush_io->n_threads); + this_thread_data = &mtflush_io->thread_data[i]; + os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx)); while (TRUE) { + #ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", os_thread_get_curr_id(), @@ -315,26 +337,10 @@ DECLARE_THREAD(mtflush_io_thread)( ib_wqueue_len(mtflush_io->wr_cq)); #endif /* UNIV_DEBUG */ - mtflush_service_io(mtflush_io); + mtflush_service_io(mtflush_io, this_thread_data); -#ifdef UNIV_DEBUG - if (mtflush_io->wt_status == WTHR_NO_WORK) { - n_timeout++; - if (n_timeout > 10) { - fprintf(stderr, "InnoDB: Note: Thread %lu has not received " - " work queue len %lu return queue len %lu\n", - os_thread_get_curr_id(), - ib_wqueue_len(mtflush_io->wq), - ib_wqueue_len(mtflush_io->wr_cq)); - n_timeout = 0; - } - } else { - n_timeout = 0; - } -#endif /* UNIV_DEBUG */ - - if (mtflush_io->wt_status == WTHR_KILL_IT) { + if (this_thread_data->wt_status == WTHR_KILL_IT) { break; } } @@ -359,22 +365,24 @@ buf_mtflu_io_thread_exit(void) /* Allocate work items for shutdown message */ work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + memset(work_item, 0, sizeof(wrk_t)*srv_mtflush_threads); /* Confirm if the io-thread KILL is in progress, bailout */ - if (mtflush_io->wt_status == WTHR_KILL_IT) { + if (mtflush_io->gwt_status == WTHR_KILL_IT) { return; } + mtflush_io->gwt_status = WTHR_KILL_IT; + fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", srv_mtflush_threads); /* Send one exit work item/thread */ for (i=0; i < srv_mtflush_threads; i++) { - work_item[i].wr.buf_pool = NULL; - work_item[i].rd.page_pool = NULL; work_item[i].tsk = MT_WRK_NONE; work_item[i].wi_status = WRK_ITEM_EXIT; work_item[i].wheap = mtflush_io->wheap; + work_item[i].rheap = mtflush_io->rheap; ib_wqueue_add(mtflush_io->wq, (void *)&(work_item[i]), @@ -384,7 +392,7 @@ buf_mtflu_io_thread_exit(void) /* Wait until all work items on a work queue are processed */ while(!ib_wqueue_is_empty(mtflush_io->wq)) { /* Wait */ - os_thread_sleep(MT_WAIT_IN_USECS * 2); + os_thread_sleep(MT_WAIT_IN_USECS); } ut_a(ib_wqueue_is_empty(mtflush_io->wq)); @@ -403,7 +411,7 @@ buf_mtflu_io_thread_exit(void) } /* Wait about 1/2 sec to allow threads really exit */ - os_thread_sleep(5000000); + os_thread_sleep(MT_WAIT_IN_USECS); ut_a(ib_wqueue_is_empty(mtflush_io->wq)); ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); @@ -415,10 +423,11 @@ buf_mtflu_io_thread_exit(void) ib_wqueue_free(mtflush_io->rd_cq); os_fast_mutex_free(&mtflush_mtx); - os_fast_mutex_free(&mtflush_mtx_wait); + os_fast_mutex_free(&mtflush_io->thread_global_mtx); /* Free heap */ mem_heap_free(mtflush_io->wheap); + mem_heap_free(mtflush_io->rheap); } /******************************************************************//** @@ -432,45 +441,50 @@ buf_mtflu_handler_init( { ulint i; mem_heap_t* mtflush_heap; - ib_wqueue_t* mtflush_work_queue; - ib_wqueue_t* mtflush_write_comp_queue; - ib_wqueue_t* mtflush_read_comp_queue; - - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx_wait); + mem_heap_t* mtflush_heap2; /* Create heap, work queue, write completion queue, read completion queue for multi-threaded flush, and init handler. */ mtflush_heap = mem_heap_create(0); ut_a(mtflush_heap != NULL); - mtflush_work_queue = ib_wqueue_create(); - ut_a(mtflush_work_queue != NULL); - mtflush_write_comp_queue = ib_wqueue_create(); - ut_a(mtflush_write_comp_queue != NULL); - mtflush_read_comp_queue = ib_wqueue_create(); - ut_a(mtflush_read_comp_queue != NULL); + mtflush_heap2 = mem_heap_create(0); + ut_a(mtflush_heap2 != NULL); mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, - MTFLUSH_MAX_WORKER * sizeof(thread_sync_t)); + sizeof(thread_sync_t)); + memset(mtflush_ctx, 0, sizeof(thread_sync_t)); ut_a(mtflush_ctx != NULL); + mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc( + mtflush_heap, sizeof(thread_data_t) * n_threads); + ut_a(mtflush_ctx->thread_data); + memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads); + + mtflush_ctx->n_threads = n_threads; + mtflush_ctx->wq = ib_wqueue_create(); + ut_a(mtflush_ctx->wq); + mtflush_ctx->wr_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->wr_cq); + mtflush_ctx->rd_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->rd_cq); + mtflush_ctx->wheap = mtflush_heap; + mtflush_ctx->rheap = mtflush_heap2; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); /* Create threads for page-compression-flush */ for(i=0; i < n_threads; i++) { os_thread_id_t new_thread_id; - mtflush_ctx[i].n_threads = n_threads; - mtflush_ctx[i].wq = mtflush_work_queue; - mtflush_ctx[i].wr_cq = mtflush_write_comp_queue; - mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; - mtflush_ctx[i].wheap = mtflush_heap; - mtflush_ctx[i].wt_status = WTHR_INITIALIZED; - mtflush_ctx[i].wthread = os_thread_create( + mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED; + + mtflush_ctx->thread_data[i].wthread = os_thread_create( mtflush_io_thread, - ((void *)(mtflush_ctx + i)), + ((void *) mtflush_ctx), &new_thread_id); - mtflush_ctx[i].wthread_id = new_thread_id; + mtflush_ctx->thread_data[i].wthread_id = new_thread_id; } buf_mtflu_work_init(); @@ -497,13 +511,15 @@ buf_mtflu_flush_work_items( { ulint n_flushed=0, i; mem_heap_t* work_heap; - wrk_t* work_item=NULL; + mem_heap_t* reply_heap; + wrk_t work_item[MTFLUSH_MAX_WORKER]; /* Allocate heap where all work items used and queue node items areallocated */ work_heap = mem_heap_create(0); - work_item = (wrk_t*)mem_heap_alloc(work_heap, sizeof(wrk_t)*buf_pool_inst); - memset(work_item, 0, sizeof(wrk_t)*buf_pool_inst); + reply_heap = mem_heap_create(0); + memset(work_item, 0, sizeof(wrk_t)*MTFLUSH_MAX_WORKER); + for(i=0;iwq, - (void *)(&(work_item[i])), + (void *)(work_item + i), work_heap); } @@ -527,8 +544,6 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; -#ifdef UNIV_DEBUG - /* TODO: Temporal fix for hang. This is really a bug. */ if((int)done_wi->id_usr == 0 && (done_wi->wi_status == WRK_ITEM_SET || done_wi->wi_status == WRK_ITEM_UNSET)) { @@ -538,7 +553,6 @@ buf_mtflu_flush_work_items( done_wi->wr.flush_type); ut_a(0); } -#endif n_flushed+= done_wi->n_flushed; i++; @@ -547,6 +561,7 @@ buf_mtflu_flush_work_items( /* Release used work_items and queue nodes */ mem_heap_free(work_heap); + mem_heap_free(reply_heap); return(n_flushed); } @@ -672,6 +687,6 @@ buf_mtflu_set_thread_ids( ut_a(thread_ids != NULL); for(i = 0; i < n_threads; i++) { - thread_ids[i] = mtflush_io[i].wthread_id; + thread_ids[i] = mtflush_io->thread_data[i].wthread_id; } } diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index f7da4c1c7a9..d1ec9979f51 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -116,26 +116,40 @@ typedef struct wrk_itm struct wrk_itm *next; /*!< Next work item */ mem_heap_t *wheap; /*!< Heap were to allocate memory for queue nodes */ + mem_heap_t *rheap; } wrk_t; +typedef struct thread_data +{ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + wthr_status_t wt_status; /*!< Worker thread status */ +} thread_data_t; + /* Thread syncronization data */ typedef struct thread_sync { + /* Global variables used by all threads */ + os_fast_mutex_t thread_global_mtx; /*!< Mutex used protecting below + variables */ ulint n_threads; /*!< Number of threads */ - os_thread_id_t wthread_id; /*!< Identifier */ - os_thread_t wthread; /*!< Thread id */ ib_wqueue_t *wq; /*!< Work Queue */ ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ - wthr_status_t wt_status; /*!< Worker thread status */ mem_heap_t* wheap; /*!< Work heap where memory is allocated */ + mem_heap_t* rheap; /*!< Work heap where memory + is allocated */ + wthr_status_t gwt_status; /*!< Global thread status */ + + /* Variables used by only one thread at a time */ + thread_data_t* thread_data; /*!< Thread specific data */ + } thread_sync_t; static int mtflush_work_initialized = -1; -static os_fast_mutex_t mtflush_mtx; -static os_fast_mutex_t mtflush_mtx_wait; static thread_sync_t* mtflush_ctx=NULL; +static os_fast_mutex_t mtflush_mtx; /******************************************************************//** Set multi-threaded flush work initialized. */ @@ -172,6 +186,8 @@ buf_mtflu_flush_pool_instance( ut_a(work_item != NULL); ut_a(work_item->wr.buf_pool != NULL); + memset(&n, 0, sizeof(flush_counters_t)); + if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { /* We have two choices here. If lsn_limit was specified then skipping an instance of buffer @@ -223,29 +239,29 @@ static void mtflush_service_io( /*===============*/ - thread_sync_t* mtflush_io) /*!< inout: multi-threaded flush + thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush syncronization data */ + thread_data_t* thread_data) /* Thread status data */ { wrk_t *work_item = NULL; ulint n_flushed=0; - mtflush_io->wt_status = WTHR_SIG_WAITING; + ut_a(mtflush_io != NULL); + ut_a(thread_data != NULL); + + thread_data->wt_status = WTHR_SIG_WAITING; - /* TODO: Temporal fix for the hang bug. This needs a real fix. */ - os_fast_mutex_lock(&mtflush_mtx_wait); work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); if (work_item == NULL) { work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); } - os_fast_mutex_unlock(&mtflush_mtx_wait); - if (work_item) { - mtflush_io->wt_status = WTHR_RUNNING; + thread_data->wt_status = WTHR_RUNNING; } else { /* Thread did not get any work */ - mtflush_io->wt_status = WTHR_NO_WORK; + thread_data->wt_status = WTHR_NO_WORK; return; } @@ -267,8 +283,8 @@ mtflush_service_io( case MT_WRK_NONE: ut_a(work_item->wi_status == WRK_ITEM_EXIT); work_item->wi_status = WRK_ITEM_EXIT; - ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); - mtflush_io->wt_status = WTHR_KILL_IT; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + thread_data->wt_status = WTHR_KILL_IT; break; case MT_WRK_WRITE: @@ -279,7 +295,7 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_FAILED; } work_item->wi_status = WRK_ITEM_SUCCESS; - ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->wheap); + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); break; case MT_WRK_READ: @@ -291,7 +307,6 @@ mtflush_service_io( ut_a(0); break; } - } /******************************************************************//** @@ -305,14 +320,23 @@ DECLARE_THREAD(mtflush_io_thread)( void * arg) { thread_sync_t *mtflush_io = ((thread_sync_t *)arg); - ulint n_timeout = 0; -#ifdef UNIV_DEBUG - ib_uint64_t stat_universal_num_processed = 0; - ib_uint64_t stat_cycle_num_processed = 0; + thread_data_t *this_thread_data = NULL; ulint i; -#endif + + /* Find correct slot for this thread */ + os_fast_mutex_lock(&(mtflush_io->thread_global_mtx)); + for(i=0; i < mtflush_io->n_threads; i ++) { + if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) { + break; + } + } + + ut_a(i <= mtflush_io->n_threads); + this_thread_data = &mtflush_io->thread_data[i]; + os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx)); while (TRUE) { + #ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", os_thread_get_curr_id(), @@ -320,26 +344,10 @@ DECLARE_THREAD(mtflush_io_thread)( ib_wqueue_len(mtflush_io->wr_cq)); #endif /* UNIV_DEBUG */ - mtflush_service_io(mtflush_io); + mtflush_service_io(mtflush_io, this_thread_data); -#ifdef UNIV_DEBUG - if (mtflush_io->wt_status == WTHR_NO_WORK) { - n_timeout++; - if (n_timeout > 10) { - fprintf(stderr, "InnoDB: Note: Thread %lu has not received " - " work queue len %lu return queue len %lu\n", - os_thread_get_curr_id(), - ib_wqueue_len(mtflush_io->wq), - ib_wqueue_len(mtflush_io->wr_cq)); - n_timeout = 0; - } - } else { - n_timeout = 0; - } -#endif /* UNIV_DEBUG */ - - if (mtflush_io->wt_status == WTHR_KILL_IT) { + if (this_thread_data->wt_status == WTHR_KILL_IT) { break; } } @@ -364,22 +372,24 @@ buf_mtflu_io_thread_exit(void) /* Allocate work items for shutdown message */ work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + memset(work_item, 0, sizeof(wrk_t)*srv_mtflush_threads); /* Confirm if the io-thread KILL is in progress, bailout */ - if (mtflush_io->wt_status == WTHR_KILL_IT) { + if (mtflush_io->gwt_status == WTHR_KILL_IT) { return; } + mtflush_io->gwt_status = WTHR_KILL_IT; + fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", srv_mtflush_threads); /* Send one exit work item/thread */ for (i=0; i < srv_mtflush_threads; i++) { - work_item[i].wr.buf_pool = NULL; - work_item[i].rd.page_pool = NULL; work_item[i].tsk = MT_WRK_NONE; work_item[i].wi_status = WRK_ITEM_EXIT; work_item[i].wheap = mtflush_io->wheap; + work_item[i].rheap = mtflush_io->rheap; ib_wqueue_add(mtflush_io->wq, (void *)&(work_item[i]), @@ -389,7 +399,7 @@ buf_mtflu_io_thread_exit(void) /* Wait until all work items on a work queue are processed */ while(!ib_wqueue_is_empty(mtflush_io->wq)) { /* Wait */ - os_thread_sleep(MT_WAIT_IN_USECS * 2); + os_thread_sleep(MT_WAIT_IN_USECS); } ut_a(ib_wqueue_is_empty(mtflush_io->wq)); @@ -408,7 +418,7 @@ buf_mtflu_io_thread_exit(void) } /* Wait about 1/2 sec to allow threads really exit */ - os_thread_sleep(5000000); + os_thread_sleep(MT_WAIT_IN_USECS); ut_a(ib_wqueue_is_empty(mtflush_io->wq)); ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); @@ -420,10 +430,11 @@ buf_mtflu_io_thread_exit(void) ib_wqueue_free(mtflush_io->rd_cq); os_fast_mutex_free(&mtflush_mtx); - os_fast_mutex_free(&mtflush_mtx_wait); + os_fast_mutex_free(&mtflush_io->thread_global_mtx); /* Free heap */ mem_heap_free(mtflush_io->wheap); + mem_heap_free(mtflush_io->rheap); } /******************************************************************//** @@ -437,45 +448,50 @@ buf_mtflu_handler_init( { ulint i; mem_heap_t* mtflush_heap; - ib_wqueue_t* mtflush_work_queue; - ib_wqueue_t* mtflush_write_comp_queue; - ib_wqueue_t* mtflush_read_comp_queue; - - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx_wait); + mem_heap_t* mtflush_heap2; /* Create heap, work queue, write completion queue, read completion queue for multi-threaded flush, and init handler. */ mtflush_heap = mem_heap_create(0); ut_a(mtflush_heap != NULL); - mtflush_work_queue = ib_wqueue_create(); - ut_a(mtflush_work_queue != NULL); - mtflush_write_comp_queue = ib_wqueue_create(); - ut_a(mtflush_write_comp_queue != NULL); - mtflush_read_comp_queue = ib_wqueue_create(); - ut_a(mtflush_read_comp_queue != NULL); + mtflush_heap2 = mem_heap_create(0); + ut_a(mtflush_heap2 != NULL); mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, - MTFLUSH_MAX_WORKER * sizeof(thread_sync_t)); + sizeof(thread_sync_t)); + memset(mtflush_ctx, 0, sizeof(thread_sync_t)); ut_a(mtflush_ctx != NULL); + mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc( + mtflush_heap, sizeof(thread_data_t) * n_threads); + ut_a(mtflush_ctx->thread_data); + memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads); + + mtflush_ctx->n_threads = n_threads; + mtflush_ctx->wq = ib_wqueue_create(); + ut_a(mtflush_ctx->wq); + mtflush_ctx->wr_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->wr_cq); + mtflush_ctx->rd_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->rd_cq); + mtflush_ctx->wheap = mtflush_heap; + mtflush_ctx->rheap = mtflush_heap2; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); /* Create threads for page-compression-flush */ for(i=0; i < n_threads; i++) { os_thread_id_t new_thread_id; - mtflush_ctx[i].n_threads = n_threads; - mtflush_ctx[i].wq = mtflush_work_queue; - mtflush_ctx[i].wr_cq = mtflush_write_comp_queue; - mtflush_ctx[i].rd_cq = mtflush_read_comp_queue; - mtflush_ctx[i].wheap = mtflush_heap; - mtflush_ctx[i].wt_status = WTHR_INITIALIZED; - mtflush_ctx[i].wthread = os_thread_create( + mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED; + + mtflush_ctx->thread_data[i].wthread = os_thread_create( mtflush_io_thread, - ((void *)(mtflush_ctx + i)), + ((void *) mtflush_ctx), &new_thread_id); - mtflush_ctx[i].wthread_id = new_thread_id; + mtflush_ctx->thread_data[i].wthread_id = new_thread_id; } buf_mtflu_work_init(); @@ -502,13 +518,15 @@ buf_mtflu_flush_work_items( { ulint n_flushed=0, i; mem_heap_t* work_heap; - wrk_t* work_item=NULL; + mem_heap_t* reply_heap; + wrk_t work_item[MTFLUSH_MAX_WORKER]; /* Allocate heap where all work items used and queue node items areallocated */ work_heap = mem_heap_create(0); - work_item = (wrk_t*)mem_heap_alloc(work_heap, sizeof(wrk_t)*buf_pool_inst); - memset(work_item, 0, sizeof(wrk_t)*buf_pool_inst); + reply_heap = mem_heap_create(0); + memset(work_item, 0, sizeof(wrk_t)*MTFLUSH_MAX_WORKER); + for(i=0;iwq, - (void *)(&(work_item[i])), + (void *)(work_item + i), work_heap); } @@ -532,8 +551,6 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; -#ifdef UNIV_DEBUG - /* TODO: Temporal fix for hang. This is really a bug. */ if((int)done_wi->id_usr == 0 && (done_wi->wi_status == WRK_ITEM_SET || done_wi->wi_status == WRK_ITEM_UNSET)) { @@ -543,7 +560,6 @@ buf_mtflu_flush_work_items( done_wi->wr.flush_type); ut_a(0); } -#endif n_flushed+= done_wi->n_flushed; i++; @@ -555,6 +571,7 @@ buf_mtflu_flush_work_items( /* Release used work_items and queue nodes */ mem_heap_free(work_heap); + mem_heap_free(reply_heap); return(n_flushed); } @@ -680,6 +697,6 @@ buf_mtflu_set_thread_ids( ut_a(thread_ids != NULL); for(i = 0; i < n_threads; i++) { - thread_ids[i] = mtflush_io[i].wthread_id; + thread_ids[i] = mtflush_io->thread_data[i].wthread_id; } } From 7322270a0514883b62f4148e6acc039a5e1b7fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Tue, 4 Mar 2014 17:14:08 +0200 Subject: [PATCH 41/56] Set actual compressed page size also on read code path to buffer pool so that we can later use it to avoid unnecessary trim operations. --- storage/innobase/buf/buf0rea.cc | 2 +- storage/innobase/fil/fil0pagecompress.cc | 10 +++++++++- storage/innobase/include/fil0pagecompress.h | 4 +++- storage/innobase/os/os0file.cc | 12 +++++++----- storage/xtradb/buf/buf0rea.cc | 2 +- storage/xtradb/fil/fil0pagecompress.cc | 10 +++++++++- storage/xtradb/include/fil0pagecompress.h | 4 +++- storage/xtradb/os/os0file.cc | 14 ++++++++------ 8 files changed, 41 insertions(+), 17 deletions(-) diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index e2578b7f6b7..ec76c9923fe 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -185,7 +185,7 @@ buf_read_page_low( *err = fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, zip_size, offset, 0, zip_size, - bpage->zip.data, bpage, 0); + bpage->zip.data, bpage, &bpage->write_size); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index dfa216d0ae2..8ecb5317088 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -226,7 +226,9 @@ fil_decompress_page( byte* page_buf, /*!< in: preallocated buffer or NULL */ byte* buf, /*!< out: buffer from which to read; in aio this must be appropriately aligned */ - ulint len) /*!< in: length of output buffer.*/ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size) /*!< in/out: Actual payload size of + the compressed data. */ { int err = 0; ulint actual_size = 0; @@ -277,6 +279,12 @@ fil_decompress_page( ut_error; } + /* Store actual payload size of the compressed data. This pointer + points to buffer pool. */ + if (write_size) { + *write_size = actual_size; + } + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { #ifdef UNIV_DEBUG diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h index 342b105401c..c362c0ddcd2 100644 --- a/storage/innobase/include/fil0pagecompress.h +++ b/storage/innobase/include/fil0pagecompress.h @@ -97,7 +97,9 @@ fil_decompress_page( byte* page_buf, /*!< in: preallocated buffer or NULL */ byte* buf, /*!< out: buffer from which to read; in aio this must be appropriately aligned */ - ulint len); /*!< in: length of output buffer.*/ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size); /*!< in/out: Actual payload size of + the compressed data. */ /****************************************************************//** Get space id from fil node diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index f0ca05b7faa..376aa244bc9 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2821,7 +2821,7 @@ try_again: if (ret && len == n) { if (fil_page_is_compressed((byte *)buf)) { - fil_decompress_page(NULL, (byte *)buf, len); + fil_decompress_page(NULL, (byte *)buf, len, NULL); } return(TRUE); } @@ -2836,7 +2836,7 @@ try_again: if ((ulint) ret == n) { if (fil_page_is_compressed((byte *)buf)) { - fil_decompress_page(NULL, (byte *)buf, n); + fil_decompress_page(NULL, (byte *)buf, n, NULL); } return(TRUE); @@ -5164,7 +5164,7 @@ os_aio_windows_handle( if (slot->type == OS_FILE_READ) { if (fil_page_is_compressed(slot->buf)) { - fil_decompress_page(slot->page_buf, slot->buf, slot->len); + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); } } else { if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { @@ -5278,7 +5278,7 @@ retry: if (slot->type == OS_FILE_READ) { if (fil_page_is_compressed(slot->buf)) { - fil_decompress_page(slot->page_buf, slot->buf, slot->len); + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); } } else { if (slot->page_compress_success && @@ -6219,7 +6219,9 @@ os_file_trim( " InnoDB: [Warning] fallocate not supported on this installation." " InnoDB: Disabling fallocate for now."); os_fallocate_failed = TRUE; - slot->write_size = NULL; + if (slot->write_size) { + *slot->write_size = 0; + } #endif /* HAVE_FALLOCATE ... */ diff --git a/storage/xtradb/buf/buf0rea.cc b/storage/xtradb/buf/buf0rea.cc index 3dec3df6f2b..7a79958c136 100644 --- a/storage/xtradb/buf/buf0rea.cc +++ b/storage/xtradb/buf/buf0rea.cc @@ -237,7 +237,7 @@ not_to_recover: *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage, 0, trx); + ((buf_block_t*) bpage)->frame, bpage, &bpage->write_size, trx); } if (sync) { diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index 05dcf372112..eac889cf7c6 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -222,7 +222,9 @@ fil_decompress_page( byte* page_buf, /*!< in: preallocated buffer or NULL */ byte* buf, /*!< out: buffer from which to read; in aio this must be appropriately aligned */ - ulint len) /*!< in: length of output buffer.*/ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size) /*!< in/out: Actual payload size of + the compressed data. */ { int err = 0; ulint actual_size = 0; @@ -273,6 +275,12 @@ fil_decompress_page( ut_error; } + /* Store actual payload size of the compressed data. This pointer + points to buffer pool. */ + if (write_size) { + *write_size = actual_size; + } + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { #ifdef UNIV_DEBUG diff --git a/storage/xtradb/include/fil0pagecompress.h b/storage/xtradb/include/fil0pagecompress.h index 342b105401c..c362c0ddcd2 100644 --- a/storage/xtradb/include/fil0pagecompress.h +++ b/storage/xtradb/include/fil0pagecompress.h @@ -97,7 +97,9 @@ fil_decompress_page( byte* page_buf, /*!< in: preallocated buffer or NULL */ byte* buf, /*!< out: buffer from which to read; in aio this must be appropriately aligned */ - ulint len); /*!< in: length of output buffer.*/ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size); /*!< in/out: Actual payload size of + the compressed data. */ /****************************************************************//** Get space id from fil node diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 933690dfefa..1b094bfa1f3 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -3009,7 +3009,7 @@ try_again: if (ret && len == n) { if (fil_page_is_compressed((byte *)buf)) { - fil_decompress_page(NULL, (byte *)buf, len); + fil_decompress_page(NULL, (byte *)buf, len, NULL); } return(TRUE); } @@ -3025,7 +3025,7 @@ try_again: if ((ulint) ret == n) { if (fil_page_is_compressed((byte *)buf)) { - fil_decompress_page(NULL, (byte *)buf, n); + fil_decompress_page(NULL, (byte *)buf, n, NULL); } return(TRUE); @@ -3129,7 +3129,7 @@ try_again: if ((ulint) ret == n) { if (fil_page_is_compressed((byte *)buf)) { - fil_decompress_page(NULL, (byte *)buf, n); + fil_decompress_page(NULL, (byte *)buf, n, NULL); } return(TRUE); @@ -5223,7 +5223,7 @@ os_aio_windows_handle( if (slot->type == OS_FILE_READ) { if (fil_page_is_compressed(slot->buf)) { - fil_decompress_page(slot->page_buf, slot->buf, slot->len); + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); } } else { if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { @@ -5337,7 +5337,7 @@ retry: if (slot->type == OS_FILE_READ) { if (fil_page_is_compressed(slot->buf)) { - fil_decompress_page(slot->page_buf, slot->buf, slot->len); + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); } } else { if (slot->page_compress_success && @@ -6284,7 +6284,9 @@ os_file_trim( " InnoDB: [Warning] fallocate not supported on this installation." " InnoDB: Disabling fallocate for now."); os_fallocate_failed = TRUE; - slot->write_size = NULL; + if (slot->write_size) { + *slot->write_size = 0; + } #endif /* HAVE_FALLOCATE ... */ From 3a4b8879e5250eeac3e0a6c770fdf235111c8171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Tue, 4 Mar 2014 20:12:32 +0200 Subject: [PATCH 42/56] Set index page page compression on by default and remove innodb_trim_pct as it is not used/implemented. --- storage/innobase/handler/ha_innodb.cc | 8 +++++--- storage/innobase/srv/srv0srv.cc | 2 +- storage/xtradb/handler/ha_innodb.cc | 8 +++++--- storage/xtradb/srv/srv0srv.cc | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 98e27f4e02c..a65937d9490 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16786,15 +16786,17 @@ static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, NULL, NULL, FALSE); #endif /* UNIV_DEBUG */ +/* static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, PLUGIN_VAR_OPCMDARG , "How many percent of compressed pages should be trimmed", NULL, NULL, 100, 0, 100, 0); +*/ static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, PLUGIN_VAR_OPCMDARG, - "Use page compression for only index pages.", - NULL, NULL, FALSE); + "Use page compression for only index pages. Default TRUE.", + NULL, NULL, TRUE); static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, @@ -16974,7 +16976,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(limit_optimistic_insert_debug), MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ - MYSQL_SYSVAR(trim_pct), + // MYSQL_SYSVAR(trim_pct), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), #ifdef HAVE_LZ4 diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index dcef4a03b76..11e6ffd31d3 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -148,7 +148,7 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE; /* If this flag is TRUE, then we will use page compression only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN my_bool srv_page_compress_index_pages = TRUE; UNIV_INTERN long srv_trim_pct = 100; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 1f92db64ddc..046fdfa45a9 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -17934,10 +17934,12 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, "Print stacktrace on long semaphore wait (off by default supported only on linux)", NULL, NULL, FALSE); +/* static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, PLUGIN_VAR_OPCMDARG , "How many percent of compressed pages should be trimmed", NULL, NULL, 100, 0, 100, 0); +*/ static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, PLUGIN_VAR_RQCMDARG, @@ -17947,8 +17949,8 @@ static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, PLUGIN_VAR_OPCMDARG, - "Use page compression for only index pages.", - NULL, NULL, FALSE); + "Use page compression for only index pages. Default TRUE.", + NULL, NULL, TRUE); static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, @@ -18166,7 +18168,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fake_changes), MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), - MYSQL_SYSVAR(trim_pct), + // MYSQL_SYSVAR(trim_pct), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), #ifdef HAVE_LZ4 diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index f1ee459efd7..5706c354ada 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -163,7 +163,7 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE; /* If this flag is TRUE, then we will use page compression only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN my_bool srv_page_compress_index_pages = TRUE; UNIV_INTERN long srv_trim_pct = 100; /* Default compression level if page compression is used and no compression level is set for the table*/ From c556b9d8176107ba892ac218dd72e35d53e0c4f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 7 Mar 2014 08:20:43 +0200 Subject: [PATCH 43/56] Changed so that innodb_compress_index pages means that if true also index pages are compressed if false index pages are not compressed. Fixed small output error when page_compression_level was incorrectly given. --- storage/innobase/handler/ha_innodb.cc | 6 +++--- storage/innobase/os/os0file.cc | 8 +++----- storage/innobase/srv/srv0srv.cc | 2 +- storage/xtradb/handler/ha_innodb.cc | 6 +++--- storage/xtradb/os/os0file.cc | 8 +++----- storage/xtradb/srv/srv0srv.cc | 2 +- 6 files changed, 14 insertions(+), 18 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index a65937d9490..1273a25a5f5 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -9907,7 +9907,7 @@ ha_innobase::check_table_options( HA_WRONG_CREATE_OPTION, "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", - create_info->key_block_size); + options->page_compression_level); return "PAGE_COMPRESSION_LEVEL"; } } @@ -16795,8 +16795,8 @@ static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, PLUGIN_VAR_OPCMDARG, - "Use page compression for only index pages. Default TRUE.", - NULL, NULL, TRUE); + "Use page compression also for index pages. Default FALSE.", + NULL, NULL, FALSE); static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 376aa244bc9..0093dd8e266 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4464,14 +4464,12 @@ found: slot->page_compression = page_compression; /* If the space is page compressed and this is write operation - and if either only index pages compression is disabled or - page is index page and only index pages compression is enabled then - we compress the page */ + and either index compression is enabled or page is not a index + page then we compress the page */ if (message1 && type == OS_FILE_WRITE && page_compression && - (srv_page_compress_index_pages == false || - (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) { + (srv_page_compress_index_pages == true || !fil_page_is_index_page(slot->buf))) { ulint real_len = len; byte* tmp = NULL; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 11e6ffd31d3..dcef4a03b76 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -148,7 +148,7 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE; /* If this flag is TRUE, then we will use page compression only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = TRUE; +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; UNIV_INTERN long srv_trim_pct = 100; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 046fdfa45a9..fc92cc828f7 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -10420,7 +10420,7 @@ ha_innobase::check_table_options( HA_WRONG_CREATE_OPTION, "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", - create_info->key_block_size); + options->page_compression_level); return "PAGE_COMPRESSION_LEVEL"; } } @@ -17949,8 +17949,8 @@ static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, PLUGIN_VAR_OPCMDARG, - "Use page compression for only index pages. Default TRUE.", - NULL, NULL, TRUE); + "Use page compression also for index pages. Default FALSE.", + NULL, NULL, FALSE); static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 38be419e2ad..525310025da 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -4580,14 +4580,12 @@ found: slot->page_compression = page_compression; /* If the space is page compressed and this is write operation - and if either only index pages compression is disabled or - page is index page and only index pages compression is enabled then - we compress the page */ + and either index compression is enabled or page is not a index + page then we compress the page */ if (message1 && type == OS_FILE_WRITE && page_compression && - (srv_page_compress_index_pages == false || - (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) { + (srv_page_compress_index_pages == true || !fil_page_is_index_page(slot->buf))) { ulint real_len = len; byte* tmp = NULL; diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 5706c354ada..f1ee459efd7 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -163,7 +163,7 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE; /* If this flag is TRUE, then we will use page compression only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = TRUE; +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; UNIV_INTERN long srv_trim_pct = 100; /* Default compression level if page compression is used and no compression level is set for the table*/ From 3ea72a2ba9deb9e3da7efe57a74ce9b34b346dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 12 Mar 2014 14:47:38 +0200 Subject: [PATCH 44/56] Removed options innodb_compress_index_pages and innodb_trim_pct. Both are unnecessary. There is a lot more index pages than there is normal pages. Earlier all pages were compressed and this provided best performance and compression ratio. Added status variable to show how many non index pages are written. --- storage/innobase/fil/fil0fil.cc | 2 ++ storage/innobase/handler/ha_innodb.cc | 16 ++-------------- storage/innobase/include/srv0mon.h | 1 + storage/innobase/include/srv0srv.h | 10 ++++------ storage/innobase/os/os0file.cc | 8 ++------ storage/innobase/srv/srv0mon.cc | 13 ++++++++++++- storage/innobase/srv/srv0srv.cc | 6 ++---- storage/xtradb/fil/fil0fil.cc | 2 ++ storage/xtradb/handler/ha_innodb.cc | 16 ++-------------- storage/xtradb/include/srv0mon.h | 1 + storage/xtradb/include/srv0srv.h | 10 ++++------ storage/xtradb/os/os0file.cc | 8 ++------ storage/xtradb/srv/srv0mon.cc | 13 ++++++++++++- storage/xtradb/srv/srv0srv.cc | 6 ++---- 14 files changed, 50 insertions(+), 62 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 3678442417a..cee9c7e0534 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -5487,6 +5487,8 @@ fil_io( srv_stats.data_written.add(len); if (fil_page_is_index_page((byte *)buf)) { srv_stats.index_pages_written.inc(); + } else { + srv_stats.non_index_pages_written.inc(); } } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1273a25a5f5..b790ae76121 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -711,6 +711,8 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, {"num_index_pages_written", (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_non_index_pages_written", + (char*) &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG}, {"num_pages_page_compressed", (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, {"num_page_compressed_trim_op", @@ -16786,18 +16788,6 @@ static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, NULL, NULL, FALSE); #endif /* UNIV_DEBUG */ -/* -static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, - PLUGIN_VAR_OPCMDARG , - "How many percent of compressed pages should be trimmed", - NULL, NULL, 100, 0, 100, 0); -*/ - -static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, - PLUGIN_VAR_OPCMDARG, - "Use page compression also for index pages. Default FALSE.", - NULL, NULL, FALSE); - static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, "Use trim. Default FALSE.", @@ -16976,8 +16966,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(limit_optimistic_insert_debug), MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ - // MYSQL_SYSVAR(trim_pct), - MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), #ifdef HAVE_LZ4 MYSQL_SYSVAR(use_lz4), diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index d7c2d6ce531..4d0379a2643 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -165,6 +165,7 @@ enum monitor_id_t { MONITOR_OVLD_PAGE_CREATED, MONITOR_OVLD_PAGES_WRITTEN, MONITOR_OVLD_INDEX_PAGES_WRITTEN, + MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN, MONITOR_OVLD_PAGES_READ, MONITOR_OVLD_BYTE_READ, MONITOR_OVLD_BYTE_WRITTEN, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index b4bb9c09ef6..ac264a7d597 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -111,6 +111,8 @@ struct srv_stats_t { ulint_ctr_64_t page_compression_trim_sect4096; /* Number of index pages written */ ulint_ctr_64_t index_pages_written; + /* Number of non index pages written */ + ulint_ctr_64_t non_index_pages_written; /* Number of pages compressed with page compression */ ulint_ctr_64_t pages_page_compressed; /* Number of TRIM operations induced by page compression */ @@ -236,12 +238,6 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ extern my_bool srv_use_native_aio; -/* Is page compression used only for index pages */ -extern my_bool srv_page_compress_index_pages; - -/* Frequency of trim operations */ -extern long srv_trim_pct; - /* Use trim operation */ extern my_bool srv_use_trim; @@ -901,6 +897,8 @@ struct export_var_t{ by page compression */ ib_int64_t innodb_index_pages_written; /*!< Number of index pages written */ + ib_int64_t innodb_non_index_pages_written; /*!< Number of non index pages + written */ ib_int64_t innodb_pages_page_compressed;/*!< Number of pages compressed by page compression */ ib_int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 0093dd8e266..09340cca68d 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4464,12 +4464,8 @@ found: slot->page_compression = page_compression; /* If the space is page compressed and this is write operation - and either index compression is enabled or page is not a index - page then we compress the page */ - if (message1 && - type == OS_FILE_WRITE && - page_compression && - (srv_page_compress_index_pages == true || !fil_page_is_index_page(slot->buf))) { + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { ulint real_len = len; byte* tmp = NULL; diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 8ba0b977c98..32171182cf9 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -296,6 +296,12 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + {"buffer_non_index_pages_written", "buffer", + "Number of non index pages written (innodb_non_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -1593,11 +1599,16 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; - /* innodb_index_pages_written, the number of page written */ + /* innodb_index_pages_written, the number of index pages written */ case MONITOR_OVLD_INDEX_PAGES_WRITTEN: value = srv_stats.index_pages_written; break; + /* innodb_non_index_pages_written, the number of non index pages written */ + case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN: + value = srv_stats.non_index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index dcef4a03b76..fe3af72e150 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -146,10 +146,6 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; -/* If this flag is TRUE, then we will use page compression -only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; -UNIV_INTERN long srv_trim_pct = 100; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ UNIV_INTERN my_bool srv_use_trim = FALSE; @@ -393,6 +389,7 @@ UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_non_index_pages_written = 0; UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; @@ -1485,6 +1482,7 @@ srv_export_innodb_status(void) export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written; export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index b38b80d9ef2..8e788e71983 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -5488,6 +5488,8 @@ _fil_io( srv_stats.data_written.add(len); if (fil_page_is_index_page((byte *)buf)) { srv_stats.index_pages_written.inc(); + } else { + srv_stats.non_index_pages_written.inc(); } } diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index fc92cc828f7..4436dc3d0e1 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -906,6 +906,8 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, {"num_index_pages_written", (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_non_index_pages_written", + (char*) &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG}, {"num_pages_page_compressed", (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, {"num_page_compressed_trim_op", @@ -17934,24 +17936,12 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, "Print stacktrace on long semaphore wait (off by default supported only on linux)", NULL, NULL, FALSE); -/* -static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, - PLUGIN_VAR_OPCMDARG , - "How many percent of compressed pages should be trimmed", - NULL, NULL, 100, 0, 100, 0); -*/ - static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, PLUGIN_VAR_RQCMDARG, "Compression level used for zlib compression. 0 is no compression" ", 1 is fastest, 9 is best compression and default is 6.", NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); -static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, - PLUGIN_VAR_OPCMDARG, - "Use page compression also for index pages. Default FALSE.", - NULL, NULL, FALSE); - static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, PLUGIN_VAR_OPCMDARG, "Use trim. Default FALSE.", @@ -18168,8 +18158,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fake_changes), MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), - // MYSQL_SYSVAR(trim_pct), - MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), #ifdef HAVE_LZ4 MYSQL_SYSVAR(use_lz4), diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h index 5e5de2c2e0f..10e1fa6188a 100644 --- a/storage/xtradb/include/srv0mon.h +++ b/storage/xtradb/include/srv0mon.h @@ -165,6 +165,7 @@ enum monitor_id_t { MONITOR_OVLD_PAGE_CREATED, MONITOR_OVLD_PAGES_WRITTEN, MONITOR_OVLD_INDEX_PAGES_WRITTEN, + MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN, MONITOR_OVLD_PAGES_READ, MONITOR_OVLD_BYTE_READ, MONITOR_OVLD_BYTE_WRITTEN, diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index 879989770e6..a532f90ec24 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -111,6 +111,8 @@ struct srv_stats_t { ulint_ctr_64_t page_compression_trim_sect4096; /* Number of index pages written */ ulint_ctr_64_t index_pages_written; + /* Number of non index pages written */ + ulint_ctr_64_t non_index_pages_written; /* Number of pages compressed with page compression */ ulint_ctr_64_t pages_page_compressed; /* Number of TRIM operations induced by page compression */ @@ -256,12 +258,6 @@ extern ibool srv_use_native_conditions; #endif /* __WIN__ */ #endif /* !UNIV_HOTBACKUP */ -/* Is page compression used only for index pages */ -extern my_bool srv_page_compress_index_pages; - -/* Frequency of trim operations */ -extern long srv_trim_pct; - /* Use trim operation */ extern my_bool srv_use_trim; @@ -1110,6 +1106,8 @@ struct export_var_t{ by page compression */ ib_int64_t innodb_index_pages_written; /*!< Number of index pages written */ + ib_int64_t innodb_non_index_pages_written; /*!< Number of non index pages + written */ ib_int64_t innodb_pages_page_compressed;/*!< Number of pages compressed by page compression */ ib_int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 525310025da..fc2f5d78c9a 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -4580,12 +4580,8 @@ found: slot->page_compression = page_compression; /* If the space is page compressed and this is write operation - and either index compression is enabled or page is not a index - page then we compress the page */ - if (message1 && - type == OS_FILE_WRITE && - page_compression && - (srv_page_compress_index_pages == true || !fil_page_is_index_page(slot->buf))) { + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { ulint real_len = len; byte* tmp = NULL; diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index 8ba0b977c98..32171182cf9 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -296,6 +296,12 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + {"buffer_non_index_pages_written", "buffer", + "Number of non index pages written (innodb_non_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -1593,11 +1599,16 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; - /* innodb_index_pages_written, the number of page written */ + /* innodb_index_pages_written, the number of index pages written */ case MONITOR_OVLD_INDEX_PAGES_WRITTEN: value = srv_stats.index_pages_written; break; + /* innodb_non_index_pages_written, the number of non index pages written */ + case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN: + value = srv_stats.non_index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index f1ee459efd7..386dbfddf0b 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -161,10 +161,6 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; -/* If this flag is TRUE, then we will use page compression -only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; -UNIV_INTERN long srv_trim_pct = 100; /* Default compression level if page compression is used and no compression level is set for the table*/ UNIV_INTERN long srv_compress_zlib_level = 6; @@ -515,6 +511,7 @@ UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_non_index_pages_written = 0; UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; @@ -1866,6 +1863,7 @@ srv_export_innodb_status(void) export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written; export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; From 6a756b3a44cbe849a3a5a41b0e134e820d567c6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 21 Mar 2014 15:46:36 +0200 Subject: [PATCH 45/56] Code cleanup: Removed some unnecessary outputs from standard builds (available on special builds UNIV_PAGECOMPRESS_DEBUG and UNIV_MTFLUSH_DEBUG). Added a new status variable compress_pages_page_compression_error to count possible compression errors. --- storage/innobase/buf/buf0flu.cc | 2 +- storage/innobase/buf/buf0mtflu.cc | 14 ++++++++------ storage/innobase/fil/fil0pagecompress.cc | 24 +++++++++++++----------- storage/innobase/include/srv0mon.h | 1 + storage/innobase/include/srv0srv.h | 4 ++++ storage/innobase/os/os0file.cc | 2 +- storage/innobase/srv/srv0mon.cc | 8 ++++++++ storage/xtradb/buf/buf0flu.cc | 14 +++++++------- storage/xtradb/buf/buf0mtflu.cc | 14 ++++++++------ storage/xtradb/fil/fil0pagecompress.cc | 24 +++++++++++++----------- storage/xtradb/include/srv0mon.h | 1 + storage/xtradb/include/srv0srv.h | 4 ++++ storage/xtradb/os/os0file.cc | 2 +- storage/xtradb/srv/srv0mon.cc | 8 ++++++++ 14 files changed, 78 insertions(+), 44 deletions(-) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 07bff922e76..280f8cc39a9 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -2461,7 +2461,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( /* Flush pages from end of LRU if required */ n_lru = n_flushed = buf_flush_LRU_tail(); -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if (n_lru) { fprintf(stderr,"n_lru:%lu ",n_lru); } diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index ea10d09e934..a5937caaf57 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -195,7 +195,7 @@ buf_mtflu_flush_pool_instance( pools based on the assumption that it will help in the retry which will follow the failure. */ -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); #endif return 0; @@ -330,12 +330,12 @@ DECLARE_THREAD(mtflush_io_thread)( while (TRUE) { -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", os_thread_get_curr_id(), ib_wqueue_len(mtflush_io->wq), ib_wqueue_len(mtflush_io->wr_cq)); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_MTFLUSH_DEBUG */ mtflush_service_io(mtflush_io, this_thread_data); @@ -374,7 +374,7 @@ buf_mtflu_io_thread_exit(void) mtflush_io->gwt_status = WTHR_KILL_IT; - fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", + fprintf(stderr, "InnoDB: [Note]: Signal mtflush_io_threads to exit [%lu]\n", srv_mtflush_threads); /* Send one exit work item/thread */ @@ -544,6 +544,7 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; +#if UNIV_DEBUG if((int)done_wi->id_usr == 0 && (done_wi->wi_status == WRK_ITEM_SET || done_wi->wi_status == WRK_ITEM_UNSET)) { @@ -553,6 +554,7 @@ buf_mtflu_flush_work_items( done_wi->wr.flush_type); ut_a(0); } +#endif n_flushed+= done_wi->n_flushed; i++; @@ -621,7 +623,7 @@ buf_mtflu_flush_list( cnt_flush[i]); } } -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); #endif @@ -663,7 +665,7 @@ buf_mtflu_flush_LRU_tail(void) } } -#if UNIV_DEBUG +#if UNIV_MTFLUSH_DEBUG fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); #endif diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 8ecb5317088..dfd52d36b8e 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -106,11 +106,11 @@ fil_compress_page( level = page_zip_level; } -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", space_id, fil_space_name(space), len); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ write_size = UNIV_PAGE_SIZE - header_len; @@ -126,6 +126,7 @@ fil_compress_page( "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); *out_len = len; return (buf); } @@ -140,6 +141,7 @@ fil_compress_page( "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); *out_len = len; return (buf); } @@ -197,11 +199,11 @@ fil_compress_page( ut_a((write_size % SECT_SIZE) == 0); } -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", space_id, fil_space_name(space), len, write_size); -#endif +#endif /* UNIV_PAGECOMPRESS_DEBUG */ srv_stats.page_compression_saved.add((len - write_size)); @@ -209,7 +211,7 @@ fil_compress_page( srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); } - //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); *out_len = write_size; @@ -258,10 +260,10 @@ fil_decompress_page( // If no buffer was given, we need to allocate temporal buffer if (page_buf == NULL) { -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: FIL: Compression buffer not given, allocating...\n"); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); } else { in_buf = page_buf; @@ -287,11 +289,11 @@ fil_decompress_page( if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Preparing for decompress for len %lu\n", actual_size); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); @@ -310,11 +312,11 @@ fil_decompress_page( ut_error; } -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Decompression succeeded for len %lu \n", len); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ #ifdef HAVE_LZ4 } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 4d0379a2643..2b02428bfb6 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -315,6 +315,7 @@ enum monitor_id_t { MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR, /* Index related counters */ MONITOR_MODULE_INDEX, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index ac264a7d597..1d01c7821d0 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -121,6 +121,8 @@ struct srv_stats_t { ulint_ctr_64_t page_compressed_trim_op_saved; /* Number of pages decompressed with page compression */ ulint_ctr_64_t pages_page_decompressed; + /* Number of page compression errors */ + ulint_ctr_64_t pages_page_compression_error; /** Number of data read in total (in bytes) */ ulint_ctr_1_t data_read; @@ -908,6 +910,8 @@ struct export_var_t{ ib_int64_t innodb_pages_page_decompressed;/*!< Number of pages decompressed by page compression */ + ib_int64_t innodb_pages_page_compression_error;/*!< Number of page + compression errors */ }; /** Thread slot in the thread table. */ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 09340cca68d..8068e05573c 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -6166,7 +6166,7 @@ os_file_trim( *slot->write_size > 0 && len >= *slot->write_size)) { -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", *slot->write_size, trim_len, len); #endif diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 32171182cf9..f276efdc021 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -926,6 +926,11 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + {"compress_pages_page_compression_error", "compression", + "Number of page compression errors", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1871,6 +1876,9 @@ srv_mon_process_existing_counter( case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: value = srv_stats.pages_page_decompressed; break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR: + value = srv_stats.pages_page_compression_error; + break; default: ut_error; diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index 04fe25afa01..7b502ae3eea 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -721,7 +721,7 @@ buf_flush_write_complete( buf_pool->n_flush[flush_type]--; -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[flush_type]); #endif @@ -1863,7 +1863,7 @@ buf_flush_start( /* There is already a flush batch of the same type running */ -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "Error: flush_type %d n_flush %lu init_flush %lu\n", flush_type, buf_pool->n_flush[flush_type], buf_pool->init_flush[flush_type]); #endif @@ -2732,7 +2732,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( /* Flush pages from end of LRU if required */ n_lru = n_flushed = buf_flush_LRU_tail(); -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if (n_lru) { fprintf(stderr,"n_lru:%lu ",n_lru); } @@ -2743,7 +2743,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( /* Flush pages from flush_list if required */ n_flushed += n_pgc_flush = page_cleaner_flush_pages_if_needed(); -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if (n_pgc_flush) { fprintf(stderr,"n_pgc_flush:%lu ",n_pgc_flush); } @@ -2760,16 +2760,16 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( MONITOR_FLUSH_BACKGROUND_PAGES, n_flushed); } -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if (n_pgc_batch) { fprintf(stderr,"n_pgc_batch:%lu ",n_pgc_batch); } #endif } -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if (n_lru || n_pgc_flush || n_pgc_batch) { - fprintf(stderr,"\n"); + fprintf1(stderr,"\n"); n_lru = n_pgc_flush = n_pgc_batch = 0; } #endif diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index d1ec9979f51..5df4a96d42e 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -199,7 +199,7 @@ buf_mtflu_flush_pool_instance( pools based on the assumption that it will help in the retry which will follow the failure. */ -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); #endif return 0; @@ -337,12 +337,12 @@ DECLARE_THREAD(mtflush_io_thread)( while (TRUE) { -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", os_thread_get_curr_id(), ib_wqueue_len(mtflush_io->wq), ib_wqueue_len(mtflush_io->wr_cq)); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_MTFLUSH_DEBUG */ mtflush_service_io(mtflush_io, this_thread_data); @@ -381,7 +381,7 @@ buf_mtflu_io_thread_exit(void) mtflush_io->gwt_status = WTHR_KILL_IT; - fprintf(stderr, "signal mtflush_io_threads to exit [%lu]\n", + fprintf(stderr, "InnoDB: [Note]: Signal mtflush_io_threads to exit [%lu]\n", srv_mtflush_threads); /* Send one exit work item/thread */ @@ -551,6 +551,7 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; +#if UNIV_DEBUG if((int)done_wi->id_usr == 0 && (done_wi->wi_status == WRK_ITEM_SET || done_wi->wi_status == WRK_ITEM_UNSET)) { @@ -560,6 +561,7 @@ buf_mtflu_flush_work_items( done_wi->wr.flush_type); ut_a(0); } +#endif n_flushed+= done_wi->n_flushed; i++; @@ -631,7 +633,7 @@ buf_mtflu_flush_list( cnt_flush[i]); } } -#ifdef UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); #endif @@ -673,7 +675,7 @@ buf_mtflu_flush_LRU_tail(void) } } -#if UNIV_DEBUG +#if UNIV_MTFLUSH_DEBUG fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); #endif diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index eac889cf7c6..2acdf85b100 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -106,11 +106,11 @@ fil_compress_page( level = page_zip_level; } -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", space_id, fil_space_name(space), len); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ write_size = UNIV_PAGE_SIZE - header_len; @@ -126,6 +126,7 @@ fil_compress_page( "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); *out_len = len; return (buf); } @@ -140,6 +141,7 @@ fil_compress_page( "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); *out_len = len; return (buf); } @@ -193,11 +195,11 @@ fil_compress_page( ut_a((write_size % SECT_SIZE) == 0); } -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", space_id, fil_space_name(space), len, write_size); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ srv_stats.page_compression_saved.add((len - write_size)); @@ -205,7 +207,7 @@ fil_compress_page( srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); } - //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); *out_len = write_size; @@ -254,10 +256,10 @@ fil_decompress_page( // If no buffer was given, we need to allocate temporal buffer if (page_buf == NULL) { -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: FIL: Note: Compression buffer not given, allocating...\n"); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); } else { in_buf = page_buf; @@ -283,11 +285,11 @@ fil_decompress_page( if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Preparing for decompress for len %lu\n", actual_size); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); @@ -305,11 +307,11 @@ fil_decompress_page( ut_error; } -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "InnoDB: Note: Decompression succeeded for len %lu \n", len); -#endif /* UNIV_DEBUG */ +#endif /* UNIV_PAGECOMPRESS_DEBUG */ #ifdef HAVE_LZ4 } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h index 10e1fa6188a..8e6975ed68f 100644 --- a/storage/xtradb/include/srv0mon.h +++ b/storage/xtradb/include/srv0mon.h @@ -316,6 +316,7 @@ enum monitor_id_t { MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR, /* Index related counters */ MONITOR_MODULE_INDEX, diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index a532f90ec24..be16dfddc72 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -121,6 +121,8 @@ struct srv_stats_t { ulint_ctr_64_t page_compressed_trim_op_saved; /* Number of pages decompressed with page compression */ ulint_ctr_64_t pages_page_decompressed; + /* Number of page compression errors */ + ulint_ctr_64_t pages_page_compression_error; /** Number of data read in total (in bytes) */ ulint_ctr_1_t data_read; @@ -1117,6 +1119,8 @@ struct export_var_t{ ib_int64_t innodb_pages_page_decompressed;/*!< Number of pages decompressed by page compression */ + ib_int64_t innodb_pages_page_compression_error;/*!< Number of page + compression errors */ }; /** Thread slot in the thread table. */ diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index fc2f5d78c9a..646f8a87cbc 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -6234,7 +6234,7 @@ os_file_trim( *slot->write_size > 0 && len >= *slot->write_size)) { -#ifdef UNIV_DEBUG +#ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", *slot->write_size, trim_len, len); #endif diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index 32171182cf9..f276efdc021 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -926,6 +926,11 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + {"compress_pages_page_compression_error", "compression", + "Number of page compression errors", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1871,6 +1876,9 @@ srv_mon_process_existing_counter( case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: value = srv_stats.pages_page_decompressed; break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR: + value = srv_stats.pages_page_compression_error; + break; default: ut_error; From a81f8fd5804e84b0679fe11a079d2ced641ee1f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Sat, 22 Mar 2014 11:30:03 +0200 Subject: [PATCH 46/56] Fix test cases to contain new status variables introduced. --- .../sys_vars/r/innodb_monitor_disable_basic.result | 2 ++ .../sys_vars/r/innodb_monitor_enable_basic.result | 10 ++++++++++ .../sys_vars/r/innodb_monitor_reset_all_basic.result | 10 ++++++++++ .../suite/sys_vars/r/innodb_monitor_reset_basic.result | 10 ++++++++++ 4 files changed, 32 insertions(+) diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result index 1f3d38a0420..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result @@ -38,6 +38,7 @@ buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -168,6 +169,7 @@ compress_pages_page_compressed disabled compress_page_compressed_trim_op disabled compress_page_compressed_trim_op_saved disabled compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled From f761835b5c13158fd958a5239b346daa09b06cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Tue, 25 Mar 2014 21:31:27 +0200 Subject: [PATCH 47/56] Fix candidate for XtraDB and row compressed tables. --- storage/xtradb/buf/buf0flu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index 7b502ae3eea..53ac9bb9cc7 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1674,7 +1674,7 @@ buf_do_LRU_batch( n->flushed = 0; } - n->evicted += count; + n->flushed += count; } /*******************************************************************//** From 502733803979e2109b6dcdcb3d8c5a0ddd6d2363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 27 Mar 2014 09:35:24 +0200 Subject: [PATCH 48/56] Fix bug https://code.launchpad.net/~laurynas-biveinis/percona-server/bug1295268 (Inadequate background LRU flushing for write workloads with InnoDB compression). If InnoDB compression is used and the workload has writes, the following situation is possible. The LRU flusher issues an LRU flush request for an instance. buf_do_LRU_batch decides to perform unzip_LRU eviction and this eviction might fully satisfy the request. Then buf_flush_LRU_tail checks the number of flushed pages in the last iteration, finds it to be zero, and wrongly decides not to flush that instance anymore. Fixed by maintaining unzip_LRU eviction counter in struct flush_counter_t variables, and checking it in buf_flush_LRU_tail when deciding whether to stop flushing the current instance. Added test cases for new configuration files to get mysql-test-run suite sys_vars to pass. Fix some small errors. --- .../r/innodb_mtflush_threads_basic.result | 21 +++++++++++ .../sys_vars/r/innodb_use_lz4_basic.result | 3 ++ .../r/innodb_use_mtflush_basic.result | 21 +++++++++++ .../sys_vars/r/innodb_use_trim_basic.result | 33 +++++++++++++++++ .../t/innodb_mtflush_threads_basic.test | 21 +++++++++++ .../sys_vars/t/innodb_use_lz4_basic.test | 5 +++ .../sys_vars/t/innodb_use_mtflush_basic.test | 22 ++++++++++++ .../sys_vars/t/innodb_use_trim_basic.test | 36 +++++++++++++++++++ storage/innobase/handler/ha_innodb.cc | 4 +-- storage/xtradb/buf/buf0flu.cc | 24 ++++++++----- storage/xtradb/handler/ha_innodb.cc | 4 +-- storage/xtradb/include/buf0flu.h | 2 ++ 12 files changed, 184 insertions(+), 12 deletions(-) create mode 100644 mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result create mode 100644 mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result create mode 100644 mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result create mode 100644 mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result create mode 100644 mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test create mode 100644 mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test create mode 100644 mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test create mode 100644 mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test diff --git a/mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result b/mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result new file mode 100644 index 00000000000..75a1cc5262e --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result @@ -0,0 +1,21 @@ +select @@global.innodb_mtflush_threads; +@@global.innodb_mtflush_threads +8 +select @@session.innodb_mtflush_threads; +ERROR HY000: Variable 'innodb_mtflush_threads' is a GLOBAL variable +show global variables like 'innodb_mtflush_threads'; +Variable_name Value +innodb_mtflush_threads 8 +show session variables like 'innodb_mtflush_threads'; +Variable_name Value +innodb_mtflush_threads 8 +select * from information_schema.global_variables where variable_name='innodb_mtflush_threads'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_MTFLUSH_THREADS 8 +select * from information_schema.session_variables where variable_name='innodb_mtflush_threads'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_MTFLUSH_THREADS 8 +set global innodb_mtflush_threads=1; +ERROR HY000: Variable 'innodb_mtflush_threads' is a read only variable +set session innodb_mtflush_threads=1; +ERROR HY000: Variable 'innodb_mtflush_threads' is a read only variable diff --git a/mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result b/mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result new file mode 100644 index 00000000000..4c3cfa524af --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result @@ -0,0 +1,3 @@ +select @@global.innodb_use_fallocate; +@@global.innodb_use_fallocate +0 diff --git a/mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result b/mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result new file mode 100644 index 00000000000..f77abba7ac9 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result @@ -0,0 +1,21 @@ +select @@global.innodb_use_mtflush; +@@global.innodb_use_mtflush +0 +select @@session.innodb_use_mtflush; +ERROR HY000: Variable 'innodb_use_mtflush' is a GLOBAL variable +show global variables like 'innodb_use_mtflush'; +Variable_name Value +innodb_use_mtflush OFF +show session variables like 'innodb_use_mtflush'; +Variable_name Value +innodb_use_mtflush OFF +select * from information_schema.global_variables where variable_name='innodb_use_mtflush'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_USE_MTFLUSH OFF +select * from information_schema.session_variables where variable_name='innodb_use_mtflush'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_USE_MTFLUSH OFF +set global innodb_use_mtflush=1; +ERROR HY000: Variable 'innodb_use_mtflush' is a read only variable +set session innodb_use_mtflush=1; +ERROR HY000: Variable 'innodb_use_mtflush' is a read only variable diff --git a/mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result b/mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result new file mode 100644 index 00000000000..63292f5d3c8 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result @@ -0,0 +1,33 @@ +SET @start_use_trim = @@global.innodb_use_trim; +SELECT @start_use_trim; +@start_use_trim +0 +SELECT COUNT(@@GLOBAL.innodb_use_trim); +COUNT(@@GLOBAL.innodb_use_trim) +1 +1 Expected +SET @@GLOBAL.innodb_use_trim=1; +SELECT COUNT(@@GLOBAL.innodb_use_trim); +COUNT(@@GLOBAL.innodb_use_trim) +1 +1 Expected +SELECT IF(@@GLOBAL.innodb_use_trim, 'ON', 'OFF') = VARIABLE_VALUE +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +IF(@@GLOBAL.innodb_use_trim, 'ON', 'OFF') = VARIABLE_VALUE +1 +1 Expected +SELECT COUNT(@@GLOBAL.innodb_use_trim); +COUNT(@@GLOBAL.innodb_use_trim) +1 +1 Expected +SELECT COUNT(VARIABLE_VALUE) +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +COUNT(VARIABLE_VALUE) +1 +1 Expected +SET @@global.innodb_use_trim = @start_use_trim; +SELECT @@global.innodb_use_trim; +@@global.innodb_use_trim +0 diff --git a/mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test b/mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test new file mode 100644 index 00000000000..c8412f969eb --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test @@ -0,0 +1,21 @@ +--source include/have_innodb.inc +# bool readonly + +# +# show values; +# +select @@global.innodb_mtflush_threads; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +select @@session.innodb_mtflush_threads; +show global variables like 'innodb_mtflush_threads'; +show session variables like 'innodb_mtflush_threads'; +select * from information_schema.global_variables where variable_name='innodb_mtflush_threads'; +select * from information_schema.session_variables where variable_name='innodb_mtflush_threads'; + +# +# show that it's read-only +# +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set global innodb_mtflush_threads=1; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set session innodb_mtflush_threads=1; diff --git a/mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test b/mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test new file mode 100644 index 00000000000..aefa276dcee --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test @@ -0,0 +1,5 @@ +--source include/have_innodb.inc +# bool readonly +# not on all compilations +select @@global.innodb_use_fallocate; + diff --git a/mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test b/mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test new file mode 100644 index 00000000000..a9c40b9e522 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test @@ -0,0 +1,22 @@ +--source include/have_innodb.inc +# bool readonly + +# +# show values; +# +select @@global.innodb_use_mtflush; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +select @@session.innodb_use_mtflush; +show global variables like 'innodb_use_mtflush'; +show session variables like 'innodb_use_mtflush'; +select * from information_schema.global_variables where variable_name='innodb_use_mtflush'; +select * from information_schema.session_variables where variable_name='innodb_use_mtflush'; + +# +# show that it's read-only +# +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set global innodb_use_mtflush=1; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set session innodb_use_mtflush=1; + diff --git a/mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test b/mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test new file mode 100644 index 00000000000..c1b0f142179 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test @@ -0,0 +1,36 @@ +--source include/have_innodb.inc + +SET @start_use_trim = @@global.innodb_use_trim; +SELECT @start_use_trim; + +SELECT COUNT(@@GLOBAL.innodb_use_trim); +--echo 1 Expected + +#################################################################### +# Check if Value can set # +#################################################################### + +SET @@GLOBAL.innodb_use_trim=1; + +SELECT COUNT(@@GLOBAL.innodb_use_trim); +--echo 1 Expected + +################################################################# +# Check if the value in GLOBAL Table matches value in variable # +################################################################# + +SELECT IF(@@GLOBAL.innodb_use_trim, 'ON', 'OFF') = VARIABLE_VALUE +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +--echo 1 Expected + +SELECT COUNT(@@GLOBAL.innodb_use_trim); +--echo 1 Expected + +SELECT COUNT(VARIABLE_VALUE) +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +--echo 1 Expected + +SET @@global.innodb_use_trim = @start_use_trim; +SELECT @@global.innodb_use_trim; \ No newline at end of file diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index b790ae76121..16e33c8901f 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16801,7 +16801,7 @@ static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, #endif /* HAVE_LZ4 */ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, - PLUGIN_VAR_RQCMDARG, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Number of multi-threaded flush threads", NULL, NULL, MTFLUSH_DEFAULT_WORKER, /* Default setting */ @@ -16810,7 +16810,7 @@ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, 0); static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, - PLUGIN_VAR_OPCMDARG , + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Use multi-threaded flush. Default FALSE.", NULL, NULL, FALSE); diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index 53ac9bb9cc7..f4ba0f10761 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1549,6 +1549,7 @@ buf_flush_LRU_list_batch( n->flushed = 0; n->evicted = 0; + n->unzip_LRU_evicted = 0; ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); @@ -1660,21 +1661,22 @@ buf_do_LRU_batch( flush_counters_t* n) /*!< out: flushed/evicted page counts */ { - ulint count = 0; - if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { - count += buf_free_from_unzip_LRU_list_batch(buf_pool, max); + n->unzip_LRU_evicted + += buf_free_from_unzip_LRU_list_batch(buf_pool, max); + } else { + n->unzip_LRU_evicted = 0; } - if (max > count) { - buf_flush_LRU_list_batch(buf_pool, max - count, limited_scan, - n); + if (max > n->unzip_LRU_evicted) { + buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, + limited_scan, n); } else { n->evicted = 0; n->flushed = 0; } - n->flushed += count; + n->evicted += n->unzip_LRU_evicted; } /*******************************************************************//** @@ -2306,9 +2308,15 @@ buf_flush_LRU_tail(void) requested_pages[i] += lru_chunk_size; + /* If we failed to flush or evict this + instance, do not bother anymore. But take into + account that we might have zero flushed pages + because the flushing request was fully + satisfied by unzip_LRU evictions. */ if (requested_pages[i] >= scan_depth[i] || !(srv_cleaner_eviction_factor - ? n.evicted : n.flushed)) { + ? n.evicted + : (n.flushed + n.unzip_LRU_evicted))) { active_instance[i] = false; remaining_instances--; diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 4436dc3d0e1..09416a990d7 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -17955,7 +17955,7 @@ static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, #endif /* HAVE_LZ4 */ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, - PLUGIN_VAR_RQCMDARG, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Number of multi-threaded flush threads", NULL, NULL, MTFLUSH_DEFAULT_WORKER, /* Default setting */ @@ -17964,7 +17964,7 @@ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, 0); static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, - PLUGIN_VAR_OPCMDARG , + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Use multi-threaded flush. Default FALSE.", NULL, NULL, FALSE); diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h index 528ec7b3f64..4cb1446036b 100644 --- a/storage/xtradb/include/buf0flu.h +++ b/storage/xtradb/include/buf0flu.h @@ -40,6 +40,8 @@ extern ibool buf_page_cleaner_is_active; struct flush_counters_t { ulint flushed; /*!< number of dirty pages flushed */ ulint evicted; /*!< number of clean pages evicted */ + ulint unzip_LRU_evicted;/*!< number of uncompressed page images + evicted */ }; From 0b92fe9c188109c980444114f36bc56c119b84e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 27 Mar 2014 12:21:16 +0200 Subject: [PATCH 49/56] Fixed windows compiler errors. --- storage/innobase/fil/fil0pagecompress.cc | 2 +- storage/xtradb/fil/fil0pagecompress.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index dfd52d36b8e..75da02a22a4 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -132,7 +132,7 @@ fil_compress_page( } } else { #endif /* HAVE_LZ4 */ - err = compress2(out_buf+header_len, &write_size, buf, len, level); + err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level); if (err != Z_OK) { /* If error we leave the actual page as it was */ diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index 2acdf85b100..96c019e3723 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -132,7 +132,7 @@ fil_compress_page( } } else { #endif /* HAVE_LZ4 */ - err = compress2(out_buf+header_len, &write_size, buf, len, level); + err = compress2(out_buf+header_len, (ulong *)&write_size, buf, len, level); if (err != Z_OK) { /* If error we leave the actual page as it was */ From 3b61030dc19cdd63e376db1db91f771051b1ac3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 28 Mar 2014 08:42:53 +0200 Subject: [PATCH 50/56] Fix error on innodb_mtflush_threads parameter. --- storage/innobase/handler/ha_innodb.cc | 2 +- storage/xtradb/handler/ha_innodb.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 16e33c8901f..2d3ac405cbe 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16801,7 +16801,7 @@ static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, #endif /* HAVE_LZ4 */ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of multi-threaded flush threads", NULL, NULL, MTFLUSH_DEFAULT_WORKER, /* Default setting */ diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 09416a990d7..83fd8b28394 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -17955,7 +17955,7 @@ static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, #endif /* HAVE_LZ4 */ static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of multi-threaded flush threads", NULL, NULL, MTFLUSH_DEFAULT_WORKER, /* Default setting */ From 88765c3b4d7357ed5a063abb46cabf72c26e7b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Sat, 29 Mar 2014 16:51:28 +0200 Subject: [PATCH 51/56] Disable failing test cases that fail because of upstream. --- mysql-test/disabled.def | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mysql-test/disabled.def b/mysql-test/disabled.def index e5fa24786e1..d2e839fa39a 100644 --- a/mysql-test/disabled.def +++ b/mysql-test/disabled.def @@ -20,3 +20,5 @@ mysql_embedded : Bug#12561297 2011-05-14 Anitha Dependent on PB2 chang ssl_crl_clients_valid : broken upstream ssl_crl : broken upstream ssl_crl_clrpath : broken upstream +innodb-wl5522-debug-zip : broken upstream +innodb_bug12902967 : broken upstream \ No newline at end of file From 13c73c31c320877bb3a7b7035631ccdd6eee4c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Tue, 15 Apr 2014 14:28:25 +0300 Subject: [PATCH 52/56] Added support for LZO compression method. Removed: innodb_use_lz4 configuration parameter Added: innodb_compression_algorithm configuration parameter 0 = no compression, 1 = ZLIB, 2 = LZ4, 3 = LZO Fixed issue with incorrect trim calculations --- cmake/lz4.cmake | 24 ++-- cmake/lzo.cmake | 35 ++++++ storage/innobase/CMakeLists.txt | 4 +- storage/innobase/fil/fil0pagecompress.cc | 118 ++++++++++++-------- storage/innobase/handler/ha_innodb.cc | 25 +++-- storage/innobase/include/fil0pagecompress.h | 12 +- storage/innobase/include/srv0srv.h | 4 +- storage/innobase/os/os0file.cc | 22 +++- storage/innobase/srv/srv0srv.cc | 3 +- storage/xtradb/CMakeLists.txt | 4 +- storage/xtradb/fil/fil0pagecompress.cc | 117 +++++++++++-------- storage/xtradb/handler/ha_innodb.cc | 25 +++-- storage/xtradb/include/fil0pagecompress.h | 12 +- storage/xtradb/include/srv0srv.h | 4 +- storage/xtradb/os/os0file.cc | 22 +++- storage/xtradb/srv/srv0srv.cc | 3 +- 16 files changed, 297 insertions(+), 137 deletions(-) create mode 100644 cmake/lzo.cmake diff --git a/cmake/lz4.cmake b/cmake/lz4.cmake index 56120e2cdd0..bb2300891eb 100644 --- a/cmake/lz4.cmake +++ b/cmake/lz4.cmake @@ -14,22 +14,22 @@ MACRO (MYSQL_CHECK_LZ4) -CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) -CHECK_LIBRARY_EXISTS(liblz4.a LZ4_compress_limitedOutput "" HAVE_LZ4_LIB) - -IF(HAVE_LZ4_LIB AND HAVE_LZ4_H) - ADD_DEFINITIONS(-DHAVE_LZ4=1) - LINK_LIBRARIES(liblz4.a) -ENDIF() -ENDMACRO() - -MACRO (MYSQL_CHECK_SHARED_LZ4) - CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) CHECK_LIBRARY_EXISTS(lz4 LZ4_compress_limitedOutput "" HAVE_LZ4_SHARED_LIB) IF (HAVE_LZ4_SHARED_LIB AND HAVE_LZ4_H) ADD_DEFINITIONS(-DHAVE_LZ4=1) - LINK_LIBRARIES(lz4) + LINK_LIBRARIES(lz4) ENDIF() ENDMACRO() + +MACRO (MYSQL_CHECK_LZ4_STATIC) + + CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) + CHECK_LIBRARY_EXISTS(liblz4.a LZ4_compress_limitedOutput "" HAVE_LZ4_LIB) + + IF(HAVE_LZ4_LIB AND HAVE_LZ4_H) + ADD_DEFINITIONS(-DHAVE_LZ4=1) + LINK_LIBRARIES(liblz4.a) + ENDIF() +ENDMACRO() \ No newline at end of file diff --git a/cmake/lzo.cmake b/cmake/lzo.cmake new file mode 100644 index 00000000000..596dfdcde8b --- /dev/null +++ b/cmake/lzo.cmake @@ -0,0 +1,35 @@ +# Copyright (C) 2014, SkySQL Ab. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +MACRO (MYSQL_CHECK_LZO_STATIC) + +CHECK_INCLUDE_FILES(lzo/lzo1x.h HAVE_LZO_H) +CHECK_LIBRARY_EXISTS(liblzo2.a lzo1x_1_compress "" HAVE_LZO_LIB) + +IF(HAVE_LZO_LIB AND HAVE_LZO_H) + ADD_DEFINITIONS(-DHAVE_LZO=1) + LINK_LIBRARIES(liblzo2.a) +ENDIF() +ENDMACRO() + +MACRO (MYSQL_CHECK_LZO) + +CHECK_INCLUDE_FILES(lzo/lzo1x.h HAVE_LZO_H) +CHECK_LIBRARY_EXISTS(lzo2 lzo1x_1_compress "" HAVE_LZO_LIB) + +IF(HAVE_LZO_LIB AND HAVE_LZO_H) + ADD_DEFINITIONS(-DHAVE_LZO=1) + LINK_LIBRARIES(lzo2) +ENDIF() +ENDMACRO() diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index fa948c449c2..ca64c730051 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -19,8 +19,10 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) INCLUDE(lz4) +INCLUDE(lzo) -MYSQL_CHECK_SHARED_LZ4() +MYSQL_CHECK_LZ4() +MYSQL_CHECK_LZO() # OS tests IF(UNIX) diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 75da02a22a4..e06a789e37b 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -66,6 +66,10 @@ static ulint srv_data_read, srv_data_written; #ifdef HAVE_LZ4 #include "lz4.h" #endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /****************************************************************//** For page compressed pages compress the page before actual write @@ -81,7 +85,9 @@ fil_compress_page( byte* out_buf, /*!< out: compressed buffer */ ulint len, /*!< in: length of input buffer.*/ ulint compression_level, /* in: compression level */ - ulint* out_len) /*!< out: actual length of compressed page */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem) /*!< in: temporal memory used by LZO */ { int err = Z_OK; int level = 0; @@ -114,9 +120,11 @@ fil_compress_page( write_size = UNIV_PAGE_SIZE - header_len; + switch(innodb_compression_algorithm) { #ifdef HAVE_LZ4 - if (srv_use_lz4) { - err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); + case PAGE_LZ4_ALGORITHM: + err = LZ4_compress_limitedOutput((const char *)buf, + (char *)out_buf+header_len, len, write_size); write_size = err; if (err == 0) { @@ -130,8 +138,25 @@ fil_compress_page( *out_len = len; return (buf); } - } else { + break; #endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_1_15_compress( + buf, len, out_buf+header_len, &write_size, lzo_mem); + + if (err != LZO_E_OK || write_size > len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu", + space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + + break; +#endif /* HAVE_LZO */ + case PAGE_ZLIB_ALGORITHM: err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level); if (err != Z_OK) { @@ -145,9 +170,12 @@ fil_compress_page( *out_len = len; return (buf); } -#ifdef HAVE_LZ4 + break; + + default: + ut_error; + break; } -#endif /* HAVE_LZ4 */ /* Set up the page header */ memcpy(out_buf, buf, FIL_PAGE_DATA); @@ -156,18 +184,7 @@ fil_compress_page( /* Set up the correct page type */ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); /* Set up the flush lsn to be compression algorithm */ - -#ifdef HAVE_LZ4 - if (srv_use_lz4) { - mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); - } else { -#endif /* HAVE_LZ4 */ - mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); - -#ifdef HAVE_LZ4 - } -#endif /* HAVE_LZ4 */ - + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, innodb_compression_algorithm); /* Set up the actual payload lenght */ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); @@ -176,17 +193,7 @@ fil_compress_page( ut_ad(fil_page_is_compressed(out_buf)); ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); - -#ifdef HAVE_LZ4 - if (srv_use_lz4) { - ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); - } else { -#endif /* HAVE_LZ4 */ - ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); - -#ifdef HAVE_LZ4 - } -#endif /* HAVE_LZ4 */ + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == innodb_compression_algorithm); #endif /* UNIV_DEBUG */ write_size+=header_len; @@ -207,11 +214,6 @@ fil_compress_page( srv_stats.page_compression_saved.add((len - write_size)); - if ((len - write_size) > 0) { - srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); - srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); - } - srv_stats.pages_page_compressed.inc(); *out_len = write_size; @@ -236,6 +238,7 @@ fil_decompress_page( ulint actual_size = 0; ulint compression_alg = 0; byte *in_buf; + ulint olen=0; ut_ad(buf); ut_ad(len); @@ -287,16 +290,16 @@ fil_decompress_page( *write_size = actual_size; } - if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { - #ifdef UNIV_PAGECOMPRESS_DEBUG - fprintf(stderr, - "InnoDB: Note: Preparing for decompress for len %lu\n", - actual_size); + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); #endif /* UNIV_PAGECOMPRESS_DEBUG */ - err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + switch(compression_alg) { + case PAGE_ZLIB_ALGORITHM: + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); /* If uncompress fails it means that page is corrupted */ if (err != Z_OK) { @@ -311,14 +314,10 @@ fil_decompress_page( ut_error; } + break; -#ifdef UNIV_PAGECOMPRESS_DEBUG - fprintf(stderr, - "InnoDB: Note: Decompression succeeded for len %lu \n", - len); -#endif /* UNIV_PAGECOMPRESS_DEBUG */ #ifdef HAVE_LZ4 - } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { + case PAGE_LZ4_ALGORITHM: err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); if (err != (int)actual_size) { @@ -331,8 +330,26 @@ fil_decompress_page( ut_error; } + break; #endif /* HAVE_LZ4 */ - } else { +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_decompress((const unsigned char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, + actual_size,(unsigned char *)in_buf, &olen, NULL); + + if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + olen, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif + default: fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" "InnoDB: but compression algorithm %s\n" @@ -341,8 +358,15 @@ fil_decompress_page( fflush(stderr); ut_error; + break; } +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + srv_stats.pages_page_decompressed.inc(); /* Copy the uncompressed page to the buffer pool, not diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 2d3ac405cbe..c82c15193ee 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -101,6 +101,7 @@ this program; if not, write to the Free Software Foundation, Inc., #endif /* UNIV_DEBUG */ #include "fts0priv.h" #include "page0zip.h" +#include "fil0pagecompress.h" #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X)) @@ -16793,12 +16794,20 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, "Use trim. Default FALSE.", NULL, NULL, FALSE); -#ifdef HAVE_LZ4 -static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, - PLUGIN_VAR_OPCMDARG , - "Use LZ4 for page compression", - NULL, NULL, FALSE); -#endif /* HAVE_LZ4 */ +static MYSQL_SYSVAR_LONG(compression_algorithm, innodb_compression_algorithm, + PLUGIN_VAR_OPCMDARG, + "Compression algorithm used on page compression. 1 for zlib, 2 for lz3, 3 for lzo", + NULL, NULL, + PAGE_ZLIB_ALGORITHM, + 0, +#if defined(HAVE_LZO) && defined(HAVE_LZ4) + PAGE_ALGORITHM_LAST, +#elif defined(HAVE_LZ4) && !defined(HAVE_LZO) + PAGE_ALGORITHM_LZ4, +#else + PAGE_ALGORITHM_ZLIB, +#endif + 0); static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -16967,9 +16976,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ MYSQL_SYSVAR(use_trim), -#ifdef HAVE_LZ4 - MYSQL_SYSVAR(use_lz4), -#endif + MYSQL_SYSVAR(compression_algorithm), MYSQL_SYSVAR(mtflush_threads), MYSQL_SYSVAR(use_mtflush), NULL diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h index c362c0ddcd2..0cc5aeb4678 100644 --- a/storage/innobase/include/fil0pagecompress.h +++ b/storage/innobase/include/fil0pagecompress.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -22,6 +22,12 @@ this program; if not, write to the Free Software Foundation, Inc., #include "fsp0fsp.h" #include "fsp0pagecompress.h" +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_ALGORITHM_LAST PAGE_LZO_ALGORITHM + /******************************************************************//** @file include/fil0pagecompress.h Helper functions for extracting/storing page compression and @@ -85,7 +91,9 @@ fil_compress_page( byte* out_buf, /*!< out: compressed buffer */ ulint len, /*!< in: length of input buffer.*/ ulint compression_level, /*!< in: compression level */ - ulint* out_len); /*!< out: actual length of compressed page */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem); /*!< in: temporal memory used by LZO */ /****************************************************************//** For page compressed pages decompress the page after actual read diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 1d01c7821d0..cfa94242200 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -249,8 +249,8 @@ extern my_bool srv_use_posix_fallocate; /* Use atomic writes i.e disable doublewrite buffer */ extern my_bool srv_use_atomic_writes; -/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ -extern my_bool srv_use_lz4; +/* Compression algorithm*/ +extern long innodb_compression_algorithm; /* Number of flush threads */ #define MTFLUSH_MAX_WORKER 64 diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 8068e05573c..ce1b42e670e 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -74,6 +74,10 @@ Created 10/21/1995 Heikki Tuuri # endif #endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -221,6 +225,12 @@ struct os_aio_slot_t{ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ #endif /* WIN_ASYNC_IO */ +#ifdef HAVE_LZO + byte lzo_mem[LZO1X_1_15_MEM_COMPRESS]; +#else + byte lzo_mem; /* Temporal memory used by LZO */ +#endif + }; /** The asynchronous i/o array structure */ @@ -4480,7 +4490,15 @@ found: ut_ad(slot->page_buf); - tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + &real_len, + slot->lzo_mem + ); /* If compression succeeded, set up the length and buffer */ if (tmp != buf) { diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index fe3af72e150..a9cc7beb6b0 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -71,6 +71,7 @@ Created 10/8/1995 Heikki Tuuri #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /* The following is the maximum allowed duration of a lock wait. */ UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; @@ -154,7 +155,7 @@ UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ -UNIV_INTERN my_bool srv_use_lz4 = FALSE; +UNIV_INTERN long innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM; /* Number of threads used for multi-threaded flush */ UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; /* If this flag is TRUE, then we will use multi threaded flush. */ diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 10118cce0c1..7e6e5a048e2 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -19,8 +19,10 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) INCLUDE(lz4) +INCLUDE(lzo) -MYSQL_CHECK_LZ4() +MYSQL_CHECK_LZ4_STATIC() +MYSQL_CHECK_LZO_STATIC() # OS tests IF(UNIX) diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index 96c019e3723..e06a789e37b 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -66,6 +66,10 @@ static ulint srv_data_read, srv_data_written; #ifdef HAVE_LZ4 #include "lz4.h" #endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /****************************************************************//** For page compressed pages compress the page before actual write @@ -81,7 +85,9 @@ fil_compress_page( byte* out_buf, /*!< out: compressed buffer */ ulint len, /*!< in: length of input buffer.*/ ulint compression_level, /* in: compression level */ - ulint* out_len) /*!< out: actual length of compressed page */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem) /*!< in: temporal memory used by LZO */ { int err = Z_OK; int level = 0; @@ -114,9 +120,11 @@ fil_compress_page( write_size = UNIV_PAGE_SIZE - header_len; + switch(innodb_compression_algorithm) { #ifdef HAVE_LZ4 - if (srv_use_lz4) { - err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); + case PAGE_LZ4_ALGORITHM: + err = LZ4_compress_limitedOutput((const char *)buf, + (char *)out_buf+header_len, len, write_size); write_size = err; if (err == 0) { @@ -130,9 +138,26 @@ fil_compress_page( *out_len = len; return (buf); } - } else { + break; #endif /* HAVE_LZ4 */ - err = compress2(out_buf+header_len, (ulong *)&write_size, buf, len, level); +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_1_15_compress( + buf, len, out_buf+header_len, &write_size, lzo_mem); + + if (err != LZO_E_OK || write_size > len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu", + space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + + break; +#endif /* HAVE_LZO */ + case PAGE_ZLIB_ALGORITHM: + err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level); if (err != Z_OK) { /* If error we leave the actual page as it was */ @@ -145,9 +170,12 @@ fil_compress_page( *out_len = len; return (buf); } -#ifdef HAVE_LZ4 + break; + + default: + ut_error; + break; } -#endif /* HAVE_LZ4 */ /* Set up the page header */ memcpy(out_buf, buf, FIL_PAGE_DATA); @@ -156,15 +184,7 @@ fil_compress_page( /* Set up the correct page type */ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); /* Set up the flush lsn to be compression algorithm */ -#ifdef HAVE_LZ4 - if (srv_use_lz4) { - mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); - } else { -#endif /* HAVE_LZ4 */ - mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); -#ifdef HAVE_LZ4 - } -#endif /* HAVE_LZ4 */ + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, innodb_compression_algorithm); /* Set up the actual payload lenght */ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); @@ -173,16 +193,7 @@ fil_compress_page( ut_ad(fil_page_is_compressed(out_buf)); ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); - -#ifdef HAVE_LZ4 - if (srv_use_lz4) { - ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); - } else { -#endif /* HAVE_LZ4 */ - ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); -#ifdef HAVE_LZ4 - } -#endif /* HAVE_LZ4 */ + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == innodb_compression_algorithm); #endif /* UNIV_DEBUG */ write_size+=header_len; @@ -203,11 +214,6 @@ fil_compress_page( srv_stats.page_compression_saved.add((len - write_size)); - if ((len - write_size) > 0) { - srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); - srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); - } - srv_stats.pages_page_compressed.inc(); *out_len = write_size; @@ -232,6 +238,7 @@ fil_decompress_page( ulint actual_size = 0; ulint compression_alg = 0; byte *in_buf; + ulint olen=0; ut_ad(buf); ut_ad(len); @@ -258,7 +265,7 @@ fil_decompress_page( if (page_buf == NULL) { #ifdef UNIV_PAGECOMPRESS_DEBUG fprintf(stderr, - "InnoDB: FIL: Note: Compression buffer not given, allocating...\n"); + "InnoDB: Note: FIL: Compression buffer not given, allocating...\n"); #endif /* UNIV_PAGECOMPRESS_DEBUG */ in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); } else { @@ -283,14 +290,15 @@ fil_decompress_page( *write_size = actual_size; } - if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { - #ifdef UNIV_PAGECOMPRESS_DEBUG - fprintf(stderr, - "InnoDB: Note: Preparing for decompress for len %lu\n", - actual_size); + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); #endif /* UNIV_PAGECOMPRESS_DEBUG */ + + switch(compression_alg) { + case PAGE_ZLIB_ALGORITHM: err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); /* If uncompress fails it means that page is corrupted */ @@ -306,14 +314,10 @@ fil_decompress_page( ut_error; } + break; -#ifdef UNIV_PAGECOMPRESS_DEBUG - fprintf(stderr, - "InnoDB: Note: Decompression succeeded for len %lu \n", - len); -#endif /* UNIV_PAGECOMPRESS_DEBUG */ #ifdef HAVE_LZ4 - } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { + case PAGE_LZ4_ALGORITHM: err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); if (err != (int)actual_size) { @@ -326,8 +330,26 @@ fil_decompress_page( ut_error; } + break; #endif /* HAVE_LZ4 */ - } else { +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_decompress((const unsigned char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, + actual_size,(unsigned char *)in_buf, &olen, NULL); + + if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + olen, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif + default: fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" "InnoDB: but compression algorithm %s\n" @@ -336,8 +358,15 @@ fil_decompress_page( fflush(stderr); ut_error; + break; } +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + srv_stats.pages_page_decompressed.inc(); /* Copy the uncompressed page to the buffer pool, not diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 83fd8b28394..25b96be43b7 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -103,6 +103,7 @@ this program; if not, write to the Free Software Foundation, Inc., #endif /* UNIV_DEBUG */ #include "fts0priv.h" #include "page0zip.h" +#include "fil0pagecompress.h" #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X)) @@ -17947,12 +17948,20 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, "Use trim. Default FALSE.", NULL, NULL, FALSE); -#ifdef HAVE_LZ4 -static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, - PLUGIN_VAR_OPCMDARG , - "Use LZ4 for page compression", - NULL, NULL, FALSE); -#endif /* HAVE_LZ4 */ +static MYSQL_SYSVAR_LONG(compression_algorithm, innodb_compression_algorithm, + PLUGIN_VAR_OPCMDARG, + "Compression algorithm used on page compression. 1 for zlib, 2 for lz3, 3 for lzo", + NULL, NULL, + PAGE_ZLIB_ALGORITHM, + 0, +#if defined(HAVE_LZO) && defined(HAVE_LZ4) + PAGE_ALGORITHM_LAST, +#elif defined(HAVE_LZ4) && !defined(HAVE_LZO) + PAGE_ALGORITHM_LZ4, +#else + PAGE_ALGORITHM_ZLIB, +#endif + 0); static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -18159,9 +18168,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), MYSQL_SYSVAR(use_trim), -#ifdef HAVE_LZ4 - MYSQL_SYSVAR(use_lz4), -#endif + MYSQL_SYSVAR(compression_algorithm), MYSQL_SYSVAR(mtflush_threads), MYSQL_SYSVAR(use_mtflush), NULL diff --git a/storage/xtradb/include/fil0pagecompress.h b/storage/xtradb/include/fil0pagecompress.h index c362c0ddcd2..0cc5aeb4678 100644 --- a/storage/xtradb/include/fil0pagecompress.h +++ b/storage/xtradb/include/fil0pagecompress.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -22,6 +22,12 @@ this program; if not, write to the Free Software Foundation, Inc., #include "fsp0fsp.h" #include "fsp0pagecompress.h" +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_ALGORITHM_LAST PAGE_LZO_ALGORITHM + /******************************************************************//** @file include/fil0pagecompress.h Helper functions for extracting/storing page compression and @@ -85,7 +91,9 @@ fil_compress_page( byte* out_buf, /*!< out: compressed buffer */ ulint len, /*!< in: length of input buffer.*/ ulint compression_level, /*!< in: compression level */ - ulint* out_len); /*!< out: actual length of compressed page */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem); /*!< in: temporal memory used by LZO */ /****************************************************************//** For page compressed pages decompress the page after actual read diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index be16dfddc72..ea8afd450dd 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -269,8 +269,8 @@ extern my_bool srv_use_posix_fallocate; /* Use atomic writes i.e disable doublewrite buffer */ extern my_bool srv_use_atomic_writes; -/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ -extern my_bool srv_use_lz4; +/* Compression algorithm*/ +extern long innodb_compression_algorithm; /* Number of flush threads */ #define MTFLUSH_MAX_WORKER 64 diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 646f8a87cbc..a3307fa0ba2 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -80,6 +80,10 @@ Created 10/21/1995 Heikki Tuuri # endif #endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -230,6 +234,12 @@ struct os_aio_slot_t{ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ #endif /* WIN_ASYNC_IO */ +#ifdef HAVE_LZO + byte lzo_mem[LZO1X_1_15_MEM_COMPRESS]; +#else + byte lzo_mem; /* Temporal memory used by LZO */ +#endif + }; /** The asynchronous i/o array structure */ @@ -4596,7 +4606,15 @@ found: ut_ad(slot->page_buf); - tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + &real_len, + slot->lzo_mem + ); /* If compression succeeded, set up the length and buffer */ if (tmp != buf) { diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 386dbfddf0b..e70a2bd0dab 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -73,6 +73,7 @@ Created 10/8/1995 Heikki Tuuri #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */ ibool innobase_thd_is_idle(const void* thd); @@ -172,7 +173,7 @@ UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; /* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ -UNIV_INTERN my_bool srv_use_lz4 = FALSE; +UNIV_INTERN long innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM; /* Number of threads used for multi-threaded flush */ UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; /* If this flag is TRUE, then we will use multi threaded flush. */ From 2f46e5b9fc51f0c427634f935b7d922047023628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 16 Apr 2014 16:55:36 +0300 Subject: [PATCH 53/56] MDEV-6070: FusionIO: Failure to create a table with ATOMIC_WRITES option leaves the database in inconsistent state, Analysis: Problem was that atomic writes variable had incorrect type on same places leading to fact that e.g. OFF option was not regognized. Furthermore, some error check code was missing from both InnoDB and XtraDB engines. Finally, when table is created we have already created the .ibd file and if we can't set atomic writes it stays there. Fix: Fix atomic writes variable type to ulint as it should be. Fix: Add proper error code checking on os errors on both InnoDB and XtraDB Fix: Remove the .idb file when atomic writes can't be enabled to a new table. --- storage/innobase/fil/fil0fil.cc | 6 +-- storage/innobase/os/os0file.cc | 83 ++++++++++++++++++++++++++------- storage/xtradb/fil/fil0fil.cc | 15 ++++-- storage/xtradb/os/os0file.cc | 81 ++++++++++++++++++++++++++------ 4 files changed, 146 insertions(+), 39 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index cee9c7e0534..888b2c659b9 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -745,7 +745,7 @@ fil_node_open_file( ulint space_id; ulint flags=0; ulint page_size; - ibool atomic_writes=FALSE; + ulint atomic_writes=0; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -3425,7 +3425,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); - bool atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3720,7 +3720,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; - ibool atomic_writes = FALSE; + ulint atomic_writes = 0; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index ce1b42e670e..3020e7b1a53 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -399,9 +399,8 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { - fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " - "file %s on non-supported platform! Please restart with " - "innodb_use_atomic_writes disabled.\n", name); + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -409,8 +408,7 @@ os_file_set_atomic_writes( return(TRUE); #else fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " - "non-supported platform! Please restart with " - "innodb_use_atomic_writes disabled.\n"); + "file %s on non-supported platform!\n", name); return(FALSE); #endif } @@ -561,6 +559,19 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -620,11 +631,14 @@ os_file_get_last_error_low( fprintf(stderr, "InnoDB: The error means mysqld does not have" " the access rights to\n" - "InnoDECANCELEDB: the directory.\n"); - } else if (err == ECANCELED) { - fprintf(stderr, - "InnoDB: Operation canceled (%d):%s\n", - err, strerror(err)); + "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } if(srv_use_atomic_writes) { fprintf(stderr, @@ -663,6 +677,7 @@ os_file_get_last_error_low( case EISDIR: return(OS_FILE_PATH_ERROR); case ECANCELED: + case ENOTTY: return(OS_FILE_OPERATION_NOT_SUPPORTED); case EAGAIN: if (srv_use_native_aio) { @@ -1521,13 +1536,21 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != INVALID_HANDLE_VALUE && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - CloseHandle(file); + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); *success = FALSE; file = INVALID_HANDLE_VALUE; + } } *success = (file != INVALID_HANDLE_VALUE); @@ -1590,13 +1613,21 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != -1 && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - *success = FALSE; - close(file); - file = -1; + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } } @@ -1836,13 +1867,21 @@ os_file_create_func( } while (retry); + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != INVALID_HANDLE_VALUE && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - CloseHandle(file); + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); *success = FALSE; file = INVALID_HANDLE_VALUE; + } } #else /* __WIN__ */ int create_flag; @@ -1972,13 +2011,21 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != -1 && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - *success = FALSE; - close(file); - file = -1; + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } } #endif /* __WIN__ */ diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index 8e788e71983..b30a85a8597 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -746,7 +746,7 @@ fil_node_open_file( ulint space_id; ulint flags=0; ulint page_size; - ibool atomic_writes=FALSE; + ulint atomic_writes=0; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -3288,6 +3288,8 @@ fil_create_link_file( } else if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; + } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; } else { err = DB_ERROR; } @@ -3448,7 +3450,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); - bool atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3509,6 +3511,11 @@ fil_create_new_single_table_tablespace( goto error_exit_3; } + if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + goto error_exit_3; + } + if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; goto error_exit_3; @@ -3735,7 +3742,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; - ibool atomic_writes = FALSE; + ulint atomic_writes = 0; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -3746,6 +3753,8 @@ fil_open_single_table_tablespace( return(DB_CORRUPTION); } + atomic_writes = fsp_flags_get_atomic_writes(flags); + /* If the tablespace was relocated, we do not compare the DATA_DIR flag */ ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index a3307fa0ba2..f7677140c9a 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -628,10 +628,13 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); - } else if (err == ECANCELED) { - fprintf(stderr, - "InnoDB: Operation canceled (%d):%s\n", - err, strerror(err)); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } if(srv_use_atomic_writes) { fprintf(stderr, @@ -698,6 +701,20 @@ os_file_get_last_error_low( "InnoDB: The error means mysqld does not have" " the access rights to\n" "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { if (strerror(err) != NULL) { fprintf(stderr, @@ -735,6 +752,7 @@ os_file_get_last_error_low( } break; case ECANCELED: + case ENOTTY: return(OS_FILE_OPERATION_NOT_SUPPORTED); case EINTR: if (srv_use_native_aio) { @@ -1591,13 +1609,21 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != INVALID_HANDLE_VALUE && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - CloseHandle(file); + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); *success = FALSE; file = INVALID_HANDLE_VALUE; + } } *success = (file != INVALID_HANDLE_VALUE); @@ -1660,13 +1686,21 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != -1 && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - *success = FALSE; - close(file); - file = -1; + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } } #endif /* __WIN__ */ @@ -1752,15 +1786,16 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); #else - ib_logf(IB_LOG_LEVEL_ERROR, - "trying to enable atomic writes on non-supported platform! " - "Please restart with innodb_use_atomic_writes disabled.\n"); + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); return(FALSE); #endif } @@ -1951,13 +1986,21 @@ os_file_create_func( } while (retry); + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != INVALID_HANDLE_VALUE && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - CloseHandle(file); + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); *success = FALSE; file = INVALID_HANDLE_VALUE; + } } #else /* __WIN__ */ @@ -2090,13 +2133,21 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ if (file != -1 && (awrites == ATOMIC_WRITES_ON || (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - *success = FALSE; - close(file); - file = -1; + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } } From 2d340f9a677bb8dc24e9e1601c613a6c10f5c3c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 23 Apr 2014 19:23:11 +0300 Subject: [PATCH 54/56] Fixed bug on free buffer space calculation when LZO is used. Fixed bug on function call when InnoDB plugin is used. --- storage/innobase/fil/fil0fil.cc | 2 +- storage/innobase/fil/fil0pagecompress.cc | 4 +++- storage/innobase/include/os0file.h | 4 +++- storage/innobase/os/os0file.cc | 2 +- storage/xtradb/fil/fil0pagecompress.cc | 4 +++- storage/xtradb/os/os0file.cc | 1 + 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 8c48adf7c66..9658b9ddcb0 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -4997,7 +4997,7 @@ retry: "space for file \'%s\' failed. Current size " INT64PF ", desired size " INT64PF "\n", node->name, start_offset, len+start_offset); - os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE); + os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__); success = FALSE; } else { success = TRUE; diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index e06a789e37b..b2d201e6a59 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -70,6 +70,8 @@ static ulint srv_data_read, srv_data_written; #include "lzo/lzo1x.h" #endif +/* Used for debugging */ +//#define UNIV_PAGECOMPRESS_DEBUG 1 /****************************************************************//** For page compressed pages compress the page before actual write @@ -145,7 +147,7 @@ fil_compress_page( err = lzo1x_1_15_compress( buf, len, out_buf+header_len, &write_size, lzo_mem); - if (err != LZO_E_OK || write_size > len) { + if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { fprintf(stderr, "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu", space_id, fil_space_name(space), len, err, write_size); diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 530fc536f01..2f22aa73508 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1318,8 +1318,10 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent);/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ #ifndef UNIV_NONINL #include "os0file.ic" diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index c33829b7fe1..cd7b4161cb2 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -370,7 +370,6 @@ os_slot_alloc_page_buf( /****************************************************************//** Does error handling when a file operation fails. @return TRUE if we should retry the operation */ -static ibool os_file_handle_error_no_exit( /*=========================*/ @@ -6337,6 +6336,7 @@ os_slot_alloc_page_buf( byte* cbuf2; byte* cbuf; + /* We allocate extra to avoid memory overwrite on compression */ cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); slot->page_compression_page = static_cast(cbuf2); diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index e06a789e37b..b2d201e6a59 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -70,6 +70,8 @@ static ulint srv_data_read, srv_data_written; #include "lzo/lzo1x.h" #endif +/* Used for debugging */ +//#define UNIV_PAGECOMPRESS_DEBUG 1 /****************************************************************//** For page compressed pages compress the page before actual write @@ -145,7 +147,7 @@ fil_compress_page( err = lzo1x_1_15_compress( buf, len, out_buf+header_len, &write_size, lzo_mem); - if (err != LZO_E_OK || write_size > len) { + if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { fprintf(stderr, "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu", space_id, fil_space_name(space), len, err, write_size); diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 36136614814..657a3a8d050 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -6406,6 +6406,7 @@ os_slot_alloc_page_buf( byte* cbuf2; byte* cbuf; + /* We allocate extra to avoid memory overwrite on compression */ cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); slot->page_compression_page = static_cast(cbuf2); From d6afa8004ec48e4c25d5dfed804d0556cdec587f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 28 Apr 2014 07:52:41 +0300 Subject: [PATCH 55/56] Fixed small error on compression failure error text. --- storage/innobase/fil/fil0pagecompress.cc | 2 +- storage/xtradb/fil/fil0pagecompress.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index b2d201e6a59..ce7063bc688 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -149,7 +149,7 @@ fil_compress_page( if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { fprintf(stderr, - "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu", + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", space_id, fil_space_name(space), len, err, write_size); srv_stats.pages_page_compression_error.inc(); *out_len = len; diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index b2d201e6a59..ce7063bc688 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -149,7 +149,7 @@ fil_compress_page( if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { fprintf(stderr, - "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu", + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", space_id, fil_space_name(space), len, err, write_size); srv_stats.pages_page_compression_error.inc(); *out_len = len; From 972a14b59a0ec12b01c9a7f5c8867294fd4f40db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 16 May 2014 15:30:13 +0300 Subject: [PATCH 56/56] Code cleanup after review. --- storage/innobase/buf/buf0mtflu.cc | 11 +++++++---- storage/xtradb/buf/buf0mtflu.cc | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc index a5937caaf57..5a1769e3b70 100644 --- a/storage/innobase/buf/buf0mtflu.cc +++ b/storage/innobase/buf/buf0mtflu.cc @@ -113,7 +113,6 @@ typedef struct wrk_itm ulint n_flushed; /*!< Flushed pages count */ os_thread_id_t id_usr; /*!< Thread-id currently working */ wrk_status_t wi_status; /*!< Work item status */ - struct wrk_itm *next; /*!< Next work item */ mem_heap_t *wheap; /*!< Heap were to allocate memory for queue nodes */ mem_heap_t *rheap; @@ -262,6 +261,9 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_SET; } +#ifdef UNIV_MTFLUSH_DEBUG + ut_a(work_item->id_usr == 0); +#endif work_item->id_usr = os_thread_get_curr_id(); /* This works as a producer/consumer model, where in tasks are @@ -365,7 +367,6 @@ buf_mtflu_io_thread_exit(void) /* Allocate work items for shutdown message */ work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); - memset(work_item, 0, sizeof(wrk_t)*srv_mtflush_threads); /* Confirm if the io-thread KILL is in progress, bailout */ if (mtflush_io->gwt_status == WTHR_KILL_IT) { @@ -383,6 +384,7 @@ buf_mtflu_io_thread_exit(void) work_item[i].wi_status = WRK_ITEM_EXIT; work_item[i].wheap = mtflush_io->wheap; work_item[i].rheap = mtflush_io->rheap; + work_item[i].id_usr = 0; ib_wqueue_add(mtflush_io->wq, (void *)&(work_item[i]), @@ -518,7 +520,6 @@ buf_mtflu_flush_work_items( node items areallocated */ work_heap = mem_heap_create(0); reply_heap = mem_heap_create(0); - memset(work_item, 0, sizeof(wrk_t)*MTFLUSH_MAX_WORKER); for(i=0;iwq, (void *)(work_item + i), @@ -544,7 +547,7 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; -#if UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if((int)done_wi->id_usr == 0 && (done_wi->wi_status == WRK_ITEM_SET || done_wi->wi_status == WRK_ITEM_UNSET)) { diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc index 5df4a96d42e..b14b83aa5d0 100644 --- a/storage/xtradb/buf/buf0mtflu.cc +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -113,7 +113,6 @@ typedef struct wrk_itm ulint n_flushed; /*!< Flushed pages count */ os_thread_id_t id_usr; /*!< Thread-id currently working */ wrk_status_t wi_status; /*!< Work item status */ - struct wrk_itm *next; /*!< Next work item */ mem_heap_t *wheap; /*!< Heap were to allocate memory for queue nodes */ mem_heap_t *rheap; @@ -269,6 +268,9 @@ mtflush_service_io( work_item->wi_status = WRK_ITEM_SET; } +#ifdef UNIV_MTFLUSH_DEBUG + ut_a(work_item->id_usr == 0); +#endif work_item->id_usr = os_thread_get_curr_id(); /* This works as a producer/consumer model, where in tasks are @@ -372,7 +374,6 @@ buf_mtflu_io_thread_exit(void) /* Allocate work items for shutdown message */ work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); - memset(work_item, 0, sizeof(wrk_t)*srv_mtflush_threads); /* Confirm if the io-thread KILL is in progress, bailout */ if (mtflush_io->gwt_status == WTHR_KILL_IT) { @@ -390,6 +391,7 @@ buf_mtflu_io_thread_exit(void) work_item[i].wi_status = WRK_ITEM_EXIT; work_item[i].wheap = mtflush_io->wheap; work_item[i].rheap = mtflush_io->rheap; + work_item[i].id_usr = 0; ib_wqueue_add(mtflush_io->wq, (void *)&(work_item[i]), @@ -525,7 +527,6 @@ buf_mtflu_flush_work_items( node items areallocated */ work_heap = mem_heap_create(0); reply_heap = mem_heap_create(0); - memset(work_item, 0, sizeof(wrk_t)*MTFLUSH_MAX_WORKER); for(i=0;iwq, (void *)(work_item + i), @@ -551,7 +554,7 @@ buf_mtflu_flush_work_items( if (done_wi != NULL) { per_pool_pages_flushed[i] = done_wi->n_flushed; -#if UNIV_DEBUG +#ifdef UNIV_MTFLUSH_DEBUG if((int)done_wi->id_usr == 0 && (done_wi->wi_status == WRK_ITEM_SET || done_wi->wi_status == WRK_ITEM_UNSET)) {