diff --git a/cmake/lz4.cmake b/cmake/lz4.cmake new file mode 100644 index 00000000000..bb2300891eb --- /dev/null +++ b/cmake/lz4.cmake @@ -0,0 +1,35 @@ +# Copyright (C) 2014, SkySQL Ab. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +MACRO (MYSQL_CHECK_LZ4) + +CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) +CHECK_LIBRARY_EXISTS(lz4 LZ4_compress_limitedOutput "" HAVE_LZ4_SHARED_LIB) + +IF (HAVE_LZ4_SHARED_LIB AND HAVE_LZ4_H) + ADD_DEFINITIONS(-DHAVE_LZ4=1) + LINK_LIBRARIES(lz4) +ENDIF() +ENDMACRO() + +MACRO (MYSQL_CHECK_LZ4_STATIC) + + CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H) + CHECK_LIBRARY_EXISTS(liblz4.a LZ4_compress_limitedOutput "" HAVE_LZ4_LIB) + + IF(HAVE_LZ4_LIB AND HAVE_LZ4_H) + ADD_DEFINITIONS(-DHAVE_LZ4=1) + LINK_LIBRARIES(liblz4.a) + ENDIF() +ENDMACRO() \ No newline at end of file diff --git a/cmake/lzo.cmake b/cmake/lzo.cmake new file mode 100644 index 00000000000..596dfdcde8b --- /dev/null +++ b/cmake/lzo.cmake @@ -0,0 +1,35 @@ +# Copyright (C) 2014, SkySQL Ab. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +MACRO (MYSQL_CHECK_LZO_STATIC) + +CHECK_INCLUDE_FILES(lzo/lzo1x.h HAVE_LZO_H) +CHECK_LIBRARY_EXISTS(liblzo2.a lzo1x_1_compress "" HAVE_LZO_LIB) + +IF(HAVE_LZO_LIB AND HAVE_LZO_H) + ADD_DEFINITIONS(-DHAVE_LZO=1) + LINK_LIBRARIES(liblzo2.a) +ENDIF() +ENDMACRO() + +MACRO (MYSQL_CHECK_LZO) + +CHECK_INCLUDE_FILES(lzo/lzo1x.h HAVE_LZO_H) +CHECK_LIBRARY_EXISTS(lzo2 lzo1x_1_compress "" HAVE_LZO_LIB) + +IF(HAVE_LZO_LIB AND HAVE_LZO_H) + ADD_DEFINITIONS(-DHAVE_LZO=1) + LINK_LIBRARIES(lzo2) +ENDIF() +ENDMACRO() diff --git a/extra/CMakeLists.txt b/extra/CMakeLists.txt index f8f71b00743..cf3a35cb1dd 100644 --- a/extra/CMakeLists.txt +++ b/extra/CMakeLists.txt @@ -72,10 +72,24 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") ENDIF() ENDIF() +IF(WITH_INNOBASE_STORAGE_ENGINE) + # Add path to the InnoDB headers + INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include) + # We use the InnoDB code directly in case the code changes. + ADD_DEFINITIONS("-DUNIV_INNOCHECKSUM") + SET(INNOBASE_SOURCES + ../storage/innobase/buf/buf0checksum.cc + ../storage/innobase/ut/ut0crc32.cc + ../storage/innobase/ut/ut0ut.cc + ) + MYSQL_ADD_EXECUTABLE(innochecksum innochecksum.cc ${INNOBASE_SOURCES}) + TARGET_LINK_LIBRARIES(innochecksum mysys mysys_ssl) +ENDIF() + MYSQL_ADD_EXECUTABLE(replace replace.c COMPONENT Server) TARGET_LINK_LIBRARIES(replace mysys) + IF(UNIX) - MYSQL_ADD_EXECUTABLE(innochecksum innochecksum.c) MYSQL_ADD_EXECUTABLE(resolve_stack_dump resolve_stack_dump.c) TARGET_LINK_LIBRARIES(resolve_stack_dump mysys) diff --git a/extra/innochecksum.c b/extra/innochecksum.c deleted file mode 100644 index ed4dfc48789..00000000000 --- a/extra/innochecksum.c +++ /dev/null @@ -1,325 +0,0 @@ -/* - Copyright (c) 2005, 2011, Oracle and/or its affiliates - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; version 2 of the License. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -*/ - -/* - InnoDB offline file checksum utility. 85% of the code in this file - was taken wholesale fron the InnoDB codebase. - - The final 15% was originally written by Mark Smith of Danga - Interactive, Inc. - - Published with a permission. -*/ - -#include -#include -#include -#include -#include -#include -#include - -/* all of these ripped from InnoDB code from MySQL 4.0.22 */ -#define UT_HASH_RANDOM_MASK 1463735687 -#define UT_HASH_RANDOM_MASK2 1653893711 -#define FIL_PAGE_LSN 16 -#define FIL_PAGE_FILE_FLUSH_LSN 26 -#define FIL_PAGE_OFFSET 4 -#define FIL_PAGE_DATA 38 -#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 -#define FIL_PAGE_SPACE_OR_CHKSUM 0 -#define UNIV_PAGE_SIZE (2 * 8192) - -/* command line argument to do page checks (that's it) */ -/* another argument to specify page ranges... seek to right spot and go from there */ - -typedef unsigned long int ulint; - -/* innodb function in name; modified slightly to not have the ASM version (lots of #ifs that didn't apply) */ -ulint mach_read_from_4(uchar *b) -{ - return( ((ulint)(b[0]) << 24) - + ((ulint)(b[1]) << 16) - + ((ulint)(b[2]) << 8) - + (ulint)(b[3]) - ); -} - -ulint -ut_fold_ulint_pair( -/*===============*/ - /* out: folded value */ - ulint n1, /* in: ulint */ - ulint n2) /* in: ulint */ -{ - return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) - ^ UT_HASH_RANDOM_MASK) + n2); -} - -ulint -ut_fold_binary( -/*===========*/ - /* out: folded value */ - uchar* str, /* in: string of bytes */ - ulint len) /* in: length */ -{ - ulint i; - ulint fold= 0; - - for (i= 0; i < len; i++) - { - fold= ut_fold_ulint_pair(fold, (ulint)(*str)); - - str++; - } - - return(fold); -} - -ulint -buf_calc_page_new_checksum( -/*=======================*/ - /* out: checksum */ - uchar* page) /* in: buffer page */ -{ - ulint checksum; - - /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO - are written outside the buffer pool to the first pages of data - files, we have to skip them in the page checksum calculation. - We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the - checksum is stored, and also the last 8 bytes of page because - there we store the old formula checksum. */ - - checksum= ut_fold_binary(page + FIL_PAGE_OFFSET, - FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) - + ut_fold_binary(page + FIL_PAGE_DATA, - UNIV_PAGE_SIZE - FIL_PAGE_DATA - - FIL_PAGE_END_LSN_OLD_CHKSUM); - checksum= checksum & 0xFFFFFFFF; - - return(checksum); -} - -ulint -buf_calc_page_old_checksum( -/*=======================*/ - /* out: checksum */ - uchar* page) /* in: buffer page */ -{ - ulint checksum; - - checksum= ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - - checksum= checksum & 0xFFFFFFFF; - - return(checksum); -} - - -int main(int argc, char **argv) -{ - FILE *f; /* our input file */ - uchar *p; /* storage of pages read */ - int bytes; /* bytes read count */ - ulint ct; /* current page number (0 based) */ - int now; /* current time */ - int lastt; /* last time */ - ulint oldcsum, oldcsumfield, csum, csumfield, logseq, logseqfield; /* ulints for checksum storage */ - struct stat st; /* for stat, if you couldn't guess */ - unsigned long long int size; /* size of file (has to be 64 bits) */ - ulint pages; /* number of pages in file */ - ulint start_page= 0, end_page= 0, use_end_page= 0; /* for starting and ending at certain pages */ - off_t offset= 0; - int just_count= 0; /* if true, just print page count */ - int verbose= 0; - int debug= 0; - int c; - int fd; - - /* remove arguments */ - while ((c= getopt(argc, argv, "cvds:e:p:")) != -1) - { - switch (c) - { - case 'v': - verbose= 1; - break; - case 'c': - just_count= 1; - break; - case 's': - start_page= atoi(optarg); - break; - case 'e': - end_page= atoi(optarg); - use_end_page= 1; - break; - case 'p': - start_page= atoi(optarg); - end_page= atoi(optarg); - use_end_page= 1; - break; - case 'd': - debug= 1; - break; - case ':': - fprintf(stderr, "option -%c requires an argument\n", optopt); - return 1; - break; - case '?': - fprintf(stderr, "unrecognized option: -%c\n", optopt); - return 1; - break; - } - } - - /* debug implies verbose... */ - if (debug) verbose= 1; - - /* make sure we have the right arguments */ - if (optind >= argc) - { - printf("InnoDB offline file checksum utility.\n"); - printf("usage: %s [-c] [-s ] [-e ] [-p ] [-v] [-d] \n", argv[0]); - printf("\t-c\tprint the count of pages in the file\n"); - printf("\t-s n\tstart on this page number (0 based)\n"); - printf("\t-e n\tend at this page number (0 based)\n"); - printf("\t-p n\tcheck only this page (0 based)\n"); - printf("\t-v\tverbose (prints progress every 5 seconds)\n"); - printf("\t-d\tdebug mode (prints checksums for each page)\n"); - return 1; - } - - /* stat the file to get size and page count */ - if (stat(argv[optind], &st)) - { - perror("error statting file"); - return 1; - } - size= st.st_size; - pages= size / UNIV_PAGE_SIZE; - if (just_count) - { - printf("%lu\n", pages); - return 0; - } - else if (verbose) - { - printf("file %s = %llu bytes (%lu pages)...\n", argv[optind], size, pages); - printf("checking pages in range %lu to %lu\n", start_page, use_end_page ? end_page : (pages - 1)); - } - - /* open the file for reading */ - f= fopen(argv[optind], "r"); - if (!f) - { - perror("error opening file"); - return 1; - } - - /* seek to the necessary position */ - if (start_page) - { - fd= fileno(f); - if (!fd) - { - perror("unable to obtain file descriptor number"); - return 1; - } - - offset= (off_t)start_page * (off_t)UNIV_PAGE_SIZE; - - if (lseek(fd, offset, SEEK_SET) != offset) - { - perror("unable to seek to necessary offset"); - return 1; - } - } - - /* allocate buffer for reading (so we don't realloc every time) */ - p= (uchar *)malloc(UNIV_PAGE_SIZE); - - /* main checksumming loop */ - ct= start_page; - lastt= 0; - while (!feof(f)) - { - bytes= fread(p, 1, UNIV_PAGE_SIZE, f); - if (!bytes && feof(f)) return 0; - if (bytes != UNIV_PAGE_SIZE) - { - fprintf(stderr, "bytes read (%d) doesn't match universal page size (%d)\n", bytes, UNIV_PAGE_SIZE); - return 1; - } - - /* check the "stored log sequence numbers" */ - logseq= mach_read_from_4(p + FIL_PAGE_LSN + 4); - logseqfield= mach_read_from_4(p + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4); - if (debug) - printf("page %lu: log sequence number: first = %lu; second = %lu\n", ct, logseq, logseqfield); - if (logseq != logseqfield) - { - fprintf(stderr, "page %lu invalid (fails log sequence number check)\n", ct); - return 1; - } - - /* check old method of checksumming */ - oldcsum= buf_calc_page_old_checksum(p); - oldcsumfield= mach_read_from_4(p + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); - if (debug) - printf("page %lu: old style: calculated = %lu; recorded = %lu\n", ct, oldcsum, oldcsumfield); - if (oldcsumfield != mach_read_from_4(p + FIL_PAGE_LSN) && oldcsumfield != oldcsum) - { - fprintf(stderr, "page %lu invalid (fails old style checksum)\n", ct); - return 1; - } - - /* now check the new method */ - csum= buf_calc_page_new_checksum(p); - csumfield= mach_read_from_4(p + FIL_PAGE_SPACE_OR_CHKSUM); - if (debug) - printf("page %lu: new style: calculated = %lu; recorded = %lu\n", ct, csum, csumfield); - if (csumfield != 0 && csum != csumfield) - { - fprintf(stderr, "page %lu invalid (fails new style checksum)\n", ct); - return 1; - } - - /* end if this was the last page we were supposed to check */ - if (use_end_page && (ct >= end_page)) - return 0; - - /* do counter increase and progress printing */ - ct++; - if (verbose) - { - if (ct % 64 == 0) - { - now= time(0); - if (!lastt) lastt= now; - if (now - lastt >= 1) - { - printf("page %lu okay: %.3f%% done\n", (ct - 1), (float) ct / pages * 100); - lastt= now; - } - } - } - } - return 0; -} - diff --git a/extra/innochecksum.cc b/extra/innochecksum.cc new file mode 100644 index 00000000000..c89196b1eee --- /dev/null +++ b/extra/innochecksum.cc @@ -0,0 +1,396 @@ +/* + Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +/* + InnoDB offline file checksum utility. 85% of the code in this utility + is included from the InnoDB codebase. + + The final 15% was originally written by Mark Smith of Danga + Interactive, Inc. + + Published with a permission. +*/ + +#include +#include +#include +#include +#include +#include +#ifndef __WIN__ +# include +#endif +#include +#include +#include /* ORACLE_WELCOME_COPYRIGHT_NOTICE */ + +/* Only parts of these files are included from the InnoDB codebase. +The parts not included are excluded by #ifndef UNIV_INNOCHECKSUM. */ + +#include "univ.i" /* include all of this */ + +#include "buf0checksum.h" /* buf_calc_page_*() */ +#include "fil0fil.h" /* FIL_* */ +#include "fsp0fsp.h" /* fsp_flags_get_page_size() & + fsp_flags_get_zip_size() */ +#include "mach0data.h" /* mach_read_from_4() */ +#include "ut0crc32.h" /* ut_crc32_init() */ + +#ifdef UNIV_NONINL +# include "fsp0fsp.ic" +# include "mach0data.ic" +# include "ut0rnd.ic" +#endif + +/* Global variables */ +static my_bool verbose; +static my_bool debug; +static my_bool just_count; +static ulong start_page; +static ulong end_page; +static ulong do_page; +static my_bool use_end_page; +static my_bool do_one_page; +ulong srv_page_size; /* replaces declaration in srv0srv.c */ +static ulong physical_page_size; /* Page size in bytes on disk. */ +static ulong logical_page_size; /* Page size when uncompressed. */ + +/* Get the page size of the filespace from the filespace header. */ +static +my_bool +get_page_size( +/*==========*/ + FILE* f, /*!< in: file pointer, must be open + and set to start of file */ + byte* buf, /*!< in: buffer used to read the page */ + ulong* logical_page_size, /*!< out: Logical/Uncompressed page size */ + ulong* physical_page_size) /*!< out: Physical/Commpressed page size */ +{ + ulong flags; + + int bytes= fread(buf, 1, UNIV_PAGE_SIZE_MIN, f); + + if (ferror(f)) + { + perror("Error reading file header"); + return FALSE; + } + + if (bytes != UNIV_PAGE_SIZE_MIN) + { + fprintf(stderr, "Error; Was not able to read the minimum page size "); + fprintf(stderr, "of %d bytes. Bytes read was %d\n", UNIV_PAGE_SIZE_MIN, bytes); + return FALSE; + } + + rewind(f); + + flags = mach_read_from_4(buf + FIL_PAGE_DATA + FSP_SPACE_FLAGS); + + /* srv_page_size is used by InnoDB code as UNIV_PAGE_SIZE */ + srv_page_size = *logical_page_size = fsp_flags_get_page_size(flags); + + /* fsp_flags_get_zip_size() will return zero if not compressed. */ + *physical_page_size = fsp_flags_get_zip_size(flags); + if (*physical_page_size == 0) + *physical_page_size= *logical_page_size; + + return TRUE; +} + + +/* command line argument to do page checks (that's it) */ +/* another argument to specify page ranges... seek to right spot and go from there */ + +static struct my_option innochecksum_options[] = +{ + {"help", '?', "Displays this help and exits.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"info", 'I', "Synonym for --help.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Displays version information and exits.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Verbose (prints progress every 5 seconds).", + &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"debug", 'd', "Debug mode (prints checksums for each page, implies verbose).", + &debug, &debug, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"count", 'c', "Print the count of pages in the file.", + &just_count, &just_count, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"start_page", 's', "Start on this page number (0 based).", + &start_page, &start_page, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, (longlong) 2L*1024L*1024L*1024L, 0, 1, 0}, + {"end_page", 'e', "End at this page number (0 based).", + &end_page, &end_page, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, (longlong) 2L*1024L*1024L*1024L, 0, 1, 0}, + {"page", 'p', "Check only this page (0 based).", + &do_page, &do_page, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, (longlong) 2L*1024L*1024L*1024L, 0, 1, 0}, + + {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +static void print_version(void) +{ + printf("%s Ver %s, for %s (%s)\n", + my_progname, INNODB_VERSION_STR, + SYSTEM_TYPE, MACHINE_TYPE); +} + +static void usage(void) +{ + print_version(); + puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2000")); + printf("InnoDB offline file checksum utility.\n"); + printf("Usage: %s [-c] [-s ] [-e ] [-p ] [-v] [-d] \n", my_progname); + my_print_help(innochecksum_options); + my_print_variables(innochecksum_options); +} + +extern "C" my_bool +innochecksum_get_one_option( +/*========================*/ + int optid, + const struct my_option *opt __attribute__((unused)), + char *argument __attribute__((unused))) +{ + switch (optid) { + case 'd': + verbose=1; /* debug implies verbose... */ + break; + case 'e': + use_end_page= 1; + break; + case 'p': + end_page= start_page= do_page; + use_end_page= 1; + do_one_page= 1; + break; + case 'V': + print_version(); + exit(0); + break; + case 'I': + case '?': + usage(); + exit(0); + break; + } + return 0; +} + +static int get_options( +/*===================*/ + int *argc, + char ***argv) +{ + int ho_error; + + if ((ho_error=handle_options(argc, argv, innochecksum_options, innochecksum_get_one_option))) + exit(ho_error); + + /* The next arg must be the filename */ + if (!*argc) + { + usage(); + return 1; + } + return 0; +} /* get_options */ + + +int main(int argc, char **argv) +{ + FILE* f; /* our input file */ + char* filename; /* our input filename. */ + unsigned char buf[UNIV_PAGE_SIZE_MAX]; /* Buffer to store pages read */ + ulong bytes; /* bytes read count */ + ulint ct; /* current page number (0 based) */ + time_t now; /* current time */ + time_t lastt; /* last time */ + ulint oldcsum, oldcsumfield, csum, csumfield, crc32, logseq, logseqfield; + /* ulints for checksum storage */ + struct stat st; /* for stat, if you couldn't guess */ + unsigned long long int size; /* size of file (has to be 64 bits) */ + ulint pages; /* number of pages in file */ + off_t offset= 0; + int fd; + + printf("InnoDB offline file checksum utility.\n"); + + ut_crc32_init(); + + MY_INIT(argv[0]); + + if (get_options(&argc,&argv)) + exit(1); + + if (verbose) + my_print_variables(innochecksum_options); + + /* The file name is not optional */ + filename = *argv; + if (*filename == '\0') + { + fprintf(stderr, "Error; File name missing\n"); + return 1; + } + + /* stat the file to get size and page count */ + if (stat(filename, &st)) + { + fprintf(stderr, "Error; %s cannot be found\n", filename); + return 1; + } + size= st.st_size; + + /* Open the file for reading */ + f= fopen(filename, "rb"); + if (f == NULL) + { + fprintf(stderr, "Error; %s cannot be opened", filename); + perror(" "); + return 1; + } + + if (!get_page_size(f, buf, &logical_page_size, &physical_page_size)) + { + return 1; + } + + /* This tool currently does not support Compressed tables */ + if (logical_page_size != physical_page_size) + { + fprintf(stderr, "Error; This file contains compressed pages\n"); + return 1; + } + + pages= (ulint) (size / physical_page_size); + + if (just_count) + { + if (verbose) + printf("Number of pages: "); + printf("%lu\n", pages); + return 0; + } + else if (verbose) + { + printf("file %s = %llu bytes (%lu pages)...\n", filename, size, pages); + if (do_one_page) + printf("InnoChecksum; checking page %lu\n", do_page); + else + printf("InnoChecksum; checking pages in range %lu to %lu\n", start_page, use_end_page ? end_page : (pages - 1)); + } + + /* seek to the necessary position */ + if (start_page) + { + fd= fileno(f); + if (!fd) + { + perror("Error; Unable to obtain file descriptor number"); + return 1; + } + + offset= (off_t)start_page * (off_t)physical_page_size; + + if (lseek(fd, offset, SEEK_SET) != offset) + { + perror("Error; Unable to seek to necessary offset"); + return 1; + } + } + + /* main checksumming loop */ + ct= start_page; + lastt= 0; + while (!feof(f)) + { + bytes= fread(buf, 1, physical_page_size, f); + if (!bytes && feof(f)) + return 0; + + if (ferror(f)) + { + fprintf(stderr, "Error reading %lu bytes", physical_page_size); + perror(" "); + return 1; + } + if (bytes != physical_page_size) + { + fprintf(stderr, "Error; bytes read (%lu) doesn't match page size (%lu)\n", bytes, physical_page_size); + return 1; + } + + /* check the "stored log sequence numbers" */ + logseq= mach_read_from_4(buf + FIL_PAGE_LSN + 4); + logseqfield= mach_read_from_4(buf + logical_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM + 4); + if (debug) + printf("page %lu: log sequence number: first = %lu; second = %lu\n", ct, logseq, logseqfield); + if (logseq != logseqfield) + { + fprintf(stderr, "Fail; page %lu invalid (fails log sequence number check)\n", ct); + return 1; + } + + /* check old method of checksumming */ + oldcsum= buf_calc_page_old_checksum(buf); + oldcsumfield= mach_read_from_4(buf + logical_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); + if (debug) + printf("page %lu: old style: calculated = %lu; recorded = %lu\n", ct, oldcsum, oldcsumfield); + if (oldcsumfield != mach_read_from_4(buf + FIL_PAGE_LSN) && oldcsumfield != oldcsum) + { + fprintf(stderr, "Fail; page %lu invalid (fails old style checksum)\n", ct); + return 1; + } + + /* now check the new method */ + csum= buf_calc_page_new_checksum(buf); + crc32= buf_calc_page_crc32(buf); + csumfield= mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM); + if (debug) + printf("page %lu: new style: calculated = %lu; crc32 = %lu; recorded = %lu\n", + ct, csum, crc32, csumfield); + if (csumfield != 0 && crc32 != csumfield && csum != csumfield) + { + fprintf(stderr, "Fail; page %lu invalid (fails innodb and crc32 checksum)\n", ct); + return 1; + } + + /* end if this was the last page we were supposed to check */ + if (use_end_page && (ct >= end_page)) + return 0; + + /* do counter increase and progress printing */ + ct++; + if (verbose) + { + if (ct % 64 == 0) + { + now= time(0); + if (!lastt) lastt= now; + if (now - lastt >= 1) + { + printf("page %lu okay: %.3f%% done\n", (ct - 1), (float) ct / pages * 100); + lastt= now; + } + } + } + } + return 0; +} + diff --git a/mysql-test/disabled.def b/mysql-test/disabled.def index e5fa24786e1..d2e839fa39a 100644 --- a/mysql-test/disabled.def +++ b/mysql-test/disabled.def @@ -20,3 +20,5 @@ mysql_embedded : Bug#12561297 2011-05-14 Anitha Dependent on PB2 chang ssl_crl_clients_valid : broken upstream ssl_crl : broken upstream ssl_crl_clrpath : broken upstream +innodb-wl5522-debug-zip : broken upstream +innodb_bug12902967 : broken upstream \ No newline at end of file diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_disable_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_enable_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_all_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result index 6f1c4c21d17..aee118aced2 100644 --- a/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_monitor_reset_basic.result @@ -37,6 +37,8 @@ buffer_pool_bytes_dirty disabled buffer_pool_pages_free disabled buffer_pages_created disabled buffer_pages_written disabled +buffer_index_pages_written disabled +buffer_non_index_pages_written disabled buffer_pages_read disabled buffer_data_reads disabled buffer_data_written disabled @@ -160,6 +162,14 @@ compress_pages_compressed disabled compress_pages_decompressed disabled compression_pad_increments disabled compression_pad_decrements disabled +compress_saved disabled +compress_trim_sect512 disabled +compress_trim_sect4096 disabled +compress_pages_page_compressed disabled +compress_page_compressed_trim_op disabled +compress_page_compressed_trim_op_saved disabled +compress_pages_page_decompressed disabled +compress_pages_page_compression_error disabled index_page_splits disabled index_page_merge_attempts disabled index_page_merge_successful disabled diff --git a/mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result b/mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result new file mode 100644 index 00000000000..75a1cc5262e --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_mtflush_threads_basic.result @@ -0,0 +1,21 @@ +select @@global.innodb_mtflush_threads; +@@global.innodb_mtflush_threads +8 +select @@session.innodb_mtflush_threads; +ERROR HY000: Variable 'innodb_mtflush_threads' is a GLOBAL variable +show global variables like 'innodb_mtflush_threads'; +Variable_name Value +innodb_mtflush_threads 8 +show session variables like 'innodb_mtflush_threads'; +Variable_name Value +innodb_mtflush_threads 8 +select * from information_schema.global_variables where variable_name='innodb_mtflush_threads'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_MTFLUSH_THREADS 8 +select * from information_schema.session_variables where variable_name='innodb_mtflush_threads'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_MTFLUSH_THREADS 8 +set global innodb_mtflush_threads=1; +ERROR HY000: Variable 'innodb_mtflush_threads' is a read only variable +set session innodb_mtflush_threads=1; +ERROR HY000: Variable 'innodb_mtflush_threads' is a read only variable diff --git a/mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result b/mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result new file mode 100644 index 00000000000..4c3cfa524af --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_use_lz4_basic.result @@ -0,0 +1,3 @@ +select @@global.innodb_use_fallocate; +@@global.innodb_use_fallocate +0 diff --git a/mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result b/mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result new file mode 100644 index 00000000000..f77abba7ac9 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_use_mtflush_basic.result @@ -0,0 +1,21 @@ +select @@global.innodb_use_mtflush; +@@global.innodb_use_mtflush +0 +select @@session.innodb_use_mtflush; +ERROR HY000: Variable 'innodb_use_mtflush' is a GLOBAL variable +show global variables like 'innodb_use_mtflush'; +Variable_name Value +innodb_use_mtflush OFF +show session variables like 'innodb_use_mtflush'; +Variable_name Value +innodb_use_mtflush OFF +select * from information_schema.global_variables where variable_name='innodb_use_mtflush'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_USE_MTFLUSH OFF +select * from information_schema.session_variables where variable_name='innodb_use_mtflush'; +VARIABLE_NAME VARIABLE_VALUE +INNODB_USE_MTFLUSH OFF +set global innodb_use_mtflush=1; +ERROR HY000: Variable 'innodb_use_mtflush' is a read only variable +set session innodb_use_mtflush=1; +ERROR HY000: Variable 'innodb_use_mtflush' is a read only variable diff --git a/mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result b/mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result new file mode 100644 index 00000000000..63292f5d3c8 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_use_trim_basic.result @@ -0,0 +1,33 @@ +SET @start_use_trim = @@global.innodb_use_trim; +SELECT @start_use_trim; +@start_use_trim +0 +SELECT COUNT(@@GLOBAL.innodb_use_trim); +COUNT(@@GLOBAL.innodb_use_trim) +1 +1 Expected +SET @@GLOBAL.innodb_use_trim=1; +SELECT COUNT(@@GLOBAL.innodb_use_trim); +COUNT(@@GLOBAL.innodb_use_trim) +1 +1 Expected +SELECT IF(@@GLOBAL.innodb_use_trim, 'ON', 'OFF') = VARIABLE_VALUE +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +IF(@@GLOBAL.innodb_use_trim, 'ON', 'OFF') = VARIABLE_VALUE +1 +1 Expected +SELECT COUNT(@@GLOBAL.innodb_use_trim); +COUNT(@@GLOBAL.innodb_use_trim) +1 +1 Expected +SELECT COUNT(VARIABLE_VALUE) +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +COUNT(VARIABLE_VALUE) +1 +1 Expected +SET @@global.innodb_use_trim = @start_use_trim; +SELECT @@global.innodb_use_trim; +@@global.innodb_use_trim +0 diff --git a/mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test b/mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test new file mode 100644 index 00000000000..c8412f969eb --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_mtflush_threads_basic.test @@ -0,0 +1,21 @@ +--source include/have_innodb.inc +# bool readonly + +# +# show values; +# +select @@global.innodb_mtflush_threads; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +select @@session.innodb_mtflush_threads; +show global variables like 'innodb_mtflush_threads'; +show session variables like 'innodb_mtflush_threads'; +select * from information_schema.global_variables where variable_name='innodb_mtflush_threads'; +select * from information_schema.session_variables where variable_name='innodb_mtflush_threads'; + +# +# show that it's read-only +# +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set global innodb_mtflush_threads=1; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set session innodb_mtflush_threads=1; diff --git a/mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test b/mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test new file mode 100644 index 00000000000..aefa276dcee --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_use_lz4_basic.test @@ -0,0 +1,5 @@ +--source include/have_innodb.inc +# bool readonly +# not on all compilations +select @@global.innodb_use_fallocate; + diff --git a/mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test b/mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test new file mode 100644 index 00000000000..a9c40b9e522 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_use_mtflush_basic.test @@ -0,0 +1,22 @@ +--source include/have_innodb.inc +# bool readonly + +# +# show values; +# +select @@global.innodb_use_mtflush; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +select @@session.innodb_use_mtflush; +show global variables like 'innodb_use_mtflush'; +show session variables like 'innodb_use_mtflush'; +select * from information_schema.global_variables where variable_name='innodb_use_mtflush'; +select * from information_schema.session_variables where variable_name='innodb_use_mtflush'; + +# +# show that it's read-only +# +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set global innodb_use_mtflush=1; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +set session innodb_use_mtflush=1; + diff --git a/mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test b/mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test new file mode 100644 index 00000000000..c1b0f142179 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_use_trim_basic.test @@ -0,0 +1,36 @@ +--source include/have_innodb.inc + +SET @start_use_trim = @@global.innodb_use_trim; +SELECT @start_use_trim; + +SELECT COUNT(@@GLOBAL.innodb_use_trim); +--echo 1 Expected + +#################################################################### +# Check if Value can set # +#################################################################### + +SET @@GLOBAL.innodb_use_trim=1; + +SELECT COUNT(@@GLOBAL.innodb_use_trim); +--echo 1 Expected + +################################################################# +# Check if the value in GLOBAL Table matches value in variable # +################################################################# + +SELECT IF(@@GLOBAL.innodb_use_trim, 'ON', 'OFF') = VARIABLE_VALUE +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +--echo 1 Expected + +SELECT COUNT(@@GLOBAL.innodb_use_trim); +--echo 1 Expected + +SELECT COUNT(VARIABLE_VALUE) +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME='innodb_use_trim'; +--echo 1 Expected + +SET @@global.innodb_use_trim = @start_use_trim; +SELECT @@global.innodb_use_trim; \ No newline at end of file diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 50fdb36495d..ced05daee3e 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -18,6 +18,11 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) +INCLUDE(lz4) +INCLUDE(lzo) + +MYSQL_CHECK_LZ4() +MYSQL_CHECK_LZO() # OS tests IF(UNIX) @@ -284,6 +289,7 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc + buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -297,6 +303,7 @@ SET(INNOBASE_SOURCES eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc + fil/fil0pagecompress.cc fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 0136c9c43a3..104c2f00ef6 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -2031,7 +2031,7 @@ btr_parse_page_reorganize( buf_block_t* block, /*!< in: page to be reorganized, or NULL */ mtr_t* mtr) /*!< in: mtr or NULL */ { - ulint level; + ulint level = page_zip_level; ut_ad(ptr && end_ptr); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index e39d82a6c8b..2e61e76d629 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1860,9 +1860,14 @@ btr_cur_update_alloc_zip_func( false=update-in-place */ mtr_t* mtr) /*!< in/out: mini-transaction */ { + + /* Have a local copy of the variables as these can change + dynamically. */ + ulint compression_level = page_zip_level; const page_t* page = page_cur_get_page(cursor); ut_ad(page_zip == page_cur_get_page_zip(cursor)); + ut_ad(page_zip); ut_ad(!dict_index_is_ibuf(index)); ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 697b3f203b3..232e57617d3 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -3350,6 +3351,7 @@ buf_page_init_low( bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; + bpage->write_size = 0; HASH_INVALIDATE(bpage, hash); #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG bpage->file_page_was_freed = FALSE; diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index e1018c89e9b..fd4d8539b7a 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -381,7 +382,8 @@ buf_dblwr_init_or_load_pages( buffer */ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, - UNIV_PAGE_SIZE, read_buf, NULL); + UNIV_PAGE_SIZE, read_buf, NULL, 0); + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -417,11 +419,12 @@ buf_dblwr_init_or_load_pages( fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf, NULL); + buf, NULL, 0); + fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - NULL); + NULL, 0); /* Check if any of these pages is half-written in data files, in the intended position */ @@ -448,7 +451,7 @@ buf_dblwr_init_or_load_pages( } fil_io(OS_FILE_WRITE, true, space_id, 0, source_page_no, 0, - UNIV_PAGE_SIZE, page, NULL); + UNIV_PAGE_SIZE, page, NULL, 0); } else if (load_corrupt_pages) { @@ -510,7 +513,7 @@ buf_dblwr_process() fil_io(OS_FILE_READ, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); + read_buf, NULL, 0); /* Check if the page is corrupt */ @@ -562,7 +565,7 @@ buf_dblwr_process() fil_io(OS_FILE_WRITE, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); ib_logf(IB_LOG_LEVEL_INFO, "Recovered the page from" @@ -778,7 +781,7 @@ buf_dblwr_write_block_to_datafile( buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), (void*) bpage->zip.data, - (void*) bpage); + (void*) bpage, 0); return; } @@ -790,8 +793,7 @@ buf_dblwr_write_block_to_datafile( fil_io(flags, sync, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block); - + (void*) block->frame, (void*) block, (ulint *)&bpage->write_size); } /********************************************************************//** @@ -885,7 +887,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -901,7 +903,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); flush: /* increment the doublewrite flushed pages counter */ @@ -1130,14 +1132,14 @@ retry: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf - + UNIV_PAGE_SIZE * i), NULL); + + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, - NULL); + NULL, 0); } /* Now flush the doublewrite buffer data to disk */ diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 0fa5c744e51..f75b3f04580 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri #endif #include "buf0buf.h" +#include "buf0mtflu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" @@ -44,10 +47,12 @@ Created 11/11/1995 Heikki Tuuri #include "ibuf0ibuf.h" #include "log0log.h" #include "os0file.h" +#include "os0sync.h" #include "trx0sys.h" #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /** Number of pages flushed through non flush_list flushes. */ static ulint buf_lru_flush_page_count = 0; @@ -723,8 +728,10 @@ buf_flush_write_complete( flush_type = buf_page_get_flush_type(bpage); buf_pool->n_flush[flush_type]--; +#ifdef UNIV_DEBUG /* fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[flush_type]); */ +#endif if (buf_pool->n_flush[flush_type] == 0 && buf_pool->init_flush[flush_type] == FALSE) { @@ -880,6 +887,8 @@ buf_flush_write_block_low( { ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; + ulint space_id = buf_page_get_space(bpage); + atomic_writes_t awrites = fil_space_get_atomic_writes(space_id); #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -956,12 +965,28 @@ buf_flush_write_block_low( sync, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage, sync); + frame, bpage, &bpage->write_size); } else { - ut_ad(!sync); - buf_dblwr_add_to_batch(bpage); + + /* InnoDB uses doublewrite buffer and doublewrite buffer + is initialized. User can define do we use atomic writes + on a file space (table) or not. If atomic writes are + not used we should use doublewrite buffer and if + atomic writes should be used, no doublewrite buffer + is used. */ + + if (awrites == ATOMIC_WRITES_ON) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage, &bpage->write_size); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage, sync); + } else { + ut_ad(!sync); + buf_dblwr_add_to_batch(bpage); + } } /* When doing single page flushing the IO is done synchronously @@ -1221,7 +1246,9 @@ buf_flush_try_neighbors( } } +#ifdef UNIV_DEBUG /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */ +#endif if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); @@ -1664,7 +1691,6 @@ pages: to avoid deadlocks, this function must be written so that it cannot end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued */ -static ulint buf_flush_batch( /*============*/ @@ -1721,7 +1747,6 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -static void buf_flush_common( /*=============*/ @@ -1746,7 +1771,6 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -static ibool buf_flush_start( /*============*/ @@ -1775,7 +1799,6 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -static void buf_flush_end( /*==========*/ @@ -1899,6 +1922,10 @@ buf_flush_list( ulint i; bool success = true; + if (buf_mtflu_init_done()) { + return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed)); + } + if (n_processed) { *n_processed = 0; } @@ -2069,6 +2096,11 @@ buf_flush_LRU_tail(void) { ulint total_flushed = 0; + if(buf_mtflu_init_done()) + { + return(buf_mtflu_flush_LRU_tail()); + } + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool = buf_pool_from_array(i); @@ -2374,6 +2406,8 @@ page_cleaner_sleep_if_needed( } } + + /******************************************************************//** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one instance of this thread. @@ -2400,7 +2434,6 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n", os_thread_pf(os_thread_get_curr_id())); #endif /* UNIV_DEBUG_THREAD_CREATION */ - buf_page_cleaner_is_active = TRUE; while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { @@ -2424,10 +2457,11 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( /* Flush pages from flush_list if required */ n_flushed += page_cleaner_flush_pages_if_needed(); + } else { n_flushed = page_cleaner_do_flush_batch( - PCT_IO(100), - LSN_MAX); + PCT_IO(100), + LSN_MAX); if (n_flushed) { MONITOR_INC_VALUE_CUMULATIVE( @@ -2440,6 +2474,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( } ut_ad(srv_shutdown_state > 0); + if (srv_fast_shutdown == 2) { /* In very fast shutdown we simulate a crash of buffer pool. We are not required to do any flushing */ @@ -2605,9 +2640,11 @@ buf_flush_validate( return(ret); } + #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + #ifdef UNIV_DEBUG /******************************************************************//** Check if there are any dirty pages that belong to a space id in the flush diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc new file mode 100644 index 00000000000..5a1769e3b70 --- /dev/null +++ b/storage/innobase/buf/buf0mtflu.cc @@ -0,0 +1,697 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file buf/buf0mtflu.cc +Multi-threaded flush method implementation + +Created 06/11/2013 Dhananjoy Das DDas@fusionio.com +Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com +Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com +Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0mtflu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "fil0fil.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "os0sync.h" +#include "trx0sys.h" +#include "srv0mon.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" + +#define MT_COMP_WATER_MARK 50 +/** Time to wait for a message. */ +#define MT_WAIT_IN_USECS 5000000 + +/* Work item status */ +typedef enum wrk_status { + WRK_ITEM_UNSET=0, /*!< Work item is not set */ + WRK_ITEM_START=1, /*!< Processing of work item has started */ + WRK_ITEM_DONE=2, /*!< Processing is done usually set to + SUCCESS/FAILED */ + WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ + WRK_ITEM_FAILED=3, /*!< Work item process failed */ + WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_SET=5, /*!< Work item is set */ + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +/* Work item task type */ +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, /*!< Exit queue-wait */ + MT_WRK_WRITE=1, /*!< Flush operation */ + MT_WRK_READ=2, /*!< Read operation */ + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; + +/* Work thread status */ +typedef enum wthr_status { + WTHR_NOT_INIT=0, /*!< Work thread not initialized */ + WTHR_INITIALIZED=1, /*!< Work thread initialized */ + WTHR_SIG_WAITING=2, /*!< Work thread wating signal */ + WTHR_RUNNING=3, /*!< Work thread running */ + WTHR_NO_WORK=4, /*!< Work thread has no work */ + WTHR_KILL_IT=5, /*!< Work thread should exit */ + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +/* Write work task */ +typedef struct wr_tsk { + buf_pool_t *buf_pool; /*!< buffer-pool instance */ + buf_flush_t flush_type; /*!< flush-type for buffer-pool + flush operation */ + ulint min; /*!< minimum number of pages + requested to be flushed */ + lsn_t lsn_limit; /*!< lsn limit for the buffer-pool + flush operation */ +} wr_tsk_t; + +/* Read work task */ +typedef struct rd_tsk { + buf_pool_t *page_pool; /*!< list of pages to decompress; */ +} rd_tsk_t; + +/* Work item */ +typedef struct wrk_itm +{ + mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type + one of the entries wr_tsk/rd_tsk + will be used */ + wr_tsk_t wr; /*!< Flush page list */ + rd_tsk_t rd; /*!< Decompress page list */ + ulint n_flushed; /*!< Flushed pages count */ + os_thread_id_t id_usr; /*!< Thread-id currently working */ + wrk_status_t wi_status; /*!< Work item status */ + mem_heap_t *wheap; /*!< Heap were to allocate memory + for queue nodes */ + mem_heap_t *rheap; +} wrk_t; + +typedef struct thread_data +{ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + wthr_status_t wt_status; /*!< Worker thread status */ +} thread_data_t; + +/* Thread syncronization data */ +typedef struct thread_sync +{ + /* Global variables used by all threads */ + os_fast_mutex_t thread_global_mtx; /*!< Mutex used protecting below + variables */ + ulint n_threads; /*!< Number of threads */ + ib_wqueue_t *wq; /*!< Work Queue */ + ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ + ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ + mem_heap_t* wheap; /*!< Work heap where memory + is allocated */ + mem_heap_t* rheap; /*!< Work heap where memory + is allocated */ + wthr_status_t gwt_status; /*!< Global thread status */ + + /* Variables used by only one thread at a time */ + thread_data_t* thread_data; /*!< Thread specific data */ + +} thread_sync_t; + +static int mtflush_work_initialized = -1; +static thread_sync_t* mtflush_ctx=NULL; +static os_fast_mutex_t mtflush_mtx; + +/******************************************************************//** +Set multi-threaded flush work initialized. */ +static inline +void +buf_mtflu_work_init(void) +/*=====================*/ +{ + mtflush_work_initialized = 1; +} + +/******************************************************************//** +Return true if multi-threaded flush is initialized +@return true if initialized */ +bool +buf_mtflu_init_done(void) +/*=====================*/ +{ + return(mtflush_work_initialized == 1); +} + +/******************************************************************//** +Fush buffer pool instance. +@return number of flushed pages, or 0 if error happened +*/ +static +ulint +buf_mtflu_flush_pool_instance( +/*==========================*/ + wrk_t *work_item) /*!< inout: work item to be flushed */ +{ + ut_a(work_item != NULL); + ut_a(work_item->wr.buf_pool != NULL); + + if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); +#endif + return 0; + } + + + if (work_item->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(work_item->wr.buf_pool); + work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU); + buf_pool_mutex_exit(work_item->wr.buf_pool); + work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min); + } + + work_item->n_flushed = buf_flush_batch(work_item->wr.buf_pool, + work_item->wr.flush_type, + work_item->wr.min, + work_item->wr.lsn_limit); + + + buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); + buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); + + return work_item->n_flushed; +} + +/******************************************************************//** +Worker function to wait for work items and processing them and +sending reply back. +*/ +static +void +mtflush_service_io( +/*===============*/ + thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush + syncronization data */ + thread_data_t* thread_data) /* Thread status data */ +{ + wrk_t *work_item = NULL; + ulint n_flushed=0; + + ut_a(mtflush_io != NULL); + ut_a(thread_data != NULL); + + thread_data->wt_status = WTHR_SIG_WAITING; + + work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); + + if (work_item == NULL) { + work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); + } + + if (work_item) { + thread_data->wt_status = WTHR_RUNNING; + } else { + /* Thread did not get any work */ + thread_data->wt_status = WTHR_NO_WORK; + return; + } + + if (work_item->wi_status != WRK_ITEM_EXIT) { + work_item->wi_status = WRK_ITEM_SET; + } + +#ifdef UNIV_MTFLUSH_DEBUG + ut_a(work_item->id_usr == 0); +#endif + work_item->id_usr = os_thread_get_curr_id(); + + /* This works as a producer/consumer model, where in tasks are + * inserted into the work-queue (wq) and completions are based + * on the type of operations performed and as a result the WRITE/ + * compression/flush operation completions get posted to wr_cq. + * And READ/decompress operations completions get posted to rd_cq. + * in future we may have others. + */ + + switch(work_item->tsk) { + case MT_WRK_NONE: + ut_a(work_item->wi_status == WRK_ITEM_EXIT); + work_item->wi_status = WRK_ITEM_EXIT; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + thread_data->wt_status = WTHR_KILL_IT; + break; + + case MT_WRK_WRITE: + ut_a(work_item->wi_status == WRK_ITEM_SET); + work_item->wi_status = WRK_ITEM_START; + /* Process work item */ + if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { + work_item->wi_status = WRK_ITEM_FAILED; + } + work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + break; + + case MT_WRK_READ: + ut_a(0); + break; + + default: + /* None other than Write/Read handling planned */ + ut_a(0); + break; + } +} + +/******************************************************************//** +Thead used to flush dirty pages when multi-threaded flush is +used. +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(mtflush_io_thread)( +/*==============================*/ + void * arg) +{ + thread_sync_t *mtflush_io = ((thread_sync_t *)arg); + thread_data_t *this_thread_data = NULL; + ulint i; + + /* Find correct slot for this thread */ + os_fast_mutex_lock(&(mtflush_io->thread_global_mtx)); + for(i=0; i < mtflush_io->n_threads; i ++) { + if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) { + break; + } + } + + ut_a(i <= mtflush_io->n_threads); + this_thread_data = &mtflush_io->thread_data[i]; + os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx)); + + while (TRUE) { + +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); +#endif /* UNIV_MTFLUSH_DEBUG */ + + mtflush_service_io(mtflush_io, this_thread_data); + + + if (this_thread_data->wt_status == WTHR_KILL_IT) { + break; + } + } + + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void) +/*==========================*/ +{ + long i; + thread_sync_t* mtflush_io = mtflush_ctx; + wrk_t* work_item = NULL; + + ut_a(mtflush_io != NULL); + + /* Allocate work items for shutdown message */ + work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + + /* Confirm if the io-thread KILL is in progress, bailout */ + if (mtflush_io->gwt_status == WTHR_KILL_IT) { + return; + } + + mtflush_io->gwt_status = WTHR_KILL_IT; + + fprintf(stderr, "InnoDB: [Note]: Signal mtflush_io_threads to exit [%lu]\n", + srv_mtflush_threads); + + /* Send one exit work item/thread */ + for (i=0; i < srv_mtflush_threads; i++) { + work_item[i].tsk = MT_WRK_NONE; + work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wheap = mtflush_io->wheap; + work_item[i].rheap = mtflush_io->rheap; + work_item[i].id_usr = 0; + + ib_wqueue_add(mtflush_io->wq, + (void *)&(work_item[i]), + mtflush_io->wheap); + } + + /* Wait until all work items on a work queue are processed */ + while(!ib_wqueue_is_empty(mtflush_io->wq)) { + /* Wait */ + os_thread_sleep(MT_WAIT_IN_USECS); + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + + /* Collect all work done items */ + for (i=0; i < srv_mtflush_threads;) { + wrk_t* work_item = NULL; + + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); + + /* If we receive reply to work item and it's status is exit, + thead has processed this message and existed */ + if (work_item && work_item->wi_status == WRK_ITEM_EXIT) { + i++; + } + } + + /* Wait about 1/2 sec to allow threads really exit */ + os_thread_sleep(MT_WAIT_IN_USECS); + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); + ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); + + /* Free all queues */ + ib_wqueue_free(mtflush_io->wq); + ib_wqueue_free(mtflush_io->wr_cq); + ib_wqueue_free(mtflush_io->rd_cq); + + os_fast_mutex_free(&mtflush_mtx); + os_fast_mutex_free(&mtflush_io->thread_global_mtx); + + /* Free heap */ + mem_heap_free(mtflush_io->wheap); + mem_heap_free(mtflush_io->rheap); +} + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt) /*!< in: Number of work items */ +{ + ulint i; + mem_heap_t* mtflush_heap; + mem_heap_t* mtflush_heap2; + + /* Create heap, work queue, write completion queue, read + completion queue for multi-threaded flush, and init + handler. */ + mtflush_heap = mem_heap_create(0); + ut_a(mtflush_heap != NULL); + mtflush_heap2 = mem_heap_create(0); + ut_a(mtflush_heap2 != NULL); + + mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, + sizeof(thread_sync_t)); + memset(mtflush_ctx, 0, sizeof(thread_sync_t)); + ut_a(mtflush_ctx != NULL); + mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc( + mtflush_heap, sizeof(thread_data_t) * n_threads); + ut_a(mtflush_ctx->thread_data); + memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads); + + mtflush_ctx->n_threads = n_threads; + mtflush_ctx->wq = ib_wqueue_create(); + ut_a(mtflush_ctx->wq); + mtflush_ctx->wr_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->wr_cq); + mtflush_ctx->rd_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->rd_cq); + mtflush_ctx->wheap = mtflush_heap; + mtflush_ctx->rheap = mtflush_heap2; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + + /* Create threads for page-compression-flush */ + for(i=0; i < n_threads; i++) { + os_thread_id_t new_thread_id; + + mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED; + + mtflush_ctx->thread_data[i].wthread = os_thread_create( + mtflush_io_thread, + ((void *) mtflush_ctx), + &new_thread_id); + + mtflush_ctx->thread_data[i].wthread_id = new_thread_id; + } + + buf_mtflu_work_init(); + + return((void *)mtflush_ctx); +} + +/******************************************************************//** +Flush buffer pool instances. +@return number of pages flushed. */ +ulint +buf_mtflu_flush_work_items( +/*=======================*/ + ulint buf_pool_inst, /*!< in: Number of buffer pool instances */ + ulint *per_pool_pages_flushed, /*!< out: Number of pages + flushed/instance */ + buf_flush_t flush_type, /*!< in: Type of flush */ + ulint min_n, /*!< in: Wished minimum number of + blocks to be flushed */ + lsn_t lsn_limit) /*!< in: All blocks whose + oldest_modification is smaller than + this should be flushed (if their + number does not exceed min_n) */ +{ + ulint n_flushed=0, i; + mem_heap_t* work_heap; + mem_heap_t* reply_heap; + wrk_t work_item[MTFLUSH_MAX_WORKER]; + + /* Allocate heap where all work items used and queue + node items areallocated */ + work_heap = mem_heap_create(0); + reply_heap = mem_heap_create(0); + + + for(i=0;iwq, + (void *)(work_item + i), + work_heap); + } + + /* wait on the completion to arrive */ + for(i=0; i< buf_pool_inst;) { + wrk_t *done_wi = NULL; + done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); + + if (done_wi != NULL) { + per_pool_pages_flushed[i] = done_wi->n_flushed; + +#ifdef UNIV_MTFLUSH_DEBUG + if((int)done_wi->id_usr == 0 && + (done_wi->wi_status == WRK_ITEM_SET || + done_wi->wi_status == WRK_ITEM_UNSET)) { + fprintf(stderr, + "**Set/Unused work_item[%lu] flush_type=%d\n", + i, + done_wi->wr.flush_type); + ut_a(0); + } +#endif + + n_flushed+= done_wi->n_flushed; + i++; + } + } + + /* Release used work_items and queue nodes */ + mem_heap_free(work_heap); + mem_heap_free(reply_heap); + + return(n_flushed); +} + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + + /* This lock is to safequard against re-entry if any. */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + } + } +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); +#endif + return(success); +} + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void) +/*==========================*/ +{ + ulint total_flushed=0, i; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; + + ut_a(buf_mtflu_init_done()); + + /* This lock is to safeguard against re-entry if any */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + +#if UNIV_MTFLUSH_DEBUG + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); +#endif + + return(total_flushed); +} + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*!thread_data[i].wthread_id; + } +} diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 7c8369c0c09..ec76c9923fe 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -184,14 +185,14 @@ buf_read_page_low( *err = fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, zip_size, offset, 0, zip_size, - bpage->zip.data, bpage); + bpage->zip.data, bpage, &bpage->write_size); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage); + ((buf_block_t*) bpage)->frame, bpage, 0); } if (sync) { diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index bbae608efdb..f161a2e78b2 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 53a22a8246b..e71c2778687 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,6 +25,8 @@ Created 10/25/1995 Heikki Tuuri *******************************************************/ #include "fil0fil.h" +#include "fil0pagecompress.h" +#include "fsp0pagecompress.h" #include #include @@ -45,6 +48,7 @@ Created 10/25/1995 Heikki Tuuri #include "page0zip.h" #include "trx0sys.h" #include "row0mysql.h" +#include "os0file.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -54,6 +58,14 @@ Created 10/25/1995 Heikki Tuuri # include "srv0srv.h" static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE @@ -431,11 +443,16 @@ fil_read( block size multiple */ void* buf, /*!< in/out: buffer where to store data read; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /********************************************************************//** @@ -460,18 +477,22 @@ fil_write( be a block size multiple */ void* buf, /*!< in: buffer from which to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ut_ad(!srv_read_only_mode); return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ -UNIV_INLINE fil_space_t* fil_space_get_by_id( /*================*/ @@ -489,6 +510,19 @@ fil_space_get_by_id( return(space); } +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + /*******************************************************************//** Returns the table space by a given name, NULL if not found. */ UNIV_INLINE @@ -709,8 +743,9 @@ fil_node_open_file( byte* buf2; byte* page; ulint space_id; - ulint flags; + ulint flags=0; ulint page_size; + ulint atomic_writes=0; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -727,7 +762,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_ONLY, &success, 0); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -779,6 +814,8 @@ fil_node_open_file( space_id = fsp_header_get_space_id(page); flags = fsp_header_get_flags(page); page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); + ut_free(buf2); @@ -829,6 +866,17 @@ fil_node_open_file( ut_error; } + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { /* Truncate the size to whole extent size. */ size_bytes = ut_2pow_round(size_bytes, @@ -852,6 +900,8 @@ add_size: space->size += node->size; } + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + /* printf("Opening file %s\n", node->name); */ /* Open the file for reading and writing, in Windows normally in the @@ -862,18 +912,18 @@ add_size: node->handle = os_file_create(innodb_file_log_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, - &ret); + &ret, atomic_writes); } else if (node->is_raw_disk) { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN_RAW, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } else { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } ut_a(ret); @@ -1244,7 +1294,6 @@ fil_space_create( DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); ut_a(fil_system); - ut_a(fsp_flags_is_valid(flags)); /* Look for a matching tablespace and if found free it. */ do { @@ -1898,12 +1947,12 @@ fil_write_lsn_and_arch_no_to_file( buf = static_cast(ut_align(buf1, UNIV_PAGE_SIZE)); err = fil_read(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); err = fil_write(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); } mem_free(buf1); @@ -3197,7 +3246,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); if (!success) { /* The following call will print an error message */ @@ -3213,10 +3262,10 @@ fil_create_link_file( ut_print_filename(stderr, filepath); fputs(" already exists.\n", stderr); err = DB_TABLESPACE_EXISTS; - } else if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; - + } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; } else { err = DB_ERROR; } @@ -3306,8 +3355,9 @@ fil_open_linked_file( /*===============*/ const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ - os_file_t* remote_file) /*!< out: remote file handle */ - + os_file_t* remote_file, /*!< out: remote file handle */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ { ibool success; @@ -3321,7 +3371,7 @@ fil_open_linked_file( *remote_file = os_file_create_simple_no_error_handling( innodb_file_data_key, *remote_filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, atomic_writes); if (!success) { char* link_filepath = fil_make_isl_name(tablename); @@ -3376,6 +3426,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3408,7 +3459,8 @@ fil_create_new_single_table_tablespace( OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + atomic_writes); if (ret == FALSE) { /* The following call will print an error message */ @@ -3435,6 +3487,11 @@ fil_create_new_single_table_tablespace( goto error_exit_3; } + if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + goto error_exit_3; + } + if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; goto error_exit_3; @@ -3473,6 +3530,7 @@ fil_create_new_single_table_tablespace( flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); if (!(fsp_flags_is_compressed(flags))) { buf_flush_init_for_writing(page, NULL, 0); @@ -3663,6 +3721,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; + ulint atomic_writes = 0; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -3673,6 +3732,8 @@ fil_open_single_table_tablespace( return(DB_CORRUPTION); } + atomic_writes = fsp_flags_get_atomic_writes(flags); + /* If the tablespace was relocated, we do not compare the DATA_DIR flag */ ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; @@ -3697,7 +3758,7 @@ fil_open_single_table_tablespace( } link_file_found = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, atomic_writes); remote.success = link_file_found; if (remote.success) { /* possibility of multiple files. */ @@ -3725,7 +3786,7 @@ fil_open_single_table_tablespace( if (dict.filepath) { dict.file = os_file_create_simple_no_error_handling( innodb_file_data_key, dict.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &dict.success); + OS_FILE_READ_ONLY, &dict.success, atomic_writes); if (dict.success) { /* possibility of multiple files. */ validate = true; @@ -3737,7 +3798,7 @@ fil_open_single_table_tablespace( ut_a(def.filepath); def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, atomic_writes); if (def.success) { tablespaces_found++; } @@ -4331,7 +4392,7 @@ fil_load_single_table_tablespace( /* Check for a link file which locates a remote tablespace. */ remote.success = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, FALSE); /* Read the first page of the remote tablespace */ if (remote.success) { @@ -4346,7 +4407,7 @@ fil_load_single_table_tablespace( /* Try to open the tablespace in the datadir. */ def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &def.success); + OS_FILE_READ_WRITE, &def.success, FALSE); /* Read the first page of the remote tablespace */ if (def.success) { @@ -5074,6 +5135,7 @@ retry: } page_size = fsp_flags_get_zip_size(space->flags); + if (!page_size) { page_size = UNIV_PAGE_SIZE; } @@ -5122,16 +5184,18 @@ retry: "space for file \'%s\' failed. Current size " INT64PF ", desired size " INT64PF "\n", node->name, start_offset, len+start_offset); - os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE); + os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__); success = FALSE; } else { success = TRUE; } mutex_enter(&fil_system->mutex); + if (success) { - node->size += n_pages; - space->size += n_pages; + node->size += (size_after_extend - start_page_no); + space->size += (size_after_extend - start_page_no); + os_has_said_disk_full = FALSE; } @@ -5167,7 +5231,7 @@ retry: success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL); + NULL, NULL, 0, FALSE, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5261,7 +5325,7 @@ fil_extend_tablespaces_to_stored_len(void) single-threaded operation */ error = fil_read(TRUE, space->id, fsp_flags_get_zip_size(space->flags), - 0, 0, UNIV_PAGE_SIZE, buf, NULL); + 0, 0, UNIV_PAGE_SIZE, buf, NULL, 0); ut_a(error == DB_SUCCESS); size_in_header = fsp_get_size_low(buf); @@ -5541,8 +5605,13 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ulint mode; fil_space_t* space; @@ -5552,6 +5621,8 @@ fil_io( ulint wake_later; os_offset_t offset; ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5605,6 +5676,11 @@ fil_io( } else if (type == OS_FILE_WRITE) { ut_ad(!srv_read_only_mode); srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } else { + srv_stats.non_index_pages_written.inc(); + } } /* Reserve the fil_system mutex and make sure that we can open at @@ -5730,6 +5806,9 @@ fil_io( ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); + #ifdef UNIV_HOTBACKUP /* In ibbackup do normal i/o, not aio */ if (type == OS_FILE_READ) { @@ -5742,7 +5821,8 @@ fil_io( #else /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message); + offset, len, node, message, write_size, + page_compressed, page_compression_level); #endif /* UNIV_HOTBACKUP */ ut_a(ret); @@ -6366,7 +6446,7 @@ fil_tablespace_iterate( file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, - OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", { @@ -6584,3 +6664,33 @@ fil_mtr_rename_log( 0, 0, new_name, old_name, mtr); } } + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc new file mode 100644 index 00000000000..d96f254e5cc --- /dev/null +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -0,0 +1,384 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" +#ifdef HAVE_LZ4 +#include "lz4.h" +#endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + +/* Used for debugging */ +//#define UNIV_PAGECOMPRESS_DEBUG 1 + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /* in: compression level */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem) /*!< in: temporal memory used by LZO */ +{ + int err = Z_OK; + int level = 0; + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; + ulint write_size=0; + + ut_ad(buf); + ut_ad(out_buf); + ut_ad(len); + ut_ad(out_len); + + level = compression_level; + ut_ad(fil_space_is_page_compressed(space_id)); + + fil_system_enter(); + fil_space_t* space = fil_space_get_by_id(space_id); + fil_system_exit(); + + /* If no compression level was provided to this table, use system + default level */ + if (level == 0) { + level = page_zip_level; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + write_size = UNIV_PAGE_SIZE - header_len; + + switch(innodb_compression_algorithm) { +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: + err = LZ4_compress_limitedOutput((const char *)buf, + (char *)out_buf+header_len, len, write_size); + write_size = err; + + if (err == 0) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_1_15_compress( + buf, len, out_buf+header_len, &write_size, lzo_mem); + + if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", + space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + + break; +#endif /* HAVE_LZO */ + case PAGE_ZLIB_ALGORITHM: + err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; + + default: + ut_error; + break; + } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, innodb_compression_algorithm); + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == innodb_compression_algorithm); +#endif /* UNIV_DEBUG */ + + write_size+=header_len; + +#define SECT_SIZE 512 + + /* Actual write needs to be alligned on block size */ + if (write_size % SECT_SIZE) { + write_size = (write_size + SECT_SIZE-1) & ~(SECT_SIZE-1); + ut_a((write_size % SECT_SIZE) == 0); + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + + srv_stats.page_compression_saved.add((len - write_size)); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + +} + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size) /*!< in/out: Actual payload size of + the compressed data. */ +{ + int err = 0; + ulint actual_size = 0; + ulint compression_alg = 0; + byte *in_buf; + ulint olen=0; + + ut_ad(buf); + ut_ad(len); + + /* Before actual decompress, make sure that page type is correct */ + + if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC || + mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: CRC %lu type %lu.\n" + "InnoDB: len %lu\n", + mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_2(buf+FIL_PAGE_TYPE), len); + + fflush(stderr); + ut_error; + } + + /* Get compression algorithm */ + compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); + + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: FIL: Compression buffer not given, allocating...\n"); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } + + /* Store actual payload size of the compressed data. This pointer + points to buffer pool. */ + if (write_size) { + *write_size = actual_size; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + + switch(compression_alg) { + case PAGE_ZLIB_ALGORITHM: + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + + fflush(stderr); + + ut_error; + } + break; + +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); + + if (err != (int)actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_decompress((const unsigned char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, + actual_size,(unsigned char *)in_buf, &olen, NULL); + + if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %ld bytes.\n" + "InnoDB: size %lu len %lu\n", + olen, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif + default: + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(compression_alg)); + + fflush(stderr); + ut_error; + break; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } +} + + diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 3ca5cd4af22..52d3d1d0d39 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4,6 +4,7 @@ Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -100,6 +101,7 @@ this program; if not, write to the Free Software Foundation, Inc., #endif /* UNIV_DEBUG */ #include "fts0priv.h" #include "page0zip.h" +#include "fil0pagecompress.h" #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X)) @@ -500,6 +502,28 @@ ib_cb_t innodb_api_cb[] = { (ib_cb_t) ib_cursor_stmt_begin }; +/** + Structure for CREATE TABLE options (table options). + It needs to be called ha_table_option_struct. + + The option values can be specified in the CREATE TABLE at the end: + CREATE TABLE ( ... ) *here* +*/ + +ha_create_table_option innodb_table_option_list[]= +{ + /* With this option user can enable page compression feature for the + table */ + HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0), + /* With this option user can set zip compression level for page + compression for this table*/ + HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), + /* With this option user can enable atomic writes feature for this table */ + HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), + HA_TOPTION_END +}; + + /*************************************************************//** Check whether valid argument given to innodb_ft_*_stopword_table. This function is registered as a callback with MySQL. @@ -680,6 +704,26 @@ static SHOW_VAR innodb_status_variables[]= { {"purge_view_trx_id_age", (char*) &export_vars.innodb_purge_view_trx_id_age, SHOW_LONG}, #endif /* UNIV_DEBUG */ + /* Status variables for page compression */ + {"page_compression_saved", + (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG}, + {"page_compression_trim_sect512", + (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG}, + {"page_compression_trim_sect4096", + (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, + {"num_index_pages_written", + (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_non_index_pages_written", + (char*) &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG}, + {"num_pages_page_compressed", + (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, + {"num_page_compressed_trim_op", + (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG}, + {"num_page_compressed_trim_op_saved", + (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG}, + {"num_pages_page_decompressed", + (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} }; @@ -2888,6 +2932,8 @@ innobase_init( if (srv_file_per_table) innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); #ifndef DBUG_OFF @@ -9598,11 +9644,16 @@ innobase_table_flags( enum row_type row_format; rec_format_t innodb_row_format = REC_FORMAT_COMPACT; bool use_data_dir; + ha_table_option_struct *options= form->s->option_struct; /* Cache the value of innodb_file_format, in case it is modified by another thread while the table is being created. */ const ulint file_format_allowed = srv_file_format; + /* Cache the value of innobase_compression_level, in case it is + modified by another thread while the table is being created. */ + const ulint default_compression_level = page_zip_level; + *flags = 0; *flags2 = 0; @@ -9656,6 +9707,8 @@ index_bad: } } + row_format = form->s->row_type; + if (create_info->key_block_size) { /* The requested compressed page size (key_block_size) is given in kilobytes. If it is a valid number, store @@ -9665,7 +9718,7 @@ index_bad: ulint kbsize; /* Key Block Size */ for (zssize = kbsize = 1; zssize <= ut_min(UNIV_PAGE_SSIZE_MAX, - PAGE_ZIP_SSIZE_MAX); + PAGE_ZIP_SSIZE_MAX); zssize++, kbsize <<= 1) { if (kbsize == create_info->key_block_size) { zip_ssize = zssize; @@ -9693,8 +9746,8 @@ index_bad: } if (!zip_allowed - || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, - PAGE_ZIP_SSIZE_MAX)) { + || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, + PAGE_ZIP_SSIZE_MAX)) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, @@ -9703,8 +9756,6 @@ index_bad: } } - row_format = form->s->row_type; - if (zip_ssize && zip_allowed) { /* if ROW_FORMAT is set to default, automatically change it to COMPRESSED.*/ @@ -9741,7 +9792,6 @@ index_bad: case ROW_TYPE_REDUNDANT: innodb_row_format = REC_FORMAT_REDUNDANT; break; - case ROW_TYPE_COMPRESSED: case ROW_TYPE_DYNAMIC: if (!use_tablespace) { @@ -9759,10 +9809,18 @@ index_bad: " innodb_file_format > Antelope.", get_row_format_name(row_format)); } else { - innodb_row_format = (row_format == ROW_TYPE_DYNAMIC - ? REC_FORMAT_DYNAMIC - : REC_FORMAT_COMPRESSED); - break; + switch(row_format) { + case ROW_TYPE_COMPRESSED: + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + default: + /* Not possible, avoid compiler warning */ + break; + } + break; /* Correct row_format */ } zip_allowed = FALSE; /* fall through to set row_format = COMPACT */ @@ -9789,7 +9847,15 @@ index_bad: && ((create_info->data_file_name != NULL) && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); - dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + /* Set up table dictionary flags */ + dict_tf_set(flags, + innodb_row_format, + zip_ssize, + use_data_dir, + options->page_compressed, + (ulint)options->page_compression_level == ULINT_UNDEFINED ? + default_compression_level : options->page_compression_level, + options->atomic_writes); if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; @@ -9807,6 +9873,105 @@ index_bad: DBUG_RETURN(true); } + +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +UNIV_INTERN +const char* +ha_innobase::check_table_options( + THD *thd, /*!< in: thread handle */ + TABLE* table, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + const bool use_tablespace, /*!< in: use file par table */ + const ulint file_format) +{ + enum row_type row_format = table->s->row_type;; + ha_table_option_struct *options= table->s->option_struct; + atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; + + /* Check page compression requirements */ + if (options->page_compressed) { + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_format > Antelope."); + return "PAGE_COMPRESSED"; + } + + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if ((ulint)options->page_compression_level != ULINT_UNDEFINED) { + if (options->page_compressed == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 0 || options->page_compression_level > 9) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", + options->page_compression_level); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + /* Check atomic writes requirements */ + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ATOMIC_WRITES requires" + " innodb_file_per_table."); + return "ATOMIC_WRITES"; + } + } + + return 0; +} + /*****************************************************************//** Creates a new table to an InnoDB database. @return error number */ @@ -9838,6 +10003,7 @@ ha_innobase::create( while creating the table. So we read the current value here and make all further decisions based on this. */ bool use_tablespace = srv_file_per_table; + const ulint file_format = srv_file_format; /* Zip Shift Size - log2 - 9 of compressed page size, zero for uncompressed */ @@ -9861,6 +10027,12 @@ ha_innobase::create( /* Create the table definition in InnoDB */ + /* Validate table options not handled by the SQL-parser */ + if(check_table_options(thd, form, create_info, use_tablespace, + file_format)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + /* Validate create options if innodb_strict_mode is set. */ if (create_options_are_invalid( thd, form, create_info, use_tablespace)) { @@ -14084,6 +14256,12 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); if (table_changes != IS_EQUAL_YES) { @@ -14110,6 +14288,13 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level || + param_new->atomic_writes != param_old->atomic_writes) { + return(COMPATIBLE_DATA_NO); + } + return(COMPATIBLE_DATA_YES); } @@ -16206,7 +16391,7 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" + "Compression level used for zlib compression. 0 is no compression" ", 1 is fastest, 9 is best compression and default is 6.", NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); @@ -16716,6 +16901,40 @@ static MYSQL_SYSVAR_BOOL(force_primary_key, "Do not allow to create table without primary key (off by default)", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, + PLUGIN_VAR_OPCMDARG, + "Use trim. Default FALSE.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_LONG(compression_algorithm, innodb_compression_algorithm, + PLUGIN_VAR_OPCMDARG, + "Compression algorithm used on page compression. 1 for zlib, 2 for lz3, 3 for lzo", + NULL, NULL, + PAGE_ZLIB_ALGORITHM, + 0, +#if defined(HAVE_LZO) && defined(HAVE_LZ4) + PAGE_ALGORITHM_LAST, +#elif defined(HAVE_LZ4) && !defined(HAVE_LZO) + PAGE_ALGORITHM_LZ4, +#else + PAGE_ALGORITHM_ZLIB, +#endif + 0); + +static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of multi-threaded flush threads", + NULL, NULL, + MTFLUSH_DEFAULT_WORKER, /* Default setting */ + 1, /* Minimum setting */ + MTFLUSH_MAX_WORKER, /* Max setting */ + 0); + +static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use multi-threaded flush. Default FALSE.", + NULL, NULL, FALSE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16872,6 +17091,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(trx_purge_view_update_only_debug), #endif /* UNIV_DEBUG */ MYSQL_SYSVAR(force_primary_key), + MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(compression_algorithm), + MYSQL_SYSVAR(mtflush_threads), + MYSQL_SYSVAR(use_mtflush), NULL }; @@ -17214,6 +17437,9 @@ ib_senderrf( case IB_LOG_LEVEL_FATAL: l = 0; break; + default: + l = 0; + break; } my_printv_error(code, format, MYF(l), args); diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 97f26f93225..912be30c0ec 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -56,6 +57,22 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; +/** Engine specific table options are definined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + int page_compression_level; /*!< Table page compression level + or UNIV_UNSPECIFIED. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + srv_use_atomic_writes=1. + Atomic writes are not used if + value OFF.*/ +}; + + /** The class defining a handle to an Innodb table */ class ha_innobase: public handler { @@ -182,6 +199,8 @@ class ha_innobase: public handler char* norm_name, char* temp_path, char* remote_path); + const char* check_table_options(THD *thd, TABLE* table, + HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 191050bdce2..83095a939ae 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -259,6 +260,22 @@ ha_innobase::check_if_supported_inplace_alter( update_thd(); trx_search_latch_release_if_reserved(prebuilt->trx); + /* Change on engine specific table options require rebuild of the + table */ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION) { + ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct; + ha_table_option_struct *old_options= table->s->option_struct; + + if (new_options->page_compressed != old_options->page_compressed || + new_options->page_compression_level != old_options->page_compression_level || + new_options->atomic_writes != old_options->atomic_writes) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE | INNOBASE_ALTER_NOREBUILD @@ -3382,6 +3399,17 @@ ha_innobase::prepare_inplace_alter_table( if (ha_alter_info->handler_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { + /* Check engine specific table options */ + if (const char* invalid_tbopt = check_table_options( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0, + srv_file_format)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + if (const char* invalid_opt = create_options_are_invalid( user_thd, altered_table, ha_alter_info->create_info, diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 7e590c05209..e49bc837b75 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1489,6 +1490,11 @@ struct buf_page_t{ state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ + + ulint write_size; /* Write size is set when this + page is first time written and then + if written again we check is TRIM + operation needed. */ #ifndef UNIV_HOTBACKUP buf_page_t* hash; /*!< node used in chaining to buf_pool->page_hash or diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index f116720574b..fc16320be10 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2014, SkySQL Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -279,6 +280,55 @@ buf_flush_get_dirty_pages_count( #endif /* !UNIV_HOTBACKUP */ +/******************************************************************//** +Start a buffer flush batch for LRU or flush list */ +ibool +buf_flush_start( +/*============*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +End a buffer flush batch for LRU or flush list */ +void +buf_flush_end( +/*==========*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +Gather the aggregated stats for both flush list and LRU list flushing */ +void +buf_flush_common( +/*=============*/ + buf_flush_t flush_type, /*!< in: type of flush */ + ulint page_count); /*!< in: number of pages flushed */ + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! +@return number of blocks for which the write request was queued */ +ulint +buf_flush_batch( +/*============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST; if BUF_FLUSH_LIST, + then the caller must not own any + latches on pages */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit); /*!< in: in the case of BUF_FLUSH_LIST + all blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + + #ifndef UNIV_NONINL #include "buf0flu.ic" #endif diff --git a/storage/innobase/include/buf0mtflu.h b/storage/innobase/include/buf0mtflu.h new file mode 100644 index 00000000000..0475335bbf5 --- /dev/null +++ b/storage/innobase/include/buf0mtflu.h @@ -0,0 +1,95 @@ +/***************************************************************************** + +Copyright (C) 2014 SkySQL Ab. All Rights Reserved. +Copyright (C) 2014 Fusion-io. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/buf0mtflu.h +Multi-threadef flush method interface function prototypes + +Created 06/02/2014 Jan Lindström jan.lindstrom@skysql.com + Dhananjoy Das DDas@fusionio.com +***********************************************************************/ + +#ifndef buf0mtflu_h +#define buf0mtflu_h + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void); +/*===========================*/ + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt); /*!< in: Number of work items */ + +/******************************************************************//** +Return true if multi-threaded flush is initialized +@return true if initialized, false if not */ +bool +buf_mtflu_init_done(void); +/*======================*/ + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void); +/*===========================*/ + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed); /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*! PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + + ); + return(false); + } + } + + if (page_compression || page_compression_level) { + /* Page compression format must have compact and + atomic_blobs and page_compression_level requires + page_compression */ + if (!compact + || !page_compression + || !atomic_blobs) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); + } + } + + if (atomic_writes) { + + if(atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } } @@ -594,6 +685,11 @@ dict_sys_tables_type_validate( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); + + ut_a(atomic_writes >= 0 && atomic_writes <= ATOMIC_WRITES_OFF); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -604,12 +700,16 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } } /* Make sure there are no bits that we do not know about. */ if (unused) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + type, unused); return(ULINT_UNDEFINED); } @@ -624,6 +724,8 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + type, zip_ssize); return(ULINT_UNDEFINED); } @@ -633,11 +735,15 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } } @@ -647,6 +753,27 @@ dict_sys_tables_type_validate( format, so the DATA_DIR flag is compatible with any other table flags. However, it is not used with TEMPORARY tables.*/ + if (page_compression || page_compression_level) { + /* page compressed row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + + if (!atomic_blobs || !page_compression) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + "InnoDB: Error: atomic_blobs %lu\n", + type, page_compression, page_compression_level, atomic_blobs); + return(ULINT_UNDEFINED); + } + } + + /* Validate that the atomic writes number is within allowed range. */ + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n", + type, atomic_writes); + return(ULINT_UNDEFINED); + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -719,8 +846,16 @@ dict_tf_set( ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ + bool use_data_dir, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic writes setup */ { + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + switch (format) { case REC_FORMAT_REDUNDANT: *flags = 0; @@ -742,6 +877,19 @@ dict_tf_set( break; } + if (page_compressed) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } + + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_a(dict_tf_get_atomic_writes(*flags) == awrites); + if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); } @@ -765,6 +913,9 @@ dict_tf_to_fsp_flags( ulint table_flags) /*!< in: dict_table_t::flags */ { ulint fsp_flags; + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); @@ -783,7 +934,20 @@ dict_tf_to_fsp_flags( fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) ? FSP_FLAGS_MASK_DATA_DIR : 0; + /* In addition, tablespace flags also contain if the page + compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression); + + /* In addition, tablespace flags also contain page compression level + if page compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + + /* In addition, tablespace flags also contain flag if atomic writes + is used for this table */ + fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); + ut_a(fsp_flags_is_valid(fsp_flags)); + ut_a(dict_tf_verify_flags(table_flags, fsp_flags)); return(fsp_flags); } @@ -811,10 +975,15 @@ dict_sys_tables_type_to_tf( /* Adjust bit zero. */ flags = redundant ? 0 : 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ flags |= type & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES + ); return(flags); } @@ -842,10 +1011,14 @@ dict_tf_to_sys_tables_type( /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ type = 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ type |= flags & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES); return(type); } diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index eb259020106..b026210b214 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -120,11 +121,26 @@ This flag prevents older engines from attempting to open the table and allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_WIDTH_DATA_DIR 1 +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ + DICT_TF_WIDTH_ATOMIC_BLOBS \ - + DICT_TF_WIDTH_DATA_DIR) + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -140,9 +156,19 @@ allows InnoDB to update_create_info() accordingly. */ /** Zero relative shift position of the DATA_DIR field */ #define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) + /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ - + DICT_TF_WIDTH_DATA_DIR) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -160,6 +186,18 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_MASK_DATA_DIR \ ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define DICT_TF_MASK_ATOMIC_WRITES \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ + << DICT_TF_POS_ATOMIC_WRITES) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -177,6 +215,19 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_HAS_DATA_DIR(flags) \ ((flags & DICT_TF_MASK_DATA_DIR) \ >> DICT_TF_POS_DATA_DIR) +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define DICT_TF_GET_ATOMIC_WRITES(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ + >> DICT_TF_POS_ATOMIC_WRITES) + /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h new file mode 100644 index 00000000000..19a2a6c52f3 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ + __attribute__((const)); + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table); /*!< in: table */ + + +#ifndef UNIV_NONINL +#include "dict0pagecompress.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic new file mode 100644 index 00000000000..ea3c7546850 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.ic @@ -0,0 +1,191 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ +{ + ulint table_unused = DICT_TF_GET_UNUSED(table_flags); + ulint compact = DICT_TF_GET_COMPACT(table_flags); + ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); + ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); + ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags); + ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); + ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); + ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", + return(ULINT_UNDEFINED);); + + ut_a(!table_unused); + ut_a(!fsp_unused); + ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_a(compact == 0 || compact == 1); /* silence compiler */ + ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + + if (ssize != zip_ssize) { + fprintf(stderr, + "InnoDB: Error: table flags has zip_ssize %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has zip_ssize %ld\n", + ssize, zip_ssize); + return (FALSE); + } + if (atomic_blobs != fsp_atomic_blobs) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic_blobs %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_blobs %ld\n", + atomic_blobs, fsp_atomic_blobs); + + return (FALSE); + } + if (page_compression != fsp_page_compression) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file ahas page_compression %ld\n", + page_compression, fsp_page_compression); + + return (FALSE); + } + if (page_compression_level != fsp_page_compression_level) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression_level %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_compression_level %ld\n", + page_compression_level, fsp_page_compression_level); + + return (FALSE); + } + + if (atomic_writes != fsp_atomic_writes) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic writes %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_writes %ld\n", + atomic_writes, fsp_atomic_writes); + + return (FALSE); + } + + return(TRUE); +} + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level >= 0 && page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ +{ + return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags)); +} + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags)); +} diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index 1299445a8ee..5f471d1a45f 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -75,6 +76,13 @@ enum ib_quiesce_t { QUIESCE_COMPLETE /*!< All done */ }; +/** Enum values for atomic_writes table option */ +typedef enum { + ATOMIC_WRITES_DEFAULT = 0, + ATOMIC_WRITES_ON = 1, + ATOMIC_WRITES_OFF = 2 +} atomic_writes_t; + /** Prefix for tmp tables, adopted from sql/table.h */ #define tmp_file_prefix "#sql" #define tmp_file_prefix_length 4 diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 6e906fa05b0..5d56bd0b4b8 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -130,6 +131,10 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID #define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* Following are used when page compression is used */ +#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on + compressed pages. */ /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -142,6 +147,7 @@ extern fil_addr_t fil_addr_null; #ifndef UNIV_INNOCHECKSUM /** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ @@ -204,6 +210,7 @@ ulint fil_space_get_type( /*===============*/ ulint id); /*!< in: space id */ + #endif /* !UNIV_HOTBACKUP */ /*******************************************************************//** Appends a new file to the chain of files of a space. File must be closed. @@ -747,8 +754,13 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ __attribute__((nonnull(8))); /**********************************************************************//** Waits for an aio operation to complete. This function is used to write the @@ -988,4 +1000,30 @@ fil_mtr_rename_log( __attribute__((nonnull)); #endif /* !UNIV_INNOCHECKSUM */ + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void); +/*==================*/ +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void); +/*==================*/ + +#ifndef UNIV_INNOCHECKSUM +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space); /*!< in: space */ +#endif #endif /* fil0fil_h */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h new file mode 100644 index 00000000000..74f6e45f4fb --- /dev/null +++ b/storage/innobase/include/fil0pagecompress.h @@ -0,0 +1,122 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" +#include "fsp0pagecompress.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/*******************************************************************//** +Returns the page compression level flag of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level if page compressed, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return atomic write table option value */ +atomic_writes_t +fil_space_get_atomic_writes( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf); /*!< in: page */ + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg); /*!> FSP_FLAGS_POS_UNUSED) +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \ + >> FSP_FLAGS_POS_ATOMIC_WRITES) + /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) +/** Set a PAGE_COMPRESSION into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \ + (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION)) + +/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ + (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) +/** Set a ATOMIC_WRITES into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ + (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES)) + /* @} */ /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 0d81e817cc9..fb253370b6e 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,12 +64,17 @@ fsp_flags_is_valid( ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); + ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); /* fsp_flags is zero unless atomic_blobs is set. */ /* Make sure there are no bits that we do not know about. */ if (unused != 0 || flags == 1) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n", + flags, unused); return(false); } else if (post_antelope) { /* The Antelope row formats REDUNDANT and COMPACT did @@ -76,6 +82,8 @@ fsp_flags_is_valid( 4-byte field is zero for Antelope row formats. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n", + flags, atomic_blobs); return(false); } } @@ -87,10 +95,14 @@ fsp_flags_is_valid( externally stored parts. */ if (post_antelope || zip_ssize != 0) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n", + flags, zip_ssize, atomic_blobs); return(false); } } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n", + flags, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(false); } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { @@ -98,12 +110,33 @@ fsp_flags_is_valid( be zero for an original 16k page size. Validate the page shift size is within allowed range. */ + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n", + flags, page_ssize, UNIV_PAGE_SSIZE_MAX); return(false); } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n", + flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG); return(false); } + /* Page compression level requires page compression and atomic blobs + to be set */ + if (page_compression_level || page_compression) { + if (!page_compression || !atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n" + "InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n", + flags, page_compression, page_compression_level, atomic_blobs); + return(false); + } + } + + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n", + flags, atomic_writes); + return (false); + } + #if UNIV_FORMAT_MAX != UNIV_FORMAT_B # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif @@ -312,3 +345,4 @@ xdes_calc_descriptor_page( } #endif /* !UNIV_INNOCHECKSUM */ + diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h new file mode 100644 index 00000000000..f2cd38481f6 --- /dev/null +++ b/storage/innobase/include/fsp0pagecompress.h @@ -0,0 +1,79 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef fsp0pagecompress_h +#define fsp0pagecompress_h + +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_ALGORITHM_LAST PAGE_LZO_ALGORITHM + +/**********************************************************************//** +Reads the page compression level from the first page of a tablespace. +@return page compression level, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_compression_level( +/*=============================*/ + const page_t* page); /*!< in: first page of a tablespace */ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Extract the page compression level from tablespace flags. +A tablespace has only one physical page compression level +whether that page is compressed or not. +@return page compression level of the file-per-table tablespace, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0pagecompress.ic" +#endif + +#endif diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic new file mode 100644 index 00000000000..65ad90cdfc4 --- /dev/null +++ b/storage/innobase/include/fsp0pagecompress.ic @@ -0,0 +1,181 @@ +/***************************************************************************** + +Copyright (C) 2013,2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.ic +Implementation for helper functions for extracting/storing page +compression and atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not page compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Determine the tablespace is page compression level from dict_table_t::flags. +@return page compression level or 0 if not compressed*/ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); +} + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags)); +} + +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +UNIV_INLINE +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); +} + +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed, false if not */ +UNIV_INLINE +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); +} + +/*******************************************************************//** +Returns the page compression level of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +UNIV_INLINE +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_compression_level(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Extract the page compression from space. +@return true if space is page compressed, false if space is not found +or space is not page compressed. */ +UNIV_INLINE +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_compressed(flags)); + } + + return(flags); +} + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +UNIV_INLINE +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg) /*!first || list->last)); } + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h index 33385ddf2d4..9906e299808 100644 --- a/storage/innobase/include/ut0wqueue.h +++ b/storage/innobase/include/ut0wqueue.h @@ -95,6 +95,23 @@ ib_wqueue_timedwait( ib_wqueue_t* wq, /* in: work queue */ ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*space_id, 0, (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + group, 0); srv_stats.os_log_pending_writes.dec(); @@ -1966,7 +1967,7 @@ log_group_checkpoint( write_offset / UNIV_PAGE_SIZE, write_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, - buf, ((byte*) group + 1)); + buf, ((byte*) group + 1), 0); ut_ad(((ulint) group & 0x1UL) == 0); } @@ -2046,7 +2047,7 @@ log_group_read_checkpoint_info( fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0, field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0); } /******************************************************//** @@ -2340,7 +2341,7 @@ loop: fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, (ulint) (source_offset / UNIV_PAGE_SIZE), (ulint) (source_offset % UNIV_PAGE_SIZE), - len, buf, NULL); + len, buf, NULL, 0); start_lsn += len; buf += len; @@ -2405,7 +2406,7 @@ log_group_archive_file_header_write( dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, 2 * OS_FILE_LOG_BLOCK_SIZE, - buf, &log_archive_io); + buf, &log_archive_io, 0); } /******************************************************//** @@ -2441,7 +2442,7 @@ log_group_archive_completed_header_write( dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf + LOG_FILE_ARCH_COMPLETED, - &log_archive_io); + &log_archive_io, 0); } /******************************************************//** @@ -2569,7 +2570,7 @@ loop: (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, - &log_archive_io); + &log_archive_io, 0); start_lsn += len; next_offset += len; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 08f50f70fd4..8545f231850 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2,6 +2,7 @@ Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2078,7 +2079,7 @@ recv_apply_log_recs_for_backup(void) error = fil_io(OS_FILE_READ, true, recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); if (error == DB_SUCCESS && !buf_zip_decompress(block, TRUE)) { exit(1); @@ -2088,7 +2089,7 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } if (error != DB_SUCCESS) { @@ -2117,13 +2118,13 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); } else { error = fil_io(OS_FILE_WRITE, true, recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); @@ -3088,7 +3089,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, LOG_FILE_HDR_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { @@ -3119,7 +3120,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, OS_FILE_LOG_BLOCK_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); } #ifdef UNIV_LOG_ARCHIVE @@ -3753,7 +3754,7 @@ ask_again: /* Read the archive file header */ fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0, - LOG_FILE_HDR_SIZE, buf, NULL); + LOG_FILE_HDR_SIZE, buf, NULL, 0); /* Check if the archive file header is consistent */ @@ -3826,7 +3827,7 @@ ask_again: fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, read_offset / UNIV_PAGE_SIZE, - read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0); ret = recv_scan_log_recs( (buf_pool_get_n_pages() diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index d3f1980aeed..f87b4ce6d7a 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,8 +43,14 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#include "linux/falloc.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -60,6 +67,17 @@ Created 10/21/1995 Heikki Tuuri #include #endif +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -175,6 +193,28 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ibool page_compression; + ulint page_compression_level; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + #ifdef WIN_ASYNC_IO HANDLE handle; /*!< handle object we need in the OVERLAPPED struct */ @@ -185,6 +225,12 @@ struct os_aio_slot_t{ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ #endif /* WIN_ASYNC_IO */ +#ifdef HAVE_LZO + byte lzo_mem[LZO1X_1_15_MEM_COMPRESS]; +#else + byte lzo_mem; /* Temporal memory used by LZO */ +#endif + }; /** The asynchronous i/o array structure */ @@ -294,6 +340,79 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len); /*!< in: length of area */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name /*!< in: name of the file */ + __attribute__((unused)), + os_file_t file /*!< in: handle to the file */ + __attribute__((unused))) +{ +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET + int atomic_option = 1; + + if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__); + return(FALSE); + } + + return(TRUE); +#else + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + return(FALSE); +#endif +} + + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -439,6 +558,19 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -501,6 +633,19 @@ os_file_get_last_error_low( "InnoDB: The error means mysqld does not have" " the access rights to\n" "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { if (strerror(err) != NULL) { fprintf(stderr, @@ -532,6 +677,9 @@ os_file_get_last_error_low( case ENOTDIR: case EISDIR: return(OS_FILE_PATH_ERROR); + case ECANCELED: + case ENOTTY: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EAGAIN: if (srv_use_native_aio) { return(OS_FILE_AIO_RESOURCES_RESERVED); @@ -578,9 +726,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -610,6 +760,9 @@ os_file_handle_error_cond_exit( " InnoDB: Disk is full. Try to clean the disk" " to free space.\n"); + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + os_has_said_disk_full = TRUE; fflush(stderr); @@ -651,6 +804,9 @@ os_file_handle_error_cond_exit( ? " Cannot continue operation" : ""); } + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + if (should_exit) { exit(1); } @@ -667,10 +823,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -681,12 +839,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -826,7 +986,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -837,7 +997,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -859,7 +1019,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -871,7 +1031,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -943,7 +1103,7 @@ next_file: return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1029,7 +1189,7 @@ next_file: goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1080,7 +1240,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1093,7 +1253,7 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1203,7 +1363,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN ? - "open" : "create"); + "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; @@ -1271,7 +1431,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; retry = false; @@ -1313,9 +1473,12 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; *success = FALSE; #ifdef __WIN__ @@ -1376,6 +1539,23 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ int create_flag; @@ -1436,6 +1616,24 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } + + #endif /* __WIN__ */ return(file); @@ -1520,12 +1718,15 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; ibool retry; ibool on_error_no_exit; ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; #ifdef __WIN__ DBUG_EXECUTE_IF( @@ -1658,9 +1859,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1669,6 +1870,22 @@ os_file_create_func( } while (retry); + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } #else /* __WIN__ */ int create_flag; const char* mode_str = NULL; @@ -1743,9 +1960,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1797,6 +2014,22 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } #endif /* __WIN__ */ return(file); @@ -1855,7 +2088,7 @@ loop: ret = unlink(name); if (ret != 0 && errno != ENOENT) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -1919,7 +2152,7 @@ loop: ret = unlink(name); if (ret != 0) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -1963,7 +2196,7 @@ os_file_rename_func( return(TRUE); } - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); #else @@ -1972,7 +2205,7 @@ os_file_rename_func( ret = rename(oldpath, newpath); if (ret != 0) { - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2003,7 +2236,7 @@ os_file_close_func( return(TRUE); } - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); #else @@ -2012,7 +2245,7 @@ os_file_close_func( ret = close(file); if (ret == -1) { - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); } @@ -2106,6 +2339,11 @@ os_file_set_size( current_size = 0; +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: File %s current_size %lu extended_size %lu\n", + name, os_file_get_size(file), size); +#endif + #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2114,15 +2352,15 @@ os_file_set_size( fprintf(stderr, "InnoDB: Error: preallocating file " "space for file \'%s\' failed. Current size " "%lu, desired size %lu\n", - name, (long unsigned) current_size, (long unsigned) size); - os_file_handle_error_no_exit(name, "posix_fallocate", FALSE); + name, current_size, size); + os_file_handle_error_no_exit(name, "posix_fallocate", FALSE, __FILE__, __LINE__); + return(FALSE); } return(TRUE); } #endif - /* Write up to 1 megabyte at a time. */ buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) * UNIV_PAGE_SIZE; @@ -2281,7 +2519,7 @@ os_file_flush_func( return(TRUE); } - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2335,7 +2573,7 @@ os_file_flush_func( ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2642,6 +2880,9 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len, NULL); + } return(TRUE); } #else /* __WIN__ */ @@ -2654,6 +2895,9 @@ try_again: ret = os_file_pread(file, buf, n, offset); if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } return(TRUE); } @@ -2665,7 +2909,7 @@ try_again: #ifdef __WIN__ error_handling: #endif - retry = os_file_handle_error(NULL, "read"); + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); if (retry) { goto try_again; @@ -2788,7 +3032,7 @@ try_again: #ifdef __WIN__ error_handling: #endif - retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); if (retry) { goto try_again; @@ -3037,7 +3281,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3065,7 +3309,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3114,7 +3358,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3167,7 +3411,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3733,7 +3977,8 @@ os_aio_array_create( array->slots = static_cast( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); + #ifdef __WIN__ array->handles = static_cast(ut_malloc(n * sizeof(HANDLE))); #endif /* __WIN__ */ @@ -3821,8 +4066,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { -#ifdef WIN_ASYNC_IO ulint i; +#ifdef WIN_ASYNC_IO for (i = 0; i < array->n_slots; i++) { os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); @@ -3844,6 +4089,14 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4177,7 +4430,16 @@ os_aio_array_reserve_slot( void* buf, /*!< in: buffer where to read or from which to write */ os_offset_t offset, /*!< in: file offset */ - ulint len) /*!< in: length of the block to read or write */ + ulint len, /*!< in: length of the block to read or write */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4267,6 +4529,54 @@ found: slot->buf = static_cast(buf); slot->offset = offset; slot->io_already_done = FALSE; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + + /* If the space is page compressed and this is write operation + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + + ut_ad(slot->page_buf); + + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + &real_len, + slot->lzo_mem + ); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back, not sure if this is really needed + below */ + os_mutex_enter(array->mutex); + + } + #ifdef WIN_ASYNC_IO control = &slot->control; @@ -4541,10 +4851,19 @@ os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ - void* message2)/*!< in: message for the aio handler + void* message2,/*!< in: message for the aio handler (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4642,7 +4961,8 @@ try_again: } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n); + name, buf, offset, n, write_size, page_compression, page_compression_level); + if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4722,7 +5042,7 @@ err_exit: os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4835,7 +5155,7 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { retry = TRUE; } else { @@ -4865,9 +5185,17 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret = WriteFile(slot->file, slot->buf, + if (slot->message1 && + slot->page_compression && + slot->page_buf) { + ret = WriteFile(slot->file, slot->page_buf, (DWORD) slot->len, &len, &(slot->control)); + } else { + ret = WriteFile(slot->file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + } break; case OS_FILE_READ: @@ -4899,6 +5227,28 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + os_aio_array_free_slot(array, slot); return(ret_val); @@ -4988,6 +5338,33 @@ retry: /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5131,6 +5508,13 @@ found: } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5138,7 +5522,7 @@ found: windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5809,4 +6193,168 @@ os_aio_all_slots_free(void) } #endif /* UNIV_DEBUG */ +#ifdef _WIN32 +#include +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len) /*!< in: length of area */ +{ + +#define SECT_SIZE 512 + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + // len here should be alligned to sector size + ut_a((trim_len % SECT_SIZE) == 0); + ut_a((len % SECT_SIZE) == 0); + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lx len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + if (slot->write_size) { + *slot->write_size = 0; + } + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#endif + + srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} #endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + /* We allocate extra to avoid memory overwrite on compression */ + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast(cbuf2); + slot->page_buf = static_cast(cbuf); +} diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index ea346566e57..f276efdc021 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -290,6 +290,18 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + {"buffer_index_pages_written", "buffer", + "Number of index pages written (innodb_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + + {"buffer_non_index_pages_written", "buffer", + "Number of non index pages written (innodb_non_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -879,6 +891,46 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + {"compress_saved", "compression", + "Number of bytes saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED}, + + {"compress_trim_sect512", "compression", + "Number of sect-512 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512}, + + {"compress_trim_sect4096", "compression", + "Number of sect-4K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096}, + + {"compress_pages_page_compressed", "compression", + "Number of pages compressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED}, + + {"compress_page_compressed_trim_op", "compression", + "Number of TRIM operation performed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP}, + + {"compress_page_compressed_trim_op_saved", "compression", + "Number of TRIM operation saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED}, + + {"compress_pages_page_decompressed", "compression", + "Number of pages decompressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + + {"compress_pages_page_compression_error", "compression", + "Number of page compression errors", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1552,6 +1604,16 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; + /* innodb_index_pages_written, the number of index pages written */ + case MONITOR_OVLD_INDEX_PAGES_WRITTEN: + value = srv_stats.index_pages_written; + break; + + /* innodb_non_index_pages_written, the number of non index pages written */ + case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN: + value = srv_stats.non_index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); @@ -1793,6 +1855,31 @@ srv_mon_process_existing_counter( value = btr_cur_n_non_sea; break; + case MONITOR_OVLD_PAGE_COMPRESS_SAVED: + value = srv_stats.page_compression_saved; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512: + value = srv_stats.page_compression_trim_sect512; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096: + value = srv_stats.page_compression_trim_sect4096; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSED: + value = srv_stats.pages_page_compressed; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP: + value = srv_stats.page_compressed_trim_op; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED: + value = srv_stats.page_compressed_trim_op_saved; + break; + case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: + value = srv_stats.pages_page_decompressed; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR: + value = srv_stats.pages_page_compression_error; + break; + default: ut_error; } diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index fc903852cba..054e5f8e8e7 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -70,6 +71,7 @@ Created 10/8/1995 Heikki Tuuri #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /* The following is the maximum allowed duration of a lock wait. */ UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; @@ -145,6 +147,20 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) +to the pages */ +UNIV_INTERN my_bool srv_use_trim = FALSE; +/* If this flag is TRUE, then we will use posix fallocate for file extentsion */ +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +/* If this flag is TRUE, then we disable doublewrite buffer */ +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +UNIV_INTERN long innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM; +/* Number of threads used for multi-threaded flush */ +UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; +/* If this flag is TRUE, then we will use multi threaded flush. */ +UNIV_INTERN my_bool srv_use_mtflush = FALSE; + #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function pointers, because they are not available on Windows Server 2003 and @@ -347,11 +363,6 @@ batch flushing i.e.: LRU flushing and flush_list flushing. The rest of the pages are used for single page flushing. */ UNIV_INTERN ulong srv_doublewrite_batch_size = 120; -UNIV_INTERN ibool srv_use_atomic_writes = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -UNIV_INTERN ibool srv_use_posix_fallocate = TRUE; -#endif - UNIV_INTERN ulong srv_replication_delay = 0; /*-------------------------------------------*/ @@ -375,6 +386,17 @@ static ulint srv_n_rows_read_old = 0; UNIV_INTERN ulint srv_truncated_status_writes = 0; UNIV_INTERN ulint srv_available_undo_logs = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; +UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_non_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; +UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; + + /* Set the following to 0 if you want InnoDB to write messages on stderr on startup/shutdown. */ UNIV_INTERN ibool srv_print_verbose_log = TRUE; @@ -1460,6 +1482,15 @@ srv_export_innodb_status(void) srv_truncated_status_writes; export_vars.innodb_available_undo_logs = srv_available_undo_logs; + export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved; + export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; + export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; + export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written; + export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; + export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; + export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; + export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 387f793a763..598d01d3584 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -3,6 +3,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -72,6 +73,7 @@ Created 2/16/1996 Heikki Tuuri # include "sync0sync.h" # include "buf0flu.h" # include "buf0rea.h" +# include "buf0mtflu.h" # include "dict0boot.h" # include "dict0load.h" # include "dict0stats_bg.h" @@ -129,7 +131,11 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32]; +/** 6 is the ? */ +#define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + 32) +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + MTFLUSH_MAX_WORKER]; +/* Thread contex data for multi-threaded flush */ +void *mtflush_ctx=NULL; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -527,7 +533,7 @@ create_log_file( *file = os_file_create( innodb_file_log_key, name, OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name); @@ -734,7 +740,7 @@ open_log_file( *file = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); return(DB_ERROR); @@ -825,7 +831,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode) { @@ -868,7 +874,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -901,17 +907,17 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else if (i == 0) { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -1105,7 +1111,7 @@ srv_undo_tablespace_create( innodb_file_data_key, name, srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode && ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -1192,7 +1198,8 @@ srv_undo_tablespace_open( | OS_FILE_ON_ERROR_SILENT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + FALSE); /* If the file open was successful then load the tablespace. */ @@ -2663,6 +2670,25 @@ files_checked: } if (!srv_read_only_mode) { + + if (srv_use_mtflush) { + /* Start multi-threaded flush threads */ + mtflush_ctx = buf_mtflu_handler_init( + srv_mtflush_threads, + srv_buf_pool_instances); + + /* Set up the thread ids */ + buf_mtflu_set_thread_ids( + srv_mtflush_threads, + mtflush_ctx, + (thread_ids + 6 + 32)); + +#if UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", + __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); +#endif + } + os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); } @@ -2927,6 +2953,17 @@ innobase_shutdown_for_mysql(void) logs_empty_and_mark_files_at_shutdown() and should have already quit or is quitting right now. */ + + if (srv_use_mtflush) { + /* g. Exit the multi threaded flush threads */ + + buf_mtflu_io_thread_exit(); + } + +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: %s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); +#endif + os_mutex_enter(os_sync_mutex); if (os_thread_count == 0) { diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc index d1ba36b3b00..1607e535a94 100644 --- a/storage/innobase/ut/ut0wqueue.cc +++ b/storage/innobase/ut/ut0wqueue.cc @@ -161,6 +161,38 @@ ib_wqueue_timedwait( return(node ? node->data : NULL); } +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq) /*mutex); + + if(!ib_list_is_empty(wq->items)) { + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + } + } + + /* We must reset the event when the list + gets emptied. */ + if(ib_list_is_empty(wq->items)) { + os_event_reset(wq->event); + } + + mutex_exit(&wq->mutex); + + return (node ? node->data : NULL); +} + /******************************************************************** Check if queue is empty. */ @@ -173,3 +205,20 @@ ib_wqueue_is_empty( { return(ib_list_is_empty(wq->items)); } + +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq) /*mutex); + len = ib_list_len(wq->items); + mutex_exit(&wq->mutex); + + return(len); +} diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index b3751b269c3..c3acfb85fb3 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -18,6 +18,11 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) +INCLUDE(lz4) +INCLUDE(lzo) + +MYSQL_CHECK_LZ4_STATIC() +MYSQL_CHECK_LZO_STATIC() # OS tests IF(UNIX) @@ -290,6 +295,7 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc + buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -303,6 +309,7 @@ SET(INNOBASE_SOURCES eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc + fil/fil0pagecompress.cc fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index 69dcc4ce9cb..2c018131e19 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -3481,6 +3482,7 @@ buf_page_init_low( bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; + bpage->write_size = 0; HASH_INVALIDATE(bpage, hash); bpage->is_corrupt = FALSE; #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG @@ -5636,3 +5638,24 @@ buf_page_init_for_backup_restore( } } #endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Aquire LRU list mutex */ +void +buf_pool_mutex_enter( +/*=================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + mutex_enter(&buf_pool->LRU_list_mutex); +} +/*********************************************************************//** +Exit LRU list mutex */ +void +buf_pool_mutex_exit( +/*================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + mutex_exit(&buf_pool->LRU_list_mutex); +} diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc index c1bc0ee4c6e..1de36eb68f4 100644 --- a/storage/xtradb/buf/buf0dblwr.cc +++ b/storage/xtradb/buf/buf0dblwr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -381,7 +382,7 @@ buf_dblwr_init_or_load_pages( buffer */ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, - UNIV_PAGE_SIZE, read_buf, NULL); + UNIV_PAGE_SIZE, read_buf, NULL, 0); doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -417,11 +418,11 @@ buf_dblwr_init_or_load_pages( fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf, NULL); + buf, NULL, 0); fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - NULL); + NULL, 0); /* Check if any of these pages is half-written in data files, in the intended position */ @@ -448,7 +449,7 @@ buf_dblwr_init_or_load_pages( } fil_io(OS_FILE_WRITE, true, space_id, 0, source_page_no, 0, - UNIV_PAGE_SIZE, page, NULL); + UNIV_PAGE_SIZE, page, NULL, 0); } else if (load_corrupt_pages) { @@ -510,7 +511,7 @@ buf_dblwr_process() fil_io(OS_FILE_READ, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); + read_buf, NULL, 0); /* Check if the page is corrupt */ @@ -562,7 +563,7 @@ buf_dblwr_process() fil_io(OS_FILE_WRITE, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); ib_logf(IB_LOG_LEVEL_INFO, "Recovered the page from" @@ -778,7 +779,7 @@ buf_dblwr_write_block_to_datafile( buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), (void*) bpage->zip.data, - (void*) bpage); + (void*) bpage, 0); return; } @@ -790,8 +791,8 @@ buf_dblwr_write_block_to_datafile( fil_io(flags, sync, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block); - + (void*) block->frame, (void*) block, + (ulint *)&bpage->write_size); } /********************************************************************//** @@ -885,7 +886,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -901,7 +902,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); flush: /* increment the doublewrite flushed pages counter */ @@ -1131,14 +1132,14 @@ retry: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf - + UNIV_PAGE_SIZE * i), NULL); + + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, - NULL); + NULL, 0); } /* Now flush the doublewrite buffer data to disk */ diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index 9e92cf321a7..3529ce9dd3b 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri #endif #include "buf0buf.h" +#include "buf0mtflu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" @@ -44,10 +47,12 @@ Created 11/11/1995 Heikki Tuuri #include "ibuf0ibuf.h" #include "log0log.h" #include "os0file.h" +#include "os0sync.h" #include "trx0sys.h" #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /** Number of pages flushed through non flush_list flushes. */ // static ulint buf_lru_flush_page_count = 0; @@ -75,11 +80,6 @@ in thrashing. */ /* @} */ -/** Handled page counters for a single flush */ -struct flush_counters_t { - ulint flushed; /*!< number of dirty pages flushed */ - ulint evicted; /*!< number of clean pages evicted */ -}; /******************************************************************//** Increases flush_list size in bytes with zip_size for compressed page, @@ -720,8 +720,10 @@ buf_flush_write_complete( buf_pool->n_flush[flush_type]--; - /* fprintf(stderr, "n pending flush %lu\n", - buf_pool->n_flush[flush_type]); */ +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "n pending flush %lu\n", + buf_pool->n_flush[flush_type]); +#endif if (buf_pool->n_flush[flush_type] == 0 && buf_pool->init_flush[flush_type] == FALSE) { @@ -879,6 +881,8 @@ buf_flush_write_block_low( { ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; + ulint space_id = buf_page_get_space(bpage); + atomic_writes_t awrites = fil_space_get_atomic_writes(space_id); #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -954,12 +958,26 @@ buf_flush_write_block_low( sync, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage, sync); + frame, bpage, &bpage->write_size); } else { - ut_ad(!sync); - buf_dblwr_add_to_batch(bpage); + /* InnoDB uses doublewrite buffer and doublewrite buffer + is initialized. User can define do we use atomic writes + on a file space (table) or not. If atomic writes are + not used we should use doublewrite buffer and if + atomic writes should be used, no doublewrite buffer + is used. */ + + if (awrites == ATOMIC_WRITES_ON) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage, &bpage->write_size); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage, sync); + } else { + buf_dblwr_add_to_batch(bpage); + } } /* When doing single page flushing the IO is done synchronously @@ -1506,6 +1524,7 @@ buf_flush_LRU_list_batch( n->flushed = 0; n->evicted = 0; + n->unzip_LRU_evicted = 0; ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); @@ -1641,21 +1660,22 @@ buf_do_LRU_batch( flush_counters_t* n) /*!< out: flushed/evicted page counts */ { - ulint count = 0; - if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { - count += buf_free_from_unzip_LRU_list_batch(buf_pool, max); + n->unzip_LRU_evicted + += buf_free_from_unzip_LRU_list_batch(buf_pool, max); + } else { + n->unzip_LRU_evicted = 0; } - if (max > count) { - buf_flush_LRU_list_batch(buf_pool, max - count, limited_scan, - n); + if (max > n->unzip_LRU_evicted) { + buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, + limited_scan, n); } else { n->evicted = 0; n->flushed = 0; } - n->flushed += count; + n->flushed += n->unzip_LRU_evicted; } /*******************************************************************//** @@ -1746,7 +1766,6 @@ end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued */ __attribute__((nonnull)) -static void buf_flush_batch( /*============*/ @@ -1805,7 +1824,6 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -static void buf_flush_common( /*=============*/ @@ -1832,7 +1850,6 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -static ibool buf_flush_start( /*============*/ @@ -1847,6 +1864,11 @@ buf_flush_start( /* There is already a flush batch of the same type running */ +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, "Error: flush_type %d n_flush %lu init_flush %lu\n", + flush_type, buf_pool->n_flush[flush_type], buf_pool->init_flush[flush_type]); +#endif + mutex_exit(&buf_pool->flush_state_mutex); return(FALSE); @@ -1861,7 +1883,6 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -static void buf_flush_end( /*==========*/ @@ -1916,6 +1937,24 @@ buf_flush_wait_batch_end( } } +/* JAN: TODO: */ + +void buf_pool_enter_LRU_mutex( + buf_pool_t* buf_pool) +{ + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + mutex_enter(&buf_pool->LRU_list_mutex); +} + +void buf_pool_exit_LRU_mutex( + buf_pool_t* buf_pool) +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + mutex_exit(&buf_pool->LRU_list_mutex); +} + +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free @@ -1985,6 +2024,10 @@ buf_flush_list( bool timeout = false; ulint flush_start_time = 0; + if (buf_mtflu_init_done()) { + return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed)); + } + for (i = 0; i < srv_buf_pool_instances; i++) { requested_pages[i] = 0; active_instance[i] = true; @@ -2212,6 +2255,11 @@ buf_flush_LRU_tail(void) ulint free_list_lwm = srv_LRU_scan_depth / 100 * srv_cleaner_free_list_lwm; + if(buf_mtflu_init_done()) + { + return(buf_mtflu_flush_LRU_tail()); + } + for (ulint i = 0; i < srv_buf_pool_instances; i++) { const buf_pool_t* buf_pool = buf_pool_from_array(i); @@ -2269,9 +2317,15 @@ buf_flush_LRU_tail(void) requested_pages[i] += lru_chunk_size; + /* If we failed to flush or evict this + instance, do not bother anymore. But take into + account that we might have zero flushed pages + because the flushing request was fully + satisfied by unzip_LRU evictions. */ if (requested_pages[i] >= scan_depth[i] || !(srv_cleaner_eviction_factor - ? n.evicted : n.flushed)) { + ? n.evicted + : (n.flushed + n.unzip_LRU_evicted))) { active_instance[i] = false; remaining_instances--; diff --git a/storage/xtradb/buf/buf0mtflu.cc b/storage/xtradb/buf/buf0mtflu.cc new file mode 100644 index 00000000000..b14b83aa5d0 --- /dev/null +++ b/storage/xtradb/buf/buf0mtflu.cc @@ -0,0 +1,707 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file buf/buf0mtflu.cc +Multi-threaded flush method implementation + +Created 06/11/2013 Dhananjoy Das DDas@fusionio.com +Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com +Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com +Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0mtflu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "fil0fil.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "os0sync.h" +#include "trx0sys.h" +#include "srv0mon.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" + +#define MT_COMP_WATER_MARK 50 +/** Time to wait for a message. */ +#define MT_WAIT_IN_USECS 5000000 + +/* Work item status */ +typedef enum wrk_status { + WRK_ITEM_UNSET=0, /*!< Work item is not set */ + WRK_ITEM_START=1, /*!< Processing of work item has started */ + WRK_ITEM_DONE=2, /*!< Processing is done usually set to + SUCCESS/FAILED */ + WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ + WRK_ITEM_FAILED=3, /*!< Work item process failed */ + WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_SET=5, /*!< Work item is set */ + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +/* Work item task type */ +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, /*!< Exit queue-wait */ + MT_WRK_WRITE=1, /*!< Flush operation */ + MT_WRK_READ=2, /*!< Read operation */ + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; + +/* Work thread status */ +typedef enum wthr_status { + WTHR_NOT_INIT=0, /*!< Work thread not initialized */ + WTHR_INITIALIZED=1, /*!< Work thread initialized */ + WTHR_SIG_WAITING=2, /*!< Work thread wating signal */ + WTHR_RUNNING=3, /*!< Work thread running */ + WTHR_NO_WORK=4, /*!< Work thread has no work */ + WTHR_KILL_IT=5, /*!< Work thread should exit */ + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +/* Write work task */ +typedef struct wr_tsk { + buf_pool_t *buf_pool; /*!< buffer-pool instance */ + buf_flush_t flush_type; /*!< flush-type for buffer-pool + flush operation */ + ulint min; /*!< minimum number of pages + requested to be flushed */ + lsn_t lsn_limit; /*!< lsn limit for the buffer-pool + flush operation */ +} wr_tsk_t; + +/* Read work task */ +typedef struct rd_tsk { + buf_pool_t *page_pool; /*!< list of pages to decompress; */ +} rd_tsk_t; + +/* Work item */ +typedef struct wrk_itm +{ + mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type + one of the entries wr_tsk/rd_tsk + will be used */ + wr_tsk_t wr; /*!< Flush page list */ + rd_tsk_t rd; /*!< Decompress page list */ + ulint n_flushed; /*!< Flushed pages count */ + os_thread_id_t id_usr; /*!< Thread-id currently working */ + wrk_status_t wi_status; /*!< Work item status */ + mem_heap_t *wheap; /*!< Heap were to allocate memory + for queue nodes */ + mem_heap_t *rheap; +} wrk_t; + +typedef struct thread_data +{ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + wthr_status_t wt_status; /*!< Worker thread status */ +} thread_data_t; + +/* Thread syncronization data */ +typedef struct thread_sync +{ + /* Global variables used by all threads */ + os_fast_mutex_t thread_global_mtx; /*!< Mutex used protecting below + variables */ + ulint n_threads; /*!< Number of threads */ + ib_wqueue_t *wq; /*!< Work Queue */ + ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ + ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ + mem_heap_t* wheap; /*!< Work heap where memory + is allocated */ + mem_heap_t* rheap; /*!< Work heap where memory + is allocated */ + wthr_status_t gwt_status; /*!< Global thread status */ + + /* Variables used by only one thread at a time */ + thread_data_t* thread_data; /*!< Thread specific data */ + +} thread_sync_t; + +static int mtflush_work_initialized = -1; +static thread_sync_t* mtflush_ctx=NULL; +static os_fast_mutex_t mtflush_mtx; + +/******************************************************************//** +Set multi-threaded flush work initialized. */ +static inline +void +buf_mtflu_work_init(void) +/*=====================*/ +{ + mtflush_work_initialized = 1; +} + +/******************************************************************//** +Return true if multi-threaded flush is initialized +@return true if initialized */ +bool +buf_mtflu_init_done(void) +/*=====================*/ +{ + return(mtflush_work_initialized == 1); +} + +/******************************************************************//** +Fush buffer pool instance. +@return number of flushed pages, or 0 if error happened +*/ +static +ulint +buf_mtflu_flush_pool_instance( +/*==========================*/ + wrk_t *work_item) /*!< inout: work item to be flushed */ +{ + flush_counters_t n; + + ut_a(work_item != NULL); + ut_a(work_item->wr.buf_pool != NULL); + + memset(&n, 0, sizeof(flush_counters_t)); + + if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); +#endif + return 0; + } + + + if (work_item->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(work_item->wr.buf_pool); + work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU); + buf_pool_mutex_exit(work_item->wr.buf_pool); + work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min); + } + + buf_flush_batch(work_item->wr.buf_pool, + work_item->wr.flush_type, + work_item->wr.min, + work_item->wr.lsn_limit, + false, + &n); + + work_item->n_flushed = n.flushed; + + buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); + buf_flush_common(work_item->wr.flush_type, work_item->n_flushed); + + return work_item->n_flushed; +} + +/******************************************************************//** +Worker function to wait for work items and processing them and +sending reply back. +*/ +static +void +mtflush_service_io( +/*===============*/ + thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush + syncronization data */ + thread_data_t* thread_data) /* Thread status data */ +{ + wrk_t *work_item = NULL; + ulint n_flushed=0; + + ut_a(mtflush_io != NULL); + ut_a(thread_data != NULL); + + thread_data->wt_status = WTHR_SIG_WAITING; + + work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); + + if (work_item == NULL) { + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wq, MT_WAIT_IN_USECS); + } + + if (work_item) { + thread_data->wt_status = WTHR_RUNNING; + } else { + /* Thread did not get any work */ + thread_data->wt_status = WTHR_NO_WORK; + return; + } + + if (work_item->wi_status != WRK_ITEM_EXIT) { + work_item->wi_status = WRK_ITEM_SET; + } + +#ifdef UNIV_MTFLUSH_DEBUG + ut_a(work_item->id_usr == 0); +#endif + work_item->id_usr = os_thread_get_curr_id(); + + /* This works as a producer/consumer model, where in tasks are + * inserted into the work-queue (wq) and completions are based + * on the type of operations performed and as a result the WRITE/ + * compression/flush operation completions get posted to wr_cq. + * And READ/decompress operations completions get posted to rd_cq. + * in future we may have others. + */ + + switch(work_item->tsk) { + case MT_WRK_NONE: + ut_a(work_item->wi_status == WRK_ITEM_EXIT); + work_item->wi_status = WRK_ITEM_EXIT; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + thread_data->wt_status = WTHR_KILL_IT; + break; + + case MT_WRK_WRITE: + ut_a(work_item->wi_status == WRK_ITEM_SET); + work_item->wi_status = WRK_ITEM_START; + /* Process work item */ + if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { + work_item->wi_status = WRK_ITEM_FAILED; + } + work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + break; + + case MT_WRK_READ: + ut_a(0); + break; + + default: + /* None other than Write/Read handling planned */ + ut_a(0); + break; + } +} + +/******************************************************************//** +Thead used to flush dirty pages when multi-threaded flush is +used. +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(mtflush_io_thread)( +/*==============================*/ + void * arg) +{ + thread_sync_t *mtflush_io = ((thread_sync_t *)arg); + thread_data_t *this_thread_data = NULL; + ulint i; + + /* Find correct slot for this thread */ + os_fast_mutex_lock(&(mtflush_io->thread_global_mtx)); + for(i=0; i < mtflush_io->n_threads; i ++) { + if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) { + break; + } + } + + ut_a(i <= mtflush_io->n_threads); + this_thread_data = &mtflush_io->thread_data[i]; + os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx)); + + while (TRUE) { + +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); +#endif /* UNIV_MTFLUSH_DEBUG */ + + mtflush_service_io(mtflush_io, this_thread_data); + + + if (this_thread_data->wt_status == WTHR_KILL_IT) { + break; + } + } + + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void) +/*==========================*/ +{ + long i; + thread_sync_t* mtflush_io = mtflush_ctx; + wrk_t* work_item = NULL; + + ut_a(mtflush_io != NULL); + + /* Allocate work items for shutdown message */ + work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + + /* Confirm if the io-thread KILL is in progress, bailout */ + if (mtflush_io->gwt_status == WTHR_KILL_IT) { + return; + } + + mtflush_io->gwt_status = WTHR_KILL_IT; + + fprintf(stderr, "InnoDB: [Note]: Signal mtflush_io_threads to exit [%lu]\n", + srv_mtflush_threads); + + /* Send one exit work item/thread */ + for (i=0; i < srv_mtflush_threads; i++) { + work_item[i].tsk = MT_WRK_NONE; + work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wheap = mtflush_io->wheap; + work_item[i].rheap = mtflush_io->rheap; + work_item[i].id_usr = 0; + + ib_wqueue_add(mtflush_io->wq, + (void *)&(work_item[i]), + mtflush_io->wheap); + } + + /* Wait until all work items on a work queue are processed */ + while(!ib_wqueue_is_empty(mtflush_io->wq)) { + /* Wait */ + os_thread_sleep(MT_WAIT_IN_USECS); + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + + /* Collect all work done items */ + for (i=0; i < srv_mtflush_threads;) { + wrk_t* work_item = NULL; + + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); + + /* If we receive reply to work item and it's status is exit, + thead has processed this message and existed */ + if (work_item && work_item->wi_status == WRK_ITEM_EXIT) { + i++; + } + } + + /* Wait about 1/2 sec to allow threads really exit */ + os_thread_sleep(MT_WAIT_IN_USECS); + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); + ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); + + /* Free all queues */ + ib_wqueue_free(mtflush_io->wq); + ib_wqueue_free(mtflush_io->wr_cq); + ib_wqueue_free(mtflush_io->rd_cq); + + os_fast_mutex_free(&mtflush_mtx); + os_fast_mutex_free(&mtflush_io->thread_global_mtx); + + /* Free heap */ + mem_heap_free(mtflush_io->wheap); + mem_heap_free(mtflush_io->rheap); +} + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt) /*!< in: Number of work items */ +{ + ulint i; + mem_heap_t* mtflush_heap; + mem_heap_t* mtflush_heap2; + + /* Create heap, work queue, write completion queue, read + completion queue for multi-threaded flush, and init + handler. */ + mtflush_heap = mem_heap_create(0); + ut_a(mtflush_heap != NULL); + mtflush_heap2 = mem_heap_create(0); + ut_a(mtflush_heap2 != NULL); + + mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, + sizeof(thread_sync_t)); + memset(mtflush_ctx, 0, sizeof(thread_sync_t)); + ut_a(mtflush_ctx != NULL); + mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc( + mtflush_heap, sizeof(thread_data_t) * n_threads); + ut_a(mtflush_ctx->thread_data); + memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads); + + mtflush_ctx->n_threads = n_threads; + mtflush_ctx->wq = ib_wqueue_create(); + ut_a(mtflush_ctx->wq); + mtflush_ctx->wr_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->wr_cq); + mtflush_ctx->rd_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->rd_cq); + mtflush_ctx->wheap = mtflush_heap; + mtflush_ctx->rheap = mtflush_heap2; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + + /* Create threads for page-compression-flush */ + for(i=0; i < n_threads; i++) { + os_thread_id_t new_thread_id; + + mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED; + + mtflush_ctx->thread_data[i].wthread = os_thread_create( + mtflush_io_thread, + ((void *) mtflush_ctx), + &new_thread_id); + + mtflush_ctx->thread_data[i].wthread_id = new_thread_id; + } + + buf_mtflu_work_init(); + + return((void *)mtflush_ctx); +} + +/******************************************************************//** +Flush buffer pool instances. +@return number of pages flushed. */ +ulint +buf_mtflu_flush_work_items( +/*=======================*/ + ulint buf_pool_inst, /*!< in: Number of buffer pool instances */ + ulint *per_pool_pages_flushed, /*!< out: Number of pages + flushed/instance */ + buf_flush_t flush_type, /*!< in: Type of flush */ + ulint min_n, /*!< in: Wished minimum number of + blocks to be flushed */ + lsn_t lsn_limit) /*!< in: All blocks whose + oldest_modification is smaller than + this should be flushed (if their + number does not exceed min_n) */ +{ + ulint n_flushed=0, i; + mem_heap_t* work_heap; + mem_heap_t* reply_heap; + wrk_t work_item[MTFLUSH_MAX_WORKER]; + + /* Allocate heap where all work items used and queue + node items areallocated */ + work_heap = mem_heap_create(0); + reply_heap = mem_heap_create(0); + + + for(i=0;iwq, + (void *)(work_item + i), + work_heap); + } + + /* wait on the completion to arrive */ + for(i=0; i< buf_pool_inst;) { + wrk_t *done_wi = NULL; + done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); + + if (done_wi != NULL) { + per_pool_pages_flushed[i] = done_wi->n_flushed; + +#ifdef UNIV_MTFLUSH_DEBUG + if((int)done_wi->id_usr == 0 && + (done_wi->wi_status == WRK_ITEM_SET || + done_wi->wi_status == WRK_ITEM_UNSET)) { + fprintf(stderr, + "**Set/Unused work_item[%lu] flush_type=%d\n", + i, + done_wi->wr.flush_type); + ut_a(0); + } +#endif + + n_flushed+= done_wi->n_flushed; + i++; + } + } + + ut_a(ib_wqueue_is_empty(mtflush_ctx->wq)); + ut_a(ib_wqueue_is_empty(mtflush_ctx->wr_cq)); + + /* Release used work_items and queue nodes */ + mem_heap_free(work_heap); + mem_heap_free(reply_heap); + + return(n_flushed); +} + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + + /* This lock is to safequard against re-entry if any. */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + } + } +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); +#endif + return(success); +} + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void) +/*==========================*/ +{ + ulint total_flushed=0, i; + ulint cnt_flush[MTFLUSH_MAX_WORKER]; + + ut_a(buf_mtflu_init_done()); + + /* This lock is to safeguard against re-entry if any */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + +#if UNIV_MTFLUSH_DEBUG + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); +#endif + + return(total_flushed); +} + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*!thread_data[i].wthread_id; + } +} diff --git a/storage/xtradb/buf/buf0rea.cc b/storage/xtradb/buf/buf0rea.cc index 6e348bbf004..7a79958c136 100644 --- a/storage/xtradb/buf/buf0rea.cc +++ b/storage/xtradb/buf/buf0rea.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -229,14 +230,14 @@ not_to_recover: *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, zip_size, offset, 0, zip_size, - bpage->zip.data, bpage, trx); + bpage->zip.data, bpage, 0, trx); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage, trx); + ((buf_block_t*) bpage)->frame, bpage, &bpage->write_size, trx); } if (sync) { diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index 5cc013b7d6b..c58da198185 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index d56a1db59d4..2f1cc7ffa81 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -54,6 +55,15 @@ Created 10/25/1995 Heikki Tuuri # include "srv0srv.h" static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ +#include "fil0pagecompress.h" +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE @@ -434,11 +444,16 @@ fil_read( block size multiple */ void* buf, /*!< in/out: buffer where to store data read; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync - aio used, else ignored */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /********************************************************************//** @@ -463,18 +478,22 @@ fil_write( be a block size multiple */ void* buf, /*!< in: buffer from which to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync - aio used, else ignored */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ut_ad(!srv_read_only_mode); return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ -UNIV_INLINE fil_space_t* fil_space_get_by_id( /*================*/ @@ -492,6 +511,19 @@ fil_space_get_by_id( return(space); } +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + /*******************************************************************//** Returns the table space by a given name, NULL if not found. */ UNIV_INLINE @@ -712,8 +744,9 @@ fil_node_open_file( byte* buf2; byte* page; ulint space_id; - ulint flags; + ulint flags=0; ulint page_size; + ulint atomic_writes=0; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -730,7 +763,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_ONLY, &success, 0); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -782,6 +815,7 @@ fil_node_open_file( space_id = fsp_header_get_space_id(page); flags = fsp_header_get_flags(page); page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); ut_free(buf2); @@ -832,6 +866,17 @@ fil_node_open_file( ut_error; } + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { /* Truncate the size to whole extent size. */ size_bytes = ut_2pow_round(size_bytes, @@ -855,6 +900,8 @@ add_size: space->size += node->size; } + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + /* printf("Opening file %s\n", node->name); */ /* Open the file for reading and writing, in Windows normally in the @@ -865,18 +912,18 @@ add_size: node->handle = os_file_create(innodb_file_log_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, - &ret); + &ret, atomic_writes); } else if (node->is_raw_disk) { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN_RAW, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } else { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } ut_a(ret); @@ -1280,7 +1327,6 @@ fil_space_create( DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); ut_a(fil_system); - ut_a(fsp_flags_is_valid(flags)); /* Look for a matching tablespace and if found free it. */ do { @@ -1936,12 +1982,12 @@ fil_write_lsn_and_arch_no_to_file( buf = static_cast(ut_align(buf1, UNIV_PAGE_SIZE)); err = fil_read(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); err = fil_write(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); } mem_free(buf1); @@ -3223,7 +3269,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); if (!success) { /* The following call will print an error message */ @@ -3243,6 +3289,8 @@ fil_create_link_file( } else if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; + } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; } else { err = DB_ERROR; } @@ -3332,8 +3380,9 @@ fil_open_linked_file( /*===============*/ const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ - os_file_t* remote_file) /*!< out: remote file handle */ - + os_file_t* remote_file, /*!< out: remote file handle */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ { ibool success; @@ -3347,7 +3396,7 @@ fil_open_linked_file( *remote_file = os_file_create_simple_no_error_handling( innodb_file_data_key, *remote_filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, atomic_writes); if (!success) { char* link_filepath = fil_make_isl_name(tablename); @@ -3402,6 +3451,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3434,7 +3484,8 @@ fil_create_new_single_table_tablespace( OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + atomic_writes); if (ret == FALSE) { /* The following call will print an error message */ @@ -3461,6 +3512,11 @@ fil_create_new_single_table_tablespace( goto error_exit_3; } + if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + goto error_exit_3; + } + if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; goto error_exit_3; @@ -3499,6 +3555,7 @@ fil_create_new_single_table_tablespace( flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); if (!(fsp_flags_is_compressed(flags))) { buf_flush_init_for_writing(page, NULL, 0); @@ -3686,6 +3743,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; + ulint atomic_writes = 0; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -3696,6 +3754,8 @@ fil_open_single_table_tablespace( return(DB_CORRUPTION); } + atomic_writes = fsp_flags_get_atomic_writes(flags); + /* If the tablespace was relocated, we do not compare the DATA_DIR flag */ ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; @@ -3720,7 +3780,7 @@ fil_open_single_table_tablespace( } link_file_found = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, atomic_writes); remote.success = link_file_found; if (remote.success) { /* possibility of multiple files. */ @@ -3748,7 +3808,7 @@ fil_open_single_table_tablespace( if (dict.filepath) { dict.file = os_file_create_simple_no_error_handling( innodb_file_data_key, dict.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &dict.success); + OS_FILE_READ_ONLY, &dict.success, atomic_writes); if (dict.success) { /* possibility of multiple files. */ validate = true; @@ -3760,7 +3820,7 @@ fil_open_single_table_tablespace( ut_a(def.filepath); def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, atomic_writes); if (def.success) { tablespaces_found++; } @@ -4343,7 +4403,7 @@ fil_load_single_table_tablespace( /* Check for a link file which locates a remote tablespace. */ remote.success = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, FALSE); /* Read the first page of the remote tablespace */ if (remote.success) { @@ -4358,7 +4418,7 @@ fil_load_single_table_tablespace( /* Try to open the tablespace in the datadir. */ def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &def.success); + OS_FILE_READ_WRITE, &def.success, FALSE); /* Read the first page of the remote tablespace */ if (def.success) { @@ -5134,13 +5194,14 @@ retry: "space for file \'%s\' failed. Current size " INT64PF ", desired size " INT64PF "\n", node->name, start_offset, len+start_offset); - os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE); + os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__); success = FALSE; } else { success = TRUE; } mutex_enter(&fil_system->mutex); + if (success) { node->size += n_pages; space->size += n_pages; @@ -5179,7 +5240,7 @@ retry: success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL, space_id, NULL); + NULL, NULL, space_id, NULL, 0, 0, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5554,7 +5615,12 @@ _fil_io( or from where to write; in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync - aio used, else ignored */ + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ trx_t* trx) { ulint mode; @@ -5565,6 +5631,8 @@ _fil_io( ulint wake_later; os_offset_t offset; ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5618,6 +5686,11 @@ _fil_io( } else if (type == OS_FILE_WRITE) { ut_ad(!srv_read_only_mode); srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } else { + srv_stats.non_index_pages_written.inc(); + } } /* Reserve the fil_system mutex and make sure that we can open at @@ -5627,6 +5700,8 @@ _fil_io( space = fil_space_get_by_id(space_id); + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); /* If we are deleting a tablespace we don't allow any read operations on that. However, we do allow write operations. */ if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { @@ -5772,7 +5847,8 @@ _fil_io( /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message, space_id, trx); + offset, len, node, message, space_id, trx, + page_compressed, page_compression_level, write_size); #else /* In ibbackup do normal i/o, not aio */ @@ -6407,7 +6483,7 @@ fil_tablespace_iterate( file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, - OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", { @@ -6694,3 +6770,33 @@ fil_space_set_corrupt( mutex_exit(&fil_system->mutex); } + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc new file mode 100644 index 00000000000..ce7063bc688 --- /dev/null +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -0,0 +1,384 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" +#ifdef HAVE_LZ4 +#include "lz4.h" +#endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + +/* Used for debugging */ +//#define UNIV_PAGECOMPRESS_DEBUG 1 + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /* in: compression level */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem) /*!< in: temporal memory used by LZO */ +{ + int err = Z_OK; + int level = 0; + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; + ulint write_size=0; + + ut_ad(buf); + ut_ad(out_buf); + ut_ad(len); + ut_ad(out_len); + + level = compression_level; + ut_ad(fil_space_is_page_compressed(space_id)); + + fil_system_enter(); + fil_space_t* space = fil_space_get_by_id(space_id); + fil_system_exit(); + + /* If no compression level was provided to this table, use system + default level */ + if (level == 0) { + level = page_zip_level; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + write_size = UNIV_PAGE_SIZE - header_len; + + switch(innodb_compression_algorithm) { +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: + err = LZ4_compress_limitedOutput((const char *)buf, + (char *)out_buf+header_len, len, write_size); + write_size = err; + + if (err == 0) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_1_15_compress( + buf, len, out_buf+header_len, &write_size, lzo_mem); + + if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", + space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + + break; +#endif /* HAVE_LZO */ + case PAGE_ZLIB_ALGORITHM: + err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; + + default: + ut_error; + break; + } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, innodb_compression_algorithm); + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == innodb_compression_algorithm); +#endif /* UNIV_DEBUG */ + + write_size+=header_len; + +#define SECT_SIZE 512 + + /* Actual write needs to be alligned on block size */ + if (write_size % SECT_SIZE) { + write_size = (write_size + SECT_SIZE-1) & ~(SECT_SIZE-1); + ut_a((write_size % SECT_SIZE) == 0); + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + + srv_stats.page_compression_saved.add((len - write_size)); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + +} + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size) /*!< in/out: Actual payload size of + the compressed data. */ +{ + int err = 0; + ulint actual_size = 0; + ulint compression_alg = 0; + byte *in_buf; + ulint olen=0; + + ut_ad(buf); + ut_ad(len); + + /* Before actual decompress, make sure that page type is correct */ + + if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC || + mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: CRC %lu type %lu.\n" + "InnoDB: len %lu\n", + mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_2(buf+FIL_PAGE_TYPE), len); + + fflush(stderr); + ut_error; + } + + /* Get compression algorithm */ + compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); + + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: FIL: Compression buffer not given, allocating...\n"); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } + + /* Store actual payload size of the compressed data. This pointer + points to buffer pool. */ + if (write_size) { + *write_size = actual_size; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + + switch(compression_alg) { + case PAGE_ZLIB_ALGORITHM: + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + + fflush(stderr); + + ut_error; + } + break; + +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); + + if (err != (int)actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_decompress((const unsigned char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, + actual_size,(unsigned char *)in_buf, &olen, NULL); + + if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + olen, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif + default: + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(compression_alg)); + + fflush(stderr); + ut_error; + break; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } +} + + diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 3cc6d0192fd..d86502a268a 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -4,6 +4,7 @@ Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -102,6 +103,7 @@ this program; if not, write to the Free Software Foundation, Inc., #endif /* UNIV_DEBUG */ #include "fts0priv.h" #include "page0zip.h" +#include "fil0pagecompress.h" #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X)) @@ -561,6 +563,27 @@ ib_cb_t innodb_api_cb[] = { (ib_cb_t) ib_cursor_stmt_begin }; +/** + Structure for CREATE TABLE options (table options). + It needs to be called ha_table_option_struct. + + The option values can be specified in the CREATE TABLE at the end: + CREATE TABLE ( ... ) *here* +*/ + +ha_create_table_option innodb_table_option_list[]= +{ + /* With this option user can enable page compression feature for the + table */ + HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0), + /* With this option user can set zip compression level for page + compression for this table*/ + HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), + /* With this option user can enable atomic writes feature for this table */ + HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), + HA_TOPTION_END +}; + /*************************************************************//** Check whether valid argument given to innodb_ft_*_stopword_table. This function is registered as a callback with MySQL. @@ -876,6 +899,27 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_x_lock_spin_rounds, SHOW_LONGLONG}, {"x_lock_spin_waits", (char*) &export_vars.innodb_x_lock_spin_waits, SHOW_LONGLONG}, + + /* Status variables for page compression */ + {"page_compression_saved", + (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG}, + {"page_compression_trim_sect512", + (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG}, + {"page_compression_trim_sect4096", + (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, + {"num_index_pages_written", + (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_non_index_pages_written", + (char*) &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG}, + {"num_pages_page_compressed", + (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, + {"num_page_compressed_trim_op", + (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG}, + {"num_page_compressed_trim_op_saved", + (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG}, + {"num_pages_page_decompressed", + (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} }; @@ -3240,6 +3284,8 @@ innobase_init( if (srv_file_per_table) innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); #ifndef DBUG_OFF @@ -10123,11 +10169,16 @@ innobase_table_flags( enum row_type row_format; rec_format_t innodb_row_format = REC_FORMAT_COMPACT; bool use_data_dir; + ha_table_option_struct *options= form->s->option_struct; /* Cache the value of innodb_file_format, in case it is modified by another thread while the table is being created. */ const ulint file_format_allowed = srv_file_format; + /* Cache the value of innobase_compression_level, in case it is + modified by another thread while the table is being created. */ + const ulint default_compression_level = page_zip_level; + *flags = 0; *flags2 = 0; @@ -10176,6 +10227,8 @@ index_bad: } } + row_format = form->s->row_type; + if (create_info->key_block_size) { /* The requested compressed page size (key_block_size) is given in kilobytes. If it is a valid number, store @@ -10223,8 +10276,6 @@ index_bad: } } - row_format = form->s->row_type; - if (zip_ssize && zip_allowed) { /* if ROW_FORMAT is set to default, automatically change it to COMPRESSED.*/ @@ -10279,10 +10330,18 @@ index_bad: " innodb_file_format > Antelope.", get_row_format_name(row_format)); } else { - innodb_row_format = (row_format == ROW_TYPE_DYNAMIC - ? REC_FORMAT_DYNAMIC - : REC_FORMAT_COMPRESSED); - break; + switch(row_format) { + case ROW_TYPE_COMPRESSED: + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + default: + /* Not possible, avoid compiler warning */ + break; + } + break; /* Correct row_format */ } zip_allowed = FALSE; /* fall through to set row_format = COMPACT */ @@ -10310,7 +10369,15 @@ index_bad: && ((create_info->data_file_name != NULL) && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); - dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + /* Set up table dictionary flags */ + dict_tf_set(flags, + innodb_row_format, + zip_ssize, + use_data_dir, + options->page_compressed, + (ulint)options->page_compression_level == ULINT_UNDEFINED ? + default_compression_level : options->page_compression_level, + options->atomic_writes); if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; @@ -10328,6 +10395,104 @@ index_bad: DBUG_RETURN(true); } +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +UNIV_INTERN +const char* +ha_innobase::check_table_options( + THD *thd, /*!< in: thread handle */ + TABLE* table, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + const bool use_tablespace, /*!< in: use file par table */ + const ulint file_format) +{ + enum row_type row_format = table->s->row_type;; + ha_table_option_struct *options= table->s->option_struct; + atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; + + /* Check page compression requirements */ + if (options->page_compressed) { + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_format > Antelope."); + return "PAGE_COMPRESSED"; + } + + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if ((ulint)options->page_compression_level != ULINT_UNDEFINED) { + if (options->page_compressed == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 0 || options->page_compression_level > 9) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", + options->page_compression_level); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + /* Check atomic writes requirements */ + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ATOMIC_WRITES requires" + " innodb_file_per_table."); + return "ATOMIC_WRITES"; + } + } + + return 0; +} + /*****************************************************************//** Creates a new table to an InnoDB database. @return error number */ @@ -10359,6 +10524,7 @@ ha_innobase::create( while creating the table. So we read the current value here and make all further decisions based on this. */ bool use_tablespace = srv_file_per_table; + const ulint file_format = srv_file_format; /* Zip Shift Size - log2 - 9 of compressed page size, zero for uncompressed */ @@ -10382,6 +10548,12 @@ ha_innobase::create( /* Create the table definition in InnoDB */ + /* Validate table options not handled by the SQL-parser */ + if(check_table_options(thd, form, create_info, use_tablespace, + file_format)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + /* Validate create options if innodb_strict_mode is set. */ if (create_options_are_invalid( thd, form, create_info, use_tablespace)) { @@ -14717,6 +14889,12 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); if (table_changes != IS_EQUAL_YES) { @@ -14743,6 +14921,13 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level || + param_new->atomic_writes != param_old->atomic_writes) { + return(COMPATIBLE_DATA_NO); + } + return(COMPATIBLE_DATA_YES); } @@ -17199,12 +17384,6 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, "innodb_thread_concurrency is reached (0 by default)", NULL, NULL, 0, 0, ~0UL, 0); -static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, - PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" - ", 1 is fastest, 9 is best compression and default is 6.", - NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); - static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages, PLUGIN_VAR_OPCMDARG, "Enables/disables the logging of entire compressed page images." @@ -17894,6 +18073,46 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, "Print stacktrace on long semaphore wait (off by default supported only on linux)", NULL, NULL, FALSE); +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, + PLUGIN_VAR_RQCMDARG, + "Compression level used for zlib compression. 0 is no compression" + ", 1 is fastest, 9 is best compression and default is 6.", + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); + +static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, + PLUGIN_VAR_OPCMDARG, + "Use trim. Default FALSE.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_LONG(compression_algorithm, innodb_compression_algorithm, + PLUGIN_VAR_OPCMDARG, + "Compression algorithm used on page compression. 1 for zlib, 2 for lz3, 3 for lzo", + NULL, NULL, + PAGE_ZLIB_ALGORITHM, + 0, +#if defined(HAVE_LZO) && defined(HAVE_LZ4) + PAGE_ALGORITHM_LAST, +#elif defined(HAVE_LZ4) && !defined(HAVE_LZO) + PAGE_ALGORITHM_LZ4, +#else + PAGE_ALGORITHM_ZLIB, +#endif + 0); + +static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of multi-threaded flush threads", + NULL, NULL, + MTFLUSH_DEFAULT_WORKER, /* Default setting */ + 1, /* Minimum setting */ + MTFLUSH_MAX_WORKER, /* Max setting */ + 0); + +static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use multi-threaded flush. Default FALSE.", + NULL, NULL, FALSE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), MYSQL_SYSVAR(additional_mem_pool_size), @@ -18088,6 +18307,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), MYSQL_SYSVAR(force_primary_key), + MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(compression_algorithm), + MYSQL_SYSVAR(mtflush_threads), + MYSQL_SYSVAR(use_mtflush), NULL }; @@ -18379,6 +18602,9 @@ ib_senderrf( case IB_LOG_LEVEL_FATAL: l = 0; break; + default: + l = 0; + break; } my_printv_error(code, format, MYF(l), args); diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index 773a9b6b04d..b4df711356c 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -57,6 +58,21 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; +/** Engine specific table options are definined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + int page_compression_level; /*!< Table page compression level + or UNIV_UNSPECIFIED. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + srv_use_atomic_writes=1. + Atomic writes are not used if + value OFF.*/ +}; + /** The class defining a handle to an Innodb table */ class ha_innobase: public handler { @@ -184,6 +200,8 @@ class ha_innobase: public handler char* norm_name, char* temp_path, char* remote_path); + const char* check_table_options(THD *thd, TABLE* table, + HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc index 13c6752ce8f..4310d0744eb 100644 --- a/storage/xtradb/handler/handler0alter.cc +++ b/storage/xtradb/handler/handler0alter.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -259,6 +260,22 @@ ha_innobase::check_if_supported_inplace_alter( update_thd(); trx_search_latch_release_if_reserved(prebuilt->trx); + /* Change on engine specific table options require rebuild of the + table */ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION) { + ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct; + ha_table_option_struct *old_options= table->s->option_struct; + + if (new_options->page_compressed != old_options->page_compressed || + new_options->page_compression_level != old_options->page_compression_level || + new_options->atomic_writes != old_options->atomic_writes) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE | INNOBASE_ALTER_NOREBUILD @@ -3395,6 +3412,17 @@ ha_innobase::prepare_inplace_alter_table( if (ha_alter_info->handler_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { + /* Check engine specific table options */ + if (const char* invalid_tbopt = check_table_options( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0, + srv_file_format)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + if (const char* invalid_opt = create_options_are_invalid( user_thd, altered_table, ha_alter_info->create_info, diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index 8e2c283476a..7d7c8de6907 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1508,6 +1509,12 @@ struct buf_page_t{ state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ + + ulint write_size; /* Write size is set when this + page is first time written and then + if written again we check is TRIM + operation needed. */ + #ifndef UNIV_HOTBACKUP buf_page_t* hash; /*!< node used in chaining to buf_pool->page_hash or @@ -2149,6 +2156,20 @@ struct CheckUnzipLRUAndLRUList { }; #endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */ +/*********************************************************************//** +Aquire LRU list mutex */ +void +buf_pool_mutex_enter( +/*=================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool */ +/*********************************************************************//** +Exit LRU list mutex */ +void +buf_pool_mutex_exit( +/*================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool */ + + #ifndef UNIV_NONINL #include "buf0buf.ic" #endif diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h index 7699e4fda67..3a83373dcae 100644 --- a/storage/xtradb/include/buf0flu.h +++ b/storage/xtradb/include/buf0flu.h @@ -39,6 +39,14 @@ extern ibool buf_page_cleaner_is_active; /** Flag indicating if the lru_manager is in active state. */ extern bool buf_lru_manager_is_active; +/** Handled page counters for a single flush */ +struct flush_counters_t { + ulint flushed; /*!< number of dirty pages flushed */ + ulint evicted; /*!< number of clean pages evicted */ + ulint unzip_LRU_evicted;/*!< number of uncompressed page images + evicted */ +}; + /********************************************************************//** Remove a block from the flush list of modified blocks. */ UNIV_INTERN @@ -304,6 +312,63 @@ buf_flush_flush_list_in_progress(void) /*==================================*/ __attribute__((warn_unused_result)); +/******************************************************************//** +Start a buffer flush batch for LRU or flush list */ +ibool +buf_flush_start( +/*============*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ + +/******************************************************************//** +End a buffer flush batch for LRU or flush list */ +void +buf_flush_end( +/*==========*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! +@return number of blocks for which the write request was queued */ +__attribute__((nonnull)) +void +buf_flush_batch( +/*============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST; if BUF_FLUSH_LIST, + then the caller must not own any + latches on pages */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST + all blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + bool limited_lru_scan,/*!< in: for LRU flushes, if true, + allow to scan only up to + srv_LRU_scan_depth pages in total */ + flush_counters_t* n); /*!< out: flushed/evicted page + counts */ + + +/******************************************************************//** +Gather the aggregated stats for both flush list and LRU list flushing */ +void +buf_flush_common( +/*=============*/ + buf_flush_t flush_type, /*!< in: type of flush */ + ulint page_count); /*!< in: number of pages flushed */ + #ifndef UNIV_NONINL #include "buf0flu.ic" #endif diff --git a/storage/xtradb/include/buf0mtflu.h b/storage/xtradb/include/buf0mtflu.h new file mode 100644 index 00000000000..0475335bbf5 --- /dev/null +++ b/storage/xtradb/include/buf0mtflu.h @@ -0,0 +1,95 @@ +/***************************************************************************** + +Copyright (C) 2014 SkySQL Ab. All Rights Reserved. +Copyright (C) 2014 Fusion-io. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/buf0mtflu.h +Multi-threadef flush method interface function prototypes + +Created 06/02/2014 Jan Lindström jan.lindstrom@skysql.com + Dhananjoy Das DDas@fusionio.com +***********************************************************************/ + +#ifndef buf0mtflu_h +#define buf0mtflu_h + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void); +/*===========================*/ + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt); /*!< in: Number of work items */ + +/******************************************************************//** +Return true if multi-threaded flush is initialized +@return true if initialized, false if not */ +bool +buf_mtflu_init_done(void); +/*======================*/ + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void); +/*===========================*/ + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed); /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*! PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + + ); + return(false); + } + } + + if (page_compression || page_compression_level) { + /* Page compression format must have compact and + atomic_blobs and page_compression_level requires + page_compression */ + if (!compact + || !page_compression + || !atomic_blobs) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + + return(false); + } + } + + if (atomic_writes) { + + if(atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } } @@ -594,6 +689,11 @@ dict_sys_tables_type_validate( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); + + ut_a(atomic_writes >= 0 && atomic_writes <= ATOMIC_WRITES_OFF); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -604,12 +704,16 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } } /* Make sure there are no bits that we do not know about. */ if (unused) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + type, unused); return(ULINT_UNDEFINED); } @@ -624,6 +728,8 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + type, zip_ssize); return(ULINT_UNDEFINED); } @@ -633,11 +739,15 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } } @@ -647,6 +757,27 @@ dict_sys_tables_type_validate( format, so the DATA_DIR flag is compatible with any other table flags. However, it is not used with TEMPORARY tables.*/ + if (page_compression || page_compression_level) { + /* page compressed row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + + if (!atomic_blobs || !page_compression) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + "InnoDB: Error: atomic_blobs %lu\n", + type, page_compression, page_compression_level, atomic_blobs); + return(ULINT_UNDEFINED); + } + } + + /* Validate that the atomic writes number is within allowed range. */ + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n", + type, atomic_writes); + return(ULINT_UNDEFINED); + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -719,8 +850,16 @@ dict_tf_set( ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ + bool use_data_dir, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic writes setup */ { + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + switch (format) { case REC_FORMAT_REDUNDANT: *flags = 0; @@ -745,6 +884,19 @@ dict_tf_set( if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); } + + if (page_compressed) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } + + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_a(dict_tf_get_atomic_writes(*flags) == awrites); } /********************************************************************//** @@ -765,6 +917,9 @@ dict_tf_to_fsp_flags( ulint table_flags) /*!< in: dict_table_t::flags */ { ulint fsp_flags; + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); @@ -783,7 +938,20 @@ dict_tf_to_fsp_flags( fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) ? FSP_FLAGS_MASK_DATA_DIR : 0; + /* In addition, tablespace flags also contain if the page + compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression); + + /* In addition, tablespace flags also contain page compression level + if page compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + + /* In addition, tablespace flags also contain flag if atomic writes + is used for this table */ + fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); + ut_a(fsp_flags_is_valid(fsp_flags)); + ut_a(dict_tf_verify_flags(table_flags, fsp_flags)); return(fsp_flags); } @@ -811,10 +979,15 @@ dict_sys_tables_type_to_tf( /* Adjust bit zero. */ flags = redundant ? 0 : 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ flags |= type & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES + ); return(flags); } @@ -842,10 +1015,14 @@ dict_tf_to_sys_tables_type( /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ type = 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ type |= flags & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES); return(type); } diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h index 527d50019a4..a347a75ea42 100644 --- a/storage/xtradb/include/dict0mem.h +++ b/storage/xtradb/include/dict0mem.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -125,11 +126,26 @@ This flag prevents older engines from attempting to open the table and allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_WIDTH_DATA_DIR 1 +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ + DICT_TF_WIDTH_ATOMIC_BLOBS \ - + DICT_TF_WIDTH_DATA_DIR) + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -145,9 +161,18 @@ allows InnoDB to update_create_info() accordingly. */ /** Zero relative shift position of the DATA_DIR field */ #define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ - + DICT_TF_WIDTH_DATA_DIR) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -165,6 +190,18 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_MASK_DATA_DIR \ ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define DICT_TF_MASK_ATOMIC_WRITES \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ + << DICT_TF_POS_ATOMIC_WRITES) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -185,6 +222,19 @@ allows InnoDB to update_create_info() accordingly. */ /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) + +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define DICT_TF_GET_ATOMIC_WRITES(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ + >> DICT_TF_POS_ATOMIC_WRITES) /* @} */ #ifndef UNIV_INNOCHECKSUM diff --git a/storage/xtradb/include/dict0pagecompress.h b/storage/xtradb/include/dict0pagecompress.h new file mode 100644 index 00000000000..19a2a6c52f3 --- /dev/null +++ b/storage/xtradb/include/dict0pagecompress.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ + __attribute__((const)); + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table); /*!< in: table */ + + +#ifndef UNIV_NONINL +#include "dict0pagecompress.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0pagecompress.ic b/storage/xtradb/include/dict0pagecompress.ic new file mode 100644 index 00000000000..ea3c7546850 --- /dev/null +++ b/storage/xtradb/include/dict0pagecompress.ic @@ -0,0 +1,191 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ +{ + ulint table_unused = DICT_TF_GET_UNUSED(table_flags); + ulint compact = DICT_TF_GET_COMPACT(table_flags); + ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); + ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); + ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags); + ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); + ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); + ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", + return(ULINT_UNDEFINED);); + + ut_a(!table_unused); + ut_a(!fsp_unused); + ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_a(compact == 0 || compact == 1); /* silence compiler */ + ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + + if (ssize != zip_ssize) { + fprintf(stderr, + "InnoDB: Error: table flags has zip_ssize %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has zip_ssize %ld\n", + ssize, zip_ssize); + return (FALSE); + } + if (atomic_blobs != fsp_atomic_blobs) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic_blobs %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_blobs %ld\n", + atomic_blobs, fsp_atomic_blobs); + + return (FALSE); + } + if (page_compression != fsp_page_compression) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file ahas page_compression %ld\n", + page_compression, fsp_page_compression); + + return (FALSE); + } + if (page_compression_level != fsp_page_compression_level) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression_level %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_compression_level %ld\n", + page_compression_level, fsp_page_compression_level); + + return (FALSE); + } + + if (atomic_writes != fsp_atomic_writes) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic writes %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_writes %ld\n", + atomic_writes, fsp_atomic_writes); + + return (FALSE); + } + + return(TRUE); +} + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level >= 0 && page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ +{ + return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags)); +} + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags)); +} diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h index 1299445a8ee..d9cc5cd607f 100644 --- a/storage/xtradb/include/dict0types.h +++ b/storage/xtradb/include/dict0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -82,6 +83,14 @@ enum ib_quiesce_t { #define TEMP_TABLE_PREFIX "#sql" #define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX + +/** Enum values for atomic_writes table option */ +typedef enum { + ATOMIC_WRITES_DEFAULT = 0, + ATOMIC_WRITES_ON = 1, + ATOMIC_WRITES_OFF = 2 +} atomic_writes_t; + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /** Flag to control insert buffer debugging. */ extern uint ibuf_debug; diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index 074906d8959..c732a8bfb62 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -131,6 +132,13 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID #define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* Following are used when page compression is used */ +#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on + compressed pages. */ +#define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ +#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */ + /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -141,6 +149,7 @@ extern fil_addr_t fil_addr_null; /* @} */ /** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< Page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ @@ -723,8 +732,8 @@ fil_space_get_n_reserved_extents( Reads or writes data. This operation is asynchronous (aio). @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ -#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \ - _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL) +#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size) \ + _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size, NULL) UNIV_INTERN dberr_t @@ -754,7 +763,12 @@ _fil_io( or from where to write; in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync - aio used, else ignored */ + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ trx_t* trx) __attribute__((nonnull(8))); /**********************************************************************//** @@ -1020,4 +1034,43 @@ fil_space_set_corrupt( /*==================*/ ulint space_id); +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void); +/*==================*/ +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void); +/*==================*/ + +#ifndef UNIV_INNOCHECKSUM +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space); /*!< in: space */ +#endif + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + #endif /* fil0fil_h */ diff --git a/storage/xtradb/include/fil0pagecompress.h b/storage/xtradb/include/fil0pagecompress.h new file mode 100644 index 00000000000..74f6e45f4fb --- /dev/null +++ b/storage/xtradb/include/fil0pagecompress.h @@ -0,0 +1,122 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" +#include "fsp0pagecompress.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/*******************************************************************//** +Returns the page compression level flag of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level if page compressed, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return atomic write table option value */ +atomic_writes_t +fil_space_get_atomic_writes( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf); /*!< in: page */ + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg); /*!> FSP_FLAGS_POS_UNUSED) +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \ + >> FSP_FLAGS_POS_ATOMIC_WRITES) /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) +/** Set a PAGE_COMPRESSION into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \ + (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION)) + +/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ + (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) +/** Set a ATOMIC_WRITES into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ + (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES)) + /* @} */ /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic index 0d81e817cc9..3563f5ef372 100644 --- a/storage/xtradb/include/fsp0fsp.ic +++ b/storage/xtradb/include/fsp0fsp.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,12 +64,17 @@ fsp_flags_is_valid( ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); + ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); /* fsp_flags is zero unless atomic_blobs is set. */ /* Make sure there are no bits that we do not know about. */ if (unused != 0 || flags == 1) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n", + flags, unused); return(false); } else if (post_antelope) { /* The Antelope row formats REDUNDANT and COMPACT did @@ -76,6 +82,8 @@ fsp_flags_is_valid( 4-byte field is zero for Antelope row formats. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n", + flags, atomic_blobs); return(false); } } @@ -87,10 +95,14 @@ fsp_flags_is_valid( externally stored parts. */ if (post_antelope || zip_ssize != 0) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n", + flags, zip_ssize, atomic_blobs); return(false); } } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n", + flags, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(false); } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { @@ -98,9 +110,13 @@ fsp_flags_is_valid( be zero for an original 16k page size. Validate the page shift size is within allowed range. */ + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n", + flags, page_ssize, UNIV_PAGE_SSIZE_MAX); return(false); } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n", + flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG); return(false); } @@ -108,6 +124,23 @@ fsp_flags_is_valid( # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif + /* Page compression level requires page compression and atomic blobs + to be set */ + if (page_compression_level || page_compression) { + if (!page_compression || !atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n" + "InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n", + flags, page_compression, page_compression_level, atomic_blobs); + return(false); + } + } + + if (atomic_writes < 0 || atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n", + flags, atomic_writes); + return (false); + } + /* The DATA_DIR field can be used for any row type so there is nothing here to validate. */ diff --git a/storage/xtradb/include/fsp0pagecompress.h b/storage/xtradb/include/fsp0pagecompress.h new file mode 100644 index 00000000000..f2cd38481f6 --- /dev/null +++ b/storage/xtradb/include/fsp0pagecompress.h @@ -0,0 +1,79 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef fsp0pagecompress_h +#define fsp0pagecompress_h + +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_ALGORITHM_LAST PAGE_LZO_ALGORITHM + +/**********************************************************************//** +Reads the page compression level from the first page of a tablespace. +@return page compression level, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_compression_level( +/*=============================*/ + const page_t* page); /*!< in: first page of a tablespace */ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Extract the page compression level from tablespace flags. +A tablespace has only one physical page compression level +whether that page is compressed or not. +@return page compression level of the file-per-table tablespace, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0pagecompress.ic" +#endif + +#endif diff --git a/storage/xtradb/include/fsp0pagecompress.ic b/storage/xtradb/include/fsp0pagecompress.ic new file mode 100644 index 00000000000..a72c4a1df90 --- /dev/null +++ b/storage/xtradb/include/fsp0pagecompress.ic @@ -0,0 +1,181 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.ic +Implementation for helper functions for extracting/storing page +compression and atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not page compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Determine the tablespace is page compression level from dict_table_t::flags. +@return page compression level or 0 if not compressed*/ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); +} + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags)); +} + +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +UNIV_INLINE +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); +} + +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed, false if not */ +UNIV_INLINE +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); +} + +/*******************************************************************//** +Returns the page compression level of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +UNIV_INLINE +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_compression_level(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Extract the page compression from space. +@return true if space is page compressed, false if space is not found +or space is not page compressed. */ +UNIV_INLINE +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_compressed(flags)); + } + + return(flags); +} + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +UNIV_INLINE +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg) /*!first || list->last)); } + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h index 33385ddf2d4..e6b9891aed1 100644 --- a/storage/xtradb/include/ut0wqueue.h +++ b/storage/xtradb/include/ut0wqueue.h @@ -95,6 +95,23 @@ ib_wqueue_timedwait( ib_wqueue_t* wq, /* in: work queue */ ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*space_id, 0, (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + group, 0); srv_stats.os_log_pending_writes.dec(); @@ -2091,7 +2091,7 @@ log_group_checkpoint( write_offset / UNIV_PAGE_SIZE, write_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, - buf, ((byte*) group + 1)); + buf, ((byte*) group + 1), 0); ut_ad(((ulint) group & 0x1UL) == 0); } @@ -2171,7 +2171,7 @@ log_group_read_checkpoint_info( fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0, field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0); } /******************************************************//** @@ -2554,7 +2554,7 @@ loop: fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, (ulint) (source_offset / UNIV_PAGE_SIZE), (ulint) (source_offset % UNIV_PAGE_SIZE), - len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL); + len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL, 0); start_lsn += len; buf += len; @@ -2679,7 +2679,7 @@ log_group_archive_file_header_write( dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, 2 * OS_FILE_LOG_BLOCK_SIZE, - buf, &log_archive_io); + buf, &log_archive_io, 0); } /******************************************************//** @@ -2716,7 +2716,7 @@ log_group_archive_completed_header_write( dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf + LOG_FILE_ARCH_COMPLETED, - &log_archive_io); + &log_archive_io, 0); } /******************************************************//** @@ -2779,12 +2779,12 @@ loop: file_handle = os_file_create(innodb_file_log_key, name, open_mode, OS_FILE_AIO, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); if (!ret && (open_mode == OS_FILE_CREATE)) { file_handle = os_file_create( innodb_file_log_key, name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); + OS_FILE_AIO, OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -2853,7 +2853,7 @@ loop: (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, - &log_archive_io); + &log_archive_io, 0); start_lsn += len; next_offset += len; diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc index 127e09e0448..a4cded85f29 100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@ -539,7 +539,7 @@ log_online_start_bitmap_file(void) log_bmp_sys->out.name, OS_FILE_CREATE, OS_FILE_READ_WRITE, - &success); + &success, FALSE); } if (UNIV_UNLIKELY(!success)) { @@ -699,7 +699,7 @@ log_online_read_init(void) log_bmp_sys->out.file = os_file_create_simple_no_error_handling (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &success); + OS_FILE_READ_WRITE, &success, FALSE); if (!success) { @@ -1483,7 +1483,7 @@ log_online_open_bitmap_file_read_only( bitmap_file->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, FALSE); if (UNIV_UNLIKELY(!success)) { /* Here and below assume that bitmap file names do not diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc index bf239299268..f8d401e8118 100644 --- a/storage/xtradb/log/log0recv.cc +++ b/storage/xtradb/log/log0recv.cc @@ -2,6 +2,7 @@ Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2149,7 +2150,7 @@ recv_apply_log_recs_for_backup(void) error = fil_io(OS_FILE_READ, true, recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); if (error == DB_SUCCESS && !buf_zip_decompress(block, TRUE)) { exit(1); @@ -2159,7 +2160,7 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } if (error != DB_SUCCESS) { @@ -2188,13 +2189,13 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); } else { error = fil_io(OS_FILE_WRITE, true, recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); @@ -3164,7 +3165,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, LOG_FILE_HDR_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { @@ -3195,7 +3196,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, OS_FILE_LOG_BLOCK_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); } log_hdr_log_block_size @@ -3795,7 +3796,7 @@ try_open_again: file_handle = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, - OS_FILE_LOG, OS_FILE_AIO, &ret); + OS_FILE_LOG, OS_FILE_AIO, &ret, FALSE); if (ret == FALSE) { ask_again: @@ -3847,7 +3848,7 @@ ask_again: /* Read the archive file header */ fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0, 0, - LOG_FILE_HDR_SIZE, buf, NULL); + LOG_FILE_HDR_SIZE, buf, NULL, 0); /* Check if the archive file header is consistent */ @@ -3921,7 +3922,7 @@ ask_again: fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, read_offset / UNIV_PAGE_SIZE, - read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0); ret = recv_scan_log_recs( (buf_pool_get_n_pages() diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 6ba19879847..e837e2e66ae 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,10 +43,16 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "btr0types.h" #include "trx0trx.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#include "linux/falloc.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -73,6 +80,10 @@ Created 10/21/1995 Heikki Tuuri # endif #endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -196,11 +207,39 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ibool page_compression; + ulint page_compression_level; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + #ifdef LINUX_NATIVE_AIO struct iocb control; /* Linux control block for aio */ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ #endif /* WIN_ASYNC_IO */ +#ifdef HAVE_LZO + byte lzo_mem[LZO1X_1_15_MEM_COMPRESS]; +#else + byte lzo_mem; /* Temporal memory used by LZO */ +#endif + }; /** The asynchronous i/o array structure */ @@ -301,6 +340,57 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len); /*!< in: length of area */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name, /*!< in: name of the file */ + os_file_t file); /*!< in: handle to the file */ + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -537,6 +627,19 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -599,6 +702,20 @@ os_file_get_last_error_low( "InnoDB: The error means mysqld does not have" " the access rights to\n" "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { if (strerror(err) != NULL) { fprintf(stderr, @@ -635,6 +752,9 @@ os_file_get_last_error_low( return(OS_FILE_AIO_RESOURCES_RESERVED); } break; + case ECANCELED: + case ENOTTY: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EINTR: if (srv_use_native_aio) { return(OS_FILE_AIO_INTERRUPTED); @@ -668,7 +788,6 @@ Does error handling when a file operation fails. Conditionally exits (calling exit(3)) based on should_exit value and the error type, if should_exit is TRUE then on_error_silent is ignored. @return TRUE if we should retry the operation */ -static ibool os_file_handle_error_cond_exit( /*===========================*/ @@ -676,9 +795,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -710,6 +831,9 @@ os_file_handle_error_cond_exit( os_has_said_disk_full = TRUE; + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + fflush(stderr); return(FALSE); @@ -742,6 +866,9 @@ os_file_handle_error_cond_exit( is better to ignore on_error_silent and print an error message to the log. */ + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + if (should_exit || !on_error_silent) { ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " "error " ULINTPF ".%s", name ? name : "(unknown)", @@ -765,10 +892,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -779,12 +908,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -927,7 +1058,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -938,7 +1069,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -960,7 +1091,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -972,7 +1103,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -1044,7 +1175,7 @@ next_file: return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1130,7 +1261,7 @@ next_file: goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1181,7 +1312,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1194,7 +1325,7 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1304,7 +1435,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN ? - "open" : "create"); + "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; @@ -1372,7 +1503,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; retry = false; @@ -1414,9 +1545,12 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; *success = FALSE; #ifdef __WIN__ @@ -1477,6 +1611,23 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ int create_flag; @@ -1537,6 +1688,23 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } + #endif /* __WIN__ */ return(file); @@ -1620,15 +1788,16 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { - os_file_handle_error_no_exit(name, "ioctl", FALSE); + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); #else - ib_logf(IB_LOG_LEVEL_ERROR, - "trying to enable atomic writes on non-supported platform! " - "Please restart with innodb_use_atomic_writes disabled.\n"); + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); return(FALSE); #endif } @@ -1654,12 +1823,15 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; ibool retry; ibool on_error_no_exit; ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; #ifdef __WIN__ DBUG_EXECUTE_IF( @@ -1802,9 +1974,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1816,11 +1988,21 @@ os_file_create_func( } while (retry); - if (srv_use_atomic_writes && type == OS_DATA_FILE && - !os_file_set_atomic_writes(name, file)) { - CloseHandle(file); + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); *success = FALSE; file = INVALID_HANDLE_VALUE; + } } #else /* __WIN__ */ @@ -1897,9 +2079,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1953,14 +2135,24 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ - if (srv_use_atomic_writes && type == OS_DATA_FILE - && file != -1 && !os_file_set_atomic_writes(name, file)) { - - *success = FALSE; - close(file); - file = -1; + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } } + #endif /* __WIN__ */ return(file); @@ -2019,7 +2211,7 @@ loop: ret = unlink(name); if (ret != 0 && errno != ENOENT) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2083,7 +2275,7 @@ loop: ret = unlink(name); if (ret != 0) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2127,7 +2319,7 @@ os_file_rename_func( return(TRUE); } - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); #else @@ -2136,7 +2328,7 @@ os_file_rename_func( ret = rename(oldpath, newpath); if (ret != 0) { - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2167,7 +2359,7 @@ os_file_close_func( return(TRUE); } - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); #else @@ -2176,7 +2368,7 @@ os_file_close_func( ret = close(file); if (ret == -1) { - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); } @@ -2268,6 +2460,12 @@ os_file_set_size( current_size = 0; +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: File %s current_size %lu extended_size %lu\n", + name, os_file_get_size(file), size); +#endif + + #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2278,7 +2476,7 @@ os_file_set_size( INT64PF ", desired size " INT64PF "\n", name, current_size, size); os_file_handle_error_no_exit (name, "posix_fallocate", - FALSE); + FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); @@ -2467,7 +2665,7 @@ os_file_flush_func( return(TRUE); } - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2521,7 +2719,7 @@ os_file_flush_func( ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2878,6 +3076,9 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len, NULL); + } return(TRUE); } #else /* __WIN__ */ @@ -2891,6 +3092,10 @@ try_again: if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + return(TRUE); } @@ -2898,7 +3103,7 @@ try_again: "Tried to read "ULINTPF" bytes at offset " UINT64PF". " "Was only able to read %ld.", n, offset, (lint) ret); #endif /* __WIN__ */ - retry = os_file_handle_error(NULL, "read"); + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); if (retry) { goto try_again; @@ -2991,10 +3196,14 @@ try_again: if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + return(TRUE); } #endif /* __WIN__ */ - retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); if (retry) { goto try_again; @@ -3206,7 +3415,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3234,7 +3443,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3283,7 +3492,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3336,7 +3545,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3900,7 +4109,7 @@ os_aio_array_create( array->slots = static_cast( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); #if defined(LINUX_NATIVE_AIO) array->aio_ctx = NULL; @@ -3975,6 +4184,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { + ulint i; + os_mutex_free(array->mutex); os_event_free(array->not_full); os_event_free(array->is_empty); @@ -3986,6 +4197,14 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4330,7 +4549,16 @@ os_aio_array_reserve_slot( to write */ os_offset_t offset, /*!< in: file offset */ ulint len, /*!< in: length of the block to read or write */ - ulint space_id) + ulint space_id, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4422,6 +4650,53 @@ found: slot->io_already_done = FALSE; slot->space_id = space_id; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + + /* If the space is page compressed and this is write operation + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + + ut_ad(slot->page_buf); + + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + &real_len, + slot->lzo_mem + ); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); + + } + #ifdef WIN_ASYNC_IO control = &slot->control; control->Offset = (DWORD) offset & 0xFFFFFFFF; @@ -4697,7 +4972,16 @@ os_aio_func( aio operation); ignored if mode is OS_AIO_SYNC */ ulint space_id, - trx_t* trx) + trx_t* trx, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4720,7 +5004,7 @@ os_aio_func( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); - if (mode == OS_AIO_SYNC) + if (mode == OS_AIO_SYNC) { ibool ret; /* This is actually an ordinary synchronous read or write: @@ -4787,7 +5071,8 @@ try_again: trx->io_read += n; } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n, space_id); + name, buf, offset, n, space_id, + page_compression, page_compression_level, write_size); if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4845,7 +5130,7 @@ err_exit: os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4945,7 +5230,7 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { retry = TRUE; } else { @@ -4973,12 +5258,18 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret_val = os_file_write(slot->name, slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + if (slot->message1 && slot->page_compression && slot->page_buf) { + ret_val = os_file_write(slot->name, slot->file, slot->page_buf, + slot->offset, slot->len); + } else { + + ret_val = os_file_write(slot->name, slot->file, slot->buf, + slot->offset, slot->len); + } break; case OS_FILE_READ: - ret_val = os_file_read(slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + ret_val = os_file_read(slot->file, slot->buf, + slot->offset, slot->len); break; default: ut_error; @@ -5003,6 +5294,28 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); return(ret_val); @@ -5092,6 +5405,33 @@ retry: /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5273,6 +5613,13 @@ found: } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5280,7 +5627,7 @@ found: windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5954,3 +6301,168 @@ os_aio_all_slots_free(void) #endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + +#ifdef _WIN32 +#include +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len) /*!< in: length of area */ +{ + +#define SECT_SIZE 512 + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + // len here should be alligned to sector size + ut_a((trim_len % SECT_SIZE) == 0); + ut_a((len % SECT_SIZE) == 0); + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lx len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + if (slot->write_size) { + *slot->write_size = 0; + } + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#endif + + srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + /* We allocate extra to avoid memory overwrite on compression */ + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast(cbuf2); + slot->page_buf = static_cast(cbuf); +} diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index ea346566e57..f276efdc021 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -290,6 +290,18 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + {"buffer_index_pages_written", "buffer", + "Number of index pages written (innodb_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + + {"buffer_non_index_pages_written", "buffer", + "Number of non index pages written (innodb_non_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -879,6 +891,46 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + {"compress_saved", "compression", + "Number of bytes saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED}, + + {"compress_trim_sect512", "compression", + "Number of sect-512 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512}, + + {"compress_trim_sect4096", "compression", + "Number of sect-4K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096}, + + {"compress_pages_page_compressed", "compression", + "Number of pages compressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED}, + + {"compress_page_compressed_trim_op", "compression", + "Number of TRIM operation performed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP}, + + {"compress_page_compressed_trim_op_saved", "compression", + "Number of TRIM operation saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED}, + + {"compress_pages_page_decompressed", "compression", + "Number of pages decompressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + + {"compress_pages_page_compression_error", "compression", + "Number of page compression errors", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1552,6 +1604,16 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; + /* innodb_index_pages_written, the number of index pages written */ + case MONITOR_OVLD_INDEX_PAGES_WRITTEN: + value = srv_stats.index_pages_written; + break; + + /* innodb_non_index_pages_written, the number of non index pages written */ + case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN: + value = srv_stats.non_index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); @@ -1793,6 +1855,31 @@ srv_mon_process_existing_counter( value = btr_cur_n_non_sea; break; + case MONITOR_OVLD_PAGE_COMPRESS_SAVED: + value = srv_stats.page_compression_saved; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512: + value = srv_stats.page_compression_trim_sect512; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096: + value = srv_stats.page_compression_trim_sect4096; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSED: + value = srv_stats.pages_page_compressed; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP: + value = srv_stats.page_compressed_trim_op; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED: + value = srv_stats.page_compressed_trim_op_saved; + break; + case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: + value = srv_stats.pages_page_decompressed; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR: + value = srv_stats.pages_page_compression_error; + break; + default: ut_error; } diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index f7f36330d4a..b472b98f1c8 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -72,6 +73,7 @@ Created 10/8/1995 Heikki Tuuri #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */ ibool innobase_thd_is_idle(const void* thd); @@ -160,6 +162,23 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +/* Default compression level if page compression is used and no compression +level is set for the table*/ +UNIV_INTERN long srv_compress_zlib_level = 6; +/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) +to the pages */ +UNIV_INTERN my_bool srv_use_trim = FALSE; +/* If this flag is TRUE, then we will use posix fallocate for file extentsion */ +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +/* If this flag is TRUE, then we disable doublewrite buffer */ +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +UNIV_INTERN long innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM; +/* Number of threads used for multi-threaded flush */ +UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; +/* If this flag is TRUE, then we will use multi threaded flush. */ +UNIV_INTERN my_bool srv_use_mtflush = FALSE; + #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function pointers, because they are not available on Windows Server 2003 and @@ -457,10 +476,6 @@ UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20; UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE; UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; -UNIV_INTERN ibool srv_use_atomic_writes = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -UNIV_INTERN ibool srv_use_posix_fallocate = FALSE; -#endif /** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages. The following parameter is the size of the buffer that is used for @@ -496,6 +511,16 @@ static ulint srv_n_rows_read_old = 0; UNIV_INTERN ulint srv_truncated_status_writes = 0; UNIV_INTERN ulint srv_available_undo_logs = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; +UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_non_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; +UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; + /* Ensure status variables are on separate cache lines */ #define CACHE_LINE_SIZE 64 @@ -1841,6 +1866,16 @@ srv_export_innodb_status(void) export_vars.innodb_descriptors_memory = os_atomic_increment_ulint(&srv_descriptors_memory, 0); + export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved; + export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; + export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; + export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written; + export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; + export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; + export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; + export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); trx_id_t done_trx_no = purge_sys->done.trx_no; diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index c1c2f39aaa1..df575a8702c 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -3,6 +3,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -67,11 +68,14 @@ Created 2/16/1996 Heikki Tuuri #include "ibuf0ibuf.h" #include "srv0start.h" #include "srv0srv.h" +#include "buf0flu.h" + #ifndef UNIV_HOTBACKUP # include "trx0rseg.h" # include "os0proc.h" # include "sync0sync.h" # include "buf0flu.h" +# include "buf0mtflu.h" # include "buf0rea.h" # include "dict0boot.h" # include "dict0load.h" @@ -131,8 +135,11 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 - + SRV_MAX_N_PURGE_THREADS]; +/** 6 is the ? */ +#define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS) +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS + MTFLUSH_MAX_WORKER]; +/* Thread contex data for multi-threaded flush */ +void *mtflush_ctx=NULL; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -541,7 +548,7 @@ create_log_file( *file = os_file_create( innodb_file_log_key, name, OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name); @@ -758,7 +765,7 @@ open_log_file( *file = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); return(DB_ERROR); @@ -849,7 +856,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode) { @@ -892,7 +899,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -925,17 +932,17 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else if (i == 0) { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -1126,7 +1133,7 @@ srv_undo_tablespace_create( innodb_file_data_key, name, srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode && ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -1213,7 +1220,8 @@ srv_undo_tablespace_open( | OS_FILE_ON_ERROR_SILENT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + FALSE); /* If the file open was successful then load the tablespace. */ @@ -2738,6 +2746,25 @@ files_checked: } if (!srv_read_only_mode) { + + if (srv_use_mtflush) { + /* Start multi-threaded flush threads */ + mtflush_ctx = buf_mtflu_handler_init( + srv_mtflush_threads, + srv_buf_pool_instances); + + /* Set up the thread ids */ + buf_mtflu_set_thread_ids( + srv_mtflush_threads, + mtflush_ctx, + (thread_ids + 6 + SRV_MAX_N_PURGE_THREADS)); +#if UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", + __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); +#endif + } + + os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); } os_thread_create(buf_flush_lru_manager_thread, NULL, NULL); @@ -3010,8 +3037,19 @@ innobase_shutdown_for_mysql(void) logs_empty_and_mark_files_at_shutdown() and should have already quit or is quitting right now. */ + + if (srv_use_mtflush) { + /* g. Exit the multi threaded flush threads */ + + buf_mtflu_io_thread_exit(); + } + +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: %s:%d os_thread_count:%lu \n", __FUNCTION__, __LINE__, os_thread_count); +#endif os_mutex_enter(os_sync_mutex); + if (os_thread_count == 0) { /* All the threads have exited or are just exiting; NOTE that the threads may not have completed their diff --git a/storage/xtradb/ut/ut0wqueue.cc b/storage/xtradb/ut/ut0wqueue.cc index d1ba36b3b00..1607e535a94 100644 --- a/storage/xtradb/ut/ut0wqueue.cc +++ b/storage/xtradb/ut/ut0wqueue.cc @@ -161,6 +161,38 @@ ib_wqueue_timedwait( return(node ? node->data : NULL); } +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq) /*mutex); + + if(!ib_list_is_empty(wq->items)) { + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + } + } + + /* We must reset the event when the list + gets emptied. */ + if(ib_list_is_empty(wq->items)) { + os_event_reset(wq->event); + } + + mutex_exit(&wq->mutex); + + return (node ? node->data : NULL); +} + /******************************************************************** Check if queue is empty. */ @@ -173,3 +205,20 @@ ib_wqueue_is_empty( { return(ib_list_is_empty(wq->items)); } + +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq) /*mutex); + len = ib_list_len(wq->items); + mutex_exit(&wq->mutex); + + return(len); +}