diff --git a/config.h.cmake b/config.h.cmake index fcc366939b2..6eaba1e0d9f 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -184,7 +184,6 @@ #cmakedefine HAVE_PERROR 1 #cmakedefine HAVE_POLL 1 #cmakedefine HAVE_POSIX_FALLOCATE 1 -#cmakedefine HAVE_LINUX_FALLOC_H 1 #cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1 #cmakedefine HAVE_PREAD 1 #cmakedefine HAVE_PAUSE_INSTRUCTION 1 diff --git a/configure.cmake b/configure.cmake index 947689d0f86..1404263e5a6 100644 --- a/configure.cmake +++ b/configure.cmake @@ -196,7 +196,6 @@ CHECK_INCLUDE_FILES (inttypes.h HAVE_INTTYPES_H) CHECK_INCLUDE_FILES (langinfo.h HAVE_LANGINFO_H) CHECK_INCLUDE_FILES (link.h HAVE_LINK_H) CHECK_INCLUDE_FILES (linux/unistd.h HAVE_LINUX_UNISTD_H) -CHECK_INCLUDE_FILES (linux/falloc.h HAVE_LINUX_FALLOC_H) CHECK_INCLUDE_FILES (limits.h HAVE_LIMITS_H) CHECK_INCLUDE_FILES (locale.h HAVE_LOCALE_H) CHECK_INCLUDE_FILES (malloc.h HAVE_MALLOC_H) diff --git a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result index a90cd22f57a..6b69d2d9ee4 100644 --- a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result +++ b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result @@ -385,7 +385,7 @@ SPACE NAME ENCRYPTION_SCHEME KEYSERVER_REQUESTS MIN_KEY_VERSION CURRENT_KEY_VERS Warnings: Warning 1012 InnoDB: SELECTing from INFORMATION_SCHEMA.innodb_tablespaces_encryption but the InnoDB storage engine is not installed select * from information_schema.innodb_tablespaces_scrubbing; -SPACE NAME COMPRESSED LAST_SCRUB_COMPLETED CURRENT_SCRUB_STARTED CURRENT_SCRUB_ACTIVE_THREADS CURRENT_SCRUB_PAGE_NUMBER CURRENT_SCRUB_MAX_PAGE_NUMBER ROTATING_OR_FLUSHING +SPACE NAME COMPRESSED LAST_SCRUB_COMPLETED CURRENT_SCRUB_STARTED CURRENT_SCRUB_ACTIVE_THREADS CURRENT_SCRUB_PAGE_NUMBER CURRENT_SCRUB_MAX_PAGE_NUMBER ON_SSD Warnings: Warning 1012 InnoDB: SELECTing from INFORMATION_SCHEMA.innodb_tablespaces_scrubbing but the InnoDB storage engine is not installed select * from information_schema.innodb_mutexes; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 37bfed420c0..ef1f1a9aea9 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2018, MariaDB Corporation. +Copyright (c) 2013, 2019, MariaDB Corporation. Copyright (c) 2013, 2014, Fusion-io This program is free software; you can redistribute it and/or modify it under @@ -1314,9 +1314,13 @@ buf_flush_try_neighbors( buf_pool_t* buf_pool = buf_pool_get(page_id); ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + fil_space_t* space = fil_space_acquire_for_io(page_id.space()); + if (!space) { + return 0; + } if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN - || srv_flush_neighbors == 0) { + || !srv_flush_neighbors || !space->is_rotational()) { /* If there is little space or neighbor flushing is not enabled then just flush the victim. */ low = page_id.page_no(); @@ -1371,9 +1375,8 @@ buf_flush_try_neighbors( } } - const ulint space_size = fil_space_get_size(page_id.space()); - if (high > space_size) { - high = space_size; + if (high > space->size) { + high = space->size; } DBUG_PRINT("ib_buf", ("flush %u:%u..%u", @@ -1450,6 +1453,8 @@ buf_flush_try_neighbors( buf_pool_mutex_exit(buf_pool); } + space->release_for_io(); + if (count > 1) { MONITOR_INC_VALUE_CUMULATIVE( MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 639c8dcb23f..1ac4b96d4f4 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -50,6 +50,11 @@ Created 10/25/1995 Heikki Tuuri #include "sync0sync.h" #include "buf0flu.h" #include "os0api.h" +#ifdef UNIV_LINUX +# include +# include +# include +#endif /** Tries to close a file in the LRU list. The caller must hold the fil_sys mutex. @@ -380,19 +385,6 @@ fil_space_get_latch( return(&(space->latch)); } -/** Note that the tablespace has been imported. -Initially, purpose=FIL_TYPE_IMPORT so that no redo log is -written while the space ID is being updated in each page. */ -void fil_space_t::set_imported() -{ - ut_ad(purpose == FIL_TYPE_IMPORT); - const fil_node_t* node = UT_LIST_GET_FIRST(chain); - atomic_write_supported = node->atomic_write - && srv_use_atomic_writes - && my_test_if_atomic_write(node->handle, physical_size()); - purpose = FIL_TYPE_TABLESPACE; -} - /**********************************************************************//** Checks if all the file nodes in a space are flushed. @return true if all are flushed */ @@ -505,108 +497,6 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, return node; } -/** Read the first page of a data file. -@param[in] first whether this is the very first read -@return whether the page was found valid */ -bool fil_node_t::read_page0(bool first) -{ - ut_ad(mutex_own(&fil_system.mutex)); - ut_a(space->purpose != FIL_TYPE_LOG); - const ulint psize = space->physical_size(); - - os_offset_t size_bytes = os_file_get_size(handle); - ut_a(size_bytes != (os_offset_t) -1); - const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; - - if (size_bytes < min_size) { - ib::error() << "The size of the file " << name - << " is only " << size_bytes - << " bytes, should be at least " << min_size; - return false; - } - - byte* buf2 = static_cast(ut_malloc_nokey(2 * psize)); - - /* Align the memory for file i/o if we might have O_DIRECT set */ - byte* page = static_cast(ut_align(buf2, psize)); - IORequest request(IORequest::READ); - if (!os_file_read(request, handle, page, 0, psize)) { - ib::error() << "Unable to read first page of file " << name; - ut_free(buf2); - return false; - } - const ulint space_id = fsp_header_get_space_id(page); - ulint flags = fsp_header_get_flags(page); - const ulint size = fsp_header_get_field(page, FSP_SIZE); - const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT); - const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE - + page); - if (!fil_space_t::is_valid_flags(flags, space->id)) { - ulint cflags = fsp_flags_convert_from_101(flags); - if (cflags == ULINT_UNDEFINED) { -invalid: - ib::error() - << "Expected tablespace flags " - << ib::hex(space->flags) - << " but found " << ib::hex(flags) - << " in the file " << name; - ut_free(buf2); - return false; - } - - ulint cf = cflags & ~FSP_FLAGS_MEM_MASK; - ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK; - - if (!fil_space_t::is_flags_equal(cf, sf) - && !fil_space_t::is_flags_equal(sf, cf)) { - goto invalid; - } - - flags = cflags; - } - - ut_ad(!(flags & FSP_FLAGS_MEM_MASK)); - - /* Try to read crypt_data from page 0 if it is not yet read. */ - if (!space->crypt_data) { - space->crypt_data = fil_space_read_crypt_data( - fil_space_t::zip_size(flags), page); - } - ut_free(buf2); - - if (UNIV_UNLIKELY(space_id != space->id)) { - ib::error() << "Expected tablespace id " << space->id - << " but found " << space_id - << " in the file " << name; - return false; - } - - ut_ad(space->free_limit == 0 || space->free_limit == free_limit); - ut_ad(space->free_len == 0 || space->free_len == free_len); - space->size_in_header = size; - space->free_limit = free_limit; - space->free_len = free_len; - - if (first) { - /* Truncate the size to a multiple of extent size. */ - ulint mask = psize * FSP_EXTENT_SIZE - 1; - - if (size_bytes <= mask) { - /* .ibd files start smaller than an - extent size. Do not truncate valid data. */ - } else { - size_bytes &= ~os_offset_t(mask); - } - - space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags; - - this->size = ulint(size_bytes / psize); - space->size += this->size; - } - - return true; -} - /** Open a file node of a tablespace. @param[in,out] node File node @return false if the file can't be opened, otherwise true */ @@ -682,28 +572,6 @@ fail: OS_FILE_AIO, OS_DATA_FILE, read_only_mode, &success); } - if (space->purpose != FIL_TYPE_LOG) { - /* - For the temporary tablespace and during the - non-redo-logged adjustments in - IMPORT TABLESPACE, we do not care about - the atomicity of writes. - - Atomic writes is supported if the file can be used - with atomic_writes (not log file), O_DIRECT is - used (tested in ha_innodb.cc) and the file is - device and file system that supports atomic writes - for the given block size - */ - space->atomic_write_supported - = space->purpose == FIL_TYPE_TEMPORARY - || space->purpose == FIL_TYPE_IMPORT - || (node->atomic_write - && srv_use_atomic_writes - && my_test_if_atomic_write( - node->handle, space->physical_size())); - } - ut_a(success); ut_a(node->is_open()); @@ -967,12 +835,6 @@ fil_space_extend_must_retry( ulint last_page_no = space->size; const ulint file_start_page_no = last_page_no - node->size; - /* Determine correct file block size */ - if (node->block_size == 0) { - node->block_size = os_file_get_block_size( - node->handle, node->name); - } - const ulint page_size = space->physical_size(); /* fil_read_first_page() expects srv_page_size bytes. @@ -1435,8 +1297,8 @@ fil_space_create( to do */ if (purpose == FIL_TYPE_TABLESPACE && !srv_fil_crypt_rotate_key_age && fil_crypt_threads_event && - (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || - srv_encrypt_tables)) { + (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF + || srv_encrypt_tables)) { /* Key rotation is not enabled, need to inform background encryption threads. */ UT_LIST_ADD_LAST(fil_system.rotation_list, space); @@ -1707,6 +1569,66 @@ void fil_system_t::create(ulint hash_size) spaces = hash_create(hash_size); fil_space_crypt_init(); +#ifdef UNIV_LINUX + ssd.clear(); + char fn[sizeof(dirent::d_name) + + sizeof "/sys/block/" "/queue/rotational"]; + const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block"; + memcpy(fn, "/sys/block/", sizeof "/sys/block"); + char* fnp = &fn[sizeof "/sys/block"]; + + std::set ssd_devices; + if (DIR* d = opendir("/sys/block")) { + while (struct dirent* e = readdir(d)) { + if (e->d_name[0] == '.') { + continue; + } + snprintf(fnp, sizeof_fnp, "%s/queue/rotational", + e->d_name); + int f = open(fn, O_RDONLY); + if (f == -1) { + continue; + } + char b[sizeof "4294967295:4294967295\n"]; + ssize_t l = read(f, b, sizeof b); + ::close(f); + if (l != 2 || memcmp("0\n", b, 2)) { + continue; + } + snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name); + f = open(fn, O_RDONLY); + if (f == -1) { + continue; + } + l = read(f, b, sizeof b); + ::close(f); + if (l <= 0 || b[l - 1] != '\n') { + continue; + } + b[l - 1] = '\0'; + char* end = b; + unsigned long dev_major = strtoul(b, &end, 10); + if (b == end || *end != ':' + || dev_major != unsigned(dev_major)) { + continue; + } + char* c = end + 1; + unsigned long dev_minor = strtoul(c, &end, 10); + if (c == end || *end + || dev_minor != unsigned(dev_minor)) { + continue; + } + ssd.push_back(makedev(unsigned(dev_major), + unsigned(dev_minor))); + } + closedir(d); + } + /* fil_system_t::is_ssd() assumes the following */ + ut_ad(makedev(0, 8) == 8); + ut_ad(makedev(0, 4) == 4); + ut_ad(makedev(0, 2) == 2); + ut_ad(makedev(0, 1) == 1); +#endif } void fil_system_t::close() @@ -2969,6 +2891,9 @@ fil_rename_tablespace( return(success); } +/* FIXME: remove this! */ +IF_WIN(, bool os_is_sparse_file_supported(os_file_t fh)); + /** Create a tablespace file. @param[in] space_id Tablespace ID @param[in] name Tablespace name in dbname/tablename format. @@ -3044,6 +2969,7 @@ fil_ibd_create( } const bool is_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + bool punch_hole = is_compressed; #ifdef _WIN32 if (is_compressed) { @@ -3061,9 +2987,8 @@ err_exit: return NULL; } - bool punch_hole = os_is_sparse_file_supported(file); - - ulint block_size = os_file_get_block_size(file, path); + /* FIXME: remove this */ + IF_WIN(, punch_hole = punch_hole && os_is_sparse_file_supported(file)); /* We have to write the space id to the file immediately and flush the file to disk. This is because in crash recovery we must be aware what @@ -3139,7 +3064,7 @@ err_exit: /* Create crypt data if the tablespace is either encrypted or user has requested it to remain unencrypted. */ if (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || - srv_encrypt_tables) { + srv_encrypt_tables) { crypt_data = fil_space_create_crypt_data(mode, key_id); } @@ -3149,19 +3074,19 @@ err_exit: free(crypt_data); *err = DB_ERROR; } else { - fil_node_t* file = space->add(path, OS_FILE_CLOSED, size, + space->punch_hole = punch_hole; + /* FIXME: Keep the file open! */ + fil_node_t* node = space->add(path, OS_FILE_CLOSED, size, false, true); mtr_t mtr; mtr.start(); fil_op_write_log( - MLOG_FILE_CREATE2, space_id, 0, file->name, + MLOG_FILE_CREATE2, space_id, 0, node->name, NULL, space->flags & ~FSP_FLAGS_MEM_MASK, &mtr); - fil_name_write(space, 0, file, &mtr); + fil_name_write(space, 0, node, &mtr); mtr.commit(); - file->block_size = block_size; - space->punch_hole = punch_hole; - + node->find_metadata(file); *err = DB_SUCCESS; } @@ -4154,6 +4079,15 @@ fil_report_invalid_page_access( : ""); } +inline void IORequest::set_fil_node(fil_node_t* node) +{ + if (!node->space->punch_hole) { + clear_punch_hole(); + } + + m_fil_node = node; +} + /** Reads or writes data. This operation could be asynchronous (aio). @param[in,out] type IO context @@ -5215,29 +5149,6 @@ fil_space_found_by_id( return space; } -/** -Get should we punch hole to tablespace. -@param[in] node File node -@return true, if punch hole should be tried, false if not. */ -bool -fil_node_should_punch_hole( - const fil_node_t* node) -{ - return (node->space->punch_hole); -} - -/** -Set punch hole to tablespace to given value. -@param[in] node File node -@param[in] val value to be set. */ -void -fil_space_set_punch_hole( - fil_node_t* node, - bool val) -{ - node->space->punch_hole = val; -} - /** Checks that this tablespace in a list of unflushed tablespaces. @return true if in a list */ bool fil_space_t::is_in_unflushed_spaces() const { diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 52f1ace7e37..4016b11387e 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -8697,7 +8697,7 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] = #define TABLESPACES_SCRUBBING_COMPRESSED 2 {STRUCT_FLD(field_name, "COMPRESSED"), - STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_length, 1), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), @@ -8749,14 +8749,14 @@ static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] = STRUCT_FLD(old_name, ""), STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, -#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9 - {STRUCT_FLD(field_name, "ROTATING_OR_FLUSHING"), - STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), +#define TABLESPACES_SCRUBBING_ON_SSD 8 + {STRUCT_FLD(field_name, "ON_SSD"), + STRUCT_FLD(field_length, 1), STRUCT_FLD(field_type, MYSQL_TYPE_LONG), STRUCT_FLD(value, 0), STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), - STRUCT_FLD(old_name, ""), - STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, END_OF_ST_FIELD_INFO }; @@ -8829,6 +8829,8 @@ i_s_dict_fill_tablespaces_scrubbing( } } + OK(fields[TABLESPACES_SCRUBBING_ON_SSD]->store(!space->is_rotational(), + true)); OK(schema_table_store_record(thd, table_to_fill)); DBUG_RETURN(0); diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index bde5a85ebf9..18449785e61 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -33,6 +33,9 @@ Created 10/25/1995 Heikki Tuuri #include "log0recv.h" #include "dict0types.h" +#ifdef UNIV_LINUX +# include +#endif // Forward declaration extern my_bool srv_use_doublewrite_buf; @@ -234,7 +237,10 @@ struct fil_space_t { /** Note that the tablespace has been imported. Initially, purpose=FIL_TYPE_IMPORT so that no redo log is written while the space ID is being updated in each page. */ - void set_imported(); + inline void set_imported(); + + /** @return whether the storage device is rotational (HDD, not SSD) */ + inline bool is_rotational() const; /** Open each file. Only invoked on fil_system.temp_space. @return whether all files were opened */ @@ -537,6 +543,8 @@ struct fil_node_t { pfs_os_file_t handle; /** whether the file actually is a raw device or disk partition */ bool is_raw_disk; + /** whether the file is on non-rotational media (SSD) */ + bool on_ssd; /** size of the file in database pages (0 if not known yet); the possible last incomplete megabyte may be ignored if space->id == 0 */ @@ -579,6 +587,14 @@ struct fil_node_t { @return whether the page was found valid */ bool read_page0(bool first); + /** Determine some file metadata when creating or reading the file. + @param file the file that is being created, or OS_FILE_CLOSED */ + void find_metadata(os_file_t file = OS_FILE_CLOSED +#ifdef UNIV_LINUX + , struct stat* statbuf = NULL +#endif + ); + /** Close the file handle. */ void close(); }; @@ -586,6 +602,24 @@ struct fil_node_t { /** Value of fil_node_t::magic_n */ #define FIL_NODE_MAGIC_N 89389 +inline void fil_space_t::set_imported() +{ + ut_ad(purpose == FIL_TYPE_IMPORT); + purpose = FIL_TYPE_TABLESPACE; + UT_LIST_GET_FIRST(chain)->find_metadata(); +} + +inline bool fil_space_t::is_rotational() const +{ + for (const fil_node_t* node = UT_LIST_GET_FIRST(chain); + node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { + if (!node->on_ssd) { + return true; + } + } + return false; +} + /** Common InnoDB file extentions */ enum ib_extention { NO_EXT = 0, @@ -853,6 +887,22 @@ struct fil_system_t { private: bool m_initialised; +#ifdef UNIV_LINUX + /** available block devices that reside on non-rotational storage */ + std::vector ssd; +public: + /** @return whether a file system device is on non-rotational storage */ + bool is_ssd(dev_t dev) const + { + /* Linux seems to allow up to 15 partitions per block device. + If the detected ssd carries "partition number 0" (it is the whole device), + compare the candidate file system number without the partition number. */ + for (const auto s : ssd) + if (dev == s || (dev & ~15U) == s) + return true; + return false; + } +#endif public: ib_mutex_t mutex; /*!< The mutex protecting the cache */ fil_space_t* sys_space; /*!< The innodb_system tablespace */ diff --git a/storage/innobase/include/os0api.h b/storage/innobase/include/os0api.h index 63f213b5457..20111cbf7f0 100644 --- a/storage/innobase/include/os0api.h +++ b/storage/innobase/include/os0api.h @@ -1,6 +1,6 @@ /*********************************************************************** -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -54,22 +54,4 @@ buf_page_get_trim_length( ulint write_length) MY_ATTRIBUTE((warn_unused_result)); -/** -Get should we punch hole to tablespace. -@param[in] space Tablespace -@return true, if punch hole should be tried, false if not. */ -bool -fil_node_should_punch_hole( - const fil_node_t* node) - MY_ATTRIBUTE((warn_unused_result)); - -/** -Set punch hole to tablespace to given value. -@param[in] space Tablespace -@param[in] val value to be set. */ -void -fil_space_set_punch_hole( - fil_node_t* node, - bool val); - #endif /* OS_API_H */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 2c11e447952..e7f076fb79a 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2019, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -360,17 +360,8 @@ public: /** Set the pointer to file node for IO @param[in] node File node */ - void set_fil_node(fil_node_t* node) - { - if (node && !fil_node_should_punch_hole(node)) { - clear_punch_hole(); - } + inline void set_fil_node(fil_node_t* node); - m_fil_node = node; - } - - /** Compare two requests - @reutrn true if the are equal */ bool operator==(const IORequest& rhs) const { return(m_type == rhs.m_type); @@ -414,17 +405,7 @@ public: : 0); } - bool should_punch_hole() const { - return (m_fil_node ? - fil_node_should_punch_hole(m_fil_node) - : false); - } - - void space_no_punch_hole() const { - if (m_fil_node) { - fil_space_set_punch_hole(m_fil_node, false); - } - } + inline bool should_punch_hole() const; /** Free storage space associated with a section of the file. @param[in] fh Open file handle @@ -1591,19 +1572,6 @@ os_file_change_size_win32( #endif /*_WIN32 */ -/** Check if the file system supports sparse files. - -Warning: On POSIX systems we try and punch a hole from offset 0 to -the system configured page size. This should only be called on an empty -file. - -@param[in] fh File handle for the file - if opened -@return true if the file system supports sparse files */ -bool -os_is_sparse_file_supported( - os_file_t fh) - MY_ATTRIBUTE((warn_unused_result)); - /** Free storage space associated with a section of the file. @param[in] fh Open file handle @param[in] off Starting offset (SEEK_SET) @@ -1643,16 +1611,6 @@ is_absolute_path( return(false); } -/***********************************************************************//** -Try to get number of bytes per sector from file system. -@return file block size */ -UNIV_INTERN -ulint -os_file_get_block_size( -/*===================*/ - os_file_t file, /*!< in: handle to a file */ - const char* name); /*!< in: file name */ - #include "os0file.ic" #endif /* os0file_h */ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index bbb8a0a4b55..23e0a401c1e 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -38,14 +38,14 @@ Created 10/21/1995 Heikki Tuuri #include "sql_const.h" #ifdef UNIV_LINUX -#include -#include +# include +# include #endif #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" -#include "srv0srv.h" +#include "fsp0fsp.h" #ifdef HAVE_LINUX_UNISTD_H #include "unistd.h" #endif @@ -70,14 +70,6 @@ Created 10/21/1995 Heikki Tuuri # endif #endif -#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) -#include -#endif - -#if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H) -#include -#endif - #ifdef _WIN32 #include #endif @@ -821,108 +813,6 @@ os_win32_device_io_control( #endif -/***********************************************************************//** -Try to get number of bytes per sector from file system. -@return file block size */ -UNIV_INTERN -ulint -os_file_get_block_size( -/*===================*/ - os_file_t file, /*!< in: handle to a file */ - const char* name) /*!< in: file name */ -{ - ulint fblock_size = 512; - -#if defined(UNIV_LINUX) - struct stat local_stat; - int err; - - err = fstat((int)file, &local_stat); - - if (err != 0) { - os_file_handle_error_no_exit(name, "fstat()", FALSE); - } else { - fblock_size = local_stat.st_blksize; - } -#endif /* UNIV_LINUX */ -#ifdef _WIN32 - - fblock_size = 0; - BOOL result = false; - size_t len = 0; - // Open volume for this file, find out it "physical bytes per sector" - - HANDLE volume_handle = INVALID_HANDLE_VALUE; - char volume[MAX_PATH + 4]="\\\\.\\"; // Special prefix required for volume names. - if (!GetVolumePathName(name , volume + 4, MAX_PATH)) { - os_file_handle_error_no_exit(name, - "GetVolumePathName()", FALSE); - goto end; - } - - len = strlen(volume); - if (volume[len - 1] == '\\') { - // Trim trailing backslash from volume name. - volume[len - 1] = 0; - } - - volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - 0, OPEN_EXISTING, 0, 0); - - if (volume_handle == INVALID_HANDLE_VALUE) { - if (GetLastError() != ERROR_ACCESS_DENIED) { - os_file_handle_error_no_exit(volume, - "CreateFile()", FALSE); - } - goto end; - } - - DWORD tmp; - STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment; - - STORAGE_PROPERTY_QUERY storage_query; - memset(&storage_query, 0, sizeof(storage_query)); - storage_query.PropertyId = StorageAccessAlignmentProperty; - storage_query.QueryType = PropertyStandardQuery; - - result = os_win32_device_io_control(volume_handle, - IOCTL_STORAGE_QUERY_PROPERTY, - &storage_query, - sizeof(storage_query), - &disk_alignment, - sizeof(disk_alignment), - &tmp); - - if (!result) { - DWORD err = GetLastError(); - if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) { - os_file_handle_error_no_exit(volume, - "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", FALSE); - } - goto end; - } - - fblock_size = disk_alignment.BytesPerPhysicalSector; - -end: - if (volume_handle != INVALID_HANDLE_VALUE) { - CloseHandle(volume_handle); - } -#endif /* _WIN32 */ - - /* Currently we support file block size up to 4Kb */ - if (fblock_size > 4096 || fblock_size < 512) { - if (fblock_size < 512) { - fblock_size = 512; - } else { - fblock_size = 4096; - } - } - - return fblock_size; -} - #ifdef WIN_ASYNC_IO /** This function is only used in Windows asynchronous i/o. Waits for an aio operation to complete. This function is used to wait the @@ -5255,6 +5145,34 @@ short_warning: #endif /* _WIN32 */ +/** Check if the file system supports sparse files. +@param fh file handle +@return true if the file system supports sparse files */ +IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh) +{ + /* In this debugging mode, we act as if punch hole is supported, + then we skip any calls to actually punch a hole. In this way, + Transparent Page Compression is still being tested. */ + DBUG_EXECUTE_IF("ignore_punch_hole", + return(true); + ); + +#ifdef _WIN32 + FILE_ATTRIBUTE_TAG_INFO info; + if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, + &info, (DWORD)sizeof(info))) { + if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { + return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; + } + } + return false; +#else + /* We don't know the FS block size, use the sector size. The FS + will do the magic. */ + return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size); +#endif /* _WIN32 */ +} + /** Extend a file. On Windows, extending a file allocates blocks for the file, @@ -5482,15 +5400,16 @@ os_file_punch_hole( os_offset_t off, os_offset_t len) { - dberr_t err; - #ifdef _WIN32 - err = os_file_punch_hole_win32(fh, off, len); + return os_file_punch_hole_win32(fh, off, len); #else - err = os_file_punch_hole_posix(fh, off, len); + return os_file_punch_hole_posix(fh, off, len); #endif /* _WIN32 */ +} - return (err); +inline bool IORequest::should_punch_hole() const +{ + return m_fil_node && m_fil_node->space->punch_hole; } /** Free storage space associated with a section of the file. @@ -5530,7 +5449,9 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) /* If punch hole is not supported, set space so that it is not used. */ if (err == DB_IO_NO_PUNCH_HOLE) { - space_no_punch_hole(); + if (m_fil_node) { + m_fil_node->space->punch_hole = false; + } err = DB_SUCCESS; } } @@ -5538,43 +5459,6 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) return (err); } -/** Check if the file system supports sparse files. - -Warning: On POSIX systems we try and punch a hole from offset 0 to -the system configured page size. This should only be called on an empty -file. -@param[in] fh File handle for the file - if opened -@return true if the file system supports sparse files */ -bool -os_is_sparse_file_supported(os_file_t fh) -{ - /* In this debugging mode, we act as if punch hole is supported, - then we skip any calls to actually punch a hole. In this way, - Transparent Page Compression is still being tested. */ - DBUG_EXECUTE_IF("ignore_punch_hole", - return(true); - ); - -#ifdef _WIN32 - FILE_ATTRIBUTE_TAG_INFO info; - if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, - &info, (DWORD)sizeof(info))) { - if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { - return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; - } - } - return false; -#else - dberr_t err; - - /* We don't know the FS block size, use the sector size. The FS - will do the magic. */ - err = os_file_punch_hole_posix(fh, 0, srv_page_size); - - return(err == DB_SUCCESS); -#endif /* _WIN32 */ -} - /** This function returns information about the specified file @param[in] path pathname of the file @param[out] stat_info information of a file in a directory @@ -7604,6 +7488,279 @@ os_file_set_umask(ulint umask) os_innodb_umask = umask; } +/** Determine some file metadata when creating or reading the file. +@param file the file that is being created, or OS_FILE_CLOSED */ +void fil_node_t::find_metadata(os_file_t file +#ifdef UNIV_LINUX + , struct stat* statbuf +#endif + ) +{ + if (file == OS_FILE_CLOSED) { + file = handle; + ut_ad(is_open()); + } + +#ifdef _WIN32 /* FIXME: make this unconditional */ + if (space->punch_hole) { + space->punch_hole = os_is_sparse_file_supported(file); + } +#endif + + /* + For the temporary tablespace and during the + non-redo-logged adjustments in + IMPORT TABLESPACE, we do not care about + the atomicity of writes. + + Atomic writes is supported if the file can be used + with atomic_writes (not log file), O_DIRECT is + used (tested in ha_innodb.cc) and the file is + device and file system that supports atomic writes + for the given block size. + */ + space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT; +#ifdef _WIN32 + block_size = 512; + on_ssd = false; + // Open volume for this file, find out it "physical bytes per sector" + char volume[MAX_PATH + 4]; + if (!GetVolumePathName(name, volume + 4, MAX_PATH)) { + os_file_handle_error_no_exit(name, + "GetVolumePathName()", FALSE); + return; + } + // Special prefix required for volume names. + memcpy(volume, "\\\\.\\", 4); + + size_t len = strlen(volume); + if (volume[len - 1] == '\\') { + // Trim trailing backslash from volume name. + volume[len - 1] = 0; + } + + HANDLE volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + 0, OPEN_EXISTING, 0, 0); + + if (volume_handle != INVALID_HANDLE_VALUE) { + DWORD tmp; + union { + STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment; + DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty; + } result; + STORAGE_PROPERTY_QUERY storage_query; + memset(&storage_query, 0, sizeof(storage_query)); + storage_query.PropertyId = StorageAccessAlignmentProperty; + storage_query.QueryType = PropertyStandardQuery; + + if (!os_win32_device_io_control(volume_handle, + IOCTL_STORAGE_QUERY_PROPERTY, + &storage_query, + sizeof storage_query, + &result.disk_alignment, + sizeof result.disk_alignment, + &tmp) + || tmp < sizeof result.disk_alignment) { + switch (GetLastError()) { + case ERROR_INVALID_FUNCTION: + case ERROR_NOT_SUPPORTED: + break; + default: + ioctl_fail: + os_file_handle_error_no_exit( + volume, + "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", + FALSE); + } + goto end; + } + + block_size = result.disk_alignment.BytesPerPhysicalSector; + + storage_query.PropertyId = StorageDeviceSeekPenaltyProperty; + storage_query.QueryType = PropertyStandardQuery; + + if (!os_win32_device_io_control(volume_handle, + IOCTL_STORAGE_QUERY_PROPERTY, + &storage_query, + sizeof storage_query, + &result.seek_penalty, + sizeof result.seek_penalty, + &tmp) + || tmp < sizeof result.seek_penalty) { + switch (GetLastError()) { + case ERROR_INVALID_FUNCTION: + case ERROR_NOT_SUPPORTED: + case ERROR_GEN_FAILURE: + goto end; + default: + goto ioctl_fail; + } + } + + on_ssd = !result.seek_penalty.IncursSeekPenalty; +end: + if (volume_handle != INVALID_HANDLE_VALUE) { + CloseHandle(volume_handle); + } + } else { + if (GetLastError() != ERROR_ACCESS_DENIED) { + os_file_handle_error_no_exit(volume, + "CreateFile()", FALSE); + } + } + + /* Currently we support file block size up to 4KiB */ + if (block_size > 4096) { + block_size = 4096; + } else if (block_size < 512) { + block_size = 512; + } +#else + on_ssd = space->atomic_write_supported; +# ifdef UNIV_LINUX + if (!on_ssd) { + struct stat sbuf; + if (!statbuf && !fstat(file, &sbuf)) { + statbuf = &sbuf; + } + if (statbuf && fil_system.is_ssd(statbuf->st_dev)) { + on_ssd = true; + } + } +# endif +#endif + if (!space->atomic_write_supported) { + space->atomic_write_supported = atomic_write + && srv_use_atomic_writes +#ifdef _WIN32 + && my_test_if_atomic_write(file, + space->physical_size()) +#else + && srv_page_size == block_size +#endif + ; + } +} + +/** Read the first page of a data file. +@param[in] first whether this is the very first read +@return whether the page was found valid */ +bool fil_node_t::read_page0(bool first) +{ + ut_ad(mutex_own(&fil_system.mutex)); + ut_a(space->purpose != FIL_TYPE_LOG); + const ulint psize = space->physical_size(); +#ifndef _WIN32 + struct stat statbuf; + if (fstat(handle, &statbuf)) { + return false; + } + block_size = statbuf.st_blksize; + os_offset_t size_bytes = statbuf.st_size; +#else + os_offset_t size_bytes = os_file_get_size(handle); + ut_a(size_bytes != (os_offset_t) -1); +#endif + const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; + + if (size_bytes < min_size) { + ib::error() << "The size of the file " << name + << " is only " << size_bytes + << " bytes, should be at least " << min_size; + return false; + } + + byte* buf2 = static_cast(ut_malloc_nokey(2 * psize)); + + /* Align the memory for file i/o if we might have O_DIRECT set */ + byte* page = static_cast(ut_align(buf2, psize)); + IORequest request(IORequest::READ); + if (!os_file_read(request, handle, page, 0, psize)) { + ib::error() << "Unable to read first page of file " << name; + ut_free(buf2); + return false; + } + const ulint space_id = fsp_header_get_space_id(page); + ulint flags = fsp_header_get_flags(page); + const ulint size = fsp_header_get_field(page, FSP_SIZE); + const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT); + const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + + page); + if (!fil_space_t::is_valid_flags(flags, space->id)) { + ulint cflags = fsp_flags_convert_from_101(flags); + if (cflags == ULINT_UNDEFINED) { +invalid: + ib::error() + << "Expected tablespace flags " + << ib::hex(space->flags) + << " but found " << ib::hex(flags) + << " in the file " << name; + ut_free(buf2); + return false; + } + + ulint cf = cflags & ~FSP_FLAGS_MEM_MASK; + ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK; + + if (!fil_space_t::is_flags_equal(cf, sf) + && !fil_space_t::is_flags_equal(sf, cf)) { + goto invalid; + } + + flags = cflags; + } + + ut_ad(!(flags & FSP_FLAGS_MEM_MASK)); + + /* Try to read crypt_data from page 0 if it is not yet read. */ + if (!space->crypt_data) { + space->crypt_data = fil_space_read_crypt_data( + fil_space_t::zip_size(flags), page); + } + ut_free(buf2); + + if (UNIV_UNLIKELY(space_id != space->id)) { + ib::error() << "Expected tablespace id " << space->id + << " but found " << space_id + << " in the file " << name; + return false; + } + + ut_ad(space->free_limit == 0 || space->free_limit == free_limit); + ut_ad(space->free_len == 0 || space->free_len == free_len); + space->size_in_header = size; + space->free_limit = free_limit; + space->free_len = free_len; + + if (first) { +#ifdef UNIV_LINUX + find_metadata(handle, &statbuf); +#else + find_metadata(); +#endif + + /* Truncate the size to a multiple of extent size. */ + ulint mask = psize * FSP_EXTENT_SIZE - 1; + + if (size_bytes <= mask) { + /* .ibd files start smaller than an + extent size. Do not truncate valid data. */ + } else { + size_bytes &= ~os_offset_t(mask); + } + + space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags; + + this->size = ulint(size_bytes / psize); + space->size += this->size; + } + + return true; +} + #else #include "univ.i" #endif /* !UNIV_INNOCHECKSUM */