From 7f7329f0924b72bda152aafbcf75ccbbfeeb444b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 20 Feb 2024 11:22:45 +0200 Subject: [PATCH] MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub --- cmake/os/AIX.cmake | 3 - cmake/os/SunOS.cmake | 4 - cmake/os/WindowsCache.cmake | 1 - config.h.cmake | 1 - configure.cmake | 1 - extra/mariabackup/common.h | 1 - extra/mariabackup/fil_cur.cc | 8 - extra/mariabackup/xtrabackup.cc | 3 +- mysys/mf_tempfile.c | 37 ++- storage/innobase/fil/fil0fil.cc | 4 +- storage/innobase/handler/ha_innodb.cc | 2 +- storage/innobase/include/fil0fil.h | 2 +- storage/innobase/include/os0file.h | 21 +- storage/innobase/include/row0merge.h | 11 +- storage/innobase/os/os0file.cc | 406 +++++++++++--------------- storage/innobase/row/row0merge.cc | 26 +- 16 files changed, 227 insertions(+), 304 deletions(-) diff --git a/cmake/os/AIX.cmake b/cmake/os/AIX.cmake index 7513c4f42c2..299b79198c6 100644 --- a/cmake/os/AIX.cmake +++ b/cmake/os/AIX.cmake @@ -34,8 +34,5 @@ ELSE() SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES -maix64 -pthread -mcmodel=large") ENDIF() -# fcntl(fd, F_SETFL, O_DIRECT) is not supported; O_DIRECT is an open(2) flag -SET(HAVE_FCNTL_DIRECT 0 CACHE INTERNAL "") - # make it WARN by default, not AUTO (that implies -Werror) SET(MYSQL_MAINTAINER_MODE "WARN" CACHE STRING "Enable MariaDB maintainer-specific warnings. One of: NO (warnings are disabled) WARN (warnings are enabled) ERR (warnings are errors) AUTO (warnings are errors in Debug only)") diff --git a/cmake/os/SunOS.cmake b/cmake/os/SunOS.cmake index 3d99d34789a..3a9d2dccb87 100644 --- a/cmake/os/SunOS.cmake +++ b/cmake/os/SunOS.cmake @@ -17,10 +17,6 @@ INCLUDE(CheckSymbolExists) INCLUDE(CheckCSourceRuns) INCLUDE(CheckCSourceCompiles) -# fcntl(fd, F_SETFL, O_DIRECT) is not supported, -# and directio(3C) would only work on UFS or NFS, not ZFS. -SET(HAVE_FCNTL_DIRECT 0 CACHE INTERNAL "") - # Enable 64 bit file offsets SET(_FILE_OFFSET_BITS 64) diff --git a/cmake/os/WindowsCache.cmake b/cmake/os/WindowsCache.cmake index ceb4262730f..c1048661aaa 100644 --- a/cmake/os/WindowsCache.cmake +++ b/cmake/os/WindowsCache.cmake @@ -44,7 +44,6 @@ SET(HAVE_EXECINFO_H CACHE INTERNAL "") SET(HAVE_FCHMOD CACHE INTERNAL "") SET(HAVE_FCNTL CACHE INTERNAL "") SET(HAVE_FCNTL_H 1 CACHE INTERNAL "") -SET(HAVE_FCNTL_DIRECT 0 CACHE INTERNAL "") SET(HAVE_FCNTL_NONBLOCK CACHE INTERNAL "") SET(HAVE_FDATASYNC CACHE INTERNAL "") SET(HAVE_DECL_FDATASYNC CACHE INTERNAL "") diff --git a/config.h.cmake b/config.h.cmake index 81ca8fe77c0..b1f69431374 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -30,7 +30,6 @@ #cmakedefine HAVE_DLFCN_H 1 #cmakedefine HAVE_EXECINFO_H 1 #cmakedefine HAVE_FCNTL_H 1 -#cmakedefine HAVE_FCNTL_DIRECT 1 #cmakedefine HAVE_FENV_H 1 #cmakedefine HAVE_FLOAT_H 1 #cmakedefine HAVE_FNMATCH_H 1 diff --git a/configure.cmake b/configure.cmake index 061837c1226..c2706b50502 100644 --- a/configure.cmake +++ b/configure.cmake @@ -706,7 +706,6 @@ CHECK_SYMBOL_EXISTS(O_NONBLOCK "unistd.h;fcntl.h" HAVE_FCNTL_NONBLOCK) IF(NOT HAVE_FCNTL_NONBLOCK) SET(NO_FCNTL_NONBLOCK 1) ENDIF() -CHECK_SYMBOL_EXISTS(O_DIRECT "fcntl.h" HAVE_FCNTL_DIRECT) # # Test for how the C compiler does inline, if at all diff --git a/extra/mariabackup/common.h b/extra/mariabackup/common.h index c983ced22b1..6fde514e8bb 100644 --- a/extra/mariabackup/common.h +++ b/extra/mariabackup/common.h @@ -23,7 +23,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA #include #include -#include #include #include diff --git a/extra/mariabackup/fil_cur.cc b/extra/mariabackup/fil_cur.cc index e9687e770c8..08936d122b2 100644 --- a/extra/mariabackup/fil_cur.cc +++ b/extra/mariabackup/fil_cur.cc @@ -199,14 +199,6 @@ xb_fil_cur_open( return(XB_FIL_CUR_SKIP); } -#ifdef HAVE_FCNTL_DIRECT - if (srv_file_flush_method == SRV_O_DIRECT - || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) { - - os_file_set_nocache(cursor->file, node->name, "OPEN"); - } -#endif - posix_fadvise(cursor->file, 0, 0, POSIX_FADV_SEQUENTIAL); cursor->page_size = node->space->physical_size(); diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 527af617c6a..fa50e6286b4 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -54,7 +54,6 @@ Street, Fifth Floor, Boston, MA 02110-1335 USA #include #include -#include #include #ifdef __linux__ @@ -2428,7 +2427,7 @@ static bool innodb_init() os_file_delete_if_exists_func(ib_logfile0.c_str(), nullptr); os_file_t file= os_file_create_func(ib_logfile0.c_str(), OS_FILE_CREATE, OS_FILE_NORMAL, -#if defined _WIN32 || defined HAVE_FCNTL_DIRECT +#if defined _WIN32 || defined O_DIRECT OS_DATA_FILE_NO_O_DIRECT, #else OS_DATA_FILE, diff --git a/mysys/mf_tempfile.c b/mysys/mf_tempfile.c index 0f1c6d6b1bc..3393b610570 100644 --- a/mysys/mf_tempfile.c +++ b/mysys/mf_tempfile.c @@ -66,7 +66,11 @@ File create_temp_file(char *to, const char *dir, const char *prefix, DBUG_ENTER("create_temp_file"); DBUG_PRINT("enter", ("dir: %s, prefix: %s", dir ? dir : "(null)", prefix)); +#if !defined _WIN32 && defined O_DIRECT + DBUG_ASSERT((mode & (O_EXCL | O_TRUNC | O_CREAT | O_RDWR | O_DIRECT)) == 0); +#else DBUG_ASSERT((mode & (O_EXCL | O_TRUNC | O_CREAT | O_RDWR)) == 0); +#endif mode|= O_TRUNC | O_CREAT | O_RDWR; /* not O_EXCL, see Windows code below */ @@ -118,16 +122,41 @@ File create_temp_file(char *to, const char *dir, const char *prefix, if ((MyFlags & MY_TEMPORARY) && O_TMPFILE_works) { - /* explictly don't use O_EXCL here has it has a different - meaning with O_TMPFILE + /* + explicitly don't use O_EXCL here has it has a different + meaning with O_TMPFILE */ - if ((file= open(dir, (mode & ~O_CREAT) | O_TMPFILE | O_CLOEXEC, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP)) >= 0) + const int flags= (mode & ~O_CREAT) | O_TMPFILE | O_CLOEXEC; + const mode_t open_mode= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +# ifdef O_DIRECT + static int O_TMPFILE_works_with_O_DIRECT= O_DIRECT; + const int try_O_DIRECT= mode & O_TMPFILE_works_with_O_DIRECT; + if (try_O_DIRECT) + file= open(dir, flags | O_DIRECT, open_mode); + else +# endif + file= open(dir, flags, open_mode); + + if (file >= 0) { +# ifdef O_DIRECT + O_TMPFILE_works: +# endif my_snprintf(to, FN_REFLEN, "%s/#sql/fd=%d", dir, file); file=my_register_filename(file, to, FILE_BY_O_TMPFILE, EE_CANTCREATEFILE, MyFlags); } +# ifdef O_DIRECT + else if (errno == EINVAL && try_O_DIRECT) + { + file= open(dir, flags, open_mode); + if (file >= 0) + { + O_TMPFILE_works_with_O_DIRECT= 0; + goto O_TMPFILE_works; + } + } +# endif else if (errno == EOPNOTSUPP || errno == EINVAL) { my_printf_error(EE_CANTCREATEFILE, "O_TMPFILE is not supported on %s " diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index bd0ace7c429..c5f8ffc2c8d 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -342,7 +342,7 @@ static bool fil_node_open_file_low(fil_node_t *node) ut_ad(node->space->is_closing()); mysql_mutex_assert_owner(&fil_system.mutex); static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); -#if defined _WIN32 || defined HAVE_FCNTL_DIRECT +#if defined _WIN32 || defined O_DIRECT ulint type; switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) { case 1: @@ -1906,7 +1906,7 @@ fil_ibd_create( static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); -#if defined _WIN32 || defined HAVE_FCNTL_DIRECT +#if defined _WIN32 || defined O_DIRECT ulint type; switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { case 1: diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 32d2a85df39..3927bb737ad 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4007,7 +4007,7 @@ static int innodb_init_params() data_mysql_default_charset_coll = (ulint) default_charset_info->number; -#ifdef HAVE_FCNTL_DIRECT +#if !defined _WIN32 && defined O_DIRECT if (srv_use_atomic_writes && my_may_have_atomic_write) { /* Force O_DIRECT on Unixes (on Windows writes are always diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 16588e948b7..142cc19721e 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -63,7 +63,7 @@ enum srv_flush_t SRV_LITTLESYNC, /** do not flush after writing */ SRV_NOSYNC, - /** invoke os_file_set_nocache() on data files. This implies using + /** Open or create files with O_DIRECT. This implies using unbuffered I/O but still fdatasync(), because some filesystems might not flush meta-data on write completion */ SRV_O_DIRECT, diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index c8374515859..c34171420f8 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -113,11 +113,8 @@ enum os_file_create_t { doesn't exist, error) */ OS_FILE_CREATE, /*!< to create new file (if exists, error) */ - OS_FILE_OVERWRITE, /*!< to create a new file, if exists - the overwrite old file */ OS_FILE_OPEN_RAW, /*!< to open a raw device or disk partition */ - OS_FILE_CREATE_PATH, /*!< to create the directories */ OS_FILE_OPEN_RETRY, /*!< open with retry */ /** Flags that can be combined with the above values. Please ensure @@ -144,7 +141,7 @@ static const ulint OS_FILE_NORMAL = 62; /** Types for file create @{ */ static constexpr ulint OS_DATA_FILE = 100; static constexpr ulint OS_LOG_FILE = 101; -#if defined _WIN32 || defined HAVE_FCNTL_DIRECT +#if defined _WIN32 || defined O_DIRECT static constexpr ulint OS_DATA_FILE_NO_O_DIRECT = 103; #endif /* @} */ @@ -375,22 +372,6 @@ os_file_create_simple_no_error_handling_func( bool* success) MY_ATTRIBUTE((warn_unused_result)); -#ifndef HAVE_FCNTL_DIRECT -#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0) -#else -/** Tries to disable OS caching on an opened file descriptor. -@param[in] fd file descriptor to alter -@param[in] file_name file name, used in the diagnostic message -@param[in] name "open" or "create"; used in the diagnostic - message */ -void -os_file_set_nocache( -/*================*/ - int fd, /*!< in: file descriptor to alter */ - const char* file_name, - const char* operation_name); -#endif - #ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */ /** Obtain an exclusive lock on a file. @param fd file descriptor diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index 93ea650d0cf..1c2af128229 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -165,14 +165,11 @@ row_merge_drop_indexes( prepare_inplace_alter_table_dict(). */ void row_merge_drop_temp_indexes(); -/** Create temporary merge files in the given paramater path, and if -UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@param[in] path location for creating temporary merge files, or NULL +/** Create a temporary file at the specified path. +@param path location for creating temporary merge files, or nullptr @return File descriptor */ -pfs_os_file_t -row_merge_file_create_low( - const char* path) - MY_ATTRIBUTE((warn_unused_result)); +pfs_os_file_t row_merge_file_create_low(const char *path) + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** Destroy a merge file. And de-register the file from Performance Schema if UNIV_PFS_IO is defined. */ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 31bec346d4c..ae46df1f56e 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -974,57 +974,19 @@ os_file_create_simple_func( *success = false; - int create_flag; - const char* mode_str __attribute__((unused)); + int create_flag = O_RDONLY | O_CLOEXEC; ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); - if (create_mode == OS_FILE_OPEN) { - mode_str = "OPEN"; - - if (access_type == OS_FILE_READ_ONLY) { - - create_flag = O_RDONLY; - - } else if (read_only) { - - create_flag = O_RDONLY; - - } else { - create_flag = O_RDWR; + if (read_only) { + } else if (create_mode == OS_FILE_OPEN) { + if (access_type != OS_FILE_READ_ONLY) { + create_flag = O_RDWR | O_CLOEXEC; } - - } else if (read_only) { - - mode_str = "OPEN"; - create_flag = O_RDONLY; - } else if (create_mode == OS_FILE_CREATE) { - - mode_str = "CREATE"; - create_flag = O_RDWR | O_CREAT | O_EXCL; - - } else if (create_mode == OS_FILE_CREATE_PATH) { - - mode_str = "CREATE PATH"; - /* Create subdirs along the path if needed. */ - - *success = os_file_create_subdirs_if_needed(name); - - if (!*success) { - - ib::error() - << "Unable to create subdirectories '" - << name << "'"; - - return(OS_FILE_CLOSED); - } - - create_flag = O_RDWR | O_CREAT | O_EXCL; - create_mode = OS_FILE_CREATE; + create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; } else { - ib::error() << "Unknown file create mode (" << create_mode @@ -1035,10 +997,33 @@ os_file_create_simple_func( bool retry; +#ifdef O_DIRECT + int direct_flag = 0; + /* This function is always called for data files, we should disable + OS caching (O_DIRECT) here as we do in os_file_create_func(), so + we open the same file in the same mode, see man page of open(2). */ + switch (srv_file_flush_method) { + case SRV_O_DSYNC: + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + direct_flag = O_DIRECT; + break; + } +#else + constexpr int direct_flag = 0; +#endif + do { - file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + file = open(name, create_flag | direct_flag, os_innodb_umask); if (file == -1) { +#ifdef O_DIRECT + if (direct_flag && errno == EINVAL) { + direct_flag = 0; + retry = true; + continue; + } +#endif *success = false; retry = os_file_handle_error( name, @@ -1051,24 +1036,6 @@ os_file_create_simple_func( } while (retry); -#ifdef HAVE_FCNTL_DIRECT - /* This function is always called for data files, we should disable - OS caching (O_DIRECT) here as we do in os_file_create_func(), so - we open the same file in the same mode, see man page of open(2). */ - if (!srv_read_only_mode && *success) { - switch (srv_file_flush_method) { - case SRV_O_DSYNC: - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: - os_file_set_nocache(file, name, mode_str); - break; - default: - break; - } - } -#endif - -#ifndef _WIN32 if (!read_only && *success && access_type == OS_FILE_READ_WRITE @@ -1079,7 +1046,6 @@ os_file_create_simple_func( close(file); file = -1; } -#endif /* !_WIN32 */ return(file); } @@ -1113,6 +1079,61 @@ os_file_create_directory( return(true); } +#ifdef O_DIRECT +# if defined __linux +/** Note that the log file uses buffered I/O. */ +static ATTRIBUTE_COLD void os_file_log_buffered() +{ + log_sys.log_maybe_unbuffered= false; + log_sys.log_buffered= true; + log_sys.set_block_size(512); +} +# endif + +/** @return whether the log file may work with unbuffered I/O. */ +static ATTRIBUTE_COLD bool os_file_log_maybe_unbuffered(const struct stat &st) +{ + MSAN_STAT_WORKAROUND(&st); +# ifdef __linux__ + char b[20 + sizeof "/sys/dev/block/" ":" "/../queue/physical_block_size"]; + if (snprintf(b, sizeof b, "/sys/dev/block/%u:%u/queue/physical_block_size", + major(st.st_dev), minor(st.st_dev)) >= + static_cast(sizeof b)) + return false; + int f= open(b, O_RDONLY); + if (f == -1) + { + if (snprintf(b, sizeof b, "/sys/dev/block/%u:%u/../queue/" + "physical_block_size", + major(st.st_dev), minor(st.st_dev)) >= + static_cast(sizeof b)) + return false; + f= open(b, O_RDONLY); + } + unsigned long s= 0; + if (f != -1) + { + ssize_t l= read(f, b, sizeof b); + if (l > 0 && size_t(l) < sizeof b && b[l - 1] == '\n') + { + char *end= b; + s= strtoul(b, &end, 10); + if (b == end || *end != '\n') + s = 0; + } + close(f); + } + if (s > 4096 || s < 64 || !ut_is_2pow(s)) + return false; + log_sys.set_block_size(uint32_t(s)); +# else + constexpr unsigned long s= 4096; +# endif + + return !(st.st_size & (s - 1)); +} +#endif + /** NOTE! Use the corresponding macro os_file_create(), not directly this function! Opens an existing file or creates a new. @@ -1151,8 +1172,7 @@ os_file_create_func( return(OS_FILE_CLOSED); ); - int create_flag; - const char* mode_str __attribute__((unused)); + int create_flag; on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false; @@ -1162,30 +1182,14 @@ os_file_create_func( create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT)); - if (create_mode == OS_FILE_OPEN - || create_mode == OS_FILE_OPEN_RAW - || create_mode == OS_FILE_OPEN_RETRY) { - - mode_str = "OPEN"; - - create_flag = read_only ? O_RDONLY : O_RDWR; - - } else if (read_only) { - - mode_str = "OPEN"; - - create_flag = O_RDONLY; - + if (read_only) { + create_flag = O_RDONLY | O_CLOEXEC; + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + create_flag = O_RDWR | O_CLOEXEC; } else if (create_mode == OS_FILE_CREATE) { - - mode_str = "CREATE"; - create_flag = O_RDWR | O_CREAT | O_EXCL; - - } else if (create_mode == OS_FILE_OVERWRITE) { - - mode_str = "OVERWRITE"; - create_flag = O_RDWR | O_CREAT | O_TRUNC; - + create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; } else { ib::error() << "Unknown file create mode (" << create_mode << ")" @@ -1194,10 +1198,46 @@ os_file_create_func( return(OS_FILE_CLOSED); } -#ifdef HAVE_FCNTL_DIRECT +#ifdef O_DIRECT + struct stat st; ut_a(type == OS_LOG_FILE - || type == OS_DATA_FILE - || type == OS_DATA_FILE_NO_O_DIRECT); + || type == OS_DATA_FILE || type == OS_DATA_FILE_NO_O_DIRECT); + int direct_flag = 0; + + if (type == OS_DATA_FILE) { + switch (srv_file_flush_method) { + case SRV_O_DSYNC: + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + direct_flag = O_DIRECT; + break; + default: + break; + } +# ifdef __linux__ + } else if (type != OS_LOG_FILE) { + } else if (log_sys.log_buffered) { + skip_o_direct: + os_file_log_buffered(); + } else if (create_mode != OS_FILE_CREATE && !log_sys.is_opened()) { + if (stat(name, &st)) { + if (errno == ENOENT) { + if (on_error_silent) goto not_found; + sql_print_error( + "InnoDB: File %s was not found", name); + goto not_found; + } + goto skip_o_direct; + } + + if (!os_file_log_maybe_unbuffered(st)) { + goto skip_o_direct; + } + + direct_flag = O_DIRECT; + log_sys.log_maybe_unbuffered= true; +# endif + } #else ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); #endif @@ -1219,9 +1259,26 @@ os_file_create_func( bool retry; do { - file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + file = open(name, create_flag | direct_flag, os_innodb_umask); if (file == -1) { +#ifdef O_DIRECT + if (direct_flag && errno == EINVAL) { + direct_flag = 0; +# ifdef __linux__ + if (type == OS_LOG_FILE) { + os_file_log_buffered(); + } +# endif + if (create_mode == OS_FILE_CREATE) { + /* Linux may create the file + before rejecting the O_DIRECT. */ + unlink(name); + } + retry = true; + continue; + } +#endif const char* operation; operation = (create_mode == OS_FILE_CREATE @@ -1243,82 +1300,25 @@ os_file_create_func( } while (retry); if (!*success) { - return file; +#ifdef __linux__ +not_found: +#endif + return OS_FILE_CLOSED; } -#ifdef HAVE_FCNTL_DIRECT - if (type == OS_DATA_FILE) { - switch (srv_file_flush_method) { - case SRV_O_DSYNC: - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: -# ifdef __linux__ -use_o_direct: -# endif - os_file_set_nocache(file, name, mode_str); - break; - default: - break; - } - } -# ifdef __linux__ - else if (type == OS_LOG_FILE && !log_sys.is_opened()) { - struct stat st; - char b[20 + sizeof "/sys/dev/block/" ":" - "/../queue/physical_block_size"]; - int f; - if (fstat(file, &st)) { - goto skip_o_direct; - } - MSAN_STAT_WORKAROUND(&st); - if (snprintf(b, sizeof b, - "/sys/dev/block/%u:%u/queue/physical_block_size", - major(st.st_dev), minor(st.st_dev)) - >= static_cast(sizeof b)) { - goto skip_o_direct; - } - if ((f = open(b, O_RDONLY)) == -1) { - if (snprintf(b, sizeof b, - "/sys/dev/block/%u:%u/../queue/" - "physical_block_size", - major(st.st_dev), minor(st.st_dev)) - >= static_cast(sizeof b)) { - goto skip_o_direct; - } - f = open(b, O_RDONLY); - } - if (f != -1) { - ssize_t l = read(f, b, sizeof b); - unsigned long s = 0; - - if (l > 0 && static_cast(l) < sizeof b - && b[l - 1] == '\n') { - char* end = b; - s = strtoul(b, &end, 10); - if (b == end || *end != '\n') { - s = 0; - } - } - close(f); - if (s > 4096 || s < 64 || !ut_is_2pow(s)) { - goto skip_o_direct; - } - log_sys.log_maybe_unbuffered= true; - log_sys.set_block_size(uint32_t(s)); - if (!log_sys.log_buffered && !(st.st_size & (s - 1))) { - goto use_o_direct; - } +#ifdef __linux__ + if (!read_only && create_mode == OS_FILE_CREATE + && type == OS_LOG_FILE) { + if (fstat(file, &st) || !os_file_log_maybe_unbuffered(st)) { + os_file_log_buffered(); } else { -skip_o_direct: - log_sys.log_maybe_unbuffered= false; - log_sys.log_buffered= true; - log_sys.set_block_size(512); + close(file); + return os_file_create_func(name, OS_FILE_OPEN, purpose, + type, false, success); } } -# endif #endif -#ifndef _WIN32 if (!read_only && create_mode != OS_FILE_OPEN_RAW && !my_disable_locking @@ -1346,7 +1346,6 @@ skip_o_direct: close(file); file = -1; } -#endif /* !_WIN32 */ return(file); } @@ -1373,38 +1372,24 @@ os_file_create_simple_no_error_handling_func( bool* success) { os_file_t file; - int create_flag; + int create_flag = O_RDONLY | O_CLOEXEC; ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); *success = false; - if (create_mode == OS_FILE_OPEN) { - - if (access_type == OS_FILE_READ_ONLY) { - - create_flag = O_RDONLY; - - } else if (read_only) { - - create_flag = O_RDONLY; - - } else { - + if (read_only) { + } else if (create_mode == OS_FILE_OPEN) { + if (access_type != OS_FILE_READ_ONLY) { ut_a(access_type == OS_FILE_READ_WRITE || access_type == OS_FILE_READ_ALLOW_DELETE); create_flag = O_RDWR; } + } else if (create_mode == OS_FILE_CREATE) { - } else if (read_only) { - - create_flag = O_RDONLY; - - } else if (create_mode == OS_FILE_CREATE) { - - create_flag = O_RDWR | O_CREAT | O_EXCL; + create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; } else { @@ -1415,11 +1400,10 @@ os_file_create_simple_no_error_handling_func( return(OS_FILE_CLOSED); } - file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + file = open(name, create_flag, os_innodb_umask); *success = (file != -1); -#ifndef _WIN32 if (!read_only && *success && access_type == OS_FILE_READ_WRITE @@ -1431,7 +1415,6 @@ os_file_create_simple_no_error_handling_func( file = -1; } -#endif /* !_WIN32 */ return(file); } @@ -1953,23 +1936,6 @@ os_file_create_simple_func( create_flag = CREATE_NEW; - } else if (create_mode == OS_FILE_CREATE_PATH) { - - /* Create subdirs along the path if needed. */ - *success = os_file_create_subdirs_if_needed(name); - - if (!*success) { - - ib::error() - << "Unable to create subdirectories '" - << name << "'"; - - return(OS_FILE_CLOSED); - } - - create_flag = CREATE_NEW; - create_mode = OS_FILE_CREATE; - } else { ib::error() @@ -2158,10 +2124,6 @@ os_file_create_func( create_flag = CREATE_NEW; - } else if (create_mode == OS_FILE_OVERWRITE) { - - create_flag = CREATE_ALWAYS; - } else { ib::error() << "Unknown file create mode (" << create_mode << ") " @@ -3045,36 +3007,6 @@ os_file_handle_error_cond_exit( return(false); } -#ifdef HAVE_FCNTL_DIRECT -/** Tries to disable OS caching on an opened file descriptor. -@param[in] fd file descriptor to alter -@param[in] file_name file name, used in the diagnostic message -@param[in] name "open" or "create"; used in the diagnostic - message */ -void -os_file_set_nocache(int fd, const char *file_name, const char *operation_name) -{ - if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { - int errno_save = errno; - static bool warning_message_printed = false; - if (errno_save == EINVAL) { - if (!warning_message_printed) { - warning_message_printed = true; - ib::info() - << "Setting O_DIRECT on file " - << file_name << " failed"; - } - } else { - ib::warn() - << "Failed to set O_DIRECT on file " - << file_name << "; " << operation_name - << " : " << strerror(errno_save) - << ", continuing anyway."; - } - } -} -#endif /* HAVE_FCNTL_DIRECT */ - /** Check if the file system supports sparse files. @param fh file handle @return true if the file system supports sparse files */ diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index f87e724e40c..54d7abb4527 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -4349,9 +4349,7 @@ void row_merge_drop_temp_indexes() UNIV_PFS_IO defined, register the file descriptor with Performance Schema. @param[in] path location for creating temporary merge files, or NULL @return File descriptor */ -pfs_os_file_t -row_merge_file_create_low( - const char* path) +static pfs_os_file_t row_merge_file_create_mode(const char *path, int mode) { if (!path) { path = mysql_tmpdir; @@ -4392,6 +4390,13 @@ row_merge_file_create_low( return(fd); } +/** Create a temporary file at the specified path. +@param path location for creating temporary merge files, or nullptr +@return File descriptor */ +pfs_os_file_t row_merge_file_create_low(const char *path) +{ + return row_merge_file_create_mode(path, O_BINARY | O_SEQUENTIAL); +} /** Create a merge file in the given location. @param[out] merge_file merge file structure @@ -4402,17 +4407,16 @@ row_merge_file_create( merge_file_t* merge_file, const char* path) { - merge_file->fd = row_merge_file_create_low(path); merge_file->offset = 0; merge_file->n_rec = 0; -#ifdef HAVE_FCNTL_DIRECT - if (merge_file->fd != OS_FILE_CLOSED) { - if (srv_disable_sort_file_cache) { - os_file_set_nocache(merge_file->fd, - "row0merge.cc", "sort"); - } - } + merge_file->fd = + row_merge_file_create_mode(path, +#if !defined _WIN32 && defined O_DIRECT + srv_disable_sort_file_cache + ? O_DIRECT | O_BINARY | O_SEQUENTIAL + : #endif + O_BINARY | O_SEQUENTIAL); return(merge_file->fd); }