From cccc6cdeb32f010f1cf777a9e9a85344a4317ab8 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 6 Sep 2023 16:27:00 -0700 Subject: [PATCH] Add support for syncfs() in frontend support functions. This commit adds support for using syncfs() in fsync_pgdata() and fsync_dir_recurse() (which have been renamed to sync_pgdata() and sync_dir_recurse()). Like recovery_init_sync_method, sync_pgdata() calls syncfs() for the data directory, each tablespace, and pg_wal (if it is a symlink). For now, all of the frontend utilities that use these support functions are hard-coded to use fsync(), but a follow-up commit will allow specifying syncfs(). Co-authored-by: Justin Pryzby Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/20210930004340.GM831%40telsasoft.com --- src/bin/initdb/initdb.c | 7 +- src/bin/pg_basebackup/pg_basebackup.c | 5 +- src/bin/pg_checksums/pg_checksums.c | 3 +- src/bin/pg_dump/pg_backup.h | 4 +- src/bin/pg_dump/pg_backup_archiver.c | 14 +- src/bin/pg_dump/pg_backup_archiver.h | 1 + src/bin/pg_dump/pg_backup_directory.c | 2 +- src/bin/pg_dump/pg_dump.c | 3 +- src/bin/pg_rewind/file_ops.c | 8 +- src/bin/pg_rewind/pg_rewind.c | 1 + src/bin/pg_rewind/pg_rewind.h | 2 + src/common/file_utils.c | 191 +++++++++++++++++++++----- src/include/common/file_utils.h | 5 +- 13 files changed, 190 insertions(+), 56 deletions(-) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 905b979947f..51198e66655 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -165,6 +165,7 @@ static bool show_setting = false; static bool data_checksums = false; static char *xlog_dir = NULL; static int wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024); +static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; /* internal vars */ @@ -3322,7 +3323,7 @@ main(int argc, char *argv[]) atexit(cleanup_directories_atexit); - /* If we only need to fsync, just do it and exit */ + /* If we only need to sync, just do it and exit */ if (sync_only) { setup_pgdata(); @@ -3333,7 +3334,7 @@ main(int argc, char *argv[]) fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - fsync_pgdata(pg_data, PG_VERSION_NUM); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); check_ok(); return 0; } @@ -3396,7 +3397,7 @@ main(int argc, char *argv[]) { fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - fsync_pgdata(pg_data, PG_VERSION_NUM); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); check_ok(); } else diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 1dc8efe0cb7..e9033af5c03 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -148,6 +148,7 @@ static bool verify_checksums = true; static bool manifest = true; static bool manifest_force_encode = false; static char *manifest_checksums = NULL; +static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; static bool success = false; static bool made_new_pgdata = false; @@ -2199,11 +2200,11 @@ BaseBackup(char *compression_algorithm, char *compression_detail, if (format == 't') { if (strcmp(basedir, "-") != 0) - (void) fsync_dir_recurse(basedir); + (void) sync_dir_recurse(basedir, sync_method); } else { - (void) fsync_pgdata(basedir, serverVersion); + (void) sync_pgdata(basedir, serverVersion, sync_method); } } diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 9011a19b4ef..836ee654059 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -44,6 +44,7 @@ static char *only_filenode = NULL; static bool do_sync = true; static bool verbose = false; static bool showprogress = false; +static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; typedef enum { @@ -623,7 +624,7 @@ main(int argc, char *argv[]) if (do_sync) { pg_log_info("syncing data directory"); - fsync_pgdata(DataDir, PG_VERSION_NUM); + sync_pgdata(DataDir, PG_VERSION_NUM, sync_method); } pg_log_info("updating control file"); diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index aba780ef4b1..3a57cdd97d4 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -24,6 +24,7 @@ #define PG_BACKUP_H #include "common/compression.h" +#include "common/file_utils.h" #include "fe_utils/simple_list.h" #include "libpq-fe.h" @@ -307,7 +308,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt, const pg_compress_specification compression_spec, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupDumpWorker); + SetupWorkerPtrType setupDumpWorker, + DataDirSyncMethod sync_method); /* The --list option */ extern void PrintTOCSummary(Archive *AHX); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 39ebcfec326..4d83381d840 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -66,7 +66,8 @@ typedef struct _parallelReadyList static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt, const pg_compress_specification compression_spec, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupWorkerPtr); + SetupWorkerPtrType setupWorkerPtr, + DataDirSyncMethod sync_method); static void _getObjectDescription(PQExpBuffer buf, const TocEntry *te); static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, bool isData); static char *sanitize_line(const char *str, bool want_hyphen); @@ -238,11 +239,12 @@ Archive * CreateArchive(const char *FileSpec, const ArchiveFormat fmt, const pg_compress_specification compression_spec, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupDumpWorker) + SetupWorkerPtrType setupDumpWorker, + DataDirSyncMethod sync_method) { ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression_spec, - dosync, mode, setupDumpWorker); + dosync, mode, setupDumpWorker, sync_method); return (Archive *) AH; } @@ -257,7 +259,8 @@ OpenArchive(const char *FileSpec, const ArchiveFormat fmt) compression_spec.algorithm = PG_COMPRESSION_NONE; AH = _allocAH(FileSpec, fmt, compression_spec, true, - archModeRead, setupRestoreWorker); + archModeRead, setupRestoreWorker, + DATA_DIR_SYNC_METHOD_FSYNC); return (Archive *) AH; } @@ -2233,7 +2236,7 @@ static ArchiveHandle * _allocAH(const char *FileSpec, const ArchiveFormat fmt, const pg_compress_specification compression_spec, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupWorkerPtr) + SetupWorkerPtrType setupWorkerPtr, DataDirSyncMethod sync_method) { ArchiveHandle *AH; CompressFileHandle *CFH; @@ -2287,6 +2290,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, AH->mode = mode; AH->compression_spec = compression_spec; AH->dosync = dosync; + AH->sync_method = sync_method; memset(&(AH->sqlparse), 0, sizeof(AH->sqlparse)); diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 18b38c17abc..b07673933d4 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -312,6 +312,7 @@ struct _archiveHandle pg_compress_specification compression_spec; /* Requested specification for * compression */ bool dosync; /* data requested to be synced on sight */ + DataDirSyncMethod sync_method; ArchiveMode mode; /* File mode - r or w */ void *formatData; /* Header data specific to file format */ diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c index 7f2ac7c7fd1..679c60420bd 100644 --- a/src/bin/pg_dump/pg_backup_directory.c +++ b/src/bin/pg_dump/pg_backup_directory.c @@ -613,7 +613,7 @@ _CloseArchive(ArchiveHandle *AH) * individually. Just recurse once through all the files generated. */ if (AH->dosync) - fsync_dir_recurse(ctx->directory); + sync_dir_recurse(ctx->directory, AH->sync_method); } AH->FH = NULL; } diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index cebd2400fd1..280662bc332 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -357,6 +357,7 @@ main(int argc, char **argv) char *compression_algorithm_str = "none"; char *error_detail = NULL; bool user_compression_defined = false; + DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; static DumpOptions dopt; @@ -777,7 +778,7 @@ main(int argc, char **argv) /* Open the output file */ fout = CreateArchive(filename, archiveFormat, compression_spec, - dosync, archiveMode, setupDumpWorker); + dosync, archiveMode, setupDumpWorker, sync_method); /* Make dump options accessible right away */ SetArchiveOptions(fout, &dopt, NULL); diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 25996b4da47..dd226859329 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -286,9 +286,9 @@ remove_target_symlink(const char *path) * * We do this once, for the whole data directory, for performance reasons. At * the end of pg_rewind's run, the kernel is likely to already have flushed - * most dirty buffers to disk. Additionally fsync_pgdata uses a two-pass - * approach (only initiating writeback in the first pass), which often reduces - * the overall amount of IO noticeably. + * most dirty buffers to disk. Additionally sync_pgdata uses a two-pass + * approach when fsync is specified (only initiating writeback in the first + * pass), which often reduces the overall amount of IO noticeably. */ void sync_target_dir(void) @@ -296,7 +296,7 @@ sync_target_dir(void) if (!do_sync || dry_run) return; - fsync_pgdata(datadir_target, PG_VERSION_NUM); + sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method); } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 7f69f024412..bdfacf32632 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -74,6 +74,7 @@ bool showprogress = false; bool dry_run = false; bool do_sync = true; bool restore_wal = false; +DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; /* Target history */ TimeLineHistoryEntry *targetHistory; diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index ef8bdc1fbb8..05729adfef3 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -13,6 +13,7 @@ #include "access/timeline.h" #include "common/logging.h" +#include "common/file_utils.h" #include "datapagemap.h" #include "libpq-fe.h" #include "storage/block.h" @@ -24,6 +25,7 @@ extern bool showprogress; extern bool dry_run; extern bool do_sync; extern int WalSegSz; +extern DataDirSyncMethod sync_method; /* Target history */ extern TimeLineHistoryEntry *targetHistory; diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 74833c4acbb..abe5129412d 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -51,19 +51,52 @@ static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), bool process_symlinks); +#ifdef HAVE_SYNCFS + /* - * Issue fsync recursively on PGDATA and all its contents. + * do_syncfs -- Try to syncfs a file system * - * We fsync regular files and directories wherever they are, but we follow + * Reports errors trying to open the path. syncfs() errors are fatal. + */ +static void +do_syncfs(const char *path) +{ + int fd; + + fd = open(path, O_RDONLY, 0); + + if (fd < 0) + { + pg_log_error("could not open file \"%s\": %m", path); + return; + } + + if (syncfs(fd) < 0) + { + pg_log_error("could not synchronize file system for file \"%s\": %m", path); + (void) close(fd); + exit(EXIT_FAILURE); + } + + (void) close(fd); +} + +#endif /* HAVE_SYNCFS */ + +/* + * Synchronize PGDATA and all its contents. + * + * We sync regular files and directories wherever they are, but we follow * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc. * Other symlinks are presumed to point at files we're not responsible for - * fsyncing, and might not have privileges to write at all. + * syncing, and might not have privileges to write at all. * - * serverVersion indicates the version of the server to be fsync'd. + * serverVersion indicates the version of the server to be sync'd. */ void -fsync_pgdata(const char *pg_data, - int serverVersion) +sync_pgdata(const char *pg_data, + int serverVersion, + DataDirSyncMethod sync_method) { bool xlog_is_symlink; char pg_wal[MAXPGPATH]; @@ -89,49 +122,135 @@ fsync_pgdata(const char *pg_data, xlog_is_symlink = true; } - /* - * If possible, hint to the kernel that we're soon going to fsync the data - * directory and its contents. - */ + switch (sync_method) + { + case DATA_DIR_SYNC_METHOD_SYNCFS: + { +#ifndef HAVE_SYNCFS + pg_log_error("this build does not support sync method \"%s\"", + "syncfs"); + exit(EXIT_FAILURE); +#else + DIR *dir; + struct dirent *de; + + /* + * On Linux, we don't have to open every single file one by + * one. We can use syncfs() to sync whole filesystems. We + * only expect filesystem boundaries to exist where we + * tolerate symlinks, namely pg_wal and the tablespaces, so we + * call syncfs() for each of those directories. + */ + + /* Sync the top level pgdata directory. */ + do_syncfs(pg_data); + + /* If any tablespaces are configured, sync each of those. */ + dir = opendir(pg_tblspc); + if (dir == NULL) + pg_log_error("could not open directory \"%s\": %m", + pg_tblspc); + else + { + while (errno = 0, (de = readdir(dir)) != NULL) + { + char subpath[MAXPGPATH * 2]; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, sizeof(subpath), "%s/%s", + pg_tblspc, de->d_name); + do_syncfs(subpath); + } + + if (errno) + pg_log_error("could not read directory \"%s\": %m", + pg_tblspc); + + (void) closedir(dir); + } + + /* If pg_wal is a symlink, process that too. */ + if (xlog_is_symlink) + do_syncfs(pg_wal); +#endif /* HAVE_SYNCFS */ + } + break; + + case DATA_DIR_SYNC_METHOD_FSYNC: + { + /* + * If possible, hint to the kernel that we're soon going to + * fsync the data directory and its contents. + */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(pg_data, pre_sync_fname, false); - if (xlog_is_symlink) - walkdir(pg_wal, pre_sync_fname, false); - walkdir(pg_tblspc, pre_sync_fname, true); + walkdir(pg_data, pre_sync_fname, false); + if (xlog_is_symlink) + walkdir(pg_wal, pre_sync_fname, false); + walkdir(pg_tblspc, pre_sync_fname, true); #endif - /* - * Now we do the fsync()s in the same order. - * - * The main call ignores symlinks, so in addition to specially processing - * pg_wal if it's a symlink, pg_tblspc has to be visited separately with - * process_symlinks = true. Note that if there are any plain directories - * in pg_tblspc, they'll get fsync'd twice. That's not an expected case - * so we don't worry about optimizing it. - */ - walkdir(pg_data, fsync_fname, false); - if (xlog_is_symlink) - walkdir(pg_wal, fsync_fname, false); - walkdir(pg_tblspc, fsync_fname, true); + /* + * Now we do the fsync()s in the same order. + * + * The main call ignores symlinks, so in addition to specially + * processing pg_wal if it's a symlink, pg_tblspc has to be + * visited separately with process_symlinks = true. Note that + * if there are any plain directories in pg_tblspc, they'll + * get fsync'd twice. That's not an expected case so we don't + * worry about optimizing it. + */ + walkdir(pg_data, fsync_fname, false); + if (xlog_is_symlink) + walkdir(pg_wal, fsync_fname, false); + walkdir(pg_tblspc, fsync_fname, true); + } + break; + } } /* - * Issue fsync recursively on the given directory and all its contents. + * Synchronize the given directory and all its contents. * - * This is a convenient wrapper on top of walkdir(). + * This is a convenient wrapper on top of walkdir() and do_syncfs(). */ void -fsync_dir_recurse(const char *dir) +sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) { - /* - * If possible, hint to the kernel that we're soon going to fsync the data - * directory and its contents. - */ + switch (sync_method) + { + case DATA_DIR_SYNC_METHOD_SYNCFS: + { +#ifndef HAVE_SYNCFS + pg_log_error("this build does not support sync method \"%s\"", + "syncfs"); + exit(EXIT_FAILURE); +#else + /* + * On Linux, we don't have to open every single file one by + * one. We can use syncfs() to sync the whole filesystem. + */ + do_syncfs(dir); +#endif /* HAVE_SYNCFS */ + } + break; + + case DATA_DIR_SYNC_METHOD_FSYNC: + { + /* + * If possible, hint to the kernel that we're soon going to + * fsync the data directory and its contents. + */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(dir, pre_sync_fname, false); + walkdir(dir, pre_sync_fname, false); #endif - walkdir(dir, fsync_fname, false); + walkdir(dir, fsync_fname, false); + } + break; + } } /* diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 7da21f15e6e..49d523e611f 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -34,8 +34,9 @@ struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND extern int fsync_fname(const char *fname, bool isdir); -extern void fsync_pgdata(const char *pg_data, int serverVersion); -extern void fsync_dir_recurse(const char *dir); +extern void sync_pgdata(const char *pg_data, int serverVersion, + DataDirSyncMethod sync_method); +extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); #endif