|
|
|
@ -306,7 +306,10 @@ static void walkdir(const char *path,
|
|
|
|
|
#ifdef PG_FLUSH_DATA_WORKS
|
|
|
|
|
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
|
|
|
|
|
#endif
|
|
|
|
|
static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
|
|
|
|
|
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
|
|
|
|
|
|
|
|
|
|
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
|
|
|
|
|
static int fsync_parent_path(const char *fname, int elevel);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
@ -413,54 +416,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
|
|
|
|
|
* indicate the OS just doesn't allow/require fsyncing directories.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
fsync_fname(char *fname, bool isdir)
|
|
|
|
|
fsync_fname(const char *fname, bool isdir)
|
|
|
|
|
{
|
|
|
|
|
int fd;
|
|
|
|
|
int returncode;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Some OSs require directories to be opened read-only whereas other
|
|
|
|
|
* systems don't allow us to fsync files opened read-only; so we need both
|
|
|
|
|
* cases here
|
|
|
|
|
*/
|
|
|
|
|
if (!isdir)
|
|
|
|
|
fd = OpenTransientFile(fname,
|
|
|
|
|
O_RDWR | PG_BINARY,
|
|
|
|
|
S_IRUSR | S_IWUSR);
|
|
|
|
|
else
|
|
|
|
|
fd = OpenTransientFile(fname,
|
|
|
|
|
O_RDONLY | PG_BINARY,
|
|
|
|
|
S_IRUSR | S_IWUSR);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Some OSs don't allow us to open directories at all (Windows returns
|
|
|
|
|
* EACCES)
|
|
|
|
|
*/
|
|
|
|
|
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
else if (fd < 0)
|
|
|
|
|
ereport(ERROR,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not open file \"%s\": %m", fname)));
|
|
|
|
|
|
|
|
|
|
returncode = pg_fsync(fd);
|
|
|
|
|
|
|
|
|
|
/* Some OSs don't allow us to fsync directories at all */
|
|
|
|
|
if (returncode != 0 && isdir && errno == EBADF)
|
|
|
|
|
{
|
|
|
|
|
CloseTransientFile(fd);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (returncode != 0)
|
|
|
|
|
ereport(ERROR,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not fsync file \"%s\": %m", fname)));
|
|
|
|
|
|
|
|
|
|
CloseTransientFile(fd);
|
|
|
|
|
fsync_fname_ext(fname, isdir, false, ERROR);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
|
|
|
|
|
*
|
|
|
|
|
* This routine ensures that, after returning, the effect of renaming file
|
|
|
|
|
* persists in case of a crash. A crash while this routine is running will
|
|
|
|
|
* leave you with either the pre-existing or the moved file in place of the
|
|
|
|
|
* new file; no mixed state or truncated files are possible.
|
|
|
|
|
*
|
|
|
|
|
* It does so by using fsync on the old filename and the possibly existing
|
|
|
|
|
* target filename before the rename, and the target file and directory after.
|
|
|
|
|
*
|
|
|
|
|
* Note that rename() cannot be used across arbitrary directories, as they
|
|
|
|
|
* might not be on the same filesystem. Therefore this routine does not
|
|
|
|
|
* support renaming across directories.
|
|
|
|
|
*
|
|
|
|
|
* Log errors with the caller specified severity.
|
|
|
|
|
*
|
|
|
|
|
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
|
|
|
|
|
* valid upon return.
|
|
|
|
|
*/
|
|
|
|
|
int
|
|
|
|
|
durable_rename(const char *oldfile, const char *newfile, int elevel)
|
|
|
|
|
{
|
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* First fsync the old and target path (if it exists), to ensure that they
|
|
|
|
|
* are properly persistent on disk. Syncing the target file is not
|
|
|
|
|
* strictly necessary, but it makes it easier to reason about crashes;
|
|
|
|
|
* because it's then guaranteed that either source or target file exists
|
|
|
|
|
* after a crash.
|
|
|
|
|
*/
|
|
|
|
|
if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
|
|
|
|
|
if (fd < 0)
|
|
|
|
|
{
|
|
|
|
|
if (errno != ENOENT)
|
|
|
|
|
{
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not open file \"%s\": %m", newfile)));
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (pg_fsync(fd) != 0)
|
|
|
|
|
{
|
|
|
|
|
int save_errno;
|
|
|
|
|
|
|
|
|
|
/* close file upon error, might not be in transaction context */
|
|
|
|
|
save_errno = errno;
|
|
|
|
|
CloseTransientFile(fd);
|
|
|
|
|
errno = save_errno;
|
|
|
|
|
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not fsync file \"%s\": %m", newfile)));
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
CloseTransientFile(fd);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Time to do the real deal... */
|
|
|
|
|
if (rename(oldfile, newfile) < 0)
|
|
|
|
|
{
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not rename file \"%s\" to \"%s\": %m",
|
|
|
|
|
oldfile, newfile)));
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* To guarantee renaming the file is persistent, fsync the file with its
|
|
|
|
|
* new name, and its containing directory.
|
|
|
|
|
*/
|
|
|
|
|
if (fsync_fname_ext(newfile, false, false, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
if (fsync_parent_path(newfile, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* durable_link_or_rename -- rename a file in a durable manner.
|
|
|
|
|
*
|
|
|
|
|
* Similar to durable_rename(), except that this routine tries (but does not
|
|
|
|
|
* guarantee) not to overwrite the target file.
|
|
|
|
|
*
|
|
|
|
|
* Note that a crash in an unfortunate moment can leave you with two links to
|
|
|
|
|
* the target file.
|
|
|
|
|
*
|
|
|
|
|
* Log errors with the caller specified severity.
|
|
|
|
|
*
|
|
|
|
|
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
|
|
|
|
|
* valid upon return.
|
|
|
|
|
*/
|
|
|
|
|
int
|
|
|
|
|
durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Ensure that, if we crash directly after the rename/link, a file with
|
|
|
|
|
* valid contents is moved into place.
|
|
|
|
|
*/
|
|
|
|
|
if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
#if HAVE_WORKING_LINK
|
|
|
|
|
if (link(oldfile, newfile) < 0)
|
|
|
|
|
{
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not link file \"%s\" to \"%s\": %m",
|
|
|
|
|
oldfile, newfile)));
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
unlink(oldfile);
|
|
|
|
|
#else
|
|
|
|
|
/* XXX: Add racy file existence check? */
|
|
|
|
|
if (rename(oldfile, newfile) < 0)
|
|
|
|
|
{
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not rename file \"%s\" to \"%s\": %m",
|
|
|
|
|
oldfile, newfile)));
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Make change persistent in case of an OS crash, both the new entry and
|
|
|
|
|
* its parent directory need to be flushed.
|
|
|
|
|
*/
|
|
|
|
|
if (fsync_fname_ext(newfile, false, false, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
/* Same for parent directory */
|
|
|
|
|
if (fsync_parent_path(newfile, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* InitFileAccess --- initialize this module during backend startup
|
|
|
|
@ -2581,10 +2688,10 @@ SyncDataDirectory(void)
|
|
|
|
|
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
|
|
|
|
|
* so we don't worry about optimizing it.
|
|
|
|
|
*/
|
|
|
|
|
walkdir(".", fsync_fname_ext, false, LOG);
|
|
|
|
|
walkdir(".", datadir_fsync_fname, false, LOG);
|
|
|
|
|
if (xlog_is_symlink)
|
|
|
|
|
walkdir("pg_xlog", fsync_fname_ext, false, LOG);
|
|
|
|
|
walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
|
|
|
|
|
walkdir("pg_xlog", datadir_fsync_fname, false, LOG);
|
|
|
|
|
walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
@ -2698,15 +2805,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
|
|
|
|
|
|
|
|
|
|
#endif /* PG_FLUSH_DATA_WORKS */
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
datadir_fsync_fname(const char *fname, bool isdir, int elevel)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We want to silently ignoring errors about unreadable files. Pass that
|
|
|
|
|
* desire on to fsync_fname_ext().
|
|
|
|
|
*/
|
|
|
|
|
fsync_fname_ext(fname, isdir, true, elevel);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* fsync_fname_ext -- Try to fsync a file or directory
|
|
|
|
|
*
|
|
|
|
|
* Ignores errors trying to open unreadable files, or trying to fsync
|
|
|
|
|
* directories on systems where that isn't allowed/required, and logs other
|
|
|
|
|
* errors at a caller-specified level.
|
|
|
|
|
* If ignore_perm is true, ignore errors upon trying to open unreadable
|
|
|
|
|
* files. Logs other errors at a caller-specified level.
|
|
|
|
|
*
|
|
|
|
|
* Returns 0 if the operation succeeded, -1 otherwise.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
fsync_fname_ext(const char *fname, bool isdir, int elevel)
|
|
|
|
|
static int
|
|
|
|
|
fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
|
|
|
|
|
{
|
|
|
|
|
int fd;
|
|
|
|
|
int flags;
|
|
|
|
@ -2724,20 +2842,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
|
|
|
|
|
else
|
|
|
|
|
flags |= O_RDONLY;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Open the file, silently ignoring errors about unreadable files (or
|
|
|
|
|
* unsupported operations, e.g. opening a directory under Windows), and
|
|
|
|
|
* logging others.
|
|
|
|
|
*/
|
|
|
|
|
fd = OpenTransientFile((char *) fname, flags, 0);
|
|
|
|
|
if (fd < 0)
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Some OSs don't allow us to open directories at all (Windows returns
|
|
|
|
|
* EACCES), just ignore the error in that case. If desired also silently
|
|
|
|
|
* ignoring errors about unreadable files. Log others.
|
|
|
|
|
*/
|
|
|
|
|
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
|
|
|
|
|
return 0;
|
|
|
|
|
else if (fd < 0 && ignore_perm && errno == EACCES)
|
|
|
|
|
return 0;
|
|
|
|
|
else if (fd < 0)
|
|
|
|
|
{
|
|
|
|
|
if (errno == EACCES || (isdir && errno == EISDIR))
|
|
|
|
|
return;
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not open file \"%s\": %m", fname)));
|
|
|
|
|
return;
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
returncode = pg_fsync(fd);
|
|
|
|
@ -2747,9 +2868,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
|
|
|
|
|
* those errors. Anything else needs to be logged.
|
|
|
|
|
*/
|
|
|
|
|
if (returncode != 0 && !(isdir && errno == EBADF))
|
|
|
|
|
{
|
|
|
|
|
int save_errno;
|
|
|
|
|
|
|
|
|
|
/* close file upon error, might not be in transaction context */
|
|
|
|
|
save_errno = errno;
|
|
|
|
|
(void) CloseTransientFile(fd);
|
|
|
|
|
errno = save_errno;
|
|
|
|
|
|
|
|
|
|
ereport(elevel,
|
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
|
errmsg("could not fsync file \"%s\": %m", fname)));
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
(void) CloseTransientFile(fd);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* fsync_parent_path -- fsync the parent path of a file or directory
|
|
|
|
|
*
|
|
|
|
|
* This is aimed at making file operations persistent on disk in case of
|
|
|
|
|
* an OS crash or power failure.
|
|
|
|
|
*/
|
|
|
|
|
static int
|
|
|
|
|
fsync_parent_path(const char *fname, int elevel)
|
|
|
|
|
{
|
|
|
|
|
char parentpath[MAXPGPATH];
|
|
|
|
|
|
|
|
|
|
strlcpy(parentpath, fname, MAXPGPATH);
|
|
|
|
|
get_parent_directory(parentpath);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* get_parent_directory() returns an empty string if the input argument is
|
|
|
|
|
* just a file name (see comments in path.c), so handle that as being the
|
|
|
|
|
* current directory.
|
|
|
|
|
*/
|
|
|
|
|
if (strlen(parentpath) == 0)
|
|
|
|
|
strlcpy(parentpath, ".", MAXPGPATH);
|
|
|
|
|
|
|
|
|
|
if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|