mirror of
https://github.com/postgres/postgres.git
synced 2025-09-02 04:21:28 +03:00
Add io_direct setting (developer-only).
Provide a way to ask the kernel to use O_DIRECT (or local equivalent) where available for data and WAL files, to avoid or minimize kernel caching. This hurts performance currently and is not intended for end users yet. Later proposed work would introduce our own I/O clustering, read-ahead, etc to replace the facilities the kernel disables with this option. The only user-visible change, if the developer-only GUC is not used, is that this commit also removes the obscure logic that would activate O_DIRECT for the WAL when wal_sync_method=open_[data]sync and wal_level=minimal (which also requires max_wal_senders=0). Those are non-default and unlikely settings, and this behavior wasn't (correctly) documented. The same effect can be achieved with io_direct=wal. Author: Thomas Munro <thomas.munro@gmail.com> Author: Andres Freund <andres@anarazel.de> Author: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg%40mail.gmail.com
This commit is contained in:
@@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
|
||||
* Try to initiate an asynchronous read. This returns false in
|
||||
* recovery if the relation file doesn't exist.
|
||||
*/
|
||||
if (smgrprefetch(smgr_reln, forkNum, blockNum))
|
||||
if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
|
||||
smgrprefetch(smgr_reln, forkNum, blockNum))
|
||||
{
|
||||
result.initiated_io = true;
|
||||
}
|
||||
#endif /* USE_PREFETCH */
|
||||
}
|
||||
else
|
||||
@@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
|
||||
* the kernel and therefore didn't really initiate I/O, and no way to know when
|
||||
* the I/O completes other than using synchronous ReadBuffer().
|
||||
*
|
||||
* 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
|
||||
* 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
|
||||
* USE_PREFETCH is not defined (this build doesn't support prefetching due to
|
||||
* lack of a kernel facility), or the underlying relation file wasn't found and
|
||||
* we are in recovery. (If the relation file wasn't found and we are not in
|
||||
* recovery, an error is raised).
|
||||
* lack of a kernel facility), direct I/O is enabled, or the underlying
|
||||
* relation file wasn't found and we are in recovery. (If the relation file
|
||||
* wasn't found and we are not in recovery, an error is raised).
|
||||
*/
|
||||
PrefetchBufferResult
|
||||
PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
|
||||
@@ -5440,6 +5443,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
|
||||
{
|
||||
PendingWriteback *pending;
|
||||
|
||||
if (io_direct_flags & IO_DIRECT_DATA)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Add buffer to the pending writeback array, unless writeback control is
|
||||
* disabled.
|
||||
|
@@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
|
||||
{
|
||||
#ifdef USE_PREFETCH
|
||||
/* Not in buffers, so initiate prefetch */
|
||||
smgrprefetch(smgr, forkNum, blockNum);
|
||||
result.initiated_io = true;
|
||||
if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
|
||||
smgrprefetch(smgr, forkNum, blockNum))
|
||||
{
|
||||
result.initiated_io = true;
|
||||
}
|
||||
#endif /* USE_PREFETCH */
|
||||
}
|
||||
|
||||
|
@@ -98,7 +98,9 @@
|
||||
#include "storage/fd.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/guc_hooks.h"
|
||||
#include "utils/resowner_private.h"
|
||||
#include "utils/varlena.h"
|
||||
|
||||
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
|
||||
#if defined(HAVE_SYNC_FILE_RANGE)
|
||||
@@ -162,6 +164,9 @@ bool data_sync_retry = false;
|
||||
/* How SyncDataDirectory() should do its job. */
|
||||
int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
|
||||
|
||||
/* Which kinds of files should be opened with PG_O_DIRECT. */
|
||||
int io_direct_flags;
|
||||
|
||||
/* Debugging.... */
|
||||
|
||||
#ifdef FDDEBUG
|
||||
@@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
|
||||
if (nbytes <= 0)
|
||||
return;
|
||||
|
||||
if (VfdCache[file].fileFlags & PG_O_DIRECT)
|
||||
return;
|
||||
|
||||
returnCode = FileAccess(file);
|
||||
if (returnCode < 0)
|
||||
return;
|
||||
@@ -3826,3 +3834,93 @@ data_sync_elevel(int elevel)
|
||||
{
|
||||
return data_sync_retry ? elevel : PANIC;
|
||||
}
|
||||
|
||||
bool
|
||||
check_io_direct(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
bool result = true;
|
||||
int flags;
|
||||
|
||||
#if PG_O_DIRECT == 0
|
||||
if (strcmp(*newval, "") != 0)
|
||||
{
|
||||
GUC_check_errdetail("io_direct is not supported on this platform.");
|
||||
result = false;
|
||||
}
|
||||
flags = 0;
|
||||
#else
|
||||
List *elemlist;
|
||||
ListCell *l;
|
||||
char *rawstring;
|
||||
|
||||
/* Need a modifiable copy of string */
|
||||
rawstring = pstrdup(*newval);
|
||||
|
||||
if (!SplitGUCList(rawstring, ',', &elemlist))
|
||||
{
|
||||
GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
|
||||
"io_direct");
|
||||
pfree(rawstring);
|
||||
list_free(elemlist);
|
||||
return false;
|
||||
}
|
||||
|
||||
flags = 0;
|
||||
foreach(l, elemlist)
|
||||
{
|
||||
char *item = (char *) lfirst(l);
|
||||
|
||||
if (pg_strcasecmp(item, "data") == 0)
|
||||
flags |= IO_DIRECT_DATA;
|
||||
else if (pg_strcasecmp(item, "wal") == 0)
|
||||
flags |= IO_DIRECT_WAL;
|
||||
else if (pg_strcasecmp(item, "wal_init") == 0)
|
||||
flags |= IO_DIRECT_WAL_INIT;
|
||||
else
|
||||
{
|
||||
GUC_check_errdetail("invalid option \"%s\"", item);
|
||||
result = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* It's possible to configure block sizes smaller than our assumed I/O
|
||||
* alignment size, which could result in invalid I/O requests.
|
||||
*/
|
||||
#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
|
||||
if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
|
||||
{
|
||||
GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
|
||||
result = false;
|
||||
}
|
||||
#endif
|
||||
#if BLCKSZ < PG_IO_ALIGN_SIZE
|
||||
if (result && (flags & IO_DIRECT_DATA))
|
||||
{
|
||||
GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small");
|
||||
result = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
pfree(rawstring);
|
||||
list_free(elemlist);
|
||||
#endif
|
||||
|
||||
if (!result)
|
||||
return result;
|
||||
|
||||
/* Save the flags in *extra, for use by assign_io_direct */
|
||||
*extra = guc_malloc(ERROR, sizeof(int));
|
||||
*((int *) *extra) = flags;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
extern void
|
||||
assign_io_direct(const char *newval, void *extra)
|
||||
{
|
||||
int *flags = (int *) extra;
|
||||
|
||||
io_direct_flags = *flags;
|
||||
}
|
||||
|
@@ -142,6 +142,16 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
|
||||
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
|
||||
MdfdVec *seg);
|
||||
|
||||
static inline int
|
||||
_mdfd_open_flags(void)
|
||||
{
|
||||
int flags = O_RDWR | PG_BINARY;
|
||||
|
||||
if (io_direct_flags & IO_DIRECT_DATA)
|
||||
flags |= PG_O_DIRECT;
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
/*
|
||||
* mdinit() -- Initialize private state for magnetic disk storage manager.
|
||||
@@ -205,14 +215,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
|
||||
path = relpath(reln->smgr_rlocator, forknum);
|
||||
|
||||
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
|
||||
fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
|
||||
|
||||
if (fd < 0)
|
||||
{
|
||||
int save_errno = errno;
|
||||
|
||||
if (isRedo)
|
||||
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
|
||||
fd = PathNameOpenFile(path, _mdfd_open_flags());
|
||||
if (fd < 0)
|
||||
{
|
||||
/* be sure to report the error reported by create, not open */
|
||||
@@ -635,7 +645,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
|
||||
|
||||
path = relpath(reln->smgr_rlocator, forknum);
|
||||
|
||||
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
|
||||
fd = PathNameOpenFile(path, _mdfd_open_flags());
|
||||
|
||||
if (fd < 0)
|
||||
{
|
||||
@@ -706,6 +716,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
off_t seekpos;
|
||||
MdfdVec *v;
|
||||
|
||||
Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
|
||||
|
||||
v = _mdfd_getseg(reln, forknum, blocknum, false,
|
||||
InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
|
||||
if (v == NULL)
|
||||
@@ -731,6 +743,8 @@ void
|
||||
mdwriteback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks)
|
||||
{
|
||||
Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
|
||||
|
||||
/*
|
||||
* Issue flush requests in as few requests as possible; have to split at
|
||||
* segment boundaries though, since those are actually separate files.
|
||||
@@ -1335,7 +1349,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
|
||||
fullpath = _mdfd_segpath(reln, forknum, segno);
|
||||
|
||||
/* open the file */
|
||||
fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
|
||||
fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
|
||||
|
||||
pfree(fullpath);
|
||||
|
||||
@@ -1546,7 +1560,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
|
||||
strlcpy(path, p, MAXPGPATH);
|
||||
pfree(p);
|
||||
|
||||
file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
|
||||
file = PathNameOpenFile(path, _mdfd_open_flags());
|
||||
if (file < 0)
|
||||
return -1;
|
||||
need_to_close = true;
|
||||
|
@@ -20,6 +20,7 @@
|
||||
#include "access/xlogutils.h"
|
||||
#include "lib/ilist.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/fd.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/md.h"
|
||||
#include "storage/smgr.h"
|
||||
|
Reference in New Issue
Block a user