1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-27 07:42:10 +03:00

Use pg_pread() and pg_pwrite() for data files and WAL.

Cut down on system calls by doing random I/O using offset-based OS
routines where available.  Remove the code for tracking the 'virtual'
seek position.  The only reason left to call FileSeek() was to get
the file's size, so provide a new function FileSize() instead.

Author: Oskari Saarenmaa, Thomas Munro
Reviewed-by: Thomas Munro, Jesper Pedersen, Tom Lane, Alvaro Herrera
Discussion: https://postgr.es/m/CAEepm=02rapCpPR3ZGF2vW=SBHSdFYO_bz_f-wwWJonmA3APgw@mail.gmail.com
Discussion: https://postgr.es/m/b8748d39-0b19-0514-a1b9-4e5a28e6a208%40gmail.com
Discussion: https://postgr.es/m/a86bd200-ebbe-d829-e3ca-0c4474b2fcb7%40ohmu.fi
This commit is contained in:
Thomas Munro
2018-11-07 09:51:50 +13:00
parent 3fd2a7932e
commit c24dcd0cfd
6 changed files with 42 additions and 288 deletions

View File

@@ -16,8 +16,8 @@
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
* single process can have. (This is around 1024 on many modern
* operating systems, but may be lower on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
@@ -167,15 +167,6 @@ int max_safe_fds = 32; /* default if not changed */
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
/*
* Note: a VFD's seekPos is normally always valid, but if for some reason
* an lseek() fails, it might become set to FileUnknownPos. We can struggle
* along without knowing the seek position in many cases, but in some places
* we have to fail if we don't have it.
*/
#define FileUnknownPos ((off_t) -1)
#define FilePosIsUnknown(pos) ((pos) < 0)
/* these are the assigned bits in fdstate below: */
#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
@@ -189,7 +180,6 @@ typedef struct vfd
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
off_t seekPos; /* current logical file position, or -1 */
off_t fileSize; /* current size of file (0 if not temporary) */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
@@ -407,9 +397,7 @@ pg_fdatasync(int fd)
/*
* pg_flush_data --- advise OS that the described dirty data should be flushed
*
* offset of 0 with nbytes 0 means that the entire file should be flushed;
* in this case, this function may have side-effects on the file's
* seek position!
* offset of 0 with nbytes 0 means that the entire file should be flushed
*/
void
pg_flush_data(int fd, off_t offset, off_t nbytes)
@@ -1029,22 +1017,6 @@ LruDelete(File file)
vfdP = &VfdCache[file];
/*
* Normally we should know the seek position, but if for some reason we
* have lost track of it, try again to get it. If we still can't get it,
* we have a problem: we will be unable to restore the file seek position
* when and if the file is re-opened. But we can't really throw an error
* and refuse to close the file, or activities such as transaction cleanup
* will be broken.
*/
if (FilePosIsUnknown(vfdP->seekPos))
{
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
if (FilePosIsUnknown(vfdP->seekPos))
elog(LOG, "could not seek file \"%s\" before closing: %m",
vfdP->fileName);
}
/*
* Close the file. We aren't expecting this to fail; if it does, better
* to leak the FD than to mess up our internal state.
@@ -1113,33 +1085,6 @@ LruInsert(File file)
{
++nfile;
}
/*
* Seek to the right position. We need no special case for seekPos
* equal to FileUnknownPos, as lseek() will certainly reject that
* (thus completing the logic noted in LruDelete() that we will fail
* to re-open a file if we couldn't get its seek position before
* closing).
*/
if (vfdP->seekPos != (off_t) 0)
{
if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
{
/*
* If we fail to restore the seek position, treat it like an
* open() failure.
*/
int save_errno = errno;
elog(LOG, "could not seek file \"%s\" after re-opening: %m",
vfdP->fileName);
(void) close(vfdP->fd);
vfdP->fd = VFD_CLOSED;
--nfile;
errno = save_errno;
return -1;
}
}
}
/*
@@ -1406,7 +1351,6 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
/* Saved flags are adjusted to be OK for re-opening file */
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
vfdP->fileSize = 0;
vfdP->fdstate = 0x0;
vfdP->resowner = NULL;
@@ -1820,7 +1764,6 @@ FileClose(File file)
/*
* FilePrefetch - initiate asynchronous read of a given range of the file.
* The logical seek position is unaffected.
*
* Currently the only implementation of this function is using posix_fadvise
* which is the simplest standardized interface that accomplishes this.
@@ -1867,10 +1810,6 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
file, VfdCache[file].fileName,
(int64) offset, (int64) nbytes));
/*
* Caution: do not call pg_flush_data with nbytes = 0, it could trash the
* file's seek position. We prefer to define that as a no-op here.
*/
if (nbytes <= 0)
return;
@@ -1884,7 +1823,8 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
}
int
FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
FileRead(File file, char *buffer, int amount, off_t offset,
uint32 wait_event_info)
{
int returnCode;
Vfd *vfdP;
@@ -1893,7 +1833,7 @@ FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
(int64) offset,
amount, buffer));
returnCode = FileAccess(file);
@@ -1904,16 +1844,10 @@ FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
retry:
pgstat_report_wait_start(wait_event_info);
returnCode = read(vfdP->fd, buffer, amount);
returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
pgstat_report_wait_end();
if (returnCode >= 0)
{
/* if seekPos is unknown, leave it that way */
if (!FilePosIsUnknown(vfdP->seekPos))
vfdP->seekPos += returnCode;
}
else
if (returnCode < 0)
{
/*
* Windows may run out of kernel buffers and return "Insufficient
@@ -1939,16 +1873,14 @@ retry:
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
/* Trouble, so assume we don't know the file position anymore */
vfdP->seekPos = FileUnknownPos;
}
return returnCode;
}
int
FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
FileWrite(File file, char *buffer, int amount, off_t offset,
uint32 wait_event_info)
{
int returnCode;
Vfd *vfdP;
@@ -1957,7 +1889,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
(int64) offset,
amount, buffer));
returnCode = FileAccess(file);
@@ -1976,26 +1908,13 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
*/
if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
{
off_t newPos;
off_t past_write = offset + amount;
/*
* Normally we should know the seek position, but if for some reason
* we have lost track of it, try again to get it. Here, it's fine to
* throw an error if we still can't get it.
*/
if (FilePosIsUnknown(vfdP->seekPos))
{
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
if (FilePosIsUnknown(vfdP->seekPos))
elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
}
newPos = vfdP->seekPos + amount;
if (newPos > vfdP->fileSize)
if (past_write > vfdP->fileSize)
{
uint64 newTotal = temporary_files_size;
newTotal += newPos - vfdP->fileSize;
newTotal += past_write - vfdP->fileSize;
if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
ereport(ERROR,
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
@@ -2007,7 +1926,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
retry:
errno = 0;
pgstat_report_wait_start(wait_event_info);
returnCode = write(vfdP->fd, buffer, amount);
returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
pgstat_report_wait_end();
/* if write didn't set errno, assume problem is no disk space */
@@ -2016,10 +1935,6 @@ retry:
if (returnCode >= 0)
{
/* if seekPos is unknown, leave it that way */
if (!FilePosIsUnknown(vfdP->seekPos))
vfdP->seekPos += returnCode;
/*
* Maintain fileSize and temporary_files_size if it's a temp file.
*
@@ -2029,12 +1944,12 @@ retry:
*/
if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
{
off_t newPos = vfdP->seekPos;
off_t past_write = offset + amount;
if (newPos > vfdP->fileSize)
if (past_write > vfdP->fileSize)
{
temporary_files_size += newPos - vfdP->fileSize;
vfdP->fileSize = newPos;
temporary_files_size += past_write - vfdP->fileSize;
vfdP->fileSize = past_write;
}
}
}
@@ -2060,9 +1975,6 @@ retry:
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
/* Trouble, so assume we don't know the file position anymore */
vfdP->seekPos = FileUnknownPos;
}
return returnCode;
@@ -2090,93 +2002,26 @@ FileSync(File file, uint32 wait_event_info)
}
off_t
FileSeek(File file, off_t offset, int whence)
FileSize(File file)
{
Vfd *vfdP;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
(int64) offset, whence));
DO_DB(elog(LOG, "FileSize %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
{
errno = EINVAL;
return (off_t) -1;
}
vfdP->seekPos = offset;
break;
case SEEK_CUR:
if (FilePosIsUnknown(vfdP->seekPos) ||
vfdP->seekPos + offset < 0)
{
errno = EINVAL;
return (off_t) -1;
}
vfdP->seekPos += offset;
break;
case SEEK_END:
if (FileAccess(file) < 0)
return (off_t) -1;
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
break;
}
}
else
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
{
errno = EINVAL;
return (off_t) -1;
}
if (vfdP->seekPos != offset)
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
case SEEK_END:
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
break;
}
if (FileAccess(file) < 0)
return (off_t) -1;
}
return vfdP->seekPos;
return lseek(VfdCache[file].fd, 0, SEEK_END);
}
/*
* XXX not actually used but here for completeness
*/
#ifdef NOT_USED
off_t
FileTell(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileTell %d (%s)",
file, VfdCache[file].fileName));
return VfdCache[file].seekPos;
}
#endif
int
FileTruncate(File file, off_t offset, uint32 wait_event_info)
{