mirror of
https://github.com/postgres/postgres.git
synced 2025-08-27 07:42:10 +03:00
Use pg_pread() and pg_pwrite() for data files and WAL.
Cut down on system calls by doing random I/O using offset-based OS routines where available. Remove the code for tracking the 'virtual' seek position. The only reason left to call FileSeek() was to get the file's size, so provide a new function FileSize() instead. Author: Oskari Saarenmaa, Thomas Munro Reviewed-by: Thomas Munro, Jesper Pedersen, Tom Lane, Alvaro Herrera Discussion: https://postgr.es/m/CAEepm=02rapCpPR3ZGF2vW=SBHSdFYO_bz_f-wwWJonmA3APgw@mail.gmail.com Discussion: https://postgr.es/m/b8748d39-0b19-0514-a1b9-4e5a28e6a208%40gmail.com Discussion: https://postgr.es/m/a86bd200-ebbe-d829-e3ca-0c4474b2fcb7%40ohmu.fi
This commit is contained in:
@@ -16,8 +16,8 @@
|
||||
* including base tables, scratch files (e.g., sort and hash spool
|
||||
* files), and random calls to C library routines like system(3); it
|
||||
* is quite easy to exceed system limits on the number of open files a
|
||||
* single process can have. (This is around 256 on many modern
|
||||
* operating systems, but can be as low as 32 on others.)
|
||||
* single process can have. (This is around 1024 on many modern
|
||||
* operating systems, but may be lower on others.)
|
||||
*
|
||||
* VFDs are managed as an LRU pool, with actual OS file descriptors
|
||||
* being opened and closed as needed. Obviously, if a routine is
|
||||
@@ -167,15 +167,6 @@ int max_safe_fds = 32; /* default if not changed */
|
||||
|
||||
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
|
||||
|
||||
/*
|
||||
* Note: a VFD's seekPos is normally always valid, but if for some reason
|
||||
* an lseek() fails, it might become set to FileUnknownPos. We can struggle
|
||||
* along without knowing the seek position in many cases, but in some places
|
||||
* we have to fail if we don't have it.
|
||||
*/
|
||||
#define FileUnknownPos ((off_t) -1)
|
||||
#define FilePosIsUnknown(pos) ((pos) < 0)
|
||||
|
||||
/* these are the assigned bits in fdstate below: */
|
||||
#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
|
||||
#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
|
||||
@@ -189,7 +180,6 @@ typedef struct vfd
|
||||
File nextFree; /* link to next free VFD, if in freelist */
|
||||
File lruMoreRecently; /* doubly linked recency-of-use list */
|
||||
File lruLessRecently;
|
||||
off_t seekPos; /* current logical file position, or -1 */
|
||||
off_t fileSize; /* current size of file (0 if not temporary) */
|
||||
char *fileName; /* name of file, or NULL for unused VFD */
|
||||
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
|
||||
@@ -407,9 +397,7 @@ pg_fdatasync(int fd)
|
||||
/*
|
||||
* pg_flush_data --- advise OS that the described dirty data should be flushed
|
||||
*
|
||||
* offset of 0 with nbytes 0 means that the entire file should be flushed;
|
||||
* in this case, this function may have side-effects on the file's
|
||||
* seek position!
|
||||
* offset of 0 with nbytes 0 means that the entire file should be flushed
|
||||
*/
|
||||
void
|
||||
pg_flush_data(int fd, off_t offset, off_t nbytes)
|
||||
@@ -1029,22 +1017,6 @@ LruDelete(File file)
|
||||
|
||||
vfdP = &VfdCache[file];
|
||||
|
||||
/*
|
||||
* Normally we should know the seek position, but if for some reason we
|
||||
* have lost track of it, try again to get it. If we still can't get it,
|
||||
* we have a problem: we will be unable to restore the file seek position
|
||||
* when and if the file is re-opened. But we can't really throw an error
|
||||
* and refuse to close the file, or activities such as transaction cleanup
|
||||
* will be broken.
|
||||
*/
|
||||
if (FilePosIsUnknown(vfdP->seekPos))
|
||||
{
|
||||
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
|
||||
if (FilePosIsUnknown(vfdP->seekPos))
|
||||
elog(LOG, "could not seek file \"%s\" before closing: %m",
|
||||
vfdP->fileName);
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the file. We aren't expecting this to fail; if it does, better
|
||||
* to leak the FD than to mess up our internal state.
|
||||
@@ -1113,33 +1085,6 @@ LruInsert(File file)
|
||||
{
|
||||
++nfile;
|
||||
}
|
||||
|
||||
/*
|
||||
* Seek to the right position. We need no special case for seekPos
|
||||
* equal to FileUnknownPos, as lseek() will certainly reject that
|
||||
* (thus completing the logic noted in LruDelete() that we will fail
|
||||
* to re-open a file if we couldn't get its seek position before
|
||||
* closing).
|
||||
*/
|
||||
if (vfdP->seekPos != (off_t) 0)
|
||||
{
|
||||
if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
|
||||
{
|
||||
/*
|
||||
* If we fail to restore the seek position, treat it like an
|
||||
* open() failure.
|
||||
*/
|
||||
int save_errno = errno;
|
||||
|
||||
elog(LOG, "could not seek file \"%s\" after re-opening: %m",
|
||||
vfdP->fileName);
|
||||
(void) close(vfdP->fd);
|
||||
vfdP->fd = VFD_CLOSED;
|
||||
--nfile;
|
||||
errno = save_errno;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1406,7 +1351,6 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
|
||||
/* Saved flags are adjusted to be OK for re-opening file */
|
||||
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
|
||||
vfdP->fileMode = fileMode;
|
||||
vfdP->seekPos = 0;
|
||||
vfdP->fileSize = 0;
|
||||
vfdP->fdstate = 0x0;
|
||||
vfdP->resowner = NULL;
|
||||
@@ -1820,7 +1764,6 @@ FileClose(File file)
|
||||
|
||||
/*
|
||||
* FilePrefetch - initiate asynchronous read of a given range of the file.
|
||||
* The logical seek position is unaffected.
|
||||
*
|
||||
* Currently the only implementation of this function is using posix_fadvise
|
||||
* which is the simplest standardized interface that accomplishes this.
|
||||
@@ -1867,10 +1810,6 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
|
||||
file, VfdCache[file].fileName,
|
||||
(int64) offset, (int64) nbytes));
|
||||
|
||||
/*
|
||||
* Caution: do not call pg_flush_data with nbytes = 0, it could trash the
|
||||
* file's seek position. We prefer to define that as a no-op here.
|
||||
*/
|
||||
if (nbytes <= 0)
|
||||
return;
|
||||
|
||||
@@ -1884,7 +1823,8 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
|
||||
}
|
||||
|
||||
int
|
||||
FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
FileRead(File file, char *buffer, int amount, off_t offset,
|
||||
uint32 wait_event_info)
|
||||
{
|
||||
int returnCode;
|
||||
Vfd *vfdP;
|
||||
@@ -1893,7 +1833,7 @@ FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
|
||||
DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
|
||||
file, VfdCache[file].fileName,
|
||||
(int64) VfdCache[file].seekPos,
|
||||
(int64) offset,
|
||||
amount, buffer));
|
||||
|
||||
returnCode = FileAccess(file);
|
||||
@@ -1904,16 +1844,10 @@ FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
|
||||
retry:
|
||||
pgstat_report_wait_start(wait_event_info);
|
||||
returnCode = read(vfdP->fd, buffer, amount);
|
||||
returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
if (returnCode >= 0)
|
||||
{
|
||||
/* if seekPos is unknown, leave it that way */
|
||||
if (!FilePosIsUnknown(vfdP->seekPos))
|
||||
vfdP->seekPos += returnCode;
|
||||
}
|
||||
else
|
||||
if (returnCode < 0)
|
||||
{
|
||||
/*
|
||||
* Windows may run out of kernel buffers and return "Insufficient
|
||||
@@ -1939,16 +1873,14 @@ retry:
|
||||
/* OK to retry if interrupted */
|
||||
if (errno == EINTR)
|
||||
goto retry;
|
||||
|
||||
/* Trouble, so assume we don't know the file position anymore */
|
||||
vfdP->seekPos = FileUnknownPos;
|
||||
}
|
||||
|
||||
return returnCode;
|
||||
}
|
||||
|
||||
int
|
||||
FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
FileWrite(File file, char *buffer, int amount, off_t offset,
|
||||
uint32 wait_event_info)
|
||||
{
|
||||
int returnCode;
|
||||
Vfd *vfdP;
|
||||
@@ -1957,7 +1889,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
|
||||
DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
|
||||
file, VfdCache[file].fileName,
|
||||
(int64) VfdCache[file].seekPos,
|
||||
(int64) offset,
|
||||
amount, buffer));
|
||||
|
||||
returnCode = FileAccess(file);
|
||||
@@ -1976,26 +1908,13 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
*/
|
||||
if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
|
||||
{
|
||||
off_t newPos;
|
||||
off_t past_write = offset + amount;
|
||||
|
||||
/*
|
||||
* Normally we should know the seek position, but if for some reason
|
||||
* we have lost track of it, try again to get it. Here, it's fine to
|
||||
* throw an error if we still can't get it.
|
||||
*/
|
||||
if (FilePosIsUnknown(vfdP->seekPos))
|
||||
{
|
||||
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
|
||||
if (FilePosIsUnknown(vfdP->seekPos))
|
||||
elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
|
||||
}
|
||||
|
||||
newPos = vfdP->seekPos + amount;
|
||||
if (newPos > vfdP->fileSize)
|
||||
if (past_write > vfdP->fileSize)
|
||||
{
|
||||
uint64 newTotal = temporary_files_size;
|
||||
|
||||
newTotal += newPos - vfdP->fileSize;
|
||||
newTotal += past_write - vfdP->fileSize;
|
||||
if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
|
||||
@@ -2007,7 +1926,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
|
||||
retry:
|
||||
errno = 0;
|
||||
pgstat_report_wait_start(wait_event_info);
|
||||
returnCode = write(vfdP->fd, buffer, amount);
|
||||
returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
/* if write didn't set errno, assume problem is no disk space */
|
||||
@@ -2016,10 +1935,6 @@ retry:
|
||||
|
||||
if (returnCode >= 0)
|
||||
{
|
||||
/* if seekPos is unknown, leave it that way */
|
||||
if (!FilePosIsUnknown(vfdP->seekPos))
|
||||
vfdP->seekPos += returnCode;
|
||||
|
||||
/*
|
||||
* Maintain fileSize and temporary_files_size if it's a temp file.
|
||||
*
|
||||
@@ -2029,12 +1944,12 @@ retry:
|
||||
*/
|
||||
if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
|
||||
{
|
||||
off_t newPos = vfdP->seekPos;
|
||||
off_t past_write = offset + amount;
|
||||
|
||||
if (newPos > vfdP->fileSize)
|
||||
if (past_write > vfdP->fileSize)
|
||||
{
|
||||
temporary_files_size += newPos - vfdP->fileSize;
|
||||
vfdP->fileSize = newPos;
|
||||
temporary_files_size += past_write - vfdP->fileSize;
|
||||
vfdP->fileSize = past_write;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2060,9 +1975,6 @@ retry:
|
||||
/* OK to retry if interrupted */
|
||||
if (errno == EINTR)
|
||||
goto retry;
|
||||
|
||||
/* Trouble, so assume we don't know the file position anymore */
|
||||
vfdP->seekPos = FileUnknownPos;
|
||||
}
|
||||
|
||||
return returnCode;
|
||||
@@ -2090,93 +2002,26 @@ FileSync(File file, uint32 wait_event_info)
|
||||
}
|
||||
|
||||
off_t
|
||||
FileSeek(File file, off_t offset, int whence)
|
||||
FileSize(File file)
|
||||
{
|
||||
Vfd *vfdP;
|
||||
|
||||
Assert(FileIsValid(file));
|
||||
|
||||
DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
|
||||
file, VfdCache[file].fileName,
|
||||
(int64) VfdCache[file].seekPos,
|
||||
(int64) offset, whence));
|
||||
DO_DB(elog(LOG, "FileSize %d (%s)",
|
||||
file, VfdCache[file].fileName));
|
||||
|
||||
vfdP = &VfdCache[file];
|
||||
|
||||
if (FileIsNotOpen(file))
|
||||
{
|
||||
switch (whence)
|
||||
{
|
||||
case SEEK_SET:
|
||||
if (offset < 0)
|
||||
{
|
||||
errno = EINVAL;
|
||||
return (off_t) -1;
|
||||
}
|
||||
vfdP->seekPos = offset;
|
||||
break;
|
||||
case SEEK_CUR:
|
||||
if (FilePosIsUnknown(vfdP->seekPos) ||
|
||||
vfdP->seekPos + offset < 0)
|
||||
{
|
||||
errno = EINVAL;
|
||||
return (off_t) -1;
|
||||
}
|
||||
vfdP->seekPos += offset;
|
||||
break;
|
||||
case SEEK_END:
|
||||
if (FileAccess(file) < 0)
|
||||
return (off_t) -1;
|
||||
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "invalid whence: %d", whence);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (whence)
|
||||
{
|
||||
case SEEK_SET:
|
||||
if (offset < 0)
|
||||
{
|
||||
errno = EINVAL;
|
||||
return (off_t) -1;
|
||||
}
|
||||
if (vfdP->seekPos != offset)
|
||||
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
|
||||
break;
|
||||
case SEEK_CUR:
|
||||
if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
|
||||
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
|
||||
break;
|
||||
case SEEK_END:
|
||||
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "invalid whence: %d", whence);
|
||||
break;
|
||||
}
|
||||
if (FileAccess(file) < 0)
|
||||
return (off_t) -1;
|
||||
}
|
||||
|
||||
return vfdP->seekPos;
|
||||
return lseek(VfdCache[file].fd, 0, SEEK_END);
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX not actually used but here for completeness
|
||||
*/
|
||||
#ifdef NOT_USED
|
||||
off_t
|
||||
FileTell(File file)
|
||||
{
|
||||
Assert(FileIsValid(file));
|
||||
DO_DB(elog(LOG, "FileTell %d (%s)",
|
||||
file, VfdCache[file].fileName));
|
||||
return VfdCache[file].seekPos;
|
||||
}
|
||||
#endif
|
||||
|
||||
int
|
||||
FileTruncate(File file, off_t offset, uint32 wait_event_info)
|
||||
{
|
||||
|
Reference in New Issue
Block a user