1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-28 23:42:10 +03:00

Add smgrzeroextend(), FileZero(), FileFallocate()

smgrzeroextend() uses FileFallocate() to efficiently extend files by multiple
blocks. When extending by a small number of blocks, use FileZero() instead, as
using posix_fallocate() for small numbers of blocks is inefficient for some
file systems / operating systems. FileZero() is also used as the fallback for
FileFallocate() on platforms / filesystems that don't support fallocate.

A big advantage of using posix_fallocate() is that it typically won't cause
dirty buffers in the kernel pagecache. So far the most common pattern in our
code is that we smgrextend() a page full of zeroes and put the corresponding
page into shared buffers, from where we later write out the actual contents of
the page. If the kernel, e.g. due to memory pressure or elapsed time, already
wrote back the all-zeroes page, this can lead to doubling the amount of writes
reaching storage.

There are no users of smgrzeroextend() as of this commit. That will follow in
future commits.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: David Rowley <dgrowleyml@gmail.com>
Reviewed-by: John Naylor <john.naylor@enterprisedb.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
This commit is contained in:
Andres Freund
2023-04-05 10:06:39 -07:00
parent 4766eef317
commit 4d330a61bb
6 changed files with 231 additions and 0 deletions

View File

@ -2206,6 +2206,94 @@ FileSync(File file, uint32 wait_event_info)
return returnCode;
}
/*
* Zero a region of the file.
*
* Returns 0 on success, -1 otherwise. In the latter case errno is set to the
* appropriate error.
*/
int
FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
{
int returnCode;
ssize_t written;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
file, VfdCache[file].fileName,
(int64) offset, (int64) amount));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
pgstat_report_wait_start(wait_event_info);
written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
pgstat_report_wait_end();
if (written < 0)
return -1;
else if (written != amount)
{
/* if errno is unset, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
return -1;
}
return 0;
}
/*
* Try to reserve file space with posix_fallocate(). If posix_fallocate() is
* not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
* use FileZero() instead.
*
* Note that at least glibc() implements posix_fallocate() in userspace if not
* implemented by the filesystem. That's not the case for all environments
* though.
*
* Returns 0 on success, -1 otherwise. In the latter case errno is set to the
* appropriate error.
*/
int
FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
{
#ifdef HAVE_POSIX_FALLOCATE
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
file, VfdCache[file].fileName,
(int64) offset, (int64) amount));
returnCode = FileAccess(file);
if (returnCode < 0)
return -1;
pgstat_report_wait_start(wait_event_info);
returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
pgstat_report_wait_end();
if (returnCode == 0)
return 0;
/* for compatibility with %m printing etc */
errno = returnCode;
/*
* Return in cases of a "real" failure, if fallocate is not supported,
* fall through to the FileZero() backed implementation.
*/
if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
return -1;
#endif
return FileZero(file, offset, amount, wait_event_info);
}
off_t
FileSize(File file)
{