mirror of
https://github.com/postgres/postgres.git
synced 2025-07-30 11:03:19 +03:00
Allow using huge TLB pages on Linux (MAP_HUGETLB)
This patch adds an option, huge_tlb_pages, which allows requesting the shared memory segment to be allocated using huge pages, by using the MAP_HUGETLB flag in mmap(). This can improve performance. The default is 'try', which means that we will attempt using huge pages, and fall back to non-huge pages if it doesn't work. Currently, only Linux has MAP_HUGETLB. On other platforms, the default 'try' behaves the same as 'off'. In the passing, don't try to round the mmap() size to a multiple of pagesize. mmap() doesn't require that, and there's no particular reason for PostgreSQL to do that either. When using MAP_HUGETLB, however, round the request size up to nearest 2MB boundary. This is to work around a bug in some Linux kernel versions, but also to avoid wasting memory, because the kernel will round the size up anyway. Many people were involved in writing this patch, including Christian Kruse, Richard Poole, Abhijit Menon-Sen, reviewed by Peter Geoghegan, Andres Freund and me.
This commit is contained in:
@ -32,6 +32,7 @@
|
||||
#include "portability/mem.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
|
||||
typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
|
||||
@ -41,7 +42,7 @@ typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
|
||||
unsigned long UsedShmemSegID = 0;
|
||||
void *UsedShmemSegAddr = NULL;
|
||||
static Size AnonymousShmemSize;
|
||||
static void *AnonymousShmem;
|
||||
static void *AnonymousShmem = NULL;
|
||||
|
||||
static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
|
||||
static void IpcMemoryDetach(int status, Datum shmaddr);
|
||||
@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Creates an anonymous mmap()ed shared memory segment.
|
||||
*
|
||||
* Pass the requested size in *size. This function will modify *size to the
|
||||
* actual size of the allocation, if it ends up allocating a segment that is
|
||||
* larger than requested.
|
||||
*/
|
||||
#ifndef EXEC_BACKEND
|
||||
static void *
|
||||
CreateAnonymousSegment(Size *size)
|
||||
{
|
||||
Size allocsize;
|
||||
void *ptr = MAP_FAILED;
|
||||
|
||||
#ifndef MAP_HUGETLB
|
||||
if (huge_tlb_pages == HUGE_TLB_ON)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("huge TLB pages not supported on this platform")));
|
||||
#else
|
||||
if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
|
||||
{
|
||||
/*
|
||||
* Round up the request size to a suitable large value.
|
||||
*
|
||||
* Some Linux kernel versions are known to have a bug, which causes
|
||||
* mmap() with MAP_HUGETLB to fail if the request size is not a
|
||||
* multiple of any supported huge page size. To work around that, we
|
||||
* round up the request size to nearest 2MB. 2MB is the most common
|
||||
* huge page page size on affected systems.
|
||||
*
|
||||
* Aside from that bug, even with a kernel that does the allocation
|
||||
* correctly, rounding it up ourselves avoids wasting memory. Without
|
||||
* it, if we for example make an allocation of 2MB + 1 bytes, the
|
||||
* kernel might decide to use two 2MB huge pages for that, and waste 2
|
||||
* MB - 1 of memory. When we do the rounding ourselves, we can use
|
||||
* that space for allocations.
|
||||
*/
|
||||
int hugepagesize = 2 * 1024 * 1024;
|
||||
|
||||
allocsize = *size;
|
||||
if (allocsize % hugepagesize != 0)
|
||||
allocsize += hugepagesize - (allocsize % hugepagesize);
|
||||
|
||||
ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
|
||||
PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
|
||||
if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
|
||||
elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (huge_tlb_pages == HUGE_TLB_OFF ||
|
||||
(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
|
||||
{
|
||||
allocsize = *size;
|
||||
ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
|
||||
}
|
||||
|
||||
if (ptr == MAP_FAILED)
|
||||
ereport(FATAL,
|
||||
(errmsg("could not map anonymous shared memory: %m"),
|
||||
(errno == ENOMEM) ?
|
||||
errhint("This error usually means that PostgreSQL's request "
|
||||
"for a shared memory segment exceeded available memory, "
|
||||
"swap space or huge pages. To reduce the request size "
|
||||
"(currently %zu bytes), reduce PostgreSQL's shared "
|
||||
"memory usage, perhaps by reducing shared_buffers or "
|
||||
"max_connections.",
|
||||
*size) : 0));
|
||||
|
||||
*size = allocsize;
|
||||
return ptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* PGSharedMemoryCreate
|
||||
@ -344,7 +419,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
|
||||
PGShmemHeader *hdr;
|
||||
IpcMemoryId shmid;
|
||||
struct stat statbuf;
|
||||
Size sysvsize = size;
|
||||
Size sysvsize;
|
||||
|
||||
#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
|
||||
if (huge_tlb_pages == HUGE_TLB_ON)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("huge TLB pages not supported on this platform")));
|
||||
#endif
|
||||
|
||||
/* Room for a header? */
|
||||
Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
|
||||
@ -359,6 +441,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
|
||||
* to run many copies of PostgreSQL without needing to adjust system
|
||||
* settings.
|
||||
*
|
||||
* We assume that no one will attempt to run PostgreSQL 9.3 or later on
|
||||
* systems that are ancient enough that anonymous shared memory is not
|
||||
* supported, such as pre-2.4 versions of Linux. If that turns out to be
|
||||
* false, we might need to add a run-time test here and do this only if
|
||||
* the running kernel supports it.
|
||||
*
|
||||
* However, we disable this logic in the EXEC_BACKEND case, and fall back
|
||||
* to the old method of allocating the entire segment using System V
|
||||
* shared memory, because there's no way to attach an mmap'd segment to a
|
||||
@ -366,44 +454,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
|
||||
* developer use, this shouldn't be a big problem.
|
||||
*/
|
||||
#ifndef EXEC_BACKEND
|
||||
{
|
||||
long pagesize = sysconf(_SC_PAGE_SIZE);
|
||||
AnonymousShmem = CreateAnonymousSegment(&size);
|
||||
AnonymousShmemSize = size;
|
||||
|
||||
/*
|
||||
* Ensure request size is a multiple of pagesize.
|
||||
*
|
||||
* pagesize will, for practical purposes, always be a power of two.
|
||||
* But just in case it isn't, we do it this way instead of using
|
||||
* TYPEALIGN().
|
||||
*/
|
||||
if (pagesize > 0 && size % pagesize != 0)
|
||||
size += pagesize - (size % pagesize);
|
||||
|
||||
/*
|
||||
* We assume that no one will attempt to run PostgreSQL 9.3 or later
|
||||
* on systems that are ancient enough that anonymous shared memory is
|
||||
* not supported, such as pre-2.4 versions of Linux. If that turns
|
||||
* out to be false, we might need to add a run-time test here and do
|
||||
* this only if the running kernel supports it.
|
||||
*/
|
||||
AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
|
||||
-1, 0);
|
||||
if (AnonymousShmem == MAP_FAILED)
|
||||
ereport(FATAL,
|
||||
(errmsg("could not map anonymous shared memory: %m"),
|
||||
(errno == ENOMEM) ?
|
||||
errhint("This error usually means that PostgreSQL's request "
|
||||
"for a shared memory segment exceeded available memory "
|
||||
"or swap space. To reduce the request size (currently "
|
||||
"%zu bytes), reduce PostgreSQL's shared memory usage, "
|
||||
"perhaps by reducing shared_buffers or "
|
||||
"max_connections.",
|
||||
size) : 0));
|
||||
AnonymousShmemSize = size;
|
||||
|
||||
/* Now we need only allocate a minimal-sized SysV shmem block. */
|
||||
sysvsize = sizeof(PGShmemHeader);
|
||||
}
|
||||
/* Now we need only allocate a minimal-sized SysV shmem block. */
|
||||
sysvsize = sizeof(PGShmemHeader);
|
||||
#else
|
||||
sysvsize = size;
|
||||
#endif
|
||||
|
||||
/* Make sure PGSharedMemoryAttach doesn't fail without need */
|
||||
|
Reference in New Issue
Block a user