mirror of
https://github.com/postgres/postgres.git
synced 2025-07-08 11:42:09 +03:00
aio: Combine io_uring memory mappings, if supported
By default io_uring creates a shared memory mapping for each io_uring instance, leading to a large number of memory mappings. Unfortunately a large number of memory mappings slows things down, backend exit is particularly affected. To address that, newer kernels (6.5) support using user-provided memory for the memory. By putting the relevant memory into shared memory we don't need any additional mappings. On a system with a new enough kernel and liburing, there is no discernible overhead when doing a pgbench -S -C anymore. Reported-by: MARK CALLAGHAN <mdcallag@gmail.com> Reviewed-by: "Burd, Greg" <greg@burd.me> Reviewed-by: Jim Nasby <jnasby@upgrade.com> Discussion: https://postgr.es/m/CAFbpF8OA44_UG+RYJcWH9WjF7E3GA6gka3gvH6nsrSnEe9H0NA@mail.gmail.com Backpatch-through: 18
This commit is contained in:
17
configure
vendored
17
configure
vendored
@ -13309,6 +13309,23 @@ fi
|
|||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if test "$with_liburing" = yes; then
|
||||||
|
_LIBS="$LIBS"
|
||||||
|
LIBS="$LIBURING_LIBS $LIBS"
|
||||||
|
for ac_func in io_uring_queue_init_mem
|
||||||
|
do :
|
||||||
|
ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem"
|
||||||
|
if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then :
|
||||||
|
cat >>confdefs.h <<_ACEOF
|
||||||
|
#define HAVE_IO_URING_QUEUE_INIT_MEM 1
|
||||||
|
_ACEOF
|
||||||
|
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
LIBS="$_LIBS"
|
||||||
|
fi
|
||||||
|
|
||||||
if test "$with_lz4" = yes ; then
|
if test "$with_lz4" = yes ; then
|
||||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
|
||||||
$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }
|
$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }
|
||||||
|
@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then
|
|||||||
AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
|
AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if test "$with_liburing" = yes; then
|
||||||
|
_LIBS="$LIBS"
|
||||||
|
LIBS="$LIBURING_LIBS $LIBS"
|
||||||
|
AC_CHECK_FUNCS([io_uring_queue_init_mem])
|
||||||
|
LIBS="$_LIBS"
|
||||||
|
fi
|
||||||
|
|
||||||
if test "$with_lz4" = yes ; then
|
if test "$with_lz4" = yes ; then
|
||||||
AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
|
AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
|
||||||
fi
|
fi
|
||||||
|
@ -995,6 +995,12 @@ liburingopt = get_option('liburing')
|
|||||||
liburing = dependency('liburing', required: liburingopt)
|
liburing = dependency('liburing', required: liburingopt)
|
||||||
if liburing.found()
|
if liburing.found()
|
||||||
cdata.set('USE_LIBURING', 1)
|
cdata.set('USE_LIBURING', 1)
|
||||||
|
|
||||||
|
if cc.has_function('io_uring_queue_init_mem',
|
||||||
|
dependencies: liburing, args: test_c_args)
|
||||||
|
cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1)
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,6 +29,9 @@
|
|||||||
|
|
||||||
#ifdef IOMETHOD_IO_URING_ENABLED
|
#ifdef IOMETHOD_IO_URING_ENABLED
|
||||||
|
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
#include <liburing.h>
|
#include <liburing.h>
|
||||||
|
|
||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
@ -94,12 +97,32 @@ PgAioUringContext
|
|||||||
struct io_uring io_uring_ring;
|
struct io_uring io_uring_ring;
|
||||||
} PgAioUringContext;
|
} PgAioUringContext;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information about the capabilities that io_uring has.
|
||||||
|
*
|
||||||
|
* Depending on liburing and kernel version different features are
|
||||||
|
* supported. At least for the kernel a kernel version check does not suffice
|
||||||
|
* as various vendors do backport features to older kernels :(.
|
||||||
|
*/
|
||||||
|
typedef struct PgAioUringCaps
|
||||||
|
{
|
||||||
|
bool checked;
|
||||||
|
/* -1 if io_uring_queue_init_mem() is unsupported */
|
||||||
|
int mem_init_size;
|
||||||
|
} PgAioUringCaps;
|
||||||
|
|
||||||
|
|
||||||
/* PgAioUringContexts for all backends */
|
/* PgAioUringContexts for all backends */
|
||||||
static PgAioUringContext *pgaio_uring_contexts;
|
static PgAioUringContext *pgaio_uring_contexts;
|
||||||
|
|
||||||
/* the current backend's context */
|
/* the current backend's context */
|
||||||
static PgAioUringContext *pgaio_my_uring_context;
|
static PgAioUringContext *pgaio_my_uring_context;
|
||||||
|
|
||||||
|
static PgAioUringCaps pgaio_uring_caps =
|
||||||
|
{
|
||||||
|
.checked = false,
|
||||||
|
.mem_init_size = -1,
|
||||||
|
};
|
||||||
|
|
||||||
static uint32
|
static uint32
|
||||||
pgaio_uring_procs(void)
|
pgaio_uring_procs(void)
|
||||||
@ -111,16 +134,145 @@ pgaio_uring_procs(void)
|
|||||||
return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
|
return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Size
|
/*
|
||||||
|
* Initializes pgaio_uring_caps, unless that's already done.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
pgaio_uring_check_capabilities(void)
|
||||||
|
{
|
||||||
|
if (pgaio_uring_caps.checked)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* By default io_uring creates a shared memory mapping for each io_uring
|
||||||
|
* instance, leading to a large number of memory mappings. Unfortunately a
|
||||||
|
* large number of memory mappings slows things down, backend exit is
|
||||||
|
* particularly affected. To address that, newer kernels (6.5) support
|
||||||
|
* using user-provided memory for the memory, by putting the relevant
|
||||||
|
* memory into shared memory we don't need any additional mappings.
|
||||||
|
*
|
||||||
|
* To know whether this is supported, we unfortunately need to probe the
|
||||||
|
* kernel by trying to create a ring with userspace-provided memory. This
|
||||||
|
* also has a secondary benefit: We can determine precisely how much
|
||||||
|
* memory we need for each io_uring instance.
|
||||||
|
*/
|
||||||
|
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
|
||||||
|
{
|
||||||
|
struct io_uring test_ring;
|
||||||
|
size_t ring_size;
|
||||||
|
void *ring_ptr;
|
||||||
|
struct io_uring_params p = {0};
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Liburing does not yet provide an API to query how much memory a
|
||||||
|
* ring will need. So we over-estimate it here. As the memory is freed
|
||||||
|
* just below that's small temporary waste of memory.
|
||||||
|
*
|
||||||
|
* 1MB is more than enough for rings within io_max_concurrency's
|
||||||
|
* range.
|
||||||
|
*/
|
||||||
|
ring_size = 1024 * 1024;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Hard to believe a system exists where 1MB would not be a multiple
|
||||||
|
* of the page size. But it's cheap to ensure...
|
||||||
|
*/
|
||||||
|
ring_size -= ring_size % sysconf(_SC_PAGESIZE);
|
||||||
|
|
||||||
|
ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
|
||||||
|
if (ring_ptr == MAP_FAILED)
|
||||||
|
elog(ERROR,
|
||||||
|
"mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
|
||||||
|
ring_size);
|
||||||
|
|
||||||
|
ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
|
||||||
|
if (ret > 0)
|
||||||
|
{
|
||||||
|
pgaio_uring_caps.mem_init_size = ret;
|
||||||
|
|
||||||
|
elog(DEBUG1,
|
||||||
|
"can use combined memory mapping for io_uring, each ring needs %d bytes",
|
||||||
|
ret);
|
||||||
|
|
||||||
|
/* clean up the created ring, it was just for a test */
|
||||||
|
io_uring_queue_exit(&test_ring);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* There are different reasons for ring creation to fail, but it's
|
||||||
|
* ok to treat that just as io_uring_queue_init_mem() not being
|
||||||
|
* supported. We'll report a more detailed error in
|
||||||
|
* pgaio_uring_shmem_init().
|
||||||
|
*/
|
||||||
|
errno = -ret;
|
||||||
|
elog(DEBUG1,
|
||||||
|
"cannot use combined memory mapping for io_uring, ring creation failed: %m");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (munmap(ring_ptr, ring_size) != 0)
|
||||||
|
elog(ERROR, "munmap() failed: %m");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
elog(DEBUG1,
|
||||||
|
"can't use combined memory mapping for io_uring, kernel or liburing too old");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
pgaio_uring_caps.checked = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Memory for all PgAioUringContext instances
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
pgaio_uring_context_shmem_size(void)
|
pgaio_uring_context_shmem_size(void)
|
||||||
{
|
{
|
||||||
return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
|
return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Memory for the combined memory used by io_uring instances. Returns 0 if
|
||||||
|
* that is not supported by kernel/liburing.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
pgaio_uring_ring_shmem_size(void)
|
||||||
|
{
|
||||||
|
size_t sz = 0;
|
||||||
|
|
||||||
|
if (pgaio_uring_caps.mem_init_size > 0)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Memory for rings needs to be allocated to the page boundary,
|
||||||
|
* reserve space. Luckily it does not need to be aligned to hugepage
|
||||||
|
* boundaries, even if huge pages are used.
|
||||||
|
*/
|
||||||
|
sz = add_size(sz, sysconf(_SC_PAGESIZE));
|
||||||
|
sz = add_size(sz, mul_size(pgaio_uring_procs(),
|
||||||
|
pgaio_uring_caps.mem_init_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
return sz;
|
||||||
|
}
|
||||||
|
|
||||||
static size_t
|
static size_t
|
||||||
pgaio_uring_shmem_size(void)
|
pgaio_uring_shmem_size(void)
|
||||||
{
|
{
|
||||||
return pgaio_uring_context_shmem_size();
|
size_t sz;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kernel and liburing support for various features influences how much
|
||||||
|
* shmem we need, perform the necessary checks.
|
||||||
|
*/
|
||||||
|
pgaio_uring_check_capabilities();
|
||||||
|
|
||||||
|
sz = pgaio_uring_context_shmem_size();
|
||||||
|
sz = add_size(sz, pgaio_uring_ring_shmem_size());
|
||||||
|
|
||||||
|
return sz;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time)
|
|||||||
{
|
{
|
||||||
int TotalProcs = pgaio_uring_procs();
|
int TotalProcs = pgaio_uring_procs();
|
||||||
bool found;
|
bool found;
|
||||||
|
char *shmem;
|
||||||
|
size_t ring_mem_remain = 0;
|
||||||
|
char *ring_mem_next = 0;
|
||||||
|
|
||||||
pgaio_uring_contexts = (PgAioUringContext *)
|
/*
|
||||||
ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found);
|
* We allocate memory for all PgAioUringContext instances and, if
|
||||||
|
* supported, the memory required for each of the io_uring instances, in
|
||||||
|
* one ShmemInitStruct().
|
||||||
|
*/
|
||||||
|
shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
|
||||||
if (found)
|
if (found)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
pgaio_uring_contexts = (PgAioUringContext *) shmem;
|
||||||
|
shmem += pgaio_uring_context_shmem_size();
|
||||||
|
|
||||||
|
/* if supported, handle memory alignment / sizing for io_uring memory */
|
||||||
|
if (pgaio_uring_caps.mem_init_size > 0)
|
||||||
|
{
|
||||||
|
ring_mem_remain = pgaio_uring_ring_shmem_size();
|
||||||
|
ring_mem_next = (char *) shmem;
|
||||||
|
|
||||||
|
/* align to page boundary, see also pgaio_uring_ring_shmem_size() */
|
||||||
|
ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
|
||||||
|
|
||||||
|
/* account for alignment */
|
||||||
|
ring_mem_remain -= ring_mem_next - shmem;
|
||||||
|
shmem += ring_mem_next - shmem;
|
||||||
|
|
||||||
|
shmem += ring_mem_remain;
|
||||||
|
}
|
||||||
|
|
||||||
for (int contextno = 0; contextno < TotalProcs; contextno++)
|
for (int contextno = 0; contextno < TotalProcs; contextno++)
|
||||||
{
|
{
|
||||||
PgAioUringContext *context = &pgaio_uring_contexts[contextno];
|
PgAioUringContext *context = &pgaio_uring_contexts[contextno];
|
||||||
@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
|
|||||||
* be worth using that - also need to evaluate if that causes
|
* be worth using that - also need to evaluate if that causes
|
||||||
* noticeable additional contention?
|
* noticeable additional contention?
|
||||||
*/
|
*/
|
||||||
ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
|
|
||||||
|
/*
|
||||||
|
* If supported (c.f. pgaio_uring_check_capabilities()), create ring
|
||||||
|
* with its data in shared memory. Otherwise fall back io_uring
|
||||||
|
* creating a memory mapping for each ring.
|
||||||
|
*/
|
||||||
|
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
|
||||||
|
if (pgaio_uring_caps.mem_init_size > 0)
|
||||||
|
{
|
||||||
|
struct io_uring_params p = {0};
|
||||||
|
|
||||||
|
ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
|
||||||
|
|
||||||
|
ring_mem_remain -= ret;
|
||||||
|
ring_mem_next += ret;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
|
||||||
|
}
|
||||||
|
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
{
|
{
|
||||||
char *hint = NULL;
|
char *hint = NULL;
|
||||||
|
@ -229,6 +229,9 @@
|
|||||||
/* Define to 1 if you have the global variable 'int timezone'. */
|
/* Define to 1 if you have the global variable 'int timezone'. */
|
||||||
#undef HAVE_INT_TIMEZONE
|
#undef HAVE_INT_TIMEZONE
|
||||||
|
|
||||||
|
/* Define to 1 if you have the `io_uring_queue_init_mem' function. */
|
||||||
|
#undef HAVE_IO_URING_QUEUE_INIT_MEM
|
||||||
|
|
||||||
/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
|
/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
|
||||||
#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
|
#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
|
||||||
|
|
||||||
|
@ -2181,6 +2181,7 @@ PgAioReturn
|
|||||||
PgAioTargetData
|
PgAioTargetData
|
||||||
PgAioTargetID
|
PgAioTargetID
|
||||||
PgAioTargetInfo
|
PgAioTargetInfo
|
||||||
|
PgAioUringCaps
|
||||||
PgAioUringContext
|
PgAioUringContext
|
||||||
PgAioWaitRef
|
PgAioWaitRef
|
||||||
PgArchData
|
PgArchData
|
||||||
|
Reference in New Issue
Block a user