1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-08 11:42:09 +03:00

aio: Combine io_uring memory mappings, if supported

By default io_uring creates a shared memory mapping for each io_uring
instance, leading to a large number of memory mappings. Unfortunately a large
number of memory mappings slows things down, backend exit is particularly
affected.  To address that, newer kernels (6.5) support using user-provided
memory for the memory. By putting the relevant memory into shared memory we
don't need any additional mappings.

On a system with a new enough kernel and liburing, there is no discernible
overhead when doing a pgbench -S -C anymore.

Reported-by: MARK CALLAGHAN <mdcallag@gmail.com>
Reviewed-by: "Burd, Greg" <greg@burd.me>
Reviewed-by: Jim Nasby <jnasby@upgrade.com>
Discussion: https://postgr.es/m/CAFbpF8OA44_UG+RYJcWH9WjF7E3GA6gka3gvH6nsrSnEe9H0NA@mail.gmail.com
Backpatch-through: 18
This commit is contained in:
Andres Freund
2025-07-07 21:03:16 -04:00
parent 55a780e947
commit f54af9f267
6 changed files with 238 additions and 6 deletions

17
configure vendored
View File

@ -13309,6 +13309,23 @@ fi
fi
if test "$with_liburing" = yes; then
_LIBS="$LIBS"
LIBS="$LIBURING_LIBS $LIBS"
for ac_func in io_uring_queue_init_mem
do :
ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem"
if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then :
cat >>confdefs.h <<_ACEOF
#define HAVE_IO_URING_QUEUE_INIT_MEM 1
_ACEOF
fi
done
LIBS="$_LIBS"
fi
if test "$with_lz4" = yes ; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }

View File

@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then
AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
fi
if test "$with_liburing" = yes; then
_LIBS="$LIBS"
LIBS="$LIBURING_LIBS $LIBS"
AC_CHECK_FUNCS([io_uring_queue_init_mem])
LIBS="$_LIBS"
fi
if test "$with_lz4" = yes ; then
AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
fi

View File

@ -995,6 +995,12 @@ liburingopt = get_option('liburing')
liburing = dependency('liburing', required: liburingopt)
if liburing.found()
cdata.set('USE_LIBURING', 1)
if cc.has_function('io_uring_queue_init_mem',
dependencies: liburing, args: test_c_args)
cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1)
endif
endif

View File

@ -29,6 +29,9 @@
#ifdef IOMETHOD_IO_URING_ENABLED
#include <sys/mman.h>
#include <unistd.h>
#include <liburing.h>
#include "miscadmin.h"
@ -94,12 +97,32 @@ PgAioUringContext
struct io_uring io_uring_ring;
} PgAioUringContext;
/*
* Information about the capabilities that io_uring has.
*
* Depending on liburing and kernel version different features are
* supported. At least for the kernel a kernel version check does not suffice
* as various vendors do backport features to older kernels :(.
*/
typedef struct PgAioUringCaps
{
bool checked;
/* -1 if io_uring_queue_init_mem() is unsupported */
int mem_init_size;
} PgAioUringCaps;
/* PgAioUringContexts for all backends */
static PgAioUringContext *pgaio_uring_contexts;
/* the current backend's context */
static PgAioUringContext *pgaio_my_uring_context;
static PgAioUringCaps pgaio_uring_caps =
{
.checked = false,
.mem_init_size = -1,
};
static uint32
pgaio_uring_procs(void)
@ -111,16 +134,145 @@ pgaio_uring_procs(void)
return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
}
static Size
/*
* Initializes pgaio_uring_caps, unless that's already done.
*/
static void
pgaio_uring_check_capabilities(void)
{
if (pgaio_uring_caps.checked)
return;
/*
* By default io_uring creates a shared memory mapping for each io_uring
* instance, leading to a large number of memory mappings. Unfortunately a
* large number of memory mappings slows things down, backend exit is
* particularly affected. To address that, newer kernels (6.5) support
* using user-provided memory for the memory, by putting the relevant
* memory into shared memory we don't need any additional mappings.
*
* To know whether this is supported, we unfortunately need to probe the
* kernel by trying to create a ring with userspace-provided memory. This
* also has a secondary benefit: We can determine precisely how much
* memory we need for each io_uring instance.
*/
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
{
struct io_uring test_ring;
size_t ring_size;
void *ring_ptr;
struct io_uring_params p = {0};
int ret;
/*
* Liburing does not yet provide an API to query how much memory a
* ring will need. So we over-estimate it here. As the memory is freed
* just below that's small temporary waste of memory.
*
* 1MB is more than enough for rings within io_max_concurrency's
* range.
*/
ring_size = 1024 * 1024;
/*
* Hard to believe a system exists where 1MB would not be a multiple
* of the page size. But it's cheap to ensure...
*/
ring_size -= ring_size % sysconf(_SC_PAGESIZE);
ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (ring_ptr == MAP_FAILED)
elog(ERROR,
"mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
ring_size);
ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
if (ret > 0)
{
pgaio_uring_caps.mem_init_size = ret;
elog(DEBUG1,
"can use combined memory mapping for io_uring, each ring needs %d bytes",
ret);
/* clean up the created ring, it was just for a test */
io_uring_queue_exit(&test_ring);
}
else
{
/*
* There are different reasons for ring creation to fail, but it's
* ok to treat that just as io_uring_queue_init_mem() not being
* supported. We'll report a more detailed error in
* pgaio_uring_shmem_init().
*/
errno = -ret;
elog(DEBUG1,
"cannot use combined memory mapping for io_uring, ring creation failed: %m");
}
if (munmap(ring_ptr, ring_size) != 0)
elog(ERROR, "munmap() failed: %m");
}
#else
{
elog(DEBUG1,
"can't use combined memory mapping for io_uring, kernel or liburing too old");
}
#endif
pgaio_uring_caps.checked = true;
}
/*
* Memory for all PgAioUringContext instances
*/
static size_t
pgaio_uring_context_shmem_size(void)
{
return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
}
/*
* Memory for the combined memory used by io_uring instances. Returns 0 if
* that is not supported by kernel/liburing.
*/
static size_t
pgaio_uring_ring_shmem_size(void)
{
size_t sz = 0;
if (pgaio_uring_caps.mem_init_size > 0)
{
/*
* Memory for rings needs to be allocated to the page boundary,
* reserve space. Luckily it does not need to be aligned to hugepage
* boundaries, even if huge pages are used.
*/
sz = add_size(sz, sysconf(_SC_PAGESIZE));
sz = add_size(sz, mul_size(pgaio_uring_procs(),
pgaio_uring_caps.mem_init_size));
}
return sz;
}
static size_t
pgaio_uring_shmem_size(void)
{
return pgaio_uring_context_shmem_size();
size_t sz;
/*
* Kernel and liburing support for various features influences how much
* shmem we need, perform the necessary checks.
*/
pgaio_uring_check_capabilities();
sz = pgaio_uring_context_shmem_size();
sz = add_size(sz, pgaio_uring_ring_shmem_size());
return sz;
}
static void
@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time)
{
int TotalProcs = pgaio_uring_procs();
bool found;
char *shmem;
size_t ring_mem_remain = 0;
char *ring_mem_next = 0;
pgaio_uring_contexts = (PgAioUringContext *)
ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found);
/*
* We allocate memory for all PgAioUringContext instances and, if
* supported, the memory required for each of the io_uring instances, in
* one ShmemInitStruct().
*/
shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
if (found)
return;
pgaio_uring_contexts = (PgAioUringContext *) shmem;
shmem += pgaio_uring_context_shmem_size();
/* if supported, handle memory alignment / sizing for io_uring memory */
if (pgaio_uring_caps.mem_init_size > 0)
{
ring_mem_remain = pgaio_uring_ring_shmem_size();
ring_mem_next = (char *) shmem;
/* align to page boundary, see also pgaio_uring_ring_shmem_size() */
ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
/* account for alignment */
ring_mem_remain -= ring_mem_next - shmem;
shmem += ring_mem_next - shmem;
shmem += ring_mem_remain;
}
for (int contextno = 0; contextno < TotalProcs; contextno++)
{
PgAioUringContext *context = &pgaio_uring_contexts[contextno];
@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
* be worth using that - also need to evaluate if that causes
* noticeable additional contention?
*/
ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
/*
* If supported (c.f. pgaio_uring_check_capabilities()), create ring
* with its data in shared memory. Otherwise fall back io_uring
* creating a memory mapping for each ring.
*/
#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
if (pgaio_uring_caps.mem_init_size > 0)
{
struct io_uring_params p = {0};
ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
ring_mem_remain -= ret;
ring_mem_next += ret;
}
else
#endif
{
ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
}
if (ret < 0)
{
char *hint = NULL;

View File

@ -229,6 +229,9 @@
/* Define to 1 if you have the global variable 'int timezone'. */
#undef HAVE_INT_TIMEZONE
/* Define to 1 if you have the `io_uring_queue_init_mem' function. */
#undef HAVE_IO_URING_QUEUE_INIT_MEM
/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P

View File

@ -2181,6 +2181,7 @@ PgAioReturn
PgAioTargetData
PgAioTargetID
PgAioTargetInfo
PgAioUringCaps
PgAioUringContext
PgAioWaitRef
PgArchData