From c6bf6d1897c9f2eebf7efd0eef231025764bf146 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Thu, 15 May 2025 13:09:25 +0000 Subject: [PATCH] benchtest: malloc tcache hotpath benchtest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Existing benchtests for malloc infrastructure seem to be rather generic to test global malloc implementation performance. This new benchtest focus on reducing any non tcache related side effects, allowing to more realistically predict performance impacts of tcache code changes. The test was inpired in bench-[cm]alloc-thread code, with severe simplifications: - forces single thread execution, reducing concurrency side-effects, like cache incoherence penalties due simultaneous writes to the same cache pages; - Focus on allocating and deallocating a single size for all the duration of the benchmark. Since all it does is allocate and deallocate, it will measure the tcache hotpath without any side-effects. - Allows to specify the allocation size as input argument. Reviewed-by: Wilco Dijkstra  --- benchtests/Makefile | 9 ++ benchtests/bench-calloc-tcache.c | 22 ++++ benchtests/bench-malloc-tcache.c | 174 +++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 benchtests/bench-calloc-tcache.c create mode 100644 benchtests/bench-malloc-tcache.c diff --git a/benchtests/Makefile b/benchtests/Makefile index 5b2fa67f35..5470be7f40 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -332,8 +332,10 @@ CFLAGS-bench-isfinite.c += $(config-cflags-signaling-nans) ifeq (${BENCHSET},) bench-malloc := \ calloc-simple \ + calloc-tcache \ calloc-thread \ malloc-simple \ + malloc-tcache \ malloc-thread \ # bench-malloc else @@ -456,9 +458,11 @@ VALIDBENCHSETNAMES := \ bench-pthread \ bench-string \ calloc-simple \ + calloc-tcache \ calloc-thread \ hash-benchset \ malloc-simple \ + malloc-tcache \ malloc-thread \ math-benchset \ stdio-benchset \ @@ -502,6 +506,11 @@ bench-malloc: $(binaries-bench-malloc) echo "Running $${run} $${thr}"; \ $(run-bench) $${thr} > $${run}-$${thr}.out; \ done;\ + elif basename $${run} | grep -q "bench-[cm]alloc-tcache"; then \ + for thr in 64 512 1024; do \ + echo "Running $${run} $${thr}"; \ + $(run-bench) $${thr} > $${run}-$${thr}.out; \ + done;\ else \ for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \ echo "Running $${run} $${thr}"; \ diff --git a/benchtests/bench-calloc-tcache.c b/benchtests/bench-calloc-tcache.c new file mode 100644 index 0000000000..5303f872b8 --- /dev/null +++ b/benchtests/bench-calloc-tcache.c @@ -0,0 +1,22 @@ +/* Benchmark calloc and free functions. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_FUNC(size) calloc (1, size) +#define TEST_NAME "calloc" + +#include "bench-malloc-tcache.c" diff --git a/benchtests/bench-malloc-tcache.c b/benchtests/bench-malloc-tcache.c new file mode 100644 index 0000000000..4bd9808e80 --- /dev/null +++ b/benchtests/bench-malloc-tcache.c @@ -0,0 +1,174 @@ +/* Benchmark tcache hotpath allocations. + Copyright (C) 2013-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef TEST_FUNC +# define TEST_FUNC(size) malloc(size) +# define TEST_NAME "malloc" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "bench-util.h" +#include "bench-util.c" +#include "bench-timing.h" +#include "json-lib.h" + +/* Benchmark duration in seconds. */ +#define BENCHMARK_DURATION 3 + +static volatile bool timeout; +size_t alloc_size; + +static void +alarm_handler (int signum) +{ + timeout = true; +} + +struct bench_result { + size_t iters; + timing_t elapsed; +}; + +static __always_inline size_t +malloc_benchmark_loop (void **elems, size_t nr_items) +{ + size_t iters = nr_items; + + while (!timeout) + { + elems[iters % nr_items] = TEST_FUNC (alloc_size); + iters++; + free (elems[iters % nr_items]); + } + + return iters - nr_items; +} + +static void +do_benchmark (struct bench_result *res) +{ + timing_t start, stop; + void *elems[10]; + memset (elems, 0, sizeof(elems)); + + alarm (BENCHMARK_DURATION); + /* Ramp up cpu before measuring. */ + bench_start (); + TIMING_NOW (start); + res[0].iters = malloc_benchmark_loop (elems, 1); + TIMING_NOW (stop); + TIMING_DIFF (res[0].elapsed, start, stop); + + timeout = false; + alarm (BENCHMARK_DURATION); + bench_start (); + TIMING_NOW (start); + res[1].iters = malloc_benchmark_loop (elems, 4); + TIMING_NOW (stop); + TIMING_DIFF (res[1].elapsed, start, stop); +} + +static void usage (const char *name) +{ + fprintf (stderr, "%s: \n", name); + exit (1); +} + +int +main (int argc, char **argv) +{ + json_ctx_t json_ctx; + double d_total_s, d_total_i; + struct sigaction act; + + if (argc == 1) + alloc_size = 1024; + else if (argc == 2) + { + long ret; + + errno = 0; + ret = strtol (argv[1], NULL, 10); + + if (errno || ret == 0) + usage (argv[0]); + + alloc_size = ret; + } + else + usage (argv[0]); + + json_init (&json_ctx, 0, stdout); + + json_document_begin (&json_ctx); + + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); + + json_attr_object_begin (&json_ctx, "functions"); + + json_attr_object_begin (&json_ctx, TEST_NAME); + + memset (&act, 0, sizeof (act)); + act.sa_handler = &alarm_handler; + + sigaction (SIGALRM, &act, NULL); + + struct bench_result res[2]; + memset (res, 0, sizeof (struct bench_result) * 2); + + do_benchmark (res); + + d_total_s = res[0].elapsed; + d_total_i = res[0].iters; + + json_attr_object_begin (&json_ctx, "simple"); + + json_attr_double (&json_ctx, "alloc_size", alloc_size); + json_attr_double (&json_ctx, "duration", d_total_s); + json_attr_double (&json_ctx, "iterations", d_total_i); + json_attr_double (&json_ctx, "time_per_iteration", d_total_s / d_total_i); + + json_attr_object_end (&json_ctx); + + d_total_s = res[1].elapsed; + d_total_i = res[1].iters; + + json_attr_object_begin (&json_ctx, "optimized"); + + json_attr_double (&json_ctx, "alloc_size", alloc_size); + json_attr_double (&json_ctx, "duration", d_total_s); + json_attr_double (&json_ctx, "iterations", d_total_i); + json_attr_double (&json_ctx, "time_per_iteration", d_total_s / d_total_i); + + json_attr_object_end (&json_ctx); + + json_attr_object_end (&json_ctx); + + json_attr_object_end (&json_ctx); + + json_document_end (&json_ctx); + + return 0; +}