1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-07-28 00:21:52 +03:00

Add per-thread cache to malloc

* config.make.in: Enable experimental malloc option.
* configure.ac: Likewise.
* configure: Regenerate.
* manual/install.texi: Document it.
* INSTALL: Regenerate.
* malloc/Makefile: Likewise.
* malloc/malloc.c: Add per-thread cache (tcache).
(tcache_put): New.
(tcache_get): New.
(tcache_thread_freeres): New.
(tcache_init): New.
(__libc_malloc): Use cached chunks if available.
(__libc_free): Initialize tcache if needed.
(__libc_realloc): Likewise.
(__libc_calloc): Likewise.
(_int_malloc): Prefill tcache when appropriate.
(_int_free): Likewise.
(do_set_tcache_max): New.
(do_set_tcache_count): New.
(do_set_tcache_unsorted_limit): New.
* manual/probes.texi: Document new probes.
* malloc/arena.c: Add new tcache tunables.
* elf/dl-tunables.list: Likewise.
* manual/tunables.texi: Document them.
* NEWS: Mention the per-thread cache.
This commit is contained in:
DJ Delorie
2017-07-06 13:37:30 -04:00
parent 3cefdd7310
commit d5c3fafc43
13 changed files with 490 additions and 9 deletions

View File

@ -238,6 +238,9 @@
/* For ALIGN_UP et. al. */
#include <libc-pointer-arith.h>
/* For DIAG_PUSH/POP_NEEDS_COMMENT et al. */
#include <libc-diag.h>
#include <malloc/malloc-internal.h>
/*
@ -296,6 +299,30 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
}
#endif
#if USE_TCACHE
/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */
# define TCACHE_MAX_BINS 64
# define MAX_TCACHE_SIZE tidx2usize (TCACHE_MAX_BINS-1)
/* Only used to pre-fill the tunables. */
# define tidx2usize(idx) (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)
/* When "x" is from chunksize(). */
# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
/* When "x" is a user-provided size. */
# define usize2tidx(x) csize2tidx (request2size (x))
/* With rounding and alignment, the bins are...
idx 0 bytes 0..24 (64-bit) or 0..12 (32-bit)
idx 1 bytes 25..40 or 13..20
idx 2 bytes 41..56 or 21..28
etc. */
/* This is another arbitrary limit, which tunables can change. Each
tcache bin will hold at most this number of chunks. */
# define TCACHE_FILL_COUNT 7
#endif
/*
REALLOC_ZERO_BYTES_FREES should be set if a call to
@ -1712,6 +1739,17 @@ struct malloc_par
/* First address handed out by MORECORE/sbrk. */
char *sbrk_base;
#if USE_TCACHE
/* Maximum number of buckets to use. */
size_t tcache_bins;
size_t tcache_max_bytes;
/* Maximum number of chunks in each bucket. */
size_t tcache_count;
/* Maximum number of chunks to remove from the unsorted list, which
aren't used to prefill the cache. */
size_t tcache_unsorted_limit;
#endif
};
/* There are several instances of this struct ("arenas") in this
@ -1750,6 +1788,13 @@ static struct malloc_par mp_ =
.trim_threshold = DEFAULT_TRIM_THRESHOLD,
#define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
.arena_test = NARENAS_FROM_NCORES (1)
#if USE_TCACHE
,
.tcache_count = TCACHE_FILL_COUNT,
.tcache_bins = TCACHE_MAX_BINS,
.tcache_max_bytes = tidx2usize (TCACHE_MAX_BINS-1),
.tcache_unsorted_limit = 0 /* No limit. */
#endif
};
/* Maximum size of memory handled in fastbins. */
@ -2875,6 +2920,124 @@ mremap_chunk (mchunkptr p, size_t new_size)
/*------------------------ Public wrappers. --------------------------------*/
#if USE_TCACHE
/* We overlay this structure on the user-data portion of a chunk when
the chunk is stored in the per-thread cache. */
typedef struct tcache_entry
{
struct tcache_entry *next;
} tcache_entry;
/* There is one of these for each thread, which contains the
per-thread cache (hence "tcache_perthread_struct"). Keeping
overall size low is mildly important. Note that COUNTS and ENTRIES
are redundant (we could have just counted the linked list each
time), this is for performance reasons. */
typedef struct tcache_perthread_struct
{
char counts[TCACHE_MAX_BINS];
tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;
static __thread char tcache_shutting_down = 0;
static __thread tcache_perthread_struct *tcache = NULL;
/* Caller must ensure that we know tc_idx is valid and there's room
for more chunks. */
static void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
assert (tc_idx < TCACHE_MAX_BINS);
e->next = tcache->entries[tc_idx];
tcache->entries[tc_idx] = e;
++(tcache->counts[tc_idx]);
}
/* Caller must ensure that we know tc_idx is valid and there's
available chunks to remove. */
static void *
tcache_get (size_t tc_idx)
{
tcache_entry *e = tcache->entries[tc_idx];
assert (tc_idx < TCACHE_MAX_BINS);
assert (tcache->entries[tc_idx] > 0);
tcache->entries[tc_idx] = e->next;
--(tcache->counts[tc_idx]);
return (void *) e;
}
static void __attribute__ ((section ("__libc_thread_freeres_fn")))
tcache_thread_freeres (void)
{
int i;
tcache_perthread_struct *tcache_tmp = tcache;
if (!tcache)
return;
tcache = NULL;
for (i = 0; i < TCACHE_MAX_BINS; ++i)
{
while (tcache_tmp->entries[i])
{
tcache_entry *e = tcache_tmp->entries[i];
tcache_tmp->entries[i] = e->next;
__libc_free (e);
}
}
__libc_free (tcache_tmp);
tcache_shutting_down = 1;
}
text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
static void
tcache_init(void)
{
mstate ar_ptr;
void *victim = 0;
const size_t bytes = sizeof (tcache_perthread_struct);
if (tcache_shutting_down)
return;
arena_get (ar_ptr, bytes);
victim = _int_malloc (ar_ptr, bytes);
if (!victim && ar_ptr != NULL)
{
ar_ptr = arena_get_retry (ar_ptr, bytes);
victim = _int_malloc (ar_ptr, bytes);
}
if (ar_ptr != NULL)
__libc_lock_unlock (ar_ptr->mutex);
/* In a low memory situation, we may not be able to allocate memory
- in which case, we just keep trying later. However, we
typically do this very early, so either there is sufficient
memory, or there isn't enough memory to do non-trivial
allocations anyway. */
if (victim)
{
tcache = (tcache_perthread_struct *) victim;
memset (tcache, 0, sizeof (tcache_perthread_struct));
}
}
#define MAYBE_INIT_TCACHE() \
if (__glibc_unlikely (tcache == NULL)) \
tcache_init();
#else
#define MAYBE_INIT_TCACHE()
#endif
void *
__libc_malloc (size_t bytes)
{
@ -2885,6 +3048,23 @@ __libc_malloc (size_t bytes)
= atomic_forced_read (__malloc_hook);
if (__builtin_expect (hook != NULL, 0))
return (*hook)(bytes, RETURN_ADDRESS (0));
#if USE_TCACHE
/* int_free also calls request2size, be careful to not pad twice. */
size_t tbytes = request2size (bytes);
size_t tc_idx = csize2tidx (tbytes);
MAYBE_INIT_TCACHE ();
DIAG_PUSH_NEEDS_COMMENT;
if (tc_idx < mp_.tcache_bins
/*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
&& tcache
&& tcache->entries[tc_idx] != NULL)
{
return tcache_get (tc_idx);
}
DIAG_POP_NEEDS_COMMENT;
#endif
arena_get (ar_ptr, bytes);
@ -2944,6 +3124,8 @@ __libc_free (void *mem)
return;
}
MAYBE_INIT_TCACHE ();
ar_ptr = arena_for_chunk (p);
_int_free (ar_ptr, p, 0);
}
@ -2981,7 +3163,10 @@ __libc_realloc (void *oldmem, size_t bytes)
if (chunk_is_mmapped (oldp))
ar_ptr = NULL;
else
ar_ptr = arena_for_chunk (oldp);
{
MAYBE_INIT_TCACHE ();
ar_ptr = arena_for_chunk (oldp);
}
/* Little security check which won't hurt performance: the allocator
never wrapps around at the end of the address space. Therefore
@ -3206,6 +3391,8 @@ __libc_calloc (size_t n, size_t elem_size)
sz = bytes;
MAYBE_INIT_TCACHE ();
arena_get (av, sz);
if (av)
{
@ -3336,6 +3523,10 @@ _int_malloc (mstate av, size_t bytes)
mchunkptr fwd; /* misc temp for linking */
mchunkptr bck; /* misc temp for linking */
#if USE_TCACHE
size_t tcache_unsorted_count; /* count of unsorted chunks processed */
#endif
const char *errstr = NULL;
/*
@ -3365,19 +3556,22 @@ _int_malloc (mstate av, size_t bytes)
can try it without checking, which saves some time on this fast path.
*/
#define REMOVE_FB(fb, victim, pp) \
do \
{ \
victim = pp; \
if (victim == NULL) \
break; \
} \
while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \
!= victim); \
if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
{
idx = fastbin_index (nb);
mfastbinptr *fb = &fastbin (av, idx);
mchunkptr pp = *fb;
do
{
victim = pp;
if (victim == NULL)
break;
}
while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
!= victim);
REMOVE_FB (fb, victim, pp);
if (victim != 0)
{
if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0))
@ -3388,6 +3582,26 @@ _int_malloc (mstate av, size_t bytes)
return NULL;
}
check_remalloced_chunk (av, victim, nb);
#if USE_TCACHE
/* While we're here, if we see other chunks of the same size,
stash them in the tcache. */
size_t tc_idx = csize2tidx (nb);
if (tcache && tc_idx < mp_.tcache_bins)
{
mchunkptr tc_victim;
/* While bin not empty and tcache not full, copy chunks over. */
while (tcache->counts[tc_idx] < mp_.tcache_count
&& (pp = *fb) != NULL)
{
REMOVE_FB (fb, tc_victim, pp);
if (tc_victim != 0)
{
tcache_put (tc_victim, tc_idx);
}
}
}
#endif
void *p = chunk2mem (victim);
alloc_perturb (p, bytes);
return p;
@ -3426,6 +3640,32 @@ _int_malloc (mstate av, size_t bytes)
if (av != &main_arena)
set_non_main_arena (victim);
check_malloced_chunk (av, victim, nb);
#if USE_TCACHE
/* While we're here, if we see other chunks of the same size,
stash them in the tcache. */
size_t tc_idx = csize2tidx (nb);
if (tcache && tc_idx < mp_.tcache_bins)
{
mchunkptr tc_victim;
/* While bin not empty and tcache not full, copy chunks over. */
while (tcache->counts[tc_idx] < mp_.tcache_count
&& (tc_victim = last (bin)) != bin)
{
if (tc_victim != 0)
{
bck = tc_victim->bk;
set_inuse_bit_at_offset (tc_victim, nb);
if (av != &main_arena)
set_non_main_arena (tc_victim);
bin->bk = bck;
bck->fd = bin;
tcache_put (tc_victim, tc_idx);
}
}
}
#endif
void *p = chunk2mem (victim);
alloc_perturb (p, bytes);
return p;
@ -3464,6 +3704,16 @@ _int_malloc (mstate av, size_t bytes)
otherwise need to expand memory to service a "small" request.
*/
#if USE_TCACHE
INTERNAL_SIZE_T tcache_nb = 0;
size_t tc_idx = csize2tidx (nb);
if (tcache && tc_idx < mp_.tcache_bins)
tcache_nb = nb;
int return_cached = 0;
tcache_unsorted_count = 0;
#endif
for (;; )
{
int iters = 0;
@ -3524,10 +3774,26 @@ _int_malloc (mstate av, size_t bytes)
set_inuse_bit_at_offset (victim, size);
if (av != &main_arena)
set_non_main_arena (victim);
#if USE_TCACHE
/* Fill cache first, return to user only if cache fills.
We may return one of these chunks later. */
if (tcache_nb
&& tcache->counts[tc_idx] < mp_.tcache_count)
{
tcache_put (victim, tc_idx);
return_cached = 1;
continue;
}
else
{
#endif
check_malloced_chunk (av, victim, nb);
void *p = chunk2mem (victim);
alloc_perturb (p, bytes);
return p;
#if USE_TCACHE
}
#endif
}
/* place chunk in bin */
@ -3594,11 +3860,31 @@ _int_malloc (mstate av, size_t bytes)
fwd->bk = victim;
bck->fd = victim;
#if USE_TCACHE
/* If we've processed as many chunks as we're allowed while
filling the cache, return one of the cached ones. */
++tcache_unsorted_count;
if (return_cached
&& mp_.tcache_unsorted_limit > 0
&& tcache_unsorted_count > mp_.tcache_unsorted_limit)
{
return tcache_get (tc_idx);
}
#endif
#define MAX_ITERS 10000
if (++iters >= MAX_ITERS)
break;
}
#if USE_TCACHE
/* If all the small chunks we found ended up cached, return one now. */
if (return_cached)
{
return tcache_get (tc_idx);
}
#endif
/*
If a large request, scan through the chunks of current bin in
sorted order to find smallest that fits. Use the skip list for this.
@ -3884,6 +4170,20 @@ _int_free (mstate av, mchunkptr p, int have_lock)
check_inuse_chunk(av, p);
#if USE_TCACHE
{
size_t tc_idx = csize2tidx (size);
if (tcache
&& tc_idx < mp_.tcache_bins
&& tcache->counts[tc_idx] < mp_.tcache_count)
{
tcache_put (p, tc_idx);
return;
}
}
#endif
/*
If eligible, place chunk on a fastbin so it can be found
and used quickly in malloc.
@ -4845,6 +5145,38 @@ do_set_arena_max (size_t value)
return 1;
}
#if USE_TCACHE
static inline int
__always_inline
do_set_tcache_max (size_t value)
{
if (value >= 0 && value <= MAX_TCACHE_SIZE)
{
LIBC_PROBE (memory_tunable_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
mp_.tcache_max_bytes = value;
mp_.tcache_bins = csize2tidx (request2size(value)) + 1;
}
return 1;
}
static inline int
__always_inline
do_set_tcache_count (size_t value)
{
LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count);
mp_.tcache_count = value;
return 1;
}
static inline int
__always_inline
do_set_tcache_unsorted_limit (size_t value)
{
LIBC_PROBE (memory_tunable_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
mp_.tcache_unsorted_limit = value;
return 1;
}
#endif
int
__libc_mallopt (int param_number, int value)