1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-07-28 00:21:52 +03:00

nptl: Invert the mmap/mprotect logic on allocated stacks (BZ#18988)

Current allocate_stack logic for create stacks is to first mmap all
the required memory with the desirable memory and then mprotect the
guard area with PROT_NONE if required.  Although it works as expected,
it pessimizes the allocation because it requires the kernel to actually
increase commit charge (it counts against the available physical/swap
memory available for the system).

The only issue is to actually check this change since side-effects are
really Linux specific and to actually account them it would require a
kernel specific tests to parse the system wide information.  On the kernel
I checked /proc/self/statm does not show any meaningful difference for
vmm and/or rss before and after thread creation.  I could only see
really meaningful information checking on system wide /proc/meminfo
between thread creation: MemFree, MemAvailable, and Committed_AS shows
large difference without the patch.  I think trying to use these
kind of information on a testcase is fragile.

The BZ#18988 reports shows that the commit pages are easily seen with
mlockall (MCL_FUTURE) (with lock all pages that become mapped in the
process) however a more straighfoward testcase shows that pthread_create
could be faster using this patch:

--
static const int inner_count = 256;
static const int outer_count = 128;

static
void *thread1(void *arg)
{
  return NULL;
}

static
void *sleeper(void *arg)
{
  pthread_t ts[inner_count];
  for (int i = 0; i < inner_count; i++)
    pthread_create (&ts[i], &a, thread1, NULL);
  for (int i = 0; i < inner_count; i++)
    pthread_join (ts[i], NULL);

  return NULL;
}

int main(void)
{
  pthread_attr_init(&a);
  pthread_attr_setguardsize(&a, 1<<20);
  pthread_attr_setstacksize(&a, 1134592);

  pthread_t ts[outer_count];
  for (int i = 0; i < outer_count; i++)
    pthread_create(&ts[i], &a, sleeper, NULL);
  for (int i = 0; i < outer_count; i++)
    pthread_join(ts[i], NULL);
    assert(r == 0);
  }
  return 0;
}

--

On x86_64 (4.4.0-45-generic, gcc 5.4.0) running the small benchtests
I see:

$ time ./test

real	0m3.647s
user	0m0.080s
sys	0m11.836s

While with the patch I see:

$ time ./test

real	0m0.696s
user	0m0.040s
sys	0m1.152s

So I added a pthread_create benchtest (thread_create) which check
the thread creation latency.  As for the simple benchtests, I saw
improvements in thread creation on all architectures I tested the
change.

Checked on x86_64-linux-gnu, i686-linux-gnu, aarch64-linux-gnu,
arm-linux-gnueabihf, powerpc64le-linux-gnu, sparc64-linux-gnu,
and sparcv9-linux-gnu.

	[BZ #18988]
	* benchtests/thread_create-inputs: New file.
	* benchtests/thread_create-source.c: Likewise.
	* support/xpthread_attr_setguardsize.c: Likewise.
	* support/Makefile (libsupport-routines): Add
	xpthread_attr_setguardsize object.
	* support/xthread.h: Add xpthread_attr_setguardsize prototype.
	* benchtests/Makefile (bench-pthread): Add thread_create.
	* nptl/allocatestack.c (allocate_stack): Call mmap with PROT_NONE and
	then mprotect the required area.
This commit is contained in:
Adhemerval Zanella
2017-01-31 18:01:59 -02:00
parent 5c3e322d3b
commit 0edbf12301
8 changed files with 175 additions and 9 deletions

View File

@ -334,6 +334,43 @@ change_stack_perm (struct pthread *pd
return 0;
}
/* Return the guard page position on allocated stack. */
static inline char *
__attribute ((always_inline))
guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
size_t pagesize_m1)
{
#ifdef NEED_SEPARATE_REGISTER_STACK
return mem + (((size - guardsize) / 2) & ~pagesize_m1);
#elif _STACK_GROWS_DOWN
return mem;
#elif _STACK_GROWS_UP
return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
#endif
}
/* Based on stack allocated with PROT_NONE, setup the required portions with
'prot' flags based on the guard page position. */
static inline int
setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
const int prot)
{
char *guardend = guard + guardsize;
#if _STACK_GROWS_DOWN
/* As defined at guard_position, for architectures with downward stack
the guard page is always at start of the allocated area. */
if (mprotect (guardend, size - guardsize, prot) != 0)
return errno;
#else
size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
if (mprotect (mem, mprots1, prot) != 0)
return errno;
size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
if (mprotect (guardend, mprots2, prot) != 0)
return errno;
#endif
return 0;
}
/* Returns a usable stack for a new thread either by allocating a
new stack or reusing a cached stack of sufficient size.
@ -490,7 +527,10 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
size += pagesize_m1 + 1;
#endif
mem = mmap (NULL, size, prot,
/* If a guard page is required, avoid committing memory by first
allocate with PROT_NONE and then reserve with required permission
excluding the guard page. */
mem = mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (__glibc_unlikely (mem == MAP_FAILED))
@ -510,9 +550,24 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
- TLS_PRE_TCB_SIZE);
#endif
/* Now mprotect the required region excluding the guard area. */
if (__glibc_likely (guardsize > 0))
{
char *guard = guard_position (mem, size, guardsize, pd,
pagesize_m1);
if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
{
munmap (mem, size);
return errno;
}
}
/* Remember the stack-related values. */
pd->stackblock = mem;
pd->stackblock_size = size;
/* Update guardsize for newly allocated guardsize to avoid
an mprotect in guard resize below. */
pd->guardsize = guardsize;
/* We allocated the first block thread-specific data array.
This address will not change for the lifetime of this
@ -593,13 +648,8 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
/* Create or resize the guard area if necessary. */
if (__glibc_unlikely (guardsize > pd->guardsize))
{
#ifdef NEED_SEPARATE_REGISTER_STACK
char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
#elif _STACK_GROWS_DOWN
char *guard = mem;
#elif _STACK_GROWS_UP
char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
#endif
char *guard = guard_position (mem, size, guardsize, pd,
pagesize_m1);
if (mprotect (guard, guardsize, PROT_NONE) != 0)
{
mprot_error: