1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-08-05 19:35:52 +03:00

aarch64: Optimized memset for falkor

The generic memset reads dczid_el0 on every memset.  This has a
significant impact on falkor for a range of sizes because reading
dczid_el0 is slow.

The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup.  With this patch
dczid_el0 is read once during startup and zva_size is cached.  This is
used to invoke the falkor-specific memset; the generic memset routine
remains unchanged.

The gains due to this are significant for falkor, with run time
reductions as high as 48%.  Here's a sample from the falkor tests:

Function: memset
Variant: walk
                      simple_memset	__memset_falkor	__memset_generic
=====================================================================
length=256, char=0:   139.96 (-698.28%)	   9.07 ( 48.26%)  17.53
length=257, char=0:   140.50 (-699.03%)	   9.53 ( 45.80%)  17.58
length=258, char=0:   140.96 (-703.95%)	   9.58 ( 45.36%)  17.53
length=259, char=0:   141.56 (-705.16%)	   9.53 ( 45.79%)  17.58
length=260, char=0:   142.15 (-710.76%)	   9.57 ( 45.39%)  17.53
length=261, char=0:   142.50 (-710.39%)	   9.53 ( 45.78%)  17.58
length=262, char=0:   142.97 (-715.09%)	   9.57 ( 45.42%)  17.54
length=263, char=0:   143.51 (-716.18%)	   9.53 ( 45.80%)  17.58
length=264, char=0:   143.93 (-720.55%)	   9.58 ( 45.39%)  17.54
length=265, char=0:   144.56 (-722.07%)	   9.53 ( 45.80%)  17.59
length=266, char=0:   144.98 (-726.42%)	   9.58 ( 45.42%)  17.54
length=267, char=0:   145.53 (-727.53%)	   9.53 ( 45.80%)  17.59
length=268, char=0:   146.25 (-731.81%)	   9.53 ( 45.79%)  17.58
length=269, char=0:   146.52 (-735.39%)	   9.53 ( 45.66%)  17.54
length=270, char=0:   146.97 (-735.81%)	   9.53 ( 45.80%)  17.58
length=271, char=0:   147.54 (-741.08%)	   9.58 ( 45.38%)  17.54
length=512, char=0:   268.26 (-1307.85%)  12.06 ( 36.71%)  19.05
length=513, char=0:   268.73 (-1273.89%)  13.56 ( 30.68%)  19.56
length=514, char=0:   269.31 (-1276.89%)  13.56 ( 30.68%)  19.56
length=515, char=0:   269.73 (-1279.05%)  13.56 ( 30.68%)  19.56
length=516, char=0:   270.34 (-1282.24%)  13.56 ( 30.67%)  19.56
length=517, char=0:   270.83 (-1284.71%)  13.56 ( 30.66%)  19.56
length=518, char=0:   271.20 (-1286.54%)  13.56 ( 30.67%)  19.56
length=519, char=0:   271.67 (-1288.67%)  13.65 ( 30.24%)  19.56
length=520, char=0:   272.14 (-1291.04%)  13.65 ( 30.22%)  19.56
length=521, char=0:   272.66 (-1293.69%)  13.65 ( 30.23%)  19.56
length=522, char=0:   273.14 (-1296.13%)  13.65 ( 30.20%)  19.56
length=523, char=0:   273.64 (-1298.75%)  13.65 ( 30.23%)  19.56
length=524, char=0:   274.34 (-1302.16%)  13.66 ( 30.20%)  19.57
length=525, char=0:   274.64 (-1297.78%)  13.56 ( 30.99%)  19.65
length=526, char=0:   275.20 (-1300.04%)  13.56 ( 31.01%)  19.66
length=527, char=0:   275.66 (-1302.86%)  13.56 ( 30.99%)  19.65
length=1024, char=0:  524.46 (-2169.75%)  20.12 ( 12.92%)  23.11
length=1025, char=0:  525.14 (-2124.63%)  21.62 (  8.40%)  23.61
length=1026, char=0:  525.59 (-2125.36%)  21.88 (  7.37%)  23.62
length=1027, char=0:  525.98 (-2127.14%)  21.62 (  8.46%)  23.62
length=1028, char=0:  526.68 (-2131.10%)  21.62 (  8.42%)  23.61
length=1029, char=0:  527.10 (-2131.70%)  21.79 (  7.73%)  23.62
length=1030, char=0:  527.54 (-2118.51%)  21.62 (  9.10%)  23.78
length=1031, char=0:  527.98 (-2136.37%)  21.62 (  8.43%)  23.61
length=1032, char=0:  528.70 (-2139.38%)  21.62 (  8.43%)  23.61
length=1033, char=0:  529.25 (-2124.37%)  21.62 (  9.11%)  23.79
length=1034, char=0:  529.48 (-2142.95%)  21.62 (  8.43%)  23.61
length=1035, char=0:  530.11 (-2145.13%)  21.62 (  8.44%)  23.61
length=1036, char=0:  530.76 (-2147.10%)  21.79 (  7.73%)  23.62
length=1037, char=0:  531.03 (-2149.45%)  21.62 (  8.42%)  23.61
length=1038, char=0:  531.64 (-2151.87%)  21.62 (  8.42%)  23.61
length=1039, char=0:  531.99 (-2151.63%)  21.80 (  7.75%)  23.63

	* sysdeps/aarch64/memset-reg.h: New file.
	* sysdeps/aarch64/memset.S: Use it.
	(__memset): Rename to MEMSET macro.
	[ZVA_MACRO]: Use zva_macro.
	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memset_generic and memset_falkor.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add memset ifuncs.
	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
	local variable zva_size.
	* sysdeps/aarch64/multiarch/memset.c: New file.
	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
	* sysdeps/aarch64/multiarch/memset_falkor.S: New file.
	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
	(DCZID_DZP_MASK): New macro.
	(DCZID_BS_MASK): Likewise.
	(init_cpu_features): Read and set zva_size.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
	(struct cpu_features): New member zva_size.
This commit is contained in:
Siddhesh Poyarekar
2017-11-20 18:25:04 +05:30
parent eb332f9feb
commit 5a67c4fa01
12 changed files with 225 additions and 22 deletions

View File

@@ -1,5 +1,26 @@
2017-11-20 Siddhesh Poyarekar <siddhesh@sourceware.org> 2017-11-20 Siddhesh Poyarekar <siddhesh@sourceware.org>
* sysdeps/aarch64/memset-reg.h: New file.
* sysdeps/aarch64/memset.S: Use it.
(__memset): Rename to MEMSET macro.
[ZVA_MACRO]: Use zva_macro.
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
Add memset_generic and memset_falkor.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add memset ifuncs.
* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
local variable zva_size.
* sysdeps/aarch64/multiarch/memset.c: New file.
* sysdeps/aarch64/multiarch/memset_generic.S: New file.
* sysdeps/aarch64/multiarch/memset_falkor.S: New file.
* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
(DCZID_DZP_MASK): New macro.
(DCZID_BS_MASK): Likewise.
(init_cpu_features): Read and set zva_size.
* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
(struct cpu_features): New member zva_size.
* benchtests/bench-memcpy-walk.c (START_SIZE): Set to 128. * benchtests/bench-memcpy-walk.c (START_SIZE): Set to 128.
* benchtests/bench-memmove-walk.c (START_SIZE): Likewise. * benchtests/bench-memmove-walk.c (START_SIZE): Likewise.
* benchtests/bench-memset-walk.c (START_SIZE): Likewise. * benchtests/bench-memset-walk.c (START_SIZE): Likewise.

View File

@@ -0,0 +1,30 @@
/* Register aliases for memset to be used across implementations.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#define dstin x0
#define val x1
#define valw w1
#define count x2
#define dst x3
#define dstend x4
#define tmp1 x5
#define tmp1w w5
#define tmp2 x6
#define tmp2w w6
#define zva_len x7
#define zva_lenw w7

View File

@@ -17,6 +17,7 @@
<http://www.gnu.org/licenses/>. */ <http://www.gnu.org/licenses/>. */
#include <sysdep.h> #include <sysdep.h>
#include "memset-reg.h"
/* Assumptions: /* Assumptions:
* *
@@ -24,20 +25,7 @@
* *
*/ */
#define dstin x0 ENTRY_ALIGN (MEMSET, 6)
#define val x1
#define valw w1
#define count x2
#define dst x3
#define dstend x4
#define tmp1 x5
#define tmp1w w5
#define tmp2 x6
#define tmp2w w6
#define zva_len x7
#define zva_lenw w7
ENTRY_ALIGN (__memset, 6)
DELOUSE (0) DELOUSE (0)
DELOUSE (2) DELOUSE (2)
@@ -108,8 +96,11 @@ L(tail64):
stp q0, q0, [dstend, -32] stp q0, q0, [dstend, -32]
ret ret
.p2align 3
L(try_zva): L(try_zva):
#ifdef ZVA_MACRO
zva_macro
#else
.p2align 3
mrs tmp1, dczid_el0 mrs tmp1, dczid_el0
tbnz tmp1w, 4, L(no_zva) tbnz tmp1w, 4, L(no_zva)
and tmp1w, tmp1w, 15 and tmp1w, tmp1w, 15
@@ -189,7 +180,7 @@ L(zva_other):
b.hs 3b b.hs 3b
4: add count, count, zva_len 4: add count, count, zva_len
b L(tail64) b L(tail64)
#endif
END (__memset) END (MEMSET)
weak_alias (__memset, memset) libc_hidden_builtin_def (MEMSET)
libc_hidden_builtin_def (memset)

View File

@@ -1,4 +1,4 @@
ifeq ($(subdir),string) ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
memmove_falkor memmove_falkor memset_generic memset_falkor
endif endif

View File

@@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
/* Enable this on non-falkor processors too so that other cores
can do a comparative analysis with __memset_generic. */
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
return i; return i;
} }

View File

@@ -18,6 +18,8 @@
#include <ldsodefs.h> #include <ldsodefs.h>
#define INIT_ARCH() \ #define INIT_ARCH() \
uint64_t __attribute__((unused)) midr = \ uint64_t __attribute__((unused)) midr = \
GLRO(dl_aarch64_cpu_features).midr_el1; GLRO(dl_aarch64_cpu_features).midr_el1; \
unsigned __attribute__((unused)) zva_size = \
GLRO(dl_aarch64_cpu_features).zva_size;

View File

@@ -0,0 +1,40 @@
/* Multiple versions of memset. AARCH64 version.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* Define multiple versions only for the definition in libc. */
#if IS_IN (libc)
/* Redefine memset so that the compiler won't complain about the type
mismatch with the IFUNC selector in strong_alias, below. */
# undef memset
# define memset __redirect_memset
# include <string.h>
# include <init-arch.h>
extern __typeof (__redirect_memset) __libc_memset;
extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
libc_ifunc (__libc_memset, (IS_FALKOR (midr) && zva_size == 64
? __memset_falkor
: __memset_generic));
# undef memset
strong_alias (__libc_memset, memset);
#endif

View File

@@ -0,0 +1,53 @@
/* Memset for falkor.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <memset-reg.h>
/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
use this function only when ZVA is enabled. */
#if IS_IN (libc)
.macro zva_macro
.p2align 4
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA. This is faster on some cores. */
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
add dst, dst, 128
1: dc zva, dst
add dst, dst, 64
subs count, count, 64
b.hi 1b
stp q0, q0, [dst, 0]
stp q0, q0, [dst, 32]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.endm
# define ZVA_MACRO zva_macro
# define MEMSET __memset_falkor
# include <sysdeps/aarch64/memset.S>
#endif

View File

@@ -0,0 +1,27 @@
/* Memset for aarch64, default version for internal use.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# define MEMSET __memset_generic
/* Add a hidden definition for use within libc.so. */
# ifdef SHARED
.globl __GI_memset; __GI_memset = __memset_generic
# endif
# include <sysdeps/aarch64/memset.S>
#endif

View File

@@ -0,0 +1,23 @@
/* Memset for aarch64, for the dynamic linker.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#if IS_IN (rtld)
# define MEMSET memset
# include <sysdeps/aarch64/memset.S>
#endif

View File

@@ -20,6 +20,9 @@
#include <sys/auxv.h> #include <sys/auxv.h>
#include <elf/dl-hwcaps.h> #include <elf/dl-hwcaps.h>
#define DCZID_DZP_MASK (1 << 4)
#define DCZID_BS_MASK (0xf)
#if HAVE_TUNABLES #if HAVE_TUNABLES
struct cpu_list struct cpu_list
{ {
@@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
} }
cpu_features->midr_el1 = midr; cpu_features->midr_el1 = midr;
/* Check if ZVA is enabled. */
unsigned dczid;
asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
if ((dczid & DCZID_DZP_MASK) == 0)
cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
} }

View File

@@ -47,6 +47,7 @@
struct cpu_features struct cpu_features
{ {
uint64_t midr_el1; uint64_t midr_el1;
unsigned zva_size;
}; };
#endif /* _CPU_FEATURES_AARCH64_H */ #endif /* _CPU_FEATURES_AARCH64_H */