mirror of
https://sourceware.org/git/glibc.git
synced 2025-08-05 19:35:52 +03:00
aarch64: Optimized memset for falkor
The generic memset reads dczid_el0 on every memset. This has a significant impact on falkor for a range of sizes because reading dczid_el0 is slow. The DZP bit in the dczid_el0 register does not change dynamically, so it is safe to read once during program startup. With this patch dczid_el0 is read once during startup and zva_size is cached. This is used to invoke the falkor-specific memset; the generic memset routine remains unchanged. The gains due to this are significant for falkor, with run time reductions as high as 48%. Here's a sample from the falkor tests: Function: memset Variant: walk simple_memset __memset_falkor __memset_generic ===================================================================== length=256, char=0: 139.96 (-698.28%) 9.07 ( 48.26%) 17.53 length=257, char=0: 140.50 (-699.03%) 9.53 ( 45.80%) 17.58 length=258, char=0: 140.96 (-703.95%) 9.58 ( 45.36%) 17.53 length=259, char=0: 141.56 (-705.16%) 9.53 ( 45.79%) 17.58 length=260, char=0: 142.15 (-710.76%) 9.57 ( 45.39%) 17.53 length=261, char=0: 142.50 (-710.39%) 9.53 ( 45.78%) 17.58 length=262, char=0: 142.97 (-715.09%) 9.57 ( 45.42%) 17.54 length=263, char=0: 143.51 (-716.18%) 9.53 ( 45.80%) 17.58 length=264, char=0: 143.93 (-720.55%) 9.58 ( 45.39%) 17.54 length=265, char=0: 144.56 (-722.07%) 9.53 ( 45.80%) 17.59 length=266, char=0: 144.98 (-726.42%) 9.58 ( 45.42%) 17.54 length=267, char=0: 145.53 (-727.53%) 9.53 ( 45.80%) 17.59 length=268, char=0: 146.25 (-731.81%) 9.53 ( 45.79%) 17.58 length=269, char=0: 146.52 (-735.39%) 9.53 ( 45.66%) 17.54 length=270, char=0: 146.97 (-735.81%) 9.53 ( 45.80%) 17.58 length=271, char=0: 147.54 (-741.08%) 9.58 ( 45.38%) 17.54 length=512, char=0: 268.26 (-1307.85%) 12.06 ( 36.71%) 19.05 length=513, char=0: 268.73 (-1273.89%) 13.56 ( 30.68%) 19.56 length=514, char=0: 269.31 (-1276.89%) 13.56 ( 30.68%) 19.56 length=515, char=0: 269.73 (-1279.05%) 13.56 ( 30.68%) 19.56 length=516, char=0: 270.34 (-1282.24%) 13.56 ( 30.67%) 19.56 length=517, char=0: 270.83 (-1284.71%) 13.56 ( 30.66%) 19.56 length=518, char=0: 271.20 (-1286.54%) 13.56 ( 30.67%) 19.56 length=519, char=0: 271.67 (-1288.67%) 13.65 ( 30.24%) 19.56 length=520, char=0: 272.14 (-1291.04%) 13.65 ( 30.22%) 19.56 length=521, char=0: 272.66 (-1293.69%) 13.65 ( 30.23%) 19.56 length=522, char=0: 273.14 (-1296.13%) 13.65 ( 30.20%) 19.56 length=523, char=0: 273.64 (-1298.75%) 13.65 ( 30.23%) 19.56 length=524, char=0: 274.34 (-1302.16%) 13.66 ( 30.20%) 19.57 length=525, char=0: 274.64 (-1297.78%) 13.56 ( 30.99%) 19.65 length=526, char=0: 275.20 (-1300.04%) 13.56 ( 31.01%) 19.66 length=527, char=0: 275.66 (-1302.86%) 13.56 ( 30.99%) 19.65 length=1024, char=0: 524.46 (-2169.75%) 20.12 ( 12.92%) 23.11 length=1025, char=0: 525.14 (-2124.63%) 21.62 ( 8.40%) 23.61 length=1026, char=0: 525.59 (-2125.36%) 21.88 ( 7.37%) 23.62 length=1027, char=0: 525.98 (-2127.14%) 21.62 ( 8.46%) 23.62 length=1028, char=0: 526.68 (-2131.10%) 21.62 ( 8.42%) 23.61 length=1029, char=0: 527.10 (-2131.70%) 21.79 ( 7.73%) 23.62 length=1030, char=0: 527.54 (-2118.51%) 21.62 ( 9.10%) 23.78 length=1031, char=0: 527.98 (-2136.37%) 21.62 ( 8.43%) 23.61 length=1032, char=0: 528.70 (-2139.38%) 21.62 ( 8.43%) 23.61 length=1033, char=0: 529.25 (-2124.37%) 21.62 ( 9.11%) 23.79 length=1034, char=0: 529.48 (-2142.95%) 21.62 ( 8.43%) 23.61 length=1035, char=0: 530.11 (-2145.13%) 21.62 ( 8.44%) 23.61 length=1036, char=0: 530.76 (-2147.10%) 21.79 ( 7.73%) 23.62 length=1037, char=0: 531.03 (-2149.45%) 21.62 ( 8.42%) 23.61 length=1038, char=0: 531.64 (-2151.87%) 21.62 ( 8.42%) 23.61 length=1039, char=0: 531.99 (-2151.63%) 21.80 ( 7.75%) 23.63 * sysdeps/aarch64/memset-reg.h: New file. * sysdeps/aarch64/memset.S: Use it. (__memset): Rename to MEMSET macro. [ZVA_MACRO]: Use zva_macro. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memset_generic and memset_falkor. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memset ifuncs. * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New local variable zva_size. * sysdeps/aarch64/multiarch/memset.c: New file. * sysdeps/aarch64/multiarch/memset_generic.S: New file. * sysdeps/aarch64/multiarch/memset_falkor.S: New file. * sysdeps/aarch64/multiarch/rtld-memset.S: New file. * sysdeps/unix/sysv/linux/aarch64/cpu-features.c (DCZID_DZP_MASK): New macro. (DCZID_BS_MASK): Likewise. (init_cpu_features): Read and set zva_size. * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (struct cpu_features): New member zva_size.
This commit is contained in:
21
ChangeLog
21
ChangeLog
@@ -1,5 +1,26 @@
|
||||
2017-11-20 Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
|
||||
* sysdeps/aarch64/memset-reg.h: New file.
|
||||
* sysdeps/aarch64/memset.S: Use it.
|
||||
(__memset): Rename to MEMSET macro.
|
||||
[ZVA_MACRO]: Use zva_macro.
|
||||
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
|
||||
Add memset_generic and memset_falkor.
|
||||
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||
(__libc_ifunc_impl_list): Add memset ifuncs.
|
||||
* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
|
||||
local variable zva_size.
|
||||
* sysdeps/aarch64/multiarch/memset.c: New file.
|
||||
* sysdeps/aarch64/multiarch/memset_generic.S: New file.
|
||||
* sysdeps/aarch64/multiarch/memset_falkor.S: New file.
|
||||
* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
|
||||
* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
|
||||
(DCZID_DZP_MASK): New macro.
|
||||
(DCZID_BS_MASK): Likewise.
|
||||
(init_cpu_features): Read and set zva_size.
|
||||
* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
|
||||
(struct cpu_features): New member zva_size.
|
||||
|
||||
* benchtests/bench-memcpy-walk.c (START_SIZE): Set to 128.
|
||||
* benchtests/bench-memmove-walk.c (START_SIZE): Likewise.
|
||||
* benchtests/bench-memset-walk.c (START_SIZE): Likewise.
|
||||
|
30
sysdeps/aarch64/memset-reg.h
Normal file
30
sysdeps/aarch64/memset-reg.h
Normal file
@@ -0,0 +1,30 @@
|
||||
/* Register aliases for memset to be used across implementations.
|
||||
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#define dstin x0
|
||||
#define val x1
|
||||
#define valw w1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define dstend x4
|
||||
#define tmp1 x5
|
||||
#define tmp1w w5
|
||||
#define tmp2 x6
|
||||
#define tmp2w w6
|
||||
#define zva_len x7
|
||||
#define zva_lenw w7
|
@@ -17,6 +17,7 @@
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include "memset-reg.h"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
@@ -24,20 +25,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define val x1
|
||||
#define valw w1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define dstend x4
|
||||
#define tmp1 x5
|
||||
#define tmp1w w5
|
||||
#define tmp2 x6
|
||||
#define tmp2w w6
|
||||
#define zva_len x7
|
||||
#define zva_lenw w7
|
||||
|
||||
ENTRY_ALIGN (__memset, 6)
|
||||
ENTRY_ALIGN (MEMSET, 6)
|
||||
|
||||
DELOUSE (0)
|
||||
DELOUSE (2)
|
||||
@@ -108,8 +96,11 @@ L(tail64):
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
L(try_zva):
|
||||
#ifdef ZVA_MACRO
|
||||
zva_macro
|
||||
#else
|
||||
.p2align 3
|
||||
mrs tmp1, dczid_el0
|
||||
tbnz tmp1w, 4, L(no_zva)
|
||||
and tmp1w, tmp1w, 15
|
||||
@@ -189,7 +180,7 @@ L(zva_other):
|
||||
b.hs 3b
|
||||
4: add count, count, zva_len
|
||||
b L(tail64)
|
||||
#endif
|
||||
|
||||
END (__memset)
|
||||
weak_alias (__memset, memset)
|
||||
libc_hidden_builtin_def (memset)
|
||||
END (MEMSET)
|
||||
libc_hidden_builtin_def (MEMSET)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
|
||||
memmove_falkor
|
||||
memmove_falkor memset_generic memset_falkor
|
||||
endif
|
||||
|
@@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
|
||||
IFUNC_IMPL (i, name, memset,
|
||||
/* Enable this on non-falkor processors too so that other cores
|
||||
can do a comparative analysis with __memset_generic. */
|
||||
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
|
||||
|
||||
return i;
|
||||
}
|
||||
|
@@ -20,4 +20,6 @@
|
||||
|
||||
#define INIT_ARCH() \
|
||||
uint64_t __attribute__((unused)) midr = \
|
||||
GLRO(dl_aarch64_cpu_features).midr_el1;
|
||||
GLRO(dl_aarch64_cpu_features).midr_el1; \
|
||||
unsigned __attribute__((unused)) zva_size = \
|
||||
GLRO(dl_aarch64_cpu_features).zva_size;
|
||||
|
40
sysdeps/aarch64/multiarch/memset.c
Normal file
40
sysdeps/aarch64/multiarch/memset.c
Normal file
@@ -0,0 +1,40 @@
|
||||
/* Multiple versions of memset. AARCH64 version.
|
||||
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Define multiple versions only for the definition in libc. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
/* Redefine memset so that the compiler won't complain about the type
|
||||
mismatch with the IFUNC selector in strong_alias, below. */
|
||||
# undef memset
|
||||
# define memset __redirect_memset
|
||||
# include <string.h>
|
||||
# include <init-arch.h>
|
||||
|
||||
extern __typeof (__redirect_memset) __libc_memset;
|
||||
|
||||
extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
|
||||
|
||||
libc_ifunc (__libc_memset, (IS_FALKOR (midr) && zva_size == 64
|
||||
? __memset_falkor
|
||||
: __memset_generic));
|
||||
|
||||
# undef memset
|
||||
strong_alias (__libc_memset, memset);
|
||||
#endif
|
53
sysdeps/aarch64/multiarch/memset_falkor.S
Normal file
53
sysdeps/aarch64/multiarch/memset_falkor.S
Normal file
@@ -0,0 +1,53 @@
|
||||
/* Memset for falkor.
|
||||
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <memset-reg.h>
|
||||
|
||||
/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
|
||||
resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
|
||||
use this function only when ZVA is enabled. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
.macro zva_macro
|
||||
.p2align 4
|
||||
/* Write the first and last 64 byte aligned block using stp rather
|
||||
than using DC ZVA. This is faster on some cores. */
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dst, 63
|
||||
stp q0, q0, [dst, 64]
|
||||
stp q0, q0, [dst, 96]
|
||||
sub count, dstend, dst /* Count is now 128 too large. */
|
||||
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
|
||||
add dst, dst, 128
|
||||
1: dc zva, dst
|
||||
add dst, dst, 64
|
||||
subs count, count, 64
|
||||
b.hi 1b
|
||||
stp q0, q0, [dst, 0]
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
.endm
|
||||
|
||||
# define ZVA_MACRO zva_macro
|
||||
# define MEMSET __memset_falkor
|
||||
# include <sysdeps/aarch64/memset.S>
|
||||
#endif
|
27
sysdeps/aarch64/multiarch/memset_generic.S
Normal file
27
sysdeps/aarch64/multiarch/memset_generic.S
Normal file
@@ -0,0 +1,27 @@
|
||||
/* Memset for aarch64, default version for internal use.
|
||||
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
# define MEMSET __memset_generic
|
||||
/* Add a hidden definition for use within libc.so. */
|
||||
# ifdef SHARED
|
||||
.globl __GI_memset; __GI_memset = __memset_generic
|
||||
# endif
|
||||
# include <sysdeps/aarch64/memset.S>
|
||||
#endif
|
23
sysdeps/aarch64/multiarch/rtld-memset.S
Normal file
23
sysdeps/aarch64/multiarch/rtld-memset.S
Normal file
@@ -0,0 +1,23 @@
|
||||
/* Memset for aarch64, for the dynamic linker.
|
||||
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (rtld)
|
||||
# define MEMSET memset
|
||||
# include <sysdeps/aarch64/memset.S>
|
||||
#endif
|
@@ -20,6 +20,9 @@
|
||||
#include <sys/auxv.h>
|
||||
#include <elf/dl-hwcaps.h>
|
||||
|
||||
#define DCZID_DZP_MASK (1 << 4)
|
||||
#define DCZID_BS_MASK (0xf)
|
||||
|
||||
#if HAVE_TUNABLES
|
||||
struct cpu_list
|
||||
{
|
||||
@@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
}
|
||||
|
||||
cpu_features->midr_el1 = midr;
|
||||
|
||||
/* Check if ZVA is enabled. */
|
||||
unsigned dczid;
|
||||
asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
|
||||
|
||||
if ((dczid & DCZID_DZP_MASK) == 0)
|
||||
cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
|
||||
}
|
||||
|
@@ -47,6 +47,7 @@
|
||||
struct cpu_features
|
||||
{
|
||||
uint64_t midr_el1;
|
||||
unsigned zva_size;
|
||||
};
|
||||
|
||||
#endif /* _CPU_FEATURES_AARCH64_H */
|
||||
|
Reference in New Issue
Block a user