1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-08-01 10:06:57 +03:00

[x86] Add a feature bit: Fast_Unaligned_Copy

On AMD processors, memcpy optimized with unaligned SSE load is
slower than emcpy optimized with aligned SSSE3 while other string
functions are faster with unaligned SSE load.  A feature bit,
Fast_Unaligned_Copy, is added to select memcpy optimized with
unaligned SSE load.

	[BZ #19583]
	* sysdeps/x86/cpu-features.c (init_cpu_features): Set
	Fast_Unaligned_Copy with Fast_Unaligned_Load for Intel
	processors.  Set Fast_Copy_Backward for AMD Excavator
	processors.
	* sysdeps/x86/cpu-features.h (bit_arch_Fast_Unaligned_Copy):
	New.
	(index_arch_Fast_Unaligned_Copy): Likewise.
	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check
	Fast_Unaligned_Copy instead of Fast_Unaligned_Load.
This commit is contained in:
H.J. Lu
2016-03-28 04:39:48 -07:00
parent b66d837bb5
commit e41b395523
4 changed files with 31 additions and 2 deletions

View File

@ -1,3 +1,17 @@
2016-03-28 H.J. Lu <hongjiu.lu@intel.com>
Amit Pawar <Amit.Pawar@amd.com>
[BZ #19583]
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
Fast_Unaligned_Copy with Fast_Unaligned_Load for Intel
processors. Set Fast_Copy_Backward for AMD Excavator
processors.
* sysdeps/x86/cpu-features.h (bit_arch_Fast_Unaligned_Copy):
New.
(index_arch_Fast_Unaligned_Copy): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check
Fast_Unaligned_Copy instead of Fast_Unaligned_Load.
2016-03-25 Florian Weimer <fweimer@redhat.com> 2016-03-25 Florian Weimer <fweimer@redhat.com>
[BZ #19791] [BZ #19791]

View File

@ -152,9 +152,13 @@ init_cpu_features (struct cpu_features *cpu_features)
#endif #endif
#if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2 #if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
# error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2 # error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
#endif
#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
#endif #endif
cpu_features->feature[index_arch_Fast_Unaligned_Load] cpu_features->feature[index_arch_Fast_Unaligned_Load]
|= (bit_arch_Fast_Unaligned_Load |= (bit_arch_Fast_Unaligned_Load
| bit_arch_Fast_Unaligned_Copy
| bit_arch_Prefer_PMINUB_for_stringop | bit_arch_Prefer_PMINUB_for_stringop
| bit_arch_Slow_SSE4_2); | bit_arch_Slow_SSE4_2);
break; break;
@ -182,11 +186,15 @@ init_cpu_features (struct cpu_features *cpu_features)
#endif #endif
#if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop #if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
# error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop # error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
#endif
#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
#endif #endif
cpu_features->feature[index_arch_Fast_Rep_String] cpu_features->feature[index_arch_Fast_Rep_String]
|= (bit_arch_Fast_Rep_String |= (bit_arch_Fast_Rep_String
| bit_arch_Fast_Copy_Backward | bit_arch_Fast_Copy_Backward
| bit_arch_Fast_Unaligned_Load | bit_arch_Fast_Unaligned_Load
| bit_arch_Fast_Unaligned_Copy
| bit_arch_Prefer_PMINUB_for_stringop); | bit_arch_Prefer_PMINUB_for_stringop);
break; break;
} }
@ -220,10 +228,14 @@ init_cpu_features (struct cpu_features *cpu_features)
if (family == 0x15) if (family == 0x15)
{ {
#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
#endif
/* "Excavator" */ /* "Excavator" */
if (model >= 0x60 && model <= 0x7f) if (model >= 0x60 && model <= 0x7f)
cpu_features->feature[index_arch_Fast_Unaligned_Load] cpu_features->feature[index_arch_Fast_Unaligned_Load]
|= bit_arch_Fast_Unaligned_Load; |= (bit_arch_Fast_Unaligned_Load
| bit_arch_Fast_Copy_Backward);
} }
} }
else else

View File

@ -35,6 +35,7 @@
#define bit_arch_I686 (1 << 15) #define bit_arch_I686 (1 << 15)
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16) #define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17) #define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
#define bit_arch_Fast_Unaligned_Copy (1 << 18)
/* CPUID Feature flags. */ /* CPUID Feature flags. */
@ -101,6 +102,7 @@
# define index_arch_I686 FEATURE_INDEX_1*FEATURE_SIZE # define index_arch_I686 FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib) # if defined (_LIBC) && !IS_IN (nonlib)
@ -265,6 +267,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_I686 FEATURE_INDEX_1 # define index_arch_I686 FEATURE_INDEX_1
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
# define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */ #endif /* !__ASSEMBLER__ */

View File

@ -42,7 +42,7 @@ ENTRY(__new_memcpy)
HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
jnz 2f jnz 2f
lea __memcpy_sse2_unaligned(%rip), %RAX_LP lea __memcpy_sse2_unaligned(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Unaligned_Load) HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
jnz 2f jnz 2f
lea __memcpy_sse2(%rip), %RAX_LP lea __memcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3) HAS_CPU_FEATURE (SSSE3)