1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-07-29 11:41:21 +03:00

Improve 64 bit strcat functions with SSE2/SSSE3

This commit is contained in:
Liubov Dmitrieva
2011-07-19 17:11:54 -04:00
committed by Ulrich Drepper
parent 7dc6bd90c5
commit 99710781cc
18 changed files with 1523 additions and 321 deletions

View File

@ -1,3 +1,32 @@
2011-07-15 Liubov Dmitrieva <liubov.dmitrieva@intel.com>
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strcat-ssse3 strcat-sse2-unaligned strncat-ssse3
strncat-sse2-unaligned strncat-c strlen-sse2-pminub
* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: New file.
* sysdeps/x86_64/multiarch/strcat.S: New file.
* sysdeps/x86_64/multiarch/strncat.S: New file.
* sysdeps/x86_64/multiarch/strncat-c.c: New file.
* sysdeps/x86_64/multiarch/strcat-ssse3.S: New file.
* sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S: New file.
* sysdeps/x86_64/multiarch/strncat-ssse3.S: New file.
* sysdeps/x86_64/multiarch/strcpy-ssse3.S
(USE_AS_STRCAT): Define.
Add strcat and strncat support.
* sysdeps/x86_64/multiarch/strlen-no-bsf.S: Likewise.
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
* sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: New file.
* string/strncat.c: Update.
(USE_AS_STRNCAT): Define.
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
Turn on bit_Prefer_PMINUB_for_stringop for Intel Core i3, i5
and i7.
* sysdeps/x86_64/multiarch/init-arch.h
(bit_Prefer_PMINUB_for_stringop): New.
(index_Prefer_PMINUB_for_stringop): Likewise.
* sysdeps/x86_64/multiarch/strlen.S (strlen): Check
bit_Prefer_PMINUB_for_stringop.
2011-07-19 Ulrich Drepper <drepper@gmail.com> 2011-07-19 Ulrich Drepper <drepper@gmail.com>
* crypt/sha512.h (struct sha512_ctx): Move buffer into union and add * crypt/sha512.h (struct sha512_ctx): Move buffer into union and add

5
NEWS
View File

@ -1,4 +1,4 @@
GNU C Library NEWS -- history of user-visible changes. 2011-7-6 GNU C Library NEWS -- history of user-visible changes. 2011-7-19
Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc. Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc.
See the end for copying conditions. See the end for copying conditions.
@ -23,6 +23,9 @@ Version 2.15
* Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64. * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
Contributed by HJ Lu. Contributed by HJ Lu.
* Improved strcat and strncat on x86-64.
Contributed by Liubov Dmitrieva.
Version 2.14 Version 2.14

View File

@ -24,10 +24,12 @@
typedef char reg_char; typedef char reg_char;
#endif #endif
#undef strncat #ifndef STRNCAT
# define STRNCAT strncat
#endif
char * char *
strncat (s1, s2, n) STRNCAT (s1, s2, n)
char *s1; char *s1;
const char *s2; const char *s2;
size_t n; size_t n;

View File

@ -5,14 +5,16 @@ endif
ifeq ($(subdir),string) ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \ strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \ strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub
ifeq (yes,$(config-cflags-sse4)) ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4 CFLAGS-varshift.c += -msse4

View File

@ -97,18 +97,22 @@ __init_cpu_features (void)
case 0x2c: case 0x2c:
case 0x2e: case 0x2e:
case 0x2f: case 0x2f:
/* Rep string instructions, copy backward and unaligned loads /* Rep string instructions, copy backward, unaligned loads
are fast on Intel Core i3, i5 and i7. */ and pminub are fast on Intel Core i3, i5 and i7. */
#if index_Fast_Rep_String != index_Fast_Copy_Backward #if index_Fast_Rep_String != index_Fast_Copy_Backward
# error index_Fast_Rep_String != index_Fast_Copy_Backward # error index_Fast_Rep_String != index_Fast_Copy_Backward
#endif #endif
#if index_Fast_Rep_String != index_Fast_Unaligned_Load #if index_Fast_Rep_String != index_Fast_Unaligned_Load
# error index_Fast_Rep_String != index_Fast_Unaligned_Load # error index_Fast_Rep_String != index_Fast_Unaligned_Load
#endif
#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
#endif #endif
__cpu_features.feature[index_Fast_Rep_String] __cpu_features.feature[index_Fast_Rep_String]
|= (bit_Fast_Rep_String |= (bit_Fast_Rep_String
| bit_Fast_Copy_Backward | bit_Fast_Copy_Backward
| bit_Fast_Unaligned_Load); | bit_Fast_Unaligned_Load
| bit_Prefer_PMINUB_for_stringop);
break; break;
} }
} }

View File

@ -21,6 +21,7 @@
#define bit_Slow_BSF (1 << 2) #define bit_Slow_BSF (1 << 2)
#define bit_Prefer_SSE_for_memop (1 << 3) #define bit_Prefer_SSE_for_memop (1 << 3)
#define bit_Fast_Unaligned_Load (1 << 4) #define bit_Fast_Unaligned_Load (1 << 4)
#define bit_Prefer_PMINUB_for_stringop (1 << 5)
#ifdef __ASSEMBLER__ #ifdef __ASSEMBLER__
@ -41,6 +42,7 @@
# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE
# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE
# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE # define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */ #else /* __ASSEMBLER__ */

View File

@ -0,0 +1,55 @@
/* strcat with SSE2
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#ifndef NOT_IN_libc
# include <sysdep.h>
# ifndef STRCAT
# define STRCAT __strcat_sse2_unaligned
# endif
# define USE_AS_STRCAT
.text
ENTRY (STRCAT)
mov %rdi, %r9
# ifdef USE_AS_STRNCAT
mov %rdx, %r8
# endif
# define RETURN jmp L(StartStrcpyPart)
# include "strlen-sse2-pminub.S"
# undef RETURN
L(StartStrcpyPart):
lea (%r9, %rax), %rdi
mov %rsi, %rcx
mov %r9, %rax /* save result */
# ifdef USE_AS_STRNCAT
test %r8, %r8
jz L(ExitZero)
# define USE_AS_STRNCPY
# endif
# include "strcpy-sse2-unaligned.S"
#endif

View File

@ -0,0 +1,559 @@
/* strcat with SSSE3
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#ifndef NOT_IN_libc
# include <sysdep.h>
# ifndef STRCAT
# define STRCAT __strcat_ssse3
# endif
# define USE_AS_STRCAT
.text
ENTRY (STRCAT)
# ifdef USE_AS_STRNCAT
mov %rdx, %r8
# endif
# define RETURN jmp L(StartStrcpyPart)
# include "strlen-no-bsf.S"
# undef RETURN
L(StartStrcpyPart):
mov %rsi, %rcx
lea (%rdi, %rax), %rdx
# ifdef USE_AS_STRNCAT
test %r8, %r8
jz L(StrncatExit0)
cmp $8, %r8
jbe L(StrncatExit8Bytes)
# endif
cmpb $0, (%rcx)
jz L(Exit1)
cmpb $0, 1(%rcx)
jz L(Exit2)
cmpb $0, 2(%rcx)
jz L(Exit3)
cmpb $0, 3(%rcx)
jz L(Exit4)
cmpb $0, 4(%rcx)
jz L(Exit5)
cmpb $0, 5(%rcx)
jz L(Exit6)
cmpb $0, 6(%rcx)
jz L(Exit7)
cmpb $0, 7(%rcx)
jz L(Exit8)
cmpb $0, 8(%rcx)
jz L(Exit9)
# ifdef USE_AS_STRNCAT
cmp $16, %r8
jb L(StrncatExit15Bytes)
# endif
cmpb $0, 9(%rcx)
jz L(Exit10)
cmpb $0, 10(%rcx)
jz L(Exit11)
cmpb $0, 11(%rcx)
jz L(Exit12)
cmpb $0, 12(%rcx)
jz L(Exit13)
cmpb $0, 13(%rcx)
jz L(Exit14)
cmpb $0, 14(%rcx)
jz L(Exit15)
cmpb $0, 15(%rcx)
jz L(Exit16)
# ifdef USE_AS_STRNCAT
cmp $16, %r8
je L(StrncatExit16)
# define USE_AS_STRNCPY
# endif
# include "strcpy-ssse3.S"
.p2align 4
L(CopyFrom1To16Bytes):
add %rsi, %rdx
add %rsi, %rcx
test %al, %al
jz L(ExitHigh)
test $0x01, %al
jnz L(Exit1)
test $0x02, %al
jnz L(Exit2)
test $0x04, %al
jnz L(Exit3)
test $0x08, %al
jnz L(Exit4)
test $0x10, %al
jnz L(Exit5)
test $0x20, %al
jnz L(Exit6)
test $0x40, %al
jnz L(Exit7)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(ExitHigh):
test $0x01, %ah
jnz L(Exit9)
test $0x02, %ah
jnz L(Exit10)
test $0x04, %ah
jnz L(Exit11)
test $0x08, %ah
jnz L(Exit12)
test $0x10, %ah
jnz L(Exit13)
test $0x20, %ah
jnz L(Exit14)
test $0x40, %ah
jnz L(Exit15)
movlpd (%rcx), %xmm0
movlpd 8(%rcx), %xmm1
movlpd %xmm0, (%rdx)
movlpd %xmm1, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit1):
xor %ah, %ah
movb %ah, 1(%rdx)
L(Exit1):
movb (%rcx), %al
movb %al, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit2):
xor %ah, %ah
movb %ah, 2(%rdx)
L(Exit2):
movw (%rcx), %ax
movw %ax, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit3):
xor %ah, %ah
movb %ah, 3(%rdx)
L(Exit3):
movw (%rcx), %ax
movw %ax, (%rdx)
movb 2(%rcx), %al
movb %al, 2(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit4):
xor %ah, %ah
movb %ah, 4(%rdx)
L(Exit4):
mov (%rcx), %eax
mov %eax, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit5):
xor %ah, %ah
movb %ah, 5(%rdx)
L(Exit5):
mov (%rcx), %eax
mov %eax, (%rdx)
movb 4(%rcx), %al
movb %al, 4(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit6):
xor %ah, %ah
movb %ah, 6(%rdx)
L(Exit6):
mov (%rcx), %eax
mov %eax, (%rdx)
movw 4(%rcx), %ax
movw %ax, 4(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit7):
xor %ah, %ah
movb %ah, 7(%rdx)
L(Exit7):
mov (%rcx), %eax
mov %eax, (%rdx)
mov 3(%rcx), %eax
mov %eax, 3(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit8):
xor %ah, %ah
movb %ah, 8(%rdx)
L(Exit8):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit9):
xor %ah, %ah
movb %ah, 9(%rdx)
L(Exit9):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movb 8(%rcx), %al
movb %al, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit10):
xor %ah, %ah
movb %ah, 10(%rdx)
L(Exit10):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movw 8(%rcx), %ax
movw %ax, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit11):
xor %ah, %ah
movb %ah, 11(%rdx)
L(Exit11):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
mov 7(%rcx), %eax
mov %eax, 7(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit12):
xor %ah, %ah
movb %ah, 12(%rdx)
L(Exit12):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
mov 8(%rcx), %eax
mov %eax, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit13):
xor %ah, %ah
movb %ah, 13(%rdx)
L(Exit13):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movlpd 5(%rcx), %xmm1
movlpd %xmm1, 5(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit14):
xor %ah, %ah
movb %ah, 14(%rdx)
L(Exit14):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movlpd 6(%rcx), %xmm1
movlpd %xmm1, 6(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit15):
xor %ah, %ah
movb %ah, 15(%rdx)
L(Exit15):
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movlpd 7(%rcx), %xmm1
movlpd %xmm1, 7(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit16):
xor %ah, %ah
movb %ah, 16(%rdx)
L(Exit16):
movlpd (%rcx), %xmm0
movlpd 8(%rcx), %xmm1
movlpd %xmm0, (%rdx)
movlpd %xmm1, 8(%rdx)
mov %rdi, %rax
ret
# ifdef USE_AS_STRNCPY
.p2align 4
L(CopyFrom1To16BytesCase2):
add $16, %r8
add %rsi, %rcx
lea (%rsi, %rdx), %rsi
lea -9(%r8), %rdx
and $1<<7, %dh
or %al, %dh
test %dh, %dh
lea (%rsi), %rdx
jz L(ExitHighCase2)
test $0x01, %al
jnz L(Exit1)
cmp $1, %r8
je L(StrncatExit1)
test $0x02, %al
jnz L(Exit2)
cmp $2, %r8
je L(StrncatExit2)
test $0x04, %al
jnz L(Exit3)
cmp $3, %r8
je L(StrncatExit3)
test $0x08, %al
jnz L(Exit4)
cmp $4, %r8
je L(StrncatExit4)
test $0x10, %al
jnz L(Exit5)
cmp $5, %r8
je L(StrncatExit5)
test $0x20, %al
jnz L(Exit6)
cmp $6, %r8
je L(StrncatExit6)
test $0x40, %al
jnz L(Exit7)
cmp $7, %r8
je L(StrncatExit7)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
lea 7(%rdx), %rax
cmpb $1, (%rax)
sbb $-1, %rax
xor %cl, %cl
movb %cl, (%rax)
mov %rdi, %rax
ret
.p2align 4
L(ExitHighCase2):
test $0x01, %ah
jnz L(Exit9)
cmp $9, %r8
je L(StrncatExit9)
test $0x02, %ah
jnz L(Exit10)
cmp $10, %r8
je L(StrncatExit10)
test $0x04, %ah
jnz L(Exit11)
cmp $11, %r8
je L(StrncatExit11)
test $0x8, %ah
jnz L(Exit12)
cmp $12, %r8
je L(StrncatExit12)
test $0x10, %ah
jnz L(Exit13)
cmp $13, %r8
je L(StrncatExit13)
test $0x20, %ah
jnz L(Exit14)
cmp $14, %r8
je L(StrncatExit14)
test $0x40, %ah
jnz L(Exit15)
cmp $15, %r8
je L(StrncatExit15)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movlpd 8(%rcx), %xmm1
movlpd %xmm1, 8(%rdx)
mov %rdi, %rax
ret
L(CopyFrom1To16BytesCase2OrCase3):
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
.p2align 4
L(CopyFrom1To16BytesCase3):
add $16, %r8
add %rsi, %rdx
add %rsi, %rcx
cmp $8, %r8
ja L(ExitHighCase3)
cmp $1, %r8
je L(StrncatExit1)
cmp $2, %r8
je L(StrncatExit2)
cmp $3, %r8
je L(StrncatExit3)
cmp $4, %r8
je L(StrncatExit4)
cmp $5, %r8
je L(StrncatExit5)
cmp $6, %r8
je L(StrncatExit6)
cmp $7, %r8
je L(StrncatExit7)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
xor %ah, %ah
movb %ah, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(ExitHighCase3):
cmp $9, %r8
je L(StrncatExit9)
cmp $10, %r8
je L(StrncatExit10)
cmp $11, %r8
je L(StrncatExit11)
cmp $12, %r8
je L(StrncatExit12)
cmp $13, %r8
je L(StrncatExit13)
cmp $14, %r8
je L(StrncatExit14)
cmp $15, %r8
je L(StrncatExit15)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movlpd 8(%rcx), %xmm1
movlpd %xmm1, 8(%rdx)
xor %ah, %ah
movb %ah, 16(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit0):
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit15Bytes):
cmp $9, %r8
je L(StrncatExit9)
cmpb $0, 9(%rcx)
jz L(Exit10)
cmp $10, %r8
je L(StrncatExit10)
cmpb $0, 10(%rcx)
jz L(Exit11)
cmp $11, %r8
je L(StrncatExit11)
cmpb $0, 11(%rcx)
jz L(Exit12)
cmp $12, %r8
je L(StrncatExit12)
cmpb $0, 12(%rcx)
jz L(Exit13)
cmp $13, %r8
je L(StrncatExit13)
cmpb $0, 13(%rcx)
jz L(Exit14)
cmp $14, %r8
je L(StrncatExit14)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
movlpd 7(%rcx), %xmm1
movlpd %xmm1, 7(%rdx)
lea 14(%rdx), %rax
cmpb $1, (%rax)
sbb $-1, %rax
xor %cl, %cl
movb %cl, (%rax)
mov %rdi, %rax
ret
.p2align 4
L(StrncatExit8Bytes):
cmpb $0, (%rcx)
jz L(Exit1)
cmp $1, %r8
je L(StrncatExit1)
cmpb $0, 1(%rcx)
jz L(Exit2)
cmp $2, %r8
je L(StrncatExit2)
cmpb $0, 2(%rcx)
jz L(Exit3)
cmp $3, %r8
je L(StrncatExit3)
cmpb $0, 3(%rcx)
jz L(Exit4)
cmp $4, %r8
je L(StrncatExit4)
cmpb $0, 4(%rcx)
jz L(Exit5)
cmp $5, %r8
je L(StrncatExit5)
cmpb $0, 5(%rcx)
jz L(Exit6)
cmp $6, %r8
je L(StrncatExit6)
cmpb $0, 6(%rcx)
jz L(Exit7)
cmp $7, %r8
je L(StrncatExit7)
movlpd (%rcx), %xmm0
movlpd %xmm0, (%rdx)
lea 7(%rdx), %rax
cmpb $1, (%rax)
sbb $-1, %rax
xor %cl, %cl
movb %cl, (%rax)
mov %rdi, %rax
ret
# endif
END (STRCAT)
#endif

View File

@ -0,0 +1,85 @@
/* Multiple versions of strcat
Copyright (C) 2009, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
#ifndef USE_AS_STRNCAT
# ifndef STRCAT
# define STRCAT strcat
# endif
#endif
#ifdef USE_AS_STRNCAT
# define STRCAT_SSSE3 __strncat_ssse3
# define STRCAT_SSE2 __strncat_sse2
# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned
# define __GI_STRCAT __GI_strncat
# define __GI___STRCAT __GI___strncat
#else
# define STRCAT_SSSE3 __strcat_ssse3
# define STRCAT_SSE2 __strcat_sse2
# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned
# define __GI_STRCAT __GI_strcat
# define __GI___STRCAT __GI___strcat
#endif
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
ENTRY(STRCAT)
.type STRCAT, @gnu_indirect_function
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
1: leaq STRCAT_SSE2_UNALIGNED(%rip), %rax
testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 2f
leaq STRCAT_SSE2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq STRCAT_SSSE3(%rip), %rax
2: ret
END(STRCAT)
# undef ENTRY
# define ENTRY(name) \
.type STRCAT_SSE2, @function; \
.align 16; \
STRCAT_SSE2: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strcat calls through a PLT.
The speedup we get from using SSSE3 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
.globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
# undef libc_hidden_def
# define libc_hidden_def(name) \
.globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
#endif
#ifndef USE_AS_STRNCAT
# include "../strcat.S"
#endif

File diff suppressed because it is too large Load Diff

View File

@ -20,25 +20,26 @@
#ifndef NOT_IN_libc #ifndef NOT_IN_libc
# include <sysdep.h> # ifndef USE_AS_STRCAT
# include <sysdep.h>
# ifndef STRCPY # ifndef STRCPY
# define STRCPY __strcpy_ssse3 # define STRCPY __strcpy_ssse3
# endif # endif
.section .text.ssse3,"ax",@progbits .section .text.ssse3,"ax",@progbits
ENTRY (STRCPY) ENTRY (STRCPY)
mov %rsi, %rcx mov %rsi, %rcx
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
mov %rdx, %r8 mov %rdx, %r8
# endif # endif
mov %rdi, %rdx mov %rdi, %rdx
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
test %r8, %r8 test %r8, %r8
jz L(Exit0) jz L(Exit0)
cmp $8, %r8 cmp $8, %r8
jbe L(StrncpyExit8Bytes) jbe L(StrncpyExit8Bytes)
# endif # endif
cmpb $0, (%rcx) cmpb $0, (%rcx)
jz L(Exit1) jz L(Exit1)
cmpb $0, 1(%rcx) cmpb $0, 1(%rcx)
@ -55,10 +56,10 @@ ENTRY (STRCPY)
jz L(Exit7) jz L(Exit7)
cmpb $0, 7(%rcx) cmpb $0, 7(%rcx)
jz L(Exit8) jz L(Exit8)
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
cmp $16, %r8 cmp $16, %r8
jb L(StrncpyExit15Bytes) jb L(StrncpyExit15Bytes)
# endif # endif
cmpb $0, 8(%rcx) cmpb $0, 8(%rcx)
jz L(Exit9) jz L(Exit9)
cmpb $0, 9(%rcx) cmpb $0, 9(%rcx)
@ -73,12 +74,13 @@ ENTRY (STRCPY)
jz L(Exit14) jz L(Exit14)
cmpb $0, 14(%rcx) cmpb $0, 14(%rcx)
jz L(Exit15) jz L(Exit15)
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
cmp $16, %r8 cmp $16, %r8
je L(Exit16) je L(Exit16)
# endif # endif
cmpb $0, 15(%rcx) cmpb $0, 15(%rcx)
jz L(Exit16) jz L(Exit16)
# endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
mov %rcx, %rsi mov %rcx, %rsi
@ -2180,12 +2182,12 @@ L(Shl15LoopExit):
jmp L(CopyFrom1To16Bytes) jmp L(CopyFrom1To16Bytes)
# endif # endif
# ifndef USE_AS_STRCAT
.p2align 4 .p2align 4
L(CopyFrom1To16Bytes): L(CopyFrom1To16Bytes):
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
add $16, %r8 add $16, %r8
# endif # endif
add %rsi, %rdx add %rsi, %rdx
add %rsi, %rcx add %rsi, %rcx
@ -2210,20 +2212,20 @@ L(CopyFrom1To16Bytes):
L(Exit8): L(Exit8):
mov (%rcx), %rax mov (%rcx), %rax
mov %rax, (%rdx) mov %rax, (%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 7(%rdx), %rax lea 7(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $8, %r8 sub $8, %r8
lea 8(%rdx), %rcx lea 8(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2249,23 +2251,23 @@ L(Exit16):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 8(%rcx), %rax mov 8(%rcx), %rax
mov %rax, 8(%rdx) mov %rax, 8(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 15(%rdx), %rax lea 15(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $16, %r8 sub $16, %r8
lea 16(%rdx), %rcx lea 16(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
.p2align 4 .p2align 4
L(CopyFrom1To16BytesCase2): L(CopyFrom1To16BytesCase2):
@ -2381,46 +2383,46 @@ L(Less12Case3): /* but more than 8 */
jl L(Exit9) jl L(Exit9)
je L(Exit10) je L(Exit10)
jg L(Exit11) jg L(Exit11)
# endif # endif
.p2align 4 .p2align 4
L(Exit1): L(Exit1):
movb (%rcx), %al movb (%rcx), %al
movb %al, (%rdx) movb %al, (%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea (%rdx), %rax lea (%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $1, %r8 sub $1, %r8
lea 1(%rdx), %rcx lea 1(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
L(Exit2): L(Exit2):
movw (%rcx), %ax movw (%rcx), %ax
movw %ax, (%rdx) movw %ax, (%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 1(%rdx), %rax lea 1(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $2, %r8 sub $2, %r8
lea 2(%rdx), %rcx lea 2(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2429,40 +2431,40 @@ L(Exit3):
movw %ax, (%rdx) movw %ax, (%rdx)
movb 2(%rcx), %al movb 2(%rcx), %al
movb %al, 2(%rdx) movb %al, 2(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 2(%rdx), %rax lea 2(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $3, %r8 sub $3, %r8
lea 3(%rdx), %rcx lea 3(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
L(Exit4): L(Exit4):
movl (%rcx), %eax movl (%rcx), %eax
movl %eax, (%rdx) movl %eax, (%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 3(%rdx), %rax lea 3(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $4, %r8 sub $4, %r8
lea 4(%rdx), %rcx lea 4(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2471,20 +2473,20 @@ L(Exit5):
movl %eax, (%rdx) movl %eax, (%rdx)
movb 4(%rcx), %al movb 4(%rcx), %al
movb %al, 4(%rdx) movb %al, 4(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 4(%rdx), %rax lea 4(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $5, %r8 sub $5, %r8
lea 5(%rdx), %rcx lea 5(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2493,20 +2495,20 @@ L(Exit6):
movl %eax, (%rdx) movl %eax, (%rdx)
movw 4(%rcx), %ax movw 4(%rcx), %ax
movw %ax, 4(%rdx) movw %ax, 4(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 5(%rdx), %rax lea 5(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $6, %r8 sub $6, %r8
lea 6(%rdx), %rcx lea 6(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2515,20 +2517,20 @@ L(Exit7):
movl %eax, (%rdx) movl %eax, (%rdx)
movl 3(%rcx), %eax movl 3(%rcx), %eax
movl %eax, 3(%rdx) movl %eax, 3(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 6(%rdx), %rax lea 6(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $7, %r8 sub $7, %r8
lea 7(%rdx), %rcx lea 7(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2537,20 +2539,20 @@ L(Exit9):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 5(%rcx), %eax mov 5(%rcx), %eax
mov %eax, 5(%rdx) mov %eax, 5(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 8(%rdx), %rax lea 8(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $9, %r8 sub $9, %r8
lea 9(%rdx), %rcx lea 9(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2559,20 +2561,20 @@ L(Exit10):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 6(%rcx), %eax mov 6(%rcx), %eax
mov %eax, 6(%rdx) mov %eax, 6(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 9(%rdx), %rax lea 9(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $10, %r8 sub $10, %r8
lea 10(%rdx), %rcx lea 10(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2581,20 +2583,20 @@ L(Exit11):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 7(%rcx), %eax mov 7(%rcx), %eax
mov %eax, 7(%rdx) mov %eax, 7(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 10(%rdx), %rax lea 10(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $11, %r8 sub $11, %r8
lea 11(%rdx), %rcx lea 11(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2603,20 +2605,20 @@ L(Exit12):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 8(%rcx), %eax mov 8(%rcx), %eax
mov %eax, 8(%rdx) mov %eax, 8(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 11(%rdx), %rax lea 11(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $12, %r8 sub $12, %r8
lea 12(%rdx), %rcx lea 12(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2625,20 +2627,20 @@ L(Exit13):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 5(%rcx), %rax mov 5(%rcx), %rax
mov %rax, 5(%rdx) mov %rax, 5(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 12(%rdx), %rax lea 12(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $13, %r8 sub $13, %r8
lea 13(%rdx), %rcx lea 13(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2647,20 +2649,20 @@ L(Exit14):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 6(%rcx), %rax mov 6(%rcx), %rax
mov %rax, 6(%rdx) mov %rax, 6(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 13(%rdx), %rax lea 13(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $14, %r8 sub $14, %r8
lea 14(%rdx), %rcx lea 14(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2669,23 +2671,23 @@ L(Exit15):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 7(%rcx), %rax mov 7(%rcx), %rax
mov %rax, 7(%rdx) mov %rax, 7(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 14(%rdx), %rax lea 14(%rdx), %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
sub $15, %r8 sub $15, %r8
lea 15(%rdx), %rcx lea 15(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1) jnz L(StrncpyFillTailWithZero1)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# endif # endif
# endif # endif
ret ret
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
.p2align 4 .p2align 4
L(Fill0): L(Fill0):
ret ret
@ -2902,13 +2904,13 @@ L(StrncpyExit15Bytes):
mov %rax, (%rdx) mov %rax, (%rdx)
mov 7(%rcx), %rax mov 7(%rcx), %rax
mov %rax, 7(%rdx) mov %rax, 7(%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 14(%rdx), %rax lea 14(%rdx), %rax
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
ret ret
.p2align 4 .p2align 4
@ -2943,15 +2945,17 @@ L(StrncpyExit8Bytes):
jz L(Exit7) jz L(Exit7)
mov (%rcx), %rax mov (%rcx), %rax
mov %rax, (%rdx) mov %rax, (%rdx)
# ifdef USE_AS_STPCPY # ifdef USE_AS_STPCPY
lea 7(%rdx), %rax lea 7(%rdx), %rax
cmpb $1, (%rax) cmpb $1, (%rax)
sbb $-1, %rax sbb $-1, %rax
# else # else
mov %rdi, %rax mov %rdi, %rax
# endif # endif
ret ret
# endif
# endif # endif
# ifdef USE_AS_STRNCPY # ifdef USE_AS_STRNCPY
@ -3715,7 +3719,7 @@ L(StrncpyExit15):
lea 1(%rsi), %rsi lea 1(%rsi), %rsi
jmp L(CopyFrom1To16BytesCase3) jmp L(CopyFrom1To16BytesCase3)
# endif # endif
# ifndef USE_AS_STRCAT
END (STRCPY) END (STRCPY)
# endif
#endif #endif

View File

@ -1,5 +1,5 @@
/* strlen without BSF /* strlen SSE2 without bsf
Copyright (C) 2010 Free Software Foundation, Inc. Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation. Contributed by Intel Corporation.
This file is part of the GNU C Library. This file is part of the GNU C Library.
@ -18,12 +18,17 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */ 02111-1307 USA. */
#if defined SHARED && !defined NOT_IN_libc #if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc
#include <sysdep.h> # ifndef USE_AS_STRCAT
.section .text.slow,"ax",@progbits # include <sysdep.h>
# define RETURN ret
.section .text.sse2,"ax",@progbits
ENTRY (__strlen_no_bsf) ENTRY (__strlen_no_bsf)
# endif
xor %eax, %eax xor %eax, %eax
cmpb $0, (%rdi) cmpb $0, (%rdi)
jz L(exit_tail0) jz L(exit_tail0)
@ -165,39 +170,37 @@ ENTRY (__strlen_no_bsf)
jnz L(exit) jnz L(exit)
and $-0x40, %rax and $-0x40, %rax
xor %r8d, %r8d
L(aligned_64): L(aligned_64):
pcmpeqb (%rax), %xmm0 pcmpeqb (%rax), %xmm0
pcmpeqb 16(%rax), %xmm1 pcmpeqb 16(%rax), %xmm1
pcmpeqb 32(%rax), %xmm2 pcmpeqb 32(%rax), %xmm2
pcmpeqb 48(%rax), %xmm3 pcmpeqb 48(%rax), %xmm3
pmovmskb %xmm0, %edx pmovmskb %xmm0, %edx
pmovmskb %xmm1, %esi pmovmskb %xmm1, %r11d
pmovmskb %xmm2, %edi pmovmskb %xmm2, %r10d
pmovmskb %xmm3, %r9d pmovmskb %xmm3, %r9d
or %edx, %r8d or %edx, %r9d
or %esi, %r8d or %r11d, %r9d
or %edi, %r8d or %r10d, %r9d
or %r9d, %r8d
lea 64(%rax), %rax lea 64(%rax), %rax
jz L(aligned_64) jz L(aligned_64)
test %edx, %edx test %edx, %edx
jnz L(aligned_64_exit_16) jnz L(aligned_64_exit_16)
test %esi, %esi test %r11d, %r11d
jnz L(aligned_64_exit_32) jnz L(aligned_64_exit_32)
test %edi, %edi test %r10d, %r10d
jnz L(aligned_64_exit_48) jnz L(aligned_64_exit_48)
L(aligned_64_exit_64): L(aligned_64_exit_64):
mov %r9d, %edx pmovmskb %xmm3, %edx
jmp L(aligned_64_exit) jmp L(aligned_64_exit)
L(aligned_64_exit_48): L(aligned_64_exit_48):
lea -16(%rax), %rax lea -16(%rax), %rax
mov %edi, %edx mov %r10d, %edx
jmp L(aligned_64_exit) jmp L(aligned_64_exit)
L(aligned_64_exit_32): L(aligned_64_exit_32):
lea -32(%rax), %rax lea -32(%rax), %rax
mov %esi, %edx mov %r11d, %edx
jmp L(aligned_64_exit) jmp L(aligned_64_exit)
L(aligned_64_exit_16): L(aligned_64_exit_16):
lea -48(%rax), %rax lea -48(%rax), %rax
@ -228,7 +231,7 @@ L(exit):
jnz L(exit_tail6) jnz L(exit_tail6)
add $7, %eax add $7, %eax
L(exit_tail0): L(exit_tail0):
ret RETURN
L(exit_high): L(exit_high):
add $8, %eax add $8, %eax
@ -253,57 +256,58 @@ L(exit_high):
test $0x40, %dh test $0x40, %dh
jnz L(exit_tail6) jnz L(exit_tail6)
add $7, %eax add $7, %eax
ret RETURN
.p2align 4 .p2align 4
L(exit_tail1): L(exit_tail1):
add $1, %eax add $1, %eax
ret RETURN
L(exit_tail2): L(exit_tail2):
add $2, %eax add $2, %eax
ret RETURN
L(exit_tail3): L(exit_tail3):
add $3, %eax add $3, %eax
ret RETURN
L(exit_tail4): L(exit_tail4):
add $4, %eax add $4, %eax
ret RETURN
L(exit_tail5): L(exit_tail5):
add $5, %eax add $5, %eax
ret RETURN
L(exit_tail6): L(exit_tail6):
add $6, %eax add $6, %eax
ret RETURN
L(exit_tail7): L(exit_tail7):
add $7, %eax add $7, %eax
ret RETURN
L(exit_tail8): L(exit_tail8):
add $8, %eax add $8, %eax
ret RETURN
L(exit_tail9): L(exit_tail9):
add $9, %eax add $9, %eax
ret RETURN
L(exit_tail10): L(exit_tail10):
add $10, %eax add $10, %eax
ret RETURN
L(exit_tail11): L(exit_tail11):
add $11, %eax add $11, %eax
ret RETURN
L(exit_tail12): L(exit_tail12):
add $12, %eax add $12, %eax
ret RETURN
L(exit_tail13): L(exit_tail13):
add $13, %eax add $13, %eax
ret RETURN
L(exit_tail14): L(exit_tail14):
add $14, %eax add $14, %eax
ret RETURN
L(exit_tail15): L(exit_tail15):
add $15, %eax add $15, %eax
ret # ifndef USE_AS_STRCAT
RETURN
END (__strlen_no_bsf) END (__strlen_no_bsf)
# endif
#endif #endif

View File

@ -0,0 +1,260 @@
/* strlen SSE2
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
# ifndef USE_AS_STRCAT
# include <sysdep.h>
# define RETURN ret
.section .text.sse2,"ax",@progbits
ENTRY (__strlen_sse2_pminub)
# endif
xor %rax, %rax
mov %edi, %ecx
and $0x3f, %ecx
pxor %xmm0, %xmm0
cmp $0x30, %ecx
ja L(next)
movdqu (%rdi), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit_less16)
mov %rdi, %rax
and $-16, %rax
jmp L(align16_start)
L(next):
mov %rdi, %rax
and $-16, %rax
pcmpeqb (%rax), %xmm0
mov $-1, %r10d
sub %rax, %rcx
shl %cl, %r10d
pmovmskb %xmm0, %edx
and %r10d, %edx
jnz L(exit)
L(align16_start):
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pcmpeqb 16(%rax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16)
pcmpeqb 32(%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32)
pcmpeqb 48(%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48)
pcmpeqb 64(%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
jnz L(exit64)
pcmpeqb 80(%rax), %xmm0
add $64, %rax
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16)
pcmpeqb 32(%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32)
pcmpeqb 48(%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48)
pcmpeqb 64(%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
jnz L(exit64)
pcmpeqb 80(%rax), %xmm0
add $64, %rax
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16)
pcmpeqb 32(%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32)
pcmpeqb 48(%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48)
pcmpeqb 64(%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
jnz L(exit64)
pcmpeqb 80(%rax), %xmm0
add $64, %rax
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16)
pcmpeqb 32(%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32)
pcmpeqb 48(%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48)
pcmpeqb 64(%rax), %xmm3
pmovmskb %xmm3, %edx
test %edx, %edx
jnz L(exit64)
test $0x3f, %rax
jz L(align64_loop)
pcmpeqb 80(%rax), %xmm0
add $80, %rax
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit)
test $0x3f, %rax
jz L(align64_loop)
pcmpeqb 16(%rax), %xmm1
add $16, %rax
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit)
test $0x3f, %rax
jz L(align64_loop)
pcmpeqb 16(%rax), %xmm2
add $16, %rax
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit)
test $0x3f, %rax
jz L(align64_loop)
pcmpeqb 16(%rax), %xmm3
add $16, %rax
pmovmskb %xmm3, %edx
test %edx, %edx
jnz L(exit)
add $16, %rax
.p2align 4
L(align64_loop):
movaps (%rax), %xmm4
pminub 16(%rax), %xmm4
movaps 32(%rax), %xmm5
pminub 48(%rax), %xmm5
add $64, %rax
pminub %xmm4, %xmm5
pcmpeqb %xmm0, %xmm5
pmovmskb %xmm5, %edx
test %edx, %edx
jz L(align64_loop)
pcmpeqb -64(%rax), %xmm0
sub $80, %rax
pmovmskb %xmm0, %edx
test %edx, %edx
jnz L(exit16)
pcmpeqb 32(%rax), %xmm1
pmovmskb %xmm1, %edx
test %edx, %edx
jnz L(exit32)
pcmpeqb 48(%rax), %xmm2
pmovmskb %xmm2, %edx
test %edx, %edx
jnz L(exit48)
pcmpeqb 64(%rax), %xmm3
pmovmskb %xmm3, %edx
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $64, %rax
RETURN
.p2align 4
L(exit):
sub %rdi, %rax
L(exit_less16):
bsf %rdx, %rdx
add %rdx, %rax
RETURN
.p2align 4
L(exit16):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $16, %rax
RETURN
.p2align 4
L(exit32):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $32, %rax
RETURN
.p2align 4
L(exit48):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $48, %rax
RETURN
.p2align 4
L(exit64):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $64, %rax
# ifndef USE_AS_STRCAT
RETURN
END (__strlen_sse2_pminub)
# endif
#endif

View File

@ -32,7 +32,10 @@ ENTRY(strlen)
cmpl $0, __cpu_features+KIND_OFFSET(%rip) cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f jne 1f
call __init_cpu_features call __init_cpu_features
1: leaq __strlen_sse2(%rip), %rax 1: leaq __strlen_sse2_pminub(%rip), %rax
testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
jnz 2f
leaq __strlen_sse2(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jz 2f jz 2f
leaq __strlen_sse42(%rip), %rax leaq __strlen_sse42(%rip), %rax

View File

@ -0,0 +1,8 @@
#define STRNCAT __strncat_sse2
#ifdef SHARED
#undef libc_hidden_def
#define libc_hidden_def(name) \
__hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
#endif
#include "string/strncat.c"

View File

@ -0,0 +1,3 @@
#define USE_AS_STRNCAT
#define STRCAT __strncat_sse2_unaligned
#include "strcat-sse2-unaligned.S"

View File

@ -0,0 +1,3 @@
#define USE_AS_STRNCAT
#define STRCAT __strncat_ssse3
#include "strcat-ssse3.S"

View File

@ -0,0 +1,3 @@
#define STRCAT strncat
#define USE_AS_STRNCAT
#include "strcat.S"