1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-07-28 00:21:52 +03:00

Optimized memcmp and wmemcmp for x86-64 and x86-32

This commit is contained in:
Liubov Dmitrieva
2011-10-15 11:10:08 -04:00
committed by Ulrich Drepper
parent 556a200797
commit be13f7bff6
19 changed files with 3069 additions and 335 deletions

View File

@ -1,3 +1,32 @@
2011-09-27 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
* sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add
memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
* sysdeps/x86_64/multiarch/memcmp-ssse3: New file.
* sysdeps/x86_64/multiarch/memcmp.S: Update. Add __memcmp_ssse3.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Update.
(USE_AS_WMEMCMP): New macro.
Fixing indents.
* sysdeps/x86_64/multiarch/wmemcmp.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-c.S: New file.
* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c
* sysdeps/i386/i686/multiarch/wmemcmp.S: New file.
* sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file.
* sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file.
* sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file.
* sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update.
(USE_AS_WMEMCMP): New macro.
* sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise.
* sysdeps/string/test-memcmp.c: Update.
Fix simple_wmemcmp.
Add new tests.
* wcsmbs/wmemcmp.c: Update.
(WMEMCMP): New macro.
Fix overflow bug.
2011-10-12 Andreas Jaeger <aj@suse.de>
[BZ #13268]

2
NEWS
View File

@ -33,7 +33,7 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva.
* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32.
* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
Contributed by Liubov Dmitrieva.
* New interfaces: scandirat, scandirat64

View File

@ -29,9 +29,21 @@
# define MEMCPY wmemcpy
# define SIMPLE_MEMCMP simple_wmemcmp
# define CHAR wchar_t
# define MAX_CHAR 256000
# define UCHAR uint32_t
# define UCHAR wchar_t
# define CHARBYTES 4
# define CHAR__MIN WCHAR_MIN
# define CHAR__MAX WCHAR_MAX
int
simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n)
{
int ret = 0;
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;}
return ret;
}
#else
# define MEMCMP memcmp
# define MEMCPY memcpy
@ -40,18 +52,20 @@
# define MAX_CHAR 255
# define UCHAR unsigned char
# define CHARBYTES 1
#endif
typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
# define CHAR__MIN CHAR_MIN
# define CHAR__MAX CHAR_MAX
int
SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n)
simple_memcmp (const char *s1, const char *s2, size_t n)
{
int ret = 0;
while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0);
while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0);
return ret;
}
#endif
typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
IMPL (SIMPLE_MEMCMP, 0)
IMPL (MEMCMP, 1)
@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
s2 = (CHAR *) (buf2 + align2);
for (i = 0; i < len; i++)
s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
s1[len] = align1;
s2[len] = align2;
@ -412,8 +426,8 @@ check1 (void)
s2[99] = 1;
s1[100] = 116;
s2[100] = 116;
s1[101] = -13;
s2[101] = -13;
s1[101] = CHAR__MIN;
s2[101] = CHAR__MAX;
s1[102] = -109;
s2[102] = -109;
s1[103] = 1;
@ -434,8 +448,8 @@ check1 (void)
s2[110] = -109;
s1[111] = 1;
s2[111] = 1;
s1[112] = 20;
s2[112] = 20;
s1[112] = CHAR__MAX;
s2[112] = CHAR__MIN;
s1[113] = -13;
s2[113] = -13;
s1[114] = -109;
@ -444,9 +458,12 @@ check1 (void)
s2[115] = 1;
n = 116;
exp_result = SIMPLE_MEMCMP (s1, s2, n);
for (size_t i = 0; i < n; i++)
{
exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i);
FOR_EACH_IMPL (impl, 0)
check_result (impl, s1, s2, n, exp_result);
check_result (impl, s1 + i, s2 + i, n - i, exp_result);
}
}
int

View File

@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
memrchr-sse2 memrchr-sse2-bsf memrchr-c \
rawmemchr-sse2 rawmemchr-sse2-bsf
rawmemchr-sse2 rawmemchr-sse2-bsf \
wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4

View File

@ -1,5 +1,5 @@
/* memcmp with SSE4.2
Copyright (C) 2010 Free Software Foundation, Inc.
/* memcmp with SSE4.2, wmemcmp with SSE4.2
Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@ -21,7 +21,6 @@
#ifndef NOT_IN_libc
# include <sysdep.h>
#include "asm-syntax.h"
# ifndef MEMCMP
# define MEMCMP __memcmp_sse4_2
@ -51,6 +50,7 @@
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
jump table with relative offsets. INDEX is a register contains the
index into the jump table. SCALE is the scale of INDEX. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
/* We first load PC into EBX. */ \
call __i686.get_pc_thunk.bx; \
@ -61,15 +61,6 @@
addl (%ebx,INDEX,SCALE), %ebx; \
/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
jmp *%ebx
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
ALIGN (4)
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret
# else
# define JMPTBL(I, B) I
@ -80,24 +71,46 @@ __i686.get_pc_thunk.bx:
jmp *TABLE(,INDEX,SCALE)
# endif
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
.section .text.sse4.2,"ax",@progbits
ENTRY (MEMCMP)
movl BLK1(%esp), %eax
movl BLK2(%esp), %edx
movl LEN(%esp), %ecx
# ifdef USE_AS_WMEMCMP
shl $2, %ecx
test %ecx, %ecx
jz L(return0)
# else
cmp $1, %ecx
jbe L(less1bytes)
# endif
pxor %xmm0, %xmm0
cmp $64, %ecx
ja L(64bytesormore)
cmp $8, %ecx
# ifndef USE_AS_WMEMCMP
PUSH (%ebx)
jb L(less8bytes)
# else
jb L(less8bytes)
PUSH (%ebx)
# endif
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(less8bytes):
mov (%eax), %bl
cmpb (%edx), %bl
@ -141,6 +154,7 @@ L(less8bytes):
mov 6(%eax), %bl
cmpb 6(%edx), %bl
je L(0bytes)
L(nonzero):
POP (%ebx)
mov $1, %eax
@ -149,14 +163,40 @@ L(nonzero):
L(above):
ret
CFI_PUSH (%ebx)
# endif
ALIGN (4)
.p2align 4
L(0bytes):
POP (%ebx)
xor %eax, %eax
ret
ALIGN (4)
# ifdef USE_AS_WMEMCMP
/* for wmemcmp, case N == 1 */
.p2align 4
L(less8bytes):
mov (%eax), %ecx
cmp (%edx), %ecx
je L(return0)
mov $1, %eax
jg L(find_diff_bigger)
neg %eax
ret
.p2align 4
L(find_diff_bigger):
ret
.p2align 4
L(return0):
xor %eax, %eax
ret
# endif
# ifndef USE_AS_WMEMCMP
.p2align 4
L(less1bytes):
jb L(0bytesend)
movzbl (%eax), %eax
@ -164,12 +204,12 @@ L(less1bytes):
sub %edx, %eax
ret
ALIGN (4)
.p2align 4
L(0bytesend):
xor %eax, %eax
ret
ALIGN (4)
# endif
.p2align 4
L(64bytesormore):
PUSH (%ebx)
mov %ecx, %ebx
@ -208,7 +248,14 @@ L(64bytesormore_loop):
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
ALIGN (4)
# ifdef USE_AS_WMEMCMP
/* Label needs only for table_64bytes filling */
L(unreal_case):
/* no code here */
# endif
.p2align 4
L(find_16diff):
sub $16, %ecx
L(find_32diff):
@ -218,9 +265,9 @@ L(find_48diff):
L(find_64diff):
add %ecx, %edx
add %ecx, %eax
jmp L(16bytes)
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(16bytes):
mov -16(%eax), %ecx
mov -16(%edx), %ebx
@ -243,8 +290,30 @@ L(4bytes):
mov $0, %eax
jne L(find_diff)
RETURN
# else
.p2align 4
L(16bytes):
mov -16(%eax), %ecx
cmp -16(%edx), %ecx
jne L(find_diff)
L(12bytes):
mov -12(%eax), %ecx
cmp -12(%edx), %ecx
jne L(find_diff)
L(8bytes):
mov -8(%eax), %ecx
cmp -8(%edx), %ecx
jne L(find_diff)
L(4bytes):
mov -4(%eax), %ecx
cmp -4(%edx), %ecx
mov $0, %eax
jne L(find_diff)
RETURN
# endif
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(49bytes):
movdqu -49(%eax), %xmm1
movdqu -49(%edx), %xmm2
@ -285,7 +354,7 @@ L(5bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(50bytes):
mov $-50, %ebx
movdqu -50(%eax), %xmm1
@ -330,7 +399,7 @@ L(2bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(51bytes):
mov $-51, %ebx
movdqu -51(%eax), %xmm1
@ -378,8 +447,8 @@ L(1bytes):
mov $0, %eax
jne L(end)
RETURN
ALIGN (4)
# endif
.p2align 4
L(52bytes):
movdqu -52(%eax), %xmm1
movdqu -52(%edx), %xmm2
@ -402,13 +471,18 @@ L(20bytes):
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax
jne L(find_diff)
RETURN
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(53bytes):
movdqu -53(%eax), %xmm1
movdqu -53(%edx), %xmm2
@ -440,7 +514,7 @@ L(21bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(54bytes):
movdqu -54(%eax), %xmm1
movdqu -54(%edx), %xmm2
@ -476,7 +550,7 @@ L(22bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(55bytes):
movdqu -55(%eax), %xmm1
movdqu -55(%edx), %xmm2
@ -513,8 +587,8 @@ L(23bytes):
mov $0, %eax
jne L(end)
RETURN
ALIGN (4)
# endif
.p2align 4
L(56bytes):
movdqu -56(%eax), %xmm1
movdqu -56(%edx), %xmm2
@ -538,18 +612,27 @@ L(24bytes):
jnc L(less16bytes)
mov -8(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -8(%edx), %ecx
# endif
jne L(find_diff)
mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax
jne L(find_diff)
RETURN
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(57bytes):
movdqu -57(%eax), %xmm1
movdqu -57(%edx), %xmm2
@ -585,7 +668,7 @@ L(25bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(58bytes):
movdqu -58(%eax), %xmm1
movdqu -58(%edx), %xmm2
@ -627,7 +710,7 @@ L(26bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(59bytes):
movdqu -59(%eax), %xmm1
movdqu -59(%edx), %xmm2
@ -668,8 +751,8 @@ L(27bytes):
mov $0, %eax
jne L(end)
RETURN
ALIGN (4)
# endif
.p2align 4
L(60bytes):
movdqu -60(%eax), %xmm1
movdqu -60(%edx), %xmm2
@ -691,22 +774,38 @@ L(28bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -12(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -12(%edx), %ecx
# endif
jne L(find_diff)
mov -8(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -8(%edx), %ecx
# endif
jne L(find_diff)
mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax
jne L(find_diff)
RETURN
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(61bytes):
movdqu -61(%eax), %xmm1
movdqu -61(%edx), %xmm2
@ -749,7 +848,7 @@ L(29bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(62bytes):
movdqu -62(%eax), %xmm1
movdqu -62(%edx), %xmm2
@ -792,7 +891,7 @@ L(30bytes):
jne L(end)
RETURN
ALIGN (4)
.p2align 4
L(63bytes):
movdqu -63(%eax), %xmm1
movdqu -63(%edx), %xmm2
@ -838,8 +937,9 @@ L(31bytes):
mov $0, %eax
jne L(end)
RETURN
# endif
ALIGN (4)
.p2align 4
L(64bytes):
movdqu -64(%eax), %xmm1
movdqu -64(%edx), %xmm2
@ -863,28 +963,45 @@ L(32bytes):
jnc L(less16bytes)
mov -16(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -16(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -16(%edx), %ecx
# endif
jne L(find_diff)
mov -12(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -12(%edx), %ecx
# endif
jne L(find_diff)
mov -8(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -8(%edx), %ecx
# endif
jne L(find_diff)
mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax
jne L(find_diff)
RETURN
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(less16bytes):
add %ebx, %eax
add %ebx, %edx
@ -910,9 +1027,35 @@ L(less16bytes):
mov $0, %eax
jne L(find_diff)
RETURN
# else
.p2align 4
L(less16bytes):
add %ebx, %eax
add %ebx, %edx
ALIGN (4)
mov (%eax), %ecx
cmp (%edx), %ecx
jne L(find_diff)
mov 4(%eax), %ecx
cmp 4(%edx), %ecx
jne L(find_diff)
mov 8(%eax), %ecx
cmp 8(%edx), %ecx
jne L(find_diff)
mov 12(%eax), %ecx
cmp 12(%edx), %ecx
mov $0, %eax
jne L(find_diff)
RETURN
# endif
.p2align 4
L(find_diff):
# ifndef USE_AS_WMEMCMP
cmpb %bl, %cl
jne L(end)
cmp %bx, %cx
@ -929,11 +1072,23 @@ L(end):
neg %eax
L(bigger):
ret
# else
POP (%ebx)
mov $1, %eax
jg L(bigger)
neg %eax
ret
.p2align 4
L(bigger):
ret
# endif
END (MEMCMP)
.section .rodata.sse4.2,"a",@progbits
ALIGN (2)
.p2align 2
.type L(table_64bytes), @object
# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@ -1000,5 +1155,72 @@ L(table_64bytes):
.int JMPTBL (L(62bytes), L(table_64bytes))
.int JMPTBL (L(63bytes), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
.size L(table_64bytes), .-L(table_64bytes)
# else
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(4bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(8bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(12bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(16bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(20bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(24bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(28bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(32bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(36bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(40bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(44bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(48bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(52bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(56bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(60bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
# endif
#endif

View File

@ -1,5 +1,5 @@
/* memcmp with SSSE3
Copyright (C) 2010 Free Software Foundation, Inc.
/* memcmp with SSSE3, wmemcmp with SSSE3
Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@ -21,7 +21,6 @@
#ifndef NOT_IN_libc
# include <sysdep.h>
#include "asm-syntax.h"
# ifndef MEMCMP
# define MEMCMP __memcmp_ssse3
@ -45,22 +44,40 @@
# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
.section .text.ssse3,"ax",@progbits
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
atom_text_section
ENTRY (MEMCMP)
movl LEN(%esp), %ecx
# ifdef USE_AS_WMEMCMP
shl $2, %ecx
test %ecx, %ecx
jz L(zero)
# endif
movl BLK1(%esp), %eax
cmp $48, %ecx
movl BLK2(%esp), %edx
jae L(48bytesormore)
# ifndef USE_AS_WMEMCMP
cmp $1, %ecx
jbe L(less1bytes)
# endif
PUSH (%ebx)
add %ecx, %edx
add %ecx, %eax
jmp L(less48bytes)
ALIGN (4)
CFI_POP (%ebx)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(less1bytes):
jb L(zero)
movb (%eax), %cl
@ -71,13 +88,14 @@ L(less1bytes):
neg %eax
L(1bytesend):
ret
# endif
ALIGN (4)
.p2align 4
L(zero):
mov $0, %eax
xor %eax, %eax
ret
ALIGN (4)
.p2align 4
L(48bytesormore):
PUSH (%ebx)
PUSH (%esi)
@ -104,6 +122,7 @@ L(48bytesormore):
jz L(shr_0)
xor %edx, %esi
# ifndef USE_AS_WMEMCMP
cmp $8, %edx
jae L(next_unaligned_table)
cmp $0, %edx
@ -122,7 +141,7 @@ L(48bytesormore):
je L(shr_6)
jmp L(shr_7)
ALIGN (4)
.p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@ -139,8 +158,17 @@ L(next_unaligned_table):
cmp $14, %edx
je L(shr_14)
jmp L(shr_15)
# else
cmp $0, %edx
je L(shr_0)
cmp $4, %edx
je L(shr_4)
cmp $8, %edx
je L(shr_8)
jmp L(shr_12)
# endif
ALIGN (4)
.p2align 4
L(shr_0):
cmp $80, %ecx
jae L(shr_0_gobble)
@ -165,7 +193,7 @@ L(shr_0):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_0_gobble):
lea -48(%ecx), %ecx
movdqa (%esi), %xmm0
@ -209,9 +237,10 @@ L(shr_0_gobble_loop_next):
POP (%esi)
jmp L(less48bytes)
# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_1):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -241,7 +270,7 @@ L(shr_1):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_1_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -295,7 +324,7 @@ L(shr_1_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_2):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -325,7 +354,7 @@ L(shr_2):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_2_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -378,7 +407,7 @@ L(shr_2_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_3):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -408,7 +437,7 @@ L(shr_3):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_3_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -458,10 +487,11 @@ L(shr_3_gobble_next):
POP (%edi)
POP (%esi)
jmp L(less48bytes)
# endif
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_4):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -491,7 +521,7 @@ L(shr_4):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_4_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -542,9 +572,10 @@ L(shr_4_gobble_next):
POP (%esi)
jmp L(less48bytes)
# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_5):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -574,7 +605,7 @@ L(shr_5):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_5_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -627,7 +658,7 @@ L(shr_5_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_6):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -657,7 +688,7 @@ L(shr_6):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_6_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -710,7 +741,7 @@ L(shr_6_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_7):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -740,7 +771,7 @@ L(shr_7):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_7_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -790,10 +821,11 @@ L(shr_7_gobble_next):
POP (%edi)
POP (%esi)
jmp L(less48bytes)
# endif
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_8):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -823,7 +855,7 @@ L(shr_8):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_8_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -874,9 +906,10 @@ L(shr_8_gobble_next):
POP (%esi)
jmp L(less48bytes)
# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_9):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -906,7 +939,7 @@ L(shr_9):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_9_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -959,7 +992,7 @@ L(shr_9_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_10):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -989,7 +1022,7 @@ L(shr_10):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_10_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -1042,7 +1075,7 @@ L(shr_10_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_11):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -1072,7 +1105,7 @@ L(shr_11):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_11_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -1122,10 +1155,11 @@ L(shr_11_gobble_next):
POP (%edi)
POP (%esi)
jmp L(less48bytes)
# endif
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_12):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -1155,7 +1189,7 @@ L(shr_12):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_12_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -1206,9 +1240,10 @@ L(shr_12_gobble_next):
POP (%esi)
jmp L(less48bytes)
# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_13):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -1238,7 +1273,7 @@ L(shr_13):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_13_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -1291,7 +1326,7 @@ L(shr_13_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_14):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -1321,7 +1356,7 @@ L(shr_14):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_14_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -1374,7 +1409,7 @@ L(shr_14_gobble_next):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_15):
cmp $80, %ecx
lea -48(%ecx), %ecx
@ -1404,7 +1439,7 @@ L(shr_15):
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(shr_15_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@ -1454,10 +1489,11 @@ L(shr_15_gobble_next):
POP (%edi)
POP (%esi)
jmp L(less48bytes)
# endif
cfi_restore_state
cfi_remember_state
ALIGN (4)
.p2align 4
L(exit):
pmovmskb %xmm1, %ebx
sub $0xffff, %ebx
@ -1465,9 +1501,12 @@ L(exit):
lea -16(%esi), %esi
lea -16(%edi), %edi
mov %ebx, %edx
L(first16bytes):
add %eax, %esi
L(less16bytes):
# ifndef USE_AS_WMEMCMP
test %dl, %dl
jz L(next_24_bytes)
@ -1497,56 +1536,56 @@ L(Byte23):
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte16):
movzbl -16(%edi), %eax
movzbl -16(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte17):
movzbl -15(%edi), %eax
movzbl -15(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte18):
movzbl -14(%edi), %eax
movzbl -14(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte19):
movzbl -13(%edi), %eax
movzbl -13(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte20):
movzbl -12(%edi), %eax
movzbl -12(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte21):
movzbl -11(%edi), %eax
movzbl -11(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(Byte22):
movzbl -10(%edi), %eax
movzbl -10(%esi), %edx
sub %edx, %eax
RETURN
ALIGN (4)
.p2align 4
L(next_24_bytes):
lea 8(%edi), %edi
lea 8(%esi), %esi
@ -1571,20 +1610,69 @@ L(next_24_bytes):
test $0x40, %dh
jnz L(Byte22)
ALIGN (4)
.p2align 4
L(Byte31):
movzbl -9(%edi), %eax
movzbl -9(%esi), %edx
sub %edx, %eax
RETURN_END
# else
/* special for wmemcmp */
xor %eax, %eax
test %dl, %dl
jz L(next_two_double_words)
and $15, %dl
jz L(second_double_word)
mov -16(%edi), %eax
cmp -16(%esi), %eax
jne L(nequal)
RETURN
.p2align 4
L(second_double_word):
mov -12(%edi), %eax
cmp -12(%esi), %eax
jne L(nequal)
RETURN
.p2align 4
L(next_two_double_words):
and $15, %dh
jz L(fourth_double_word)
mov -8(%edi), %eax
cmp -8(%esi), %eax
jne L(nequal)
RETURN
.p2align 4
L(fourth_double_word):
mov -4(%edi), %eax
cmp -4(%esi), %eax
jne L(nequal)
RETURN
.p2align 4
L(nequal):
mov $1, %eax
jg L(nequal_bigger)
neg %eax
RETURN
.p2align 4
L(nequal_bigger):
RETURN_END
# endif
CFI_PUSH (%ebx)
ALIGN (4)
.p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
cmp $8, %ecx
je L(8bytes)
# ifndef USE_AS_WMEMCMP
cmp $9, %ecx
je L(9bytes)
cmp $10, %ecx
@ -1598,13 +1686,17 @@ L(more8bytes):
cmp $14, %ecx
je L(14bytes)
jmp L(15bytes)
# else
jmp L(12bytes)
# endif
ALIGN (4)
.p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
cmp $16, %ecx
je L(16bytes)
# ifndef USE_AS_WMEMCMP
cmp $17, %ecx
je L(17bytes)
cmp $18, %ecx
@ -1618,13 +1710,17 @@ L(more16bytes):
cmp $22, %ecx
je L(22bytes)
jmp L(23bytes)
# else
jmp L(20bytes)
# endif
ALIGN (4)
.p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
cmp $24, %ecx
je L(24bytes)
# ifndef USE_AS_WMEMCMP
cmp $25, %ecx
je L(25bytes)
cmp $26, %ecx
@ -1638,13 +1734,17 @@ L(more24bytes):
cmp $30, %ecx
je L(30bytes)
jmp L(31bytes)
# else
jmp L(28bytes)
# endif
ALIGN (4)
.p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
cmp $32, %ecx
je L(32bytes)
# ifndef USE_AS_WMEMCMP
cmp $33, %ecx
je L(33bytes)
cmp $34, %ecx
@ -1658,11 +1758,35 @@ L(more32bytes):
cmp $38, %ecx
je L(38bytes)
jmp L(39bytes)
# else
jmp L(36bytes)
# endif
ALIGN (4)
.p2align 4
L(less48bytes):
cmp $8, %ecx
jae L(more8bytes)
# ifndef USE_AS_WMEMCMP
cmp $2, %ecx
je L(2bytes)
cmp $3, %ecx
je L(3bytes)
cmp $4, %ecx
je L(4bytes)
cmp $5, %ecx
je L(5bytes)
cmp $6, %ecx
je L(6bytes)
jmp L(7bytes)
# else
jmp L(4bytes)
# endif
.p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
# ifndef USE_AS_WMEMCMP
cmp $41, %ecx
je L(41bytes)
cmp $42, %ecx
@ -1677,23 +1801,7 @@ L(more40bytes):
je L(46bytes)
jmp L(47bytes)
ALIGN (4)
L(less48bytes):
cmp $8, %ecx
jae L(more8bytes)
cmp $2, %ecx
je L(2bytes)
cmp $3, %ecx
je L(3bytes)
cmp $4, %ecx
je L(4bytes)
cmp $5, %ecx
je L(5bytes)
cmp $6, %ecx
je L(6bytes)
jmp L(7bytes)
ALIGN (4)
.p2align 4
L(44bytes):
mov -44(%eax), %ecx
mov -44(%edx), %ebx
@ -1753,8 +1861,61 @@ L(4bytes):
POP (%ebx)
ret
CFI_PUSH (%ebx)
# else
.p2align 4
L(44bytes):
mov -44(%eax), %ecx
cmp -44(%edx), %ecx
jne L(find_diff)
L(40bytes):
mov -40(%eax), %ecx
cmp -40(%edx), %ecx
jne L(find_diff)
L(36bytes):
mov -36(%eax), %ecx
cmp -36(%edx), %ecx
jne L(find_diff)
L(32bytes):
mov -32(%eax), %ecx
cmp -32(%edx), %ecx
jne L(find_diff)
L(28bytes):
mov -28(%eax), %ecx
cmp -28(%edx), %ecx
jne L(find_diff)
L(24bytes):
mov -24(%eax), %ecx
cmp -24(%edx), %ecx
jne L(find_diff)
L(20bytes):
mov -20(%eax), %ecx
cmp -20(%edx), %ecx
jne L(find_diff)
L(16bytes):
mov -16(%eax), %ecx
cmp -16(%edx), %ecx
jne L(find_diff)
L(12bytes):
mov -12(%eax), %ecx
cmp -12(%edx), %ecx
jne L(find_diff)
L(8bytes):
mov -8(%eax), %ecx
cmp -8(%edx), %ecx
jne L(find_diff)
L(4bytes):
mov -4(%eax), %ecx
xor %eax, %eax
cmp -4(%edx), %ecx
jne L(find_diff)
POP (%ebx)
ret
CFI_PUSH (%ebx)
# endif
ALIGN (4)
# ifndef USE_AS_WMEMCMP
.p2align 4
L(45bytes):
mov -45(%eax), %ecx
mov -45(%edx), %ebx
@ -1818,7 +1979,7 @@ L(5bytes):
ret
CFI_PUSH (%ebx)
ALIGN (4)
.p2align 4
L(46bytes):
mov -46(%eax), %ecx
mov -46(%edx), %ebx
@ -1886,7 +2047,7 @@ L(2bytes):
ret
CFI_PUSH (%ebx)
ALIGN (4)
.p2align 4
L(47bytes):
movl -47(%eax), %ecx
movl -47(%edx), %ebx
@ -1957,7 +2118,7 @@ L(3bytes):
ret
CFI_PUSH (%ebx)
ALIGN (4)
.p2align 4
L(find_diff):
cmpb %bl, %cl
jne L(end)
@ -1968,6 +2129,8 @@ L(find_diff):
cmp %bl, %cl
jne L(end)
cmp %bx, %cx
.p2align 4
L(end):
POP (%ebx)
mov $1, %eax
@ -1975,7 +2138,21 @@ L(end):
neg %eax
L(bigger):
ret
# else
END (MEMCMP)
/* for wmemcmp */
.p2align 4
L(find_diff):
POP (%ebx)
mov $1, %eax
jg L(find_diff_bigger)
neg %eax
ret
.p2align 4
L(find_diff_bigger):
ret
# endif
END (MEMCMP)
#endif

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define WMEMCMP __wmemcmp_ia32
#endif
#include "wcsmbs/wmemcmp.c"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_sse4_2
#include "memcmp-sse4.S"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_ssse3
#include "memcmp-ssse3.S"

View File

@ -0,0 +1,59 @@
/* Multiple versions of wmemcmp
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret
.text
ENTRY(wmemcmp)
.type wmemcmp, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax
testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4)
cfi_restore (ebx)
ret
END(wmemcmp)
#endif

View File

@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
strrchr-sse2-no-bsf strchr-sse2-no-bsf
strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4

View File

@ -1,5 +1,5 @@
/* memcmp with SSE4.1
Copyright (C) 2010 Free Software Foundation, Inc.
/* memcmp with SSE4.1, wmemcmp with SSE4.1
Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@ -21,7 +21,6 @@
#ifndef NOT_IN_libc
# include <sysdep.h>
#include "asm-syntax.h"
# ifndef MEMCMP
# define MEMCMP __memcmp_sse4_1
@ -40,23 +39,35 @@
jmp *%rcx; \
ud2
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
shl $2, %rdx
# endif
pxor %xmm0, %xmm0
cmp $79, %rdx
ja L(79bytesormore)
# ifndef USE_AS_WMEMCMP
cmp $1, %rdx
je L(firstbyte)
# endif
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
# ifndef USE_AS_WMEMCMP
ALIGN (4)
L(firstbyte):
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
sub %ecx, %eax
ret
# endif
ALIGN (4)
L(79bytesormore):
@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_aglined):
sub $64, %rdx
ALIGN (4)
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
@ -803,13 +815,19 @@ L(12bytes):
jne L(diffin8bytes)
L(4bytes):
mov -4(%rsi), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx
# else
cmp -4(%rdi), %ecx
# endif
jne L(diffin4bytes)
L(0bytes):
xor %eax, %eax
ret
# ifndef USE_AS_WMEMCMP
/* unreal case for wmemcmp */
ALIGN (4)
L(65bytes):
movdqu -65(%rdi), %xmm1
@ -1017,6 +1035,7 @@ L(1bytes):
movzbl -1(%rsi), %ecx
sub %ecx, %eax
ret
# endif
ALIGN (4)
L(68bytes):
@ -1047,13 +1066,20 @@ L(20bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx
# else
cmp -4(%rdi), %ecx
# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
ALIGN (4)
L(69bytes):
movdqu -69(%rsi), %xmm1
@ -1161,6 +1187,7 @@ L(23bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
# endif
ALIGN (4)
L(72bytes):
@ -1191,13 +1218,16 @@ L(24bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
mov -8(%rdi), %rax
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
ret
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
ALIGN (4)
L(73bytes):
movdqu -73(%rsi), %xmm1
@ -1312,7 +1342,7 @@ L(27bytes):
jne L(diffin4bytes)
xor %eax, %eax
ret
# endif
ALIGN (4)
L(76bytes):
movdqu -76(%rsi), %xmm1
@ -1346,13 +1376,19 @@ L(28bytes):
mov -12(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx
# else
cmp -4(%rdi), %ecx
# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
ALIGN (4)
L(77bytes):
movdqu -77(%rsi), %xmm1
@ -1474,7 +1510,7 @@ L(31bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
# endif
ALIGN (4)
L(64bytes):
movdqu -64(%rdi), %xmm2
@ -1527,7 +1563,17 @@ L(diffin8bytes):
jne L(diffin4bytes)
shr $32, %rcx
shr $32, %rax
# ifdef USE_AS_WMEMCMP
/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
ret
# endif
L(diffin4bytes):
# ifndef USE_AS_WMEMCMP
cmp %cx, %ax
jne L(diffin2bytes)
shr $16, %ecx
@ -1546,11 +1592,28 @@ L(end):
and $0xff, %ecx
sub %ecx, %eax
ret
# else
/* for wmemcmp */
mov $1, %eax
jl L(nequal_bigger)
neg %eax
ret
ALIGN (4)
L(nequal_bigger):
ret
L(unreal_case):
xor %eax, %eax
ret
# endif
END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits
ALIGN (3)
# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@ -1632,4 +1695,87 @@ L(table_64bytes):
.int JMPTBL (L(77bytes), L(table_64bytes))
.int JMPTBL (L(78bytes), L(table_64bytes))
.int JMPTBL (L(79bytes), L(table_64bytes))
# else
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(4bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(8bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(12bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(16bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(20bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(24bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(28bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(32bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(36bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(40bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(44bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(48bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(52bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(56bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(60bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(68bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(72bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(76bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
# endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* Multiple versions of memcmp
Copyright (C) 2010 Free Software Foundation, Inc.
Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@ -29,11 +29,20 @@ ENTRY(memcmp)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq __memcmp_sse2(%rip), %rax
testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 2f
1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 2f
leaq __memcmp_sse2(%rip), %rax
ret
2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 3f
leaq __memcmp_sse4_1(%rip), %rax
2: ret
ret
3: leaq __memcmp_ssse3(%rip), %rax
ret
END(memcmp)
# undef ENTRY

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define WMEMCMP __wmemcmp_sse2
#endif
#include "wcsmbs/wmemcmp.c"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_sse4_1
#include "memcmp-sse4.S"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_ssse3
#include "memcmp-ssse3.S"

View File

@ -0,0 +1,47 @@
/* Multiple versions of wmemcmp
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
ENTRY(wmemcmp)
.type wmemcmp, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 2f
leaq __wmemcmp_sse2(%rip), %rax
ret
2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 3f
leaq __wmemcmp_sse4_1(%rip), %rax
ret
3: leaq __wmemcmp_ssse3(%rip), %rax
ret
END(wmemcmp)
#endif

View File

@ -1,4 +1,4 @@
/* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
/* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@ -19,9 +19,12 @@
#include <wchar.h>
#ifndef WMEMCMP
# define wmemcmp
#endif
int
wmemcmp (s1, s2, n)
WMEMCMP (s1, s2, n)
const wchar_t *s1;
const wchar_t *s2;
size_t n;
@ -34,19 +37,19 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[1];
c2 = (wint_t) s2[1];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[2];
c2 = (wint_t) s2[2];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[3];
c2 = (wint_t) s2[3];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
s1 += 4;
s2 += 4;
n -= 4;
@ -57,7 +60,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
++s1;
++s2;
--n;
@ -67,7 +70,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
++s1;
++s2;
--n;
@ -77,7 +80,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
return c1 - c2;
return c1 > c2 ? 1 : -1;
}
return 0;