1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-08-05 19:35:52 +03:00

Optimie x86-64 SSE4 memcmp for unaligned data.

This commit is contained in:
H.J. Lu
2010-04-14 17:53:44 -07:00
committed by Ulrich Drepper
parent 404a6e3201
commit dd37cd1a12
3 changed files with 380 additions and 6 deletions

View File

@@ -1,3 +1,8 @@
2010-04-14 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Optimized for unaligned
data.
2010-04-12 H.J. Lu <hongjiu.lu@intel.com> 2010-04-12 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add

View File

@@ -3168,6 +3168,10 @@ static Void_t* sYSMALLOc(nb, av) INTERNAL_SIZE_T nb; mstate av;
size = nb + mp_.top_pad + MINSIZE; size = nb + mp_.top_pad + MINSIZE;
#define TWOM (2*1024*1024)
char *cur = (char*)MORECORE(0);
size = (char*)((size_t)(cur + size + TWOM - 1)&~(TWOM-1))-cur;
/* /*
If contiguous, we can subtract out existing space that we hope to If contiguous, we can subtract out existing space that we hope to
combine with new space. We add it back later only if combine with new space. We add it back later only if

View File

@@ -72,6 +72,8 @@ L(79bytesormore):
sub %rcx, %rdi sub %rcx, %rdi
add %rcx, %rdx add %rcx, %rdx
test $0xf, %rdi
jz L(2aligned)
cmp $128, %rdx cmp $128, %rdx
ja L(128bytesormore) ja L(128bytesormore)
@@ -120,8 +122,10 @@ L(less32bytesin64):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(128bytesormore): L(128bytesormore):
cmp $512, %rdx
ja L(512bytesormore)
cmp $256, %rdx cmp $256, %rdx
ja L(256bytesormore) ja L(less512bytes)
L(less256bytes): L(less256bytes):
sub $128, %rdx sub $128, %rdx
@@ -191,9 +195,6 @@ L(less32bytesin128):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(256bytesormore):
cmp $512, %rdx
ja L(512bytesormore)
L(less512bytes): L(less512bytes):
sub $256, %rdx sub $256, %rdx
movdqu (%rdi), %xmm2 movdqu (%rdi), %xmm2
@@ -307,7 +308,18 @@ L(less32bytesin256):
ALIGN (4) ALIGN (4)
L(512bytesormore): L(512bytesormore):
#ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
#else
mov __x86_64_data_cache_size_half(%rip), %r8
#endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
cmp %r8, %rdx
ja L(L2_L3_cache_unaglined)
sub $64, %rdx sub $64, %rdx
ALIGN (4)
L(64bytesormore_loop): L(64bytesormore_loop):
movdqu (%rdi), %xmm2 movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2 pxor (%rsi), %xmm2
@@ -337,6 +349,357 @@ L(64bytesormore_loop):
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_unaglined):
sub $64, %rdx
ALIGN (4)
L(L2_L3_unaligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi)
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
movdqa %xmm2, %xmm1
movdqu 16(%rdi), %xmm3
pxor 16(%rsi), %xmm3
por %xmm3, %xmm1
movdqu 32(%rdi), %xmm4
pxor 32(%rsi), %xmm4
por %xmm4, %xmm1
movdqu 48(%rdi), %xmm5
pxor 48(%rsi), %xmm5
por %xmm5, %xmm1
ptest %xmm1, %xmm0
jnc L(64bytesormore_loop_end)
add $64, %rsi
add $64, %rdi
sub $64, %rdx
jae L(L2_L3_unaligned_128bytes_loop)
add $64, %rdx
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
/*
* This case is for machines which are sensitive for unaligned instructions.
*/
ALIGN (4)
L(2aligned):
cmp $128, %rdx
ja L(128bytesormorein2aligned)
L(less128bytesin2aligned):
sub $64, %rdx
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(16bytesin256)
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(32bytesin256)
movdqa 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(48bytesin256)
movdqa 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(64bytesin256)
cmp $32, %rdx
jb L(less32bytesin64in2alinged)
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(80bytesin256)
movdqa 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(96bytesin256)
sub $32, %rdx
add $32, %rdi
add $32, %rsi
L(less32bytesin64in2alinged):
add $64, %rdi
add $64, %rsi
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4)
L(128bytesormorein2aligned):
cmp $512, %rdx
ja L(512bytesormorein2aligned)
cmp $256, %rdx
ja L(256bytesormorein2aligned)
L(less256bytesin2alinged):
sub $128, %rdx
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(16bytesin256)
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(32bytesin256)
movdqa 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(48bytesin256)
movdqa 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(64bytesin256)
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(80bytesin256)
movdqa 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(96bytesin256)
movdqa 96(%rdi), %xmm2
pxor 96(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(112bytesin256)
movdqa 112(%rdi), %xmm2
pxor 112(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(128bytesin256)
add $128, %rsi
add $128, %rdi
cmp $64, %rdx
jae L(less128bytesin2aligned)
cmp $32, %rdx
jb L(less32bytesin128in2aligned)
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(16bytesin256)
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(32bytesin256)
sub $32, %rdx
add $32, %rdi
add $32, %rsi
L(less32bytesin128in2aligned):
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4)
L(256bytesormorein2aligned):
sub $256, %rdx
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(16bytesin256)
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(32bytesin256)
movdqa 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(48bytesin256)
movdqa 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(64bytesin256)
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(80bytesin256)
movdqa 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(96bytesin256)
movdqa 96(%rdi), %xmm2
pxor 96(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(112bytesin256)
movdqa 112(%rdi), %xmm2
pxor 112(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(128bytesin256)
movdqa 128(%rdi), %xmm2
pxor 128(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(144bytesin256)
movdqa 144(%rdi), %xmm2
pxor 144(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(160bytesin256)
movdqa 160(%rdi), %xmm2
pxor 160(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(176bytesin256)
movdqa 176(%rdi), %xmm2
pxor 176(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(192bytesin256)
movdqa 192(%rdi), %xmm2
pxor 192(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(208bytesin256)
movdqa 208(%rdi), %xmm2
pxor 208(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(224bytesin256)
movdqa 224(%rdi), %xmm2
pxor 224(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(240bytesin256)
movdqa 240(%rdi), %xmm2
pxor 240(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(256bytesin256)
add $256, %rsi
add $256, %rdi
cmp $128, %rdx
jae L(less256bytesin2alinged)
cmp $64, %rdx
jae L(less128bytesin2aligned)
cmp $32, %rdx
jb L(less32bytesin256in2alinged)
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(16bytesin256)
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
jnc L(32bytesin256)
sub $32, %rdx
add $32, %rdi
add $32, %rsi
L(less32bytesin256in2alinged):
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4)
L(512bytesormorein2aligned):
#ifdef SHARED_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
#else
mov __x86_64_data_cache_size_half(%rip), %r8
#endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
cmp %r8, %rdx
ja L(L2_L3_cache_aglined)
sub $64, %rdx
ALIGN (4)
L(64bytesormore_loopin2aligned):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
movdqa %xmm2, %xmm1
movdqa 16(%rdi), %xmm3
pxor 16(%rsi), %xmm3
por %xmm3, %xmm1
movdqa 32(%rdi), %xmm4
pxor 32(%rsi), %xmm4
por %xmm4, %xmm1
movdqa 48(%rdi), %xmm5
pxor 48(%rsi), %xmm5
por %xmm5, %xmm1
ptest %xmm1, %xmm0
jnc L(64bytesormore_loop_end)
add $64, %rsi
add $64, %rdi
sub $64, %rdx
jae L(64bytesormore_loopin2aligned)
add $64, %rdx
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_aglined):
sub $64, %rdx
ALIGN (4)
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi)
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
movdqa %xmm2, %xmm1
movdqa 16(%rdi), %xmm3
pxor 16(%rsi), %xmm3
por %xmm3, %xmm1
movdqa 32(%rdi), %xmm4
pxor 32(%rsi), %xmm4
por %xmm4, %xmm1
movdqa 48(%rdi), %xmm5
pxor 48(%rsi), %xmm5
por %xmm5, %xmm1
ptest %xmm1, %xmm0
jnc L(64bytesormore_loop_end)
add $64, %rsi
add $64, %rdi
sub $64, %rdx
jae L(L2_L3_aligned_128bytes_loop)
add $64, %rdx
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
ALIGN (4) ALIGN (4)
L(64bytesormore_loop_end): L(64bytesormore_loop_end):
add $16, %rdi add $16, %rdi
@@ -1147,7 +1510,10 @@ L(32bytes):
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) /*
* Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
*/
ALIGN (3)
L(less16bytes): L(less16bytes):
movsbq %dl, %rdx movsbq %dl, %rdx
mov (%rsi, %rdx), %rcx mov (%rsi, %rdx), %rcx
@@ -1166,7 +1532,6 @@ L(diffin4bytes):
jne L(diffin2bytes) jne L(diffin2bytes)
shr $16, %ecx shr $16, %ecx
shr $16, %eax shr $16, %eax
xor %rdx, %rdx
L(diffin2bytes): L(diffin2bytes):
cmp %cl, %al cmp %cl, %al
jne L(end) jne L(end)