mirror of
				https://sourceware.org/git/glibc.git
				synced 2025-11-03 20:53:13 +03:00 
			
		
		
		
	* All files with FSF copyright notices: Update copyright dates using scripts/update-copyrights. * locale/programs/charmap-kw.h: Regenerated. * locale/programs/locfile-kw.h: Likewise.
		
			
				
	
	
		
			248 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			248 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/* Optimized strcmp implementation for PowerPC64/POWER8.
 | 
						|
   Copyright (C) 2015-2018 Free Software Foundation, Inc.
 | 
						|
   This file is part of the GNU C Library.
 | 
						|
 | 
						|
   The GNU C Library is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU Lesser General Public
 | 
						|
   License as published by the Free Software Foundation; either
 | 
						|
   version 2.1 of the License, or (at your option) any later version.
 | 
						|
 | 
						|
   The GNU C Library is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
   Lesser General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU Lesser General Public
 | 
						|
   License along with the GNU C Library; if not, see
 | 
						|
   <http://www.gnu.org/licenses/>.  */
 | 
						|
 | 
						|
#include <sysdep.h>
 | 
						|
 | 
						|
#ifndef STRCMP
 | 
						|
# define STRCMP strcmp
 | 
						|
#endif
 | 
						|
 | 
						|
/* Implements the function
 | 
						|
 | 
						|
   size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
 | 
						|
 | 
						|
   The implementation uses unaligned doubleword access to avoid specialized
 | 
						|
   code paths depending of data alignment.  Although recent powerpc64 uses
 | 
						|
   64K as default, the page cross handling assumes minimum page size of
 | 
						|
   4k.  */
 | 
						|
 | 
						|
ENTRY_TOCLESS (STRCMP, 4)
 | 
						|
	li	r0,0
 | 
						|
 | 
						|
	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
 | 
						|
	   the code:
 | 
						|
 | 
						|
	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
 | 
						|
 | 
						|
	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
 | 
						|
 | 
						|
	rldicl	r7,r3,0,52
 | 
						|
	rldicl	r9,r4,0,52
 | 
						|
	cmpldi	cr7,r7,4096-16
 | 
						|
	bgt	cr7,L(pagecross_check)
 | 
						|
	cmpldi	cr5,r9,4096-16
 | 
						|
	bgt	cr5,L(pagecross_check)
 | 
						|
 | 
						|
	/* For short string up to 16 bytes, load both s1 and s2 using
 | 
						|
	   unaligned dwords and compare.  */
 | 
						|
	ld	r8,0(r3)
 | 
						|
	ld	r10,0(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
 | 
						|
	ld	r8,8(r3)
 | 
						|
	ld	r10,8(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
 | 
						|
	addi	r7,r3,16
 | 
						|
	addi	r4,r4,16
 | 
						|
 | 
						|
L(align_8b):
 | 
						|
	/* Now it has checked for first 16 bytes, align source1 to doubleword
 | 
						|
	   and adjust source2 address.  */
 | 
						|
	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
 | 
						|
	subf	r4,r9,r4	/* Adjust source2 address based on source1
 | 
						|
				   alignment.  */
 | 
						|
	rldicr	r7,r7,0,60	/* Align source1 to doubleword.  */
 | 
						|
 | 
						|
	/* At this point, source1 alignment is 0 and source2 alignment is
 | 
						|
	   between 0 and 7.  Check is source2 alignment is 0, meaning both
 | 
						|
	   sources have the same alignment.  */
 | 
						|
	andi.	r9,r4,0x7
 | 
						|
	bne	cr0,L(loop_diff_align)
 | 
						|
 | 
						|
	/* If both source1 and source2 are doubleword aligned, there is no
 | 
						|
	   need for page boundary cross checks.  */
 | 
						|
 | 
						|
	ld	r8,0(r7)
 | 
						|
	ld	r10,0(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
 | 
						|
	.align 4
 | 
						|
L(loop_equal_align):
 | 
						|
	ld	r8,8(r7)
 | 
						|
	ld	r10,8(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
 | 
						|
	ld	r8,16(r7)
 | 
						|
	ld	r10,16(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
 | 
						|
	ldu	r8,24(r7)
 | 
						|
	ldu	r10,24(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
 | 
						|
	b	L(loop_equal_align)
 | 
						|
 | 
						|
	/* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
 | 
						|
	   result and r10 the dword from s2.  To code isolate the byte
 | 
						|
	   up to end (including the '\0'), masking with 0xFF the remaining
 | 
						|
	   ones:
 | 
						|
 | 
						|
           #if __LITTLE_ENDIAN__
 | 
						|
	     (__builtin_ffsl (x) - 1) = counting trailing zero bits
 | 
						|
	     r9 = (__builtin_ffsl (r9) - 1) + 8;
 | 
						|
	     r9 = -1UL << r9
 | 
						|
	   #else
 | 
						|
	     r9  = __builtin_clzl (r9) + 8;
 | 
						|
	     r9  = -1UL >> r9
 | 
						|
	   #endif
 | 
						|
	     r8  = r8  | r9
 | 
						|
	     r10 = r10 | r9  */
 | 
						|
 | 
						|
#ifdef __LITTLE_ENDIAN__
 | 
						|
	nor 	r9,r9,r9
 | 
						|
L(different_nocmpb):
 | 
						|
	neg	r3,r9
 | 
						|
	and	r9,r9,r3
 | 
						|
	cntlzd	r9,r9
 | 
						|
	subfic	r9,r9,63
 | 
						|
#else
 | 
						|
	not	r9,r9
 | 
						|
L(different_nocmpb):
 | 
						|
	cntlzd	r9,r9
 | 
						|
	subfic	r9,r9,56
 | 
						|
#endif
 | 
						|
	srd	r3,r8,r9
 | 
						|
	srd	r10,r10,r9
 | 
						|
	rldicl	r10,r10,0,56
 | 
						|
	rldicl	r3,r3,0,56
 | 
						|
	subf	r3,r10,r3
 | 
						|
	extsw	r3,r3
 | 
						|
	blr
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(pagecross_check):
 | 
						|
	subfic	r9,r9,4096
 | 
						|
	subfic	r7,r7,4096
 | 
						|
	cmpld	cr7,r7,r9
 | 
						|
	bge	cr7,L(pagecross)
 | 
						|
	mr	r7,r9
 | 
						|
 | 
						|
	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
 | 
						|
	   a simple byte a byte comparison until the page alignment for s1
 | 
						|
	   is reached.  */
 | 
						|
L(pagecross):
 | 
						|
	add	r7,r3,r7
 | 
						|
	subf	r9,r3,r7
 | 
						|
	mtctr	r9
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(pagecross_loop):
 | 
						|
	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
 | 
						|
	   and if *s1 is '\0'.  */
 | 
						|
	lbz	r9,0(r3)
 | 
						|
	lbz	r10,0(r4)
 | 
						|
	addi	r3,r3,1
 | 
						|
	addi	r4,r4,1
 | 
						|
	cmplw	cr7,r9,r10
 | 
						|
	cmpdi	cr5,r9,r0
 | 
						|
	bne	cr7,L(pagecross_ne)
 | 
						|
	beq	cr5,L(pagecross_nullfound)
 | 
						|
	bdnz	L(pagecross_loop)
 | 
						|
	b	L(align_8b)
 | 
						|
 | 
						|
	.align	4
 | 
						|
	/* The unaligned read of source2 will cross a 4K page boundary,
 | 
						|
	   and the different byte or NULL maybe be in the remaining page
 | 
						|
	   bytes. Since it can not use the unaligned load, the algorithm
 | 
						|
	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
 | 
						|
L(check_source2_byte):
 | 
						|
	li	r9,8
 | 
						|
	mtctr	r9
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(check_source2_byte_loop):
 | 
						|
	lbz	r9,0(r7)
 | 
						|
	lbz	r10,0(r4)
 | 
						|
	addi	r7,r7,1
 | 
						|
	addi	r4,r4,1
 | 
						|
	cmplw	cr7,r9,10
 | 
						|
	cmpdi	r5,r9,0
 | 
						|
	bne	cr7,L(pagecross_ne)
 | 
						|
	beq	cr5,L(pagecross_nullfound)
 | 
						|
	bdnz	L(check_source2_byte_loop)
 | 
						|
 | 
						|
	/* If source2 is unaligned to doubleword, the code needs to check
 | 
						|
	   on each interation if the unaligned doubleword access will cross
 | 
						|
	   a 4k page boundary.  */
 | 
						|
	.align	5
 | 
						|
L(loop_unaligned):
 | 
						|
	ld	r8,0(r7)
 | 
						|
	ld	r10,0(r4)
 | 
						|
	cmpb	r12,r8,r0
 | 
						|
	cmpb	r11,r8,r10
 | 
						|
	orc.	r9,r12,r11
 | 
						|
	bne	cr0,L(different_nocmpb)
 | 
						|
	addi	r7,r7,8
 | 
						|
	addi	r4,r4,8
 | 
						|
 | 
						|
L(loop_diff_align):
 | 
						|
	/* Check if [src2]+8 cross a 4k page boundary:
 | 
						|
 | 
						|
	     srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
 | 
						|
 | 
						|
	     with PAGE_SIZE being 4096.  */
 | 
						|
	rldicl	r9,r4,0,52
 | 
						|
	cmpldi	cr7,r9,4088
 | 
						|
	ble	cr7,L(loop_unaligned)
 | 
						|
	b	L(check_source2_byte)
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(pagecross_ne):
 | 
						|
	extsw	r3,r9
 | 
						|
	mr	r9,r10
 | 
						|
L(pagecross_retdiff):
 | 
						|
	subf	r9,r9,r3
 | 
						|
	extsw	r3,r9
 | 
						|
	blr
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(pagecross_nullfound):
 | 
						|
	li	r3,0
 | 
						|
	b	L(pagecross_retdiff)
 | 
						|
END (STRCMP)
 | 
						|
libc_hidden_builtin_def (strcmp)
 |