mirror of
				https://sourceware.org/git/glibc.git
				synced 2025-11-03 20:53:13 +03:00 
			
		
		
		
	* All files with FSF copyright notices: Update copyright dates using scripts/update-copyrights. * locale/programs/charmap-kw.h: Regenerated. * locale/programs/locfile-kw.h: Likewise.
		
			
				
	
	
		
			432 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			432 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/* Optimized memset implementation for PowerPC32/POWER7.
 | 
						|
   Copyright (C) 2010-2018 Free Software Foundation, Inc.
 | 
						|
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
 | 
						|
   This file is part of the GNU C Library.
 | 
						|
 | 
						|
   The GNU C Library is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU Lesser General Public
 | 
						|
   License as published by the Free Software Foundation; either
 | 
						|
   version 2.1 of the License, or (at your option) any later version.
 | 
						|
 | 
						|
   The GNU C Library is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
   Lesser General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU Lesser General Public
 | 
						|
   License along with the GNU C Library; if not, see
 | 
						|
   <http://www.gnu.org/licenses/>.  */
 | 
						|
 | 
						|
#include <sysdep.h>
 | 
						|
 | 
						|
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
 | 
						|
   Returns 's'.  */
 | 
						|
 | 
						|
	.machine  power7
 | 
						|
EALIGN (memset, 5, 0)
 | 
						|
	CALL_MCOUNT
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(_memset):
 | 
						|
	cmplwi	cr7,5,31
 | 
						|
	cmplwi	cr6,5,8
 | 
						|
	mr	10,3		/* Save original argument for later.  */
 | 
						|
	mr	7,1		/* Save original r1 for later.  */
 | 
						|
	cfi_offset(31,-8)
 | 
						|
 | 
						|
	/* Replicate byte to word.  */
 | 
						|
	insrwi	4,4,8,16
 | 
						|
	insrwi	4,4,16,0
 | 
						|
 | 
						|
	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
 | 
						|
 | 
						|
	neg	0,3
 | 
						|
	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
 | 
						|
 | 
						|
	/* Save our word twice to create a doubleword that we will later
 | 
						|
	   copy to a FPR.  */
 | 
						|
	stwu	1,-32(1)
 | 
						|
	andi.	11,10,7		/* Check alignment of DST.  */
 | 
						|
	mr	12,5
 | 
						|
	stw	4,24(1)
 | 
						|
	stw	4,28(1)
 | 
						|
	beq	L(big_aligned)
 | 
						|
 | 
						|
	clrlwi	0,0,29
 | 
						|
	mtocrf	0x01,0
 | 
						|
	subf	5,0,5
 | 
						|
 | 
						|
	/* Get DST aligned to 8 bytes.  */
 | 
						|
1:	bf	31,2f
 | 
						|
 | 
						|
	stb	4,0(10)
 | 
						|
	addi	10,10,1
 | 
						|
2:	bf	30,4f
 | 
						|
 | 
						|
	sth	4,0(10)
 | 
						|
	addi	10,10,2
 | 
						|
4:	bf	29,L(big_aligned)
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	addi	10,10,4
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(big_aligned):
 | 
						|
	cmplwi	cr5,5,255
 | 
						|
	li	0,32
 | 
						|
	cmplwi	cr1,5,160
 | 
						|
	dcbtst	0,10
 | 
						|
	cmplwi	cr6,4,0
 | 
						|
	srwi	9,5,3		/* Number of full doublewords remaining.  */
 | 
						|
	crand	27,26,21
 | 
						|
	mtocrf	0x01,9
 | 
						|
	bt	27,L(huge)
 | 
						|
 | 
						|
	/* From this point on, we'll copy 32+ bytes and the value
 | 
						|
	   isn't 0 (so we can't use dcbz).  */
 | 
						|
 | 
						|
	srwi	8,5,5
 | 
						|
	clrlwi	11,5,29
 | 
						|
	cmplwi	cr6,11,0
 | 
						|
	cmplwi	cr1,9,4
 | 
						|
	mtctr	8
 | 
						|
 | 
						|
	/* Copy 1~3 doublewords so the main loop starts
 | 
						|
	at a multiple of 32 bytes.  */
 | 
						|
 | 
						|
	bf	30,1f
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	stw	4,8(10)
 | 
						|
	stw	4,12(10)
 | 
						|
	addi	10,10,16
 | 
						|
	bf	31,L(big_loop)
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	addi	10,10,8
 | 
						|
	mr	12,10
 | 
						|
	blt	cr1,L(tail_bytes)
 | 
						|
 | 
						|
	b	L(big_loop)
 | 
						|
 | 
						|
	.align	4
 | 
						|
1:	/* Copy 1 doubleword.  */
 | 
						|
	bf	31,L(big_loop)
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	addi	10,10,8
 | 
						|
 | 
						|
	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
 | 
						|
	   to the lfd we will do next.  Also, ping-pong through r10 and r12
 | 
						|
	   to avoid AGEN delays.  */
 | 
						|
	.align	4
 | 
						|
L(big_loop):
 | 
						|
	addi	12,10,32
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	stw	4,8(10)
 | 
						|
	stw	4,12(10)
 | 
						|
	stw	4,16(10)
 | 
						|
	stw	4,20(10)
 | 
						|
	stw	4,24(10)
 | 
						|
	stw	4,28(10)
 | 
						|
	bdz	L(tail_bytes)
 | 
						|
 | 
						|
	addi	10,10,64
 | 
						|
	stw	4,0(12)
 | 
						|
	stw	4,4(12)
 | 
						|
	stw	4,8(12)
 | 
						|
	stw	4,12(12)
 | 
						|
	stw	4,16(12)
 | 
						|
	stw	4,20(12)
 | 
						|
	stw	4,24(12)
 | 
						|
	stw	4,28(12)
 | 
						|
	bdnz	L(big_loop_fast_setup)
 | 
						|
 | 
						|
	mr	12,10
 | 
						|
	b	L(tail_bytes)
 | 
						|
 | 
						|
	/* Now that we're probably past the LHS window, use the VSX to
 | 
						|
	   speed up the loop.  */
 | 
						|
L(big_loop_fast_setup):
 | 
						|
	li	11,24
 | 
						|
	li	6,16
 | 
						|
	lxvdsx	4,1,11
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(big_loop_fast):
 | 
						|
	addi	12,10,32
 | 
						|
	stxvd2x	4,0,10
 | 
						|
	stxvd2x	4,10,6
 | 
						|
	bdz	L(tail_bytes)
 | 
						|
 | 
						|
	addi	10,10,64
 | 
						|
	stxvd2x	4,0,12
 | 
						|
	stxvd2x	4,12,6
 | 
						|
	bdnz	L(big_loop_fast)
 | 
						|
 | 
						|
	mr	12,10
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(tail_bytes):
 | 
						|
 | 
						|
	/* Check for tail bytes.  */
 | 
						|
	mr	1,7		/* Restore r1.  */
 | 
						|
	beqlr	cr6
 | 
						|
 | 
						|
	clrlwi	0,5,29
 | 
						|
	mtocrf	0x01,0
 | 
						|
 | 
						|
	/*  At this point we have a tail of 0-7 bytes and we know that the
 | 
						|
	destination is doubleword-aligned.  */
 | 
						|
4:	/* Copy 4 bytes.  */
 | 
						|
	bf	29,2f
 | 
						|
 | 
						|
	stw	4,0(12)
 | 
						|
	addi	12,12,4
 | 
						|
2:	/* Copy 2 bytes.  */
 | 
						|
	bf	30,1f
 | 
						|
 | 
						|
	sth	4,0(12)
 | 
						|
	addi	12,12,2
 | 
						|
1:	/* Copy 1 byte.  */
 | 
						|
	bflr	31
 | 
						|
 | 
						|
	stb	4,0(12)
 | 
						|
	blr
 | 
						|
 | 
						|
 | 
						|
	/* Special case when value is 0 and we have a long length to deal
 | 
						|
	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
 | 
						|
	   dcbz though, we need to get the destination 128-bytes aligned.  */
 | 
						|
	.align	4
 | 
						|
L(huge):
 | 
						|
	lfd	4,24(1)
 | 
						|
	andi.	11,10,127
 | 
						|
	neg	0,10
 | 
						|
	beq	L(huge_aligned)
 | 
						|
 | 
						|
	clrlwi	0,0,25
 | 
						|
	subf	5,0,5
 | 
						|
	srwi	0,0,3
 | 
						|
	mtocrf  0x01,0
 | 
						|
 | 
						|
	/* Get DST aligned to 128 bytes.  */
 | 
						|
8:	bf	28,4f
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	stfd	4,8(10)
 | 
						|
	stfd	4,16(10)
 | 
						|
	stfd	4,24(10)
 | 
						|
	stfd	4,32(10)
 | 
						|
	stfd	4,40(10)
 | 
						|
	stfd	4,48(10)
 | 
						|
	stfd	4,56(10)
 | 
						|
	addi	10,10,64
 | 
						|
	.align	4
 | 
						|
4:	bf	29,2f
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	stfd	4,8(10)
 | 
						|
	stfd	4,16(10)
 | 
						|
	stfd	4,24(10)
 | 
						|
	addi	10,10,32
 | 
						|
	.align	4
 | 
						|
2:	bf	30,1f
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	stfd	4,8(10)
 | 
						|
	addi	10,10,16
 | 
						|
	.align	4
 | 
						|
1:	bf	31,L(huge_aligned)
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	addi	10,10,8
 | 
						|
 | 
						|
L(huge_aligned):
 | 
						|
	srwi	8,5,7
 | 
						|
	clrlwi	11,5,25
 | 
						|
	cmplwi	cr6,11,0
 | 
						|
	mtctr	8
 | 
						|
 | 
						|
	/* Copies 128-bytes at a time.  */
 | 
						|
	.align	4
 | 
						|
L(huge_loop):
 | 
						|
	dcbz	0,10
 | 
						|
	addi	10,10,128
 | 
						|
	bdnz	L(huge_loop)
 | 
						|
 | 
						|
	/* We have a tail of 0~127 bytes to handle.  */
 | 
						|
	mr	1,7		/* Restore r1.  */
 | 
						|
	beqlr	cr6
 | 
						|
 | 
						|
	subf	9,3,10
 | 
						|
	subf	5,9,12
 | 
						|
	srwi	8,5,3
 | 
						|
	cmplwi	cr6,8,0
 | 
						|
	mtocrf	0x01,8
 | 
						|
 | 
						|
	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
 | 
						|
	speed.  We'll handle the resulting tail bytes later.  */
 | 
						|
	beq	cr6,L(tail)
 | 
						|
 | 
						|
8:	bf	28,4f
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	stfd	4,8(10)
 | 
						|
	stfd	4,16(10)
 | 
						|
	stfd	4,24(10)
 | 
						|
	stfd	4,32(10)
 | 
						|
	stfd	4,40(10)
 | 
						|
	stfd	4,48(10)
 | 
						|
	stfd	4,56(10)
 | 
						|
	addi	10,10,64
 | 
						|
	.align	4
 | 
						|
4:	bf	29,2f
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	stfd	4,8(10)
 | 
						|
	stfd	4,16(10)
 | 
						|
	stfd	4,24(10)
 | 
						|
	addi	10,10,32
 | 
						|
	.align	4
 | 
						|
2:	bf	30,1f
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	stfd	4,8(10)
 | 
						|
	addi	10,10,16
 | 
						|
	.align	4
 | 
						|
1:	bf	31,L(tail)
 | 
						|
 | 
						|
	stfd	4,0(10)
 | 
						|
	addi	10,10,8
 | 
						|
 | 
						|
	/* Handle the rest of the tail bytes here.  */
 | 
						|
L(tail):
 | 
						|
	mtocrf	0x01,5
 | 
						|
 | 
						|
	.align	4
 | 
						|
4:	bf	29,2f
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	addi	10,10,4
 | 
						|
	.align	4
 | 
						|
2:	bf	30,1f
 | 
						|
 | 
						|
	sth	4,0(10)
 | 
						|
	addi	10,10,2
 | 
						|
	.align	4
 | 
						|
1:	bflr	31
 | 
						|
 | 
						|
	stb	4,0(10)
 | 
						|
	blr
 | 
						|
 | 
						|
 | 
						|
	/* Expanded tree to copy tail bytes without increments.  */
 | 
						|
	.align	4
 | 
						|
L(copy_tail):
 | 
						|
	bf	29,L(FXX)
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	bf	30,L(TFX)
 | 
						|
 | 
						|
	sth	4,4(10)
 | 
						|
	bflr	31
 | 
						|
 | 
						|
	stb	4,6(10)
 | 
						|
	blr
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(FXX):	bf	30,L(FFX)
 | 
						|
 | 
						|
	sth	4,0(10)
 | 
						|
	bflr	31
 | 
						|
 | 
						|
	stb	4,2(10)
 | 
						|
	blr
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(TFX):	bflr	31
 | 
						|
 | 
						|
	stb	4,4(10)
 | 
						|
	blr
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(FFX):	bflr	31
 | 
						|
 | 
						|
	stb	4,0(10)
 | 
						|
	blr
 | 
						|
 | 
						|
	/* Handle copies of 9~31 bytes.  */
 | 
						|
	.align	4
 | 
						|
L(medium):
 | 
						|
	/* At least 9 bytes to go.  */
 | 
						|
	andi.	11,10,3
 | 
						|
	clrlwi	0,0,30
 | 
						|
	beq	L(medium_aligned)
 | 
						|
 | 
						|
	/* Force 4-bytes alignment for DST.  */
 | 
						|
	mtocrf	0x01,0
 | 
						|
	subf	5,0,5
 | 
						|
1:	/* Copy 1 byte.  */
 | 
						|
	bf	31,2f
 | 
						|
 | 
						|
	stb	4,0(10)
 | 
						|
	addi	10,10,1
 | 
						|
2:	/* Copy 2 bytes.  */
 | 
						|
	bf	30,L(medium_aligned)
 | 
						|
 | 
						|
	sth	4,0(10)
 | 
						|
	addi	10,10,2
 | 
						|
 | 
						|
	.align	4
 | 
						|
L(medium_aligned):
 | 
						|
	/* At least 6 bytes to go, and DST is word-aligned.  */
 | 
						|
	cmplwi	cr1,5,16
 | 
						|
	mtocrf	0x01,5
 | 
						|
	blt	cr1,8f
 | 
						|
 | 
						|
	/* Copy 16 bytes.  */
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	stw	4,8(10)
 | 
						|
	stw	4,12(10)
 | 
						|
	addi	10,10,16
 | 
						|
8:	/* Copy 8 bytes.  */
 | 
						|
	bf	28,4f
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	addi	10,10,8
 | 
						|
4:	/* Copy 4 bytes.  */
 | 
						|
	bf	29,2f
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	addi	10,10,4
 | 
						|
2:	/* Copy 2-3 bytes.  */
 | 
						|
	bf	30,1f
 | 
						|
 | 
						|
	sth	4,0(10)
 | 
						|
	addi	10,10,2
 | 
						|
1:	/* Copy 1 byte.  */
 | 
						|
	bflr	31
 | 
						|
 | 
						|
	stb	4,0(10)
 | 
						|
	blr
 | 
						|
 | 
						|
	/* Handles copies of 0~8 bytes.  */
 | 
						|
	.align	4
 | 
						|
L(small):
 | 
						|
	mtocrf	0x01,5
 | 
						|
	bne	cr6,L(copy_tail)
 | 
						|
 | 
						|
	stw	4,0(10)
 | 
						|
	stw	4,4(10)
 | 
						|
	blr
 | 
						|
 | 
						|
END (memset)
 | 
						|
libc_hidden_builtin_def (memset)
 |