mirror of
				https://sourceware.org/git/glibc.git
				synced 2025-11-03 20:53:13 +03:00 
			
		
		
		
	* All files with FSF copyright notices: Update copyright dates using scripts/update-copyrights. * locale/programs/charmap-kw.h: Regenerated. * locale/programs/locfile-kw.h: Likewise.
		
			
				
	
	
		
			981 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			981 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/* Copy SIZE bytes from SRC to DEST.  For SUN4V M7.
 | 
						|
   Copyright (C) 2017-2018 Free Software Foundation, Inc.
 | 
						|
   This file is part of the GNU C Library.
 | 
						|
 | 
						|
   The GNU C Library is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU Lesser General Public
 | 
						|
   License as published by the Free Software Foundation; either
 | 
						|
   version 2.1 of the License, or (at your option) any later version.
 | 
						|
 | 
						|
   The GNU C Library is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
   Lesser General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU Lesser General Public
 | 
						|
   License along with the GNU C Library; if not, see
 | 
						|
   <http://www.gnu.org/licenses/>.  */
 | 
						|
 | 
						|
#include <sysdep.h>
 | 
						|
 | 
						|
#ifndef XCC
 | 
						|
# define XCC    xcc
 | 
						|
#endif
 | 
						|
	.register	%g2,#scratch
 | 
						|
	.register	%g3,#scratch
 | 
						|
	.register	%g6,#scratch
 | 
						|
 | 
						|
#define	FPRS_FEF	0x04
 | 
						|
 | 
						|
/*
 | 
						|
 * ASI_STBI_P marks the cache line as "least recently used"
 | 
						|
 * which means if many threads are active, it has a high chance
 | 
						|
 * of being pushed out of the cache between the first initializing
 | 
						|
 * store and the final stores.
 | 
						|
 * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
 | 
						|
 * cache line as "most recently used" for all but the last cache
 | 
						|
 * line.
 | 
						|
 */
 | 
						|
 | 
						|
#define	ASI_BLK_INIT_QUAD_LDD_P	0xe2
 | 
						|
#define	ASI_ST_BLK_INIT_MRU_P	0xf2
 | 
						|
 | 
						|
#define	ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
 | 
						|
#define	ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P
 | 
						|
 | 
						|
#define	BLOCK_SIZE	64	/* L2 data cache line size  */
 | 
						|
#define	SHORTCOPY	3
 | 
						|
#define	SHORTCHECK	14
 | 
						|
#define	SHORT_LONG	64	/* max copy for short longword-aligned case  */
 | 
						|
				/* must be at least 64  */
 | 
						|
#define	SMALL_MAX	255	/* max small copy for word/long aligned  */
 | 
						|
#define	SMALL_UMAX	128	/* max small copy for unaligned case  */
 | 
						|
#define	MED_WMAX	1023	/* max copy for medium word-aligned case  */
 | 
						|
#define	MED_MAX		511	/* max copy for medium longword-aligned case  */
 | 
						|
#define	ST_CHUNK	20	/* ST_CHUNK - block of values for BIS Store  */
 | 
						|
/* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
 | 
						|
 * prefetch 20 can cause inst pipeline to delay if data is in memory
 | 
						|
 * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache  */
 | 
						|
#define	ALIGN_PRE	20	/* distance for aligned prefetch loop  */
 | 
						|
 | 
						|
#define EX_ST(x)	x
 | 
						|
#define EX_RETVAL(x)	x
 | 
						|
#define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
 | 
						|
#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P
 | 
						|
 | 
						|
#if IS_IN (libc)
 | 
						|
 | 
						|
	.text
 | 
						|
 | 
						|
ENTRY(__memmove_niagara7)
 | 
						|
	/* %o0=dst, %o1=src, %o2=len */
 | 
						|
	cmp	%o1, %o0	/* if from address is >= to use forward copy  */
 | 
						|
	bgeu,pn	%XCC, .Lforcpy	/* else use backward if ...  */
 | 
						|
	 sub	%o0, %o1, %o4	/* get difference of two addresses  */
 | 
						|
	cmp	%o2, %o4	/* compare size and difference of addresses  */
 | 
						|
	bleu,pn	%XCC, .Lforcpy	/* if size is bigger, do overlapped copy  */
 | 
						|
	 add	%o1, %o2, %o5	/* get to end of source space  */
 | 
						|
 | 
						|
/* an overlapped copy that must be done "backwards"  */
 | 
						|
.Lchksize:
 | 
						|
	cmp	%o2, 8			/* less than 8 byte do byte copy  */
 | 
						|
	blu,pn %XCC, 2f			/* else continue  */
 | 
						|
 | 
						|
/* Now size is bigger than 8  */
 | 
						|
.Ldbalign:
 | 
						|
	 add	%o0, %o2, %g1		/* get to end of dest space  */
 | 
						|
	andcc	%g1, 7, %o3		/* %o3 has cnt til dst 8 byte align  */
 | 
						|
	bz,a,pn	%XCC, .Ldbbck		/* skip if dst is 8 byte aligned  */
 | 
						|
	 andn	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
 | 
						|
	sub	%o2, %o3, %o2		/* update o2 with new count  */
 | 
						|
 | 
						|
1:	dec	%o5			/* decrement source  */
 | 
						|
	ldub	[%o5], %g1		/* load one byte  */
 | 
						|
	deccc	%o3			/* decrement count  */
 | 
						|
	bgu,pt	%XCC, 1b		/* if not done keep copying  */
 | 
						|
	 stb	%g1, [%o5+%o4]		/* store one byte into dest  */
 | 
						|
	andncc	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
 | 
						|
	bz,pn	%XCC, 2f		/* if size < 8, move to byte copy  */
 | 
						|
 | 
						|
/* Now Destination is 8 byte aligned  */
 | 
						|
.Ldbbck:
 | 
						|
	 andcc	%o5, 7, %o0		/* %o0 has src offset  */
 | 
						|
	bz,a,pn	%XCC, .Ldbcopybc	/* if src is aligned do fast memmove  */
 | 
						|
	 sub	%o2, %o3, %o2		/* Residue bytes in %o2  */
 | 
						|
 | 
						|
.Lcpy_dbwdbc:				/* alignment of src is needed  */
 | 
						|
	sub	%o2, 8, %o2		/* set size one loop ahead  */
 | 
						|
	sll	%o0, 3, %g1		/* %g1 is left shift  */
 | 
						|
	mov	64, %g5			/* init %g5 to be 64  */
 | 
						|
	sub	%g5, %g1, %g5		/* %g5 rightshift = (64 - leftshift)  */
 | 
						|
	sub	%o5, %o0, %o5		/* align the src at 8 bytes.  */
 | 
						|
	add	%o4, %o0, %o4		/* increase diff between src & dst  */
 | 
						|
	ldx	[%o5], %o1		/* load first 8 bytes  */
 | 
						|
	srlx	%o1, %g5, %o1
 | 
						|
1:	sub	%o5, 8, %o5		/* subtract 8 from src  */
 | 
						|
	ldx	[%o5], %o0		/* load 8 byte  */
 | 
						|
	sllx	%o0, %g1, %o3		/* shift loaded val left to tmp reg  */
 | 
						|
	or	%o1, %o3, %o3		/* align data  */
 | 
						|
	stx	%o3, [%o5+%o4]		/* store 8 byte  */
 | 
						|
	subcc	%o2, 8, %o2		/* subtract 8 byte from size  */
 | 
						|
	bg,pt	%XCC, 1b		/* if size > 0 continue  */
 | 
						|
	 srlx	%o0, %g5, %o1		/* move extra byte for the next use  */
 | 
						|
 | 
						|
	srl	%g1, 3, %o0		/* restore %o0 value for alignment  */
 | 
						|
	add	%o5, %o0, %o5		/* restore src alignment  */
 | 
						|
	sub	%o4, %o0, %o4		/* restore diff between src & dest  */
 | 
						|
 | 
						|
	ba	2f			/* branch to the trailing byte copy  */
 | 
						|
	 add	%o2, 8, %o2		/* restore size value  */
 | 
						|
 | 
						|
.Ldbcopybc:				/* alignment of src is not needed  */
 | 
						|
1:	sub	%o5, 8, %o5		/* subtract from src  */
 | 
						|
	ldx	[%o5], %g1		/* load 8 bytes  */
 | 
						|
	subcc	%o3, 8, %o3		/* subtract from size  */
 | 
						|
	bgu,pt	%XCC, 1b		/* if size is bigger 0 continue  */
 | 
						|
	 stx	%g1, [%o5+%o4]		/* store 8 bytes to destination  */
 | 
						|
 | 
						|
	ba	2f
 | 
						|
	 nop
 | 
						|
 | 
						|
.Lbcbyte:
 | 
						|
1:	ldub	[%o5], %g1		/* load one byte  */
 | 
						|
	stb	%g1, [%o5+%o4]		/* store one byte  */
 | 
						|
2:	deccc	%o2			/* decrement size  */
 | 
						|
	bgeu,a,pt %XCC, 1b		/* if size is >= 0 continue  */
 | 
						|
	 dec	%o5			/* decrement from address  */
 | 
						|
 | 
						|
.Lexitbc:				/* exit from backward copy  */
 | 
						|
	retl
 | 
						|
	 add	%o5, %o4, %o0		/* restore dest addr  */
 | 
						|
 | 
						|
 | 
						|
/* Check to see if memmove is large aligned copy
 | 
						|
 * If so, use special version of copy that avoids
 | 
						|
 * use of block store init.  */
 | 
						|
.Lforcpy:
 | 
						|
	cmp	%o2, SMALL_MAX		/* check for not small case  */
 | 
						|
	blt,pn	%XCC, .Lmv_short	/* merge with memcpy  */
 | 
						|
	 mov	%o0, %g1		/* save %o0  */
 | 
						|
	neg	%o0, %o5
 | 
						|
	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
 | 
						|
	brz,pt	%o5, .Lmv_dst_aligned_on_8
 | 
						|
 | 
						|
/* %o5 has the bytes to be written in partial store.  */
 | 
						|
	 sub	%o2, %o5, %o2
 | 
						|
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
7:					/* dst aligning loop  */
 | 
						|
	ldub	[%o1+%o0], %o4		/* load one byte  */
 | 
						|
	subcc	%o5, 1, %o5
 | 
						|
	stb	%o4, [%o0]
 | 
						|
	bgu,pt	%XCC, 7b
 | 
						|
	 add	%o0, 1, %o0		/* advance dst  */
 | 
						|
	add	%o1, %o0, %o1		/* restore %o1  */
 | 
						|
.Lmv_dst_aligned_on_8:
 | 
						|
	andcc	%o1, 7, %o5
 | 
						|
	brnz,pn	%o5, .Lsrc_dst_unaligned_on_8
 | 
						|
	 prefetch [%o1 + (1 * BLOCK_SIZE)], 20
 | 
						|
 | 
						|
.Lmv_src_dst_aligned_on_8:
 | 
						|
/* check if we are copying MED_MAX or more bytes  */
 | 
						|
	cmp	%o2, MED_MAX		/* limit to store buffer size  */
 | 
						|
	bleu,pt	%XCC, .Lmedlong
 | 
						|
	 prefetch [%o1 + (2 * BLOCK_SIZE)], 20
 | 
						|
 | 
						|
/* The mv_align loop below mimics the memcpy code for large aligned copies,
 | 
						|
 * but does not use the ASI_STBI_P (block initializing store) performance
 | 
						|
 * optimization.  This is used when memcpy is incorrectly invoked with
 | 
						|
 * overlapping buffers.  */
 | 
						|
 | 
						|
.Lmv_large_align8_copy:			/* Src and dst share 8 byte align  */
 | 
						|
					/* align dst to 64 byte boundary  */
 | 
						|
	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
 | 
						|
	brz,pn	%o3, .Lmv_aligned_on_64
 | 
						|
	 sub	%o3, 64, %o3		/* %o3 has negative bytes to move  */
 | 
						|
	add	%o2, %o3, %o2		/* adjust remaining count  */
 | 
						|
.Lmv_align_to_64:
 | 
						|
	ldx	[%o1], %o4
 | 
						|
	add	%o1, 8, %o1		/* increment src ptr  */
 | 
						|
	addcc	%o3, 8, %o3
 | 
						|
	stx	%o4, [%o0]
 | 
						|
	brnz,pt	%o3, .Lmv_align_to_64
 | 
						|
	 add	%o0, 8, %o0		/* increment dst ptr  */
 | 
						|
 | 
						|
.Lmv_aligned_on_64:
 | 
						|
	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
 | 
						|
	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
 | 
						|
.Lmv_align_loop:
 | 
						|
	ldx	[%o1],%o4
 | 
						|
	stx	%o4,[%o0]
 | 
						|
	prefetch [%o0 + (10 * BLOCK_SIZE)], 22
 | 
						|
	prefetch [%o1 + (10 * BLOCK_SIZE)], 21
 | 
						|
	subcc	%o5, 64, %o5
 | 
						|
	ldx	[%o1+8],%o4
 | 
						|
	stx	%o4,[%o0+8]
 | 
						|
	ldx	[%o1+16],%o4
 | 
						|
	stx	%o4,[%o0+16]
 | 
						|
	ldx	[%o1+24],%o4
 | 
						|
	stx	%o4,[%o0+24]
 | 
						|
	ldx	[%o1+32],%o4
 | 
						|
	stx	%o4,[%o0+32]
 | 
						|
	ldx	[%o1+40],%o4
 | 
						|
	stx	%o4,[%o0+40]
 | 
						|
	ldx	[%o1+48],%o4
 | 
						|
	add	%o1, 64, %o1
 | 
						|
	stx	%o4,[%o0+48]
 | 
						|
	add	%o0, 64, %o0
 | 
						|
	ldx	[%o1-8],%o4
 | 
						|
	bgt,pt	%XCC, .Lmv_align_loop
 | 
						|
	 stx	%o4,[%o0-8]
 | 
						|
 | 
						|
	ba	.Lmedlong
 | 
						|
	 nop
 | 
						|
END(__memmove_niagara7)
 | 
						|
 | 
						|
ENTRY(__mempcpy_niagara7)
 | 
						|
	/* %o0=dst, %o1=src, %o2=len */
 | 
						|
	ba,pt	%icc, 101f
 | 
						|
	 add	%o0, %o2, %g1		/* save dst + len  */
 | 
						|
END(__mempcpy_niagara7)
 | 
						|
 | 
						|
	.align	32
 | 
						|
ENTRY(__memcpy_niagara7)
 | 
						|
100:	/* %o0=dst, %o1=src, %o2=len */
 | 
						|
	mov	%o0, %g1		/* save %o0  */
 | 
						|
101:
 | 
						|
#ifndef __arch64__
 | 
						|
	srl	%o2, 0, %o2
 | 
						|
#endif
 | 
						|
	cmp	%o2, SMALL_MAX		/* check for not small case  */
 | 
						|
	bgeu,pn	%XCC, .Lmedium		/* go to larger cases  */
 | 
						|
.Lmv_short:
 | 
						|
	 cmp	%o2, SHORTCOPY		/* check for really short case  */
 | 
						|
	ble,pn	%XCC, .Lsmallfin
 | 
						|
	 or	%o0, %o1, %o4		/* prepare alignment check  */
 | 
						|
	andcc	%o4, 0x3, %o5		/* test for word alignment  */
 | 
						|
	bnz,pn	%XCC, .Lsmallunalign	/* branch to non-word aligned case  */
 | 
						|
	 nop
 | 
						|
	subcc	%o2, 7, %o2		/* adjust count  */
 | 
						|
	ble,pn	%XCC, .Lsmallwordx
 | 
						|
	 andcc	%o4, 0x7, %o5		/* test for long alignment  */
 | 
						|
/* 8 or more bytes, src and dest start on word boundary
 | 
						|
 * %o4 contains or %o0, %o1  */
 | 
						|
.Lsmalllong:
 | 
						|
	bnz,pn	%XCC, .Lsmallwords	/* branch to word aligned case  */
 | 
						|
	 cmp	%o2, SHORT_LONG-7
 | 
						|
	bge,a	%XCC, .Lmedl64		/* if we branch  */
 | 
						|
	 sub	%o2,56,%o2		/* adjust %o2 to -63 off count  */
 | 
						|
 | 
						|
/* slightly unroll the small_long_loop to improve very short copies  */
 | 
						|
	cmp	%o2, 32-7
 | 
						|
	blt,a,pn %XCC, .Lsmall_long_l
 | 
						|
	 sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
 | 
						|
	ldx	[%o1], %o5
 | 
						|
	ldx	[%o1+8], %o4
 | 
						|
	ldx	[%o1+16], %o3
 | 
						|
 | 
						|
	subcc	%o2, 24, %o2
 | 
						|
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
 | 
						|
	stx	%o5, [%o0]		/* write word  */
 | 
						|
	stx	%o4, [%o0+8]		/* write word  */
 | 
						|
	stx	%o3, [%o0+16]		/* write word  */
 | 
						|
 | 
						|
	add	%o0, 24, %o0
 | 
						|
 | 
						|
/* end loop unroll  */
 | 
						|
 | 
						|
.Lsmall_long_l:
 | 
						|
	ldx	[%o1+%o0], %o3
 | 
						|
	subcc	%o2, 8, %o2
 | 
						|
	add	%o0, 8, %o0
 | 
						|
	bgu,pn	%XCC, .Lsmall_long_l	/* loop until done  */
 | 
						|
	 stx	%o3, [%o0-8]		/* write word  */
 | 
						|
	addcc	%o2, 7, %o2		/* restore %o2 to correct count  */
 | 
						|
	bnz,pn	%XCC, .Lsmall_long_x	/* check for completion  */
 | 
						|
	 add	%o1, %o0, %o1		/* restore %o1  */
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
.Lsmall_long_x:
 | 
						|
	cmp	%o2, 4			/* check for 4 or more bytes left  */
 | 
						|
	blt,pn	%XCC, .Lsmallleft3	/* if not, go to finish up  */
 | 
						|
	 nop
 | 
						|
	lduw	[%o1], %o3
 | 
						|
	add	%o1, 4, %o1
 | 
						|
	subcc	%o2, 4, %o2
 | 
						|
	stw	%o3, [%o0]
 | 
						|
	bnz,pn	%XCC, .Lsmallleft3
 | 
						|
	 add	%o0, 4, %o0
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 32
 | 
						|
/* src and dest start on word boundary; 7 or fewer bytes  */
 | 
						|
.Lsmallwordx:
 | 
						|
	lduw	[%o1], %o3		/* read word  */
 | 
						|
	addcc	%o2, 3, %o2		/* restore count  */
 | 
						|
	bz,pt	%XCC, .Lsmallexit
 | 
						|
	 stw	%o3, [%o0]		/* write word  */
 | 
						|
	deccc	%o2			/* reduce count for cc test  */
 | 
						|
	ldub	[%o1+4], %o3		/* load one byte  */
 | 
						|
	bz,pt	%XCC, .Lsmallexit
 | 
						|
	 stb	%o3, [%o0+4]		/* store one byte  */
 | 
						|
	ldub	[%o1+5], %o3		/* load second byte  */
 | 
						|
	deccc	%o2
 | 
						|
	bz,pt	%XCC, .Lsmallexit
 | 
						|
	 stb	%o3, [%o0+5]		/* store second byte  */
 | 
						|
	ldub	[%o1+6], %o3		/* load third byte  */
 | 
						|
	stb	%o3, [%o0+6]		/* store third byte  */
 | 
						|
.Lsmallexit:
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 32
 | 
						|
.Lsmallunalign:
 | 
						|
	cmp	%o2, SHORTCHECK
 | 
						|
	ble,pn	%XCC, .Lsmallrest
 | 
						|
	 cmp	%o2, SMALL_UMAX
 | 
						|
	bge,pt	%XCC, .Lmedium_join
 | 
						|
	 andcc	%o1, 0x3, %o5		/* is src word aligned  */
 | 
						|
	bz,pn	%XCC, .Laldst
 | 
						|
	 cmp	%o5, 2			/* is src half-word aligned  */
 | 
						|
	be,pt	%XCC, .Ls2algn
 | 
						|
	 cmp	%o5, 3			/* src is byte aligned  */
 | 
						|
.Ls1algn:
 | 
						|
	ldub	[%o1], %o3		/* move 1 or 3 bytes to align it  */
 | 
						|
	inc	1, %o1
 | 
						|
	stb	%o3, [%o0]		/* move a byte to align src  */
 | 
						|
	inc	1, %o0
 | 
						|
	bne,pt	%XCC, .Ls2algn
 | 
						|
	 dec	%o2
 | 
						|
	b	.Lald			/* now go align dest  */
 | 
						|
	 andcc	%o0, 0x3, %o5
 | 
						|
 | 
						|
.Ls2algn:
 | 
						|
	lduh	[%o1], %o3		/* know src is 2 byte aligned  */
 | 
						|
	inc	2, %o1
 | 
						|
	srl	%o3, 8, %o4
 | 
						|
	stb	%o4, [%o0]		/* have to do bytes,  */
 | 
						|
	stb	%o3, [%o0 + 1]		/* do not know dst alignment  */
 | 
						|
	inc	2, %o0
 | 
						|
	dec	2, %o2
 | 
						|
 | 
						|
.Laldst:
 | 
						|
	andcc	%o0, 0x3, %o5		/* align the destination address  */
 | 
						|
.Lald:
 | 
						|
	bz,pn	%XCC, .Lw4cp
 | 
						|
	 cmp	%o5, 2
 | 
						|
	be,pn	%XCC, .Lw2cp
 | 
						|
	 cmp	%o5, 3
 | 
						|
.Lw3cp:	lduw	[%o1], %o4
 | 
						|
	inc	4, %o1
 | 
						|
	srl	%o4, 24, %o5
 | 
						|
	stb	%o5, [%o0]
 | 
						|
	bne,pt	%XCC, .Lw1cp
 | 
						|
	 inc	%o0
 | 
						|
	dec	1, %o2
 | 
						|
	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 | 
						|
	dec	4, %o3			/* avoid reading beyond tail of src  */
 | 
						|
	sub	%o1, %o0, %o1		/*  %o1 gets the difference  */
 | 
						|
 | 
						|
1:	sll	%o4, 8, %g5		/* save residual bytes  */
 | 
						|
	lduw	[%o1+%o0], %o4
 | 
						|
	deccc	4, %o3
 | 
						|
	srl	%o4, 24, %o5		/* merge with residual  */
 | 
						|
	or	%o5, %g5, %g5
 | 
						|
	st	%g5, [%o0]
 | 
						|
	bnz,pt	%XCC, 1b
 | 
						|
	 inc	4, %o0
 | 
						|
	sub	%o1, 3, %o1		/* used one byte of last word read  */
 | 
						|
	and	%o2, 3, %o2
 | 
						|
	b	7f
 | 
						|
	 inc	4, %o2
 | 
						|
 | 
						|
.Lw1cp:	srl	%o4, 8, %o5
 | 
						|
	sth	%o5, [%o0]
 | 
						|
	inc	2, %o0
 | 
						|
	dec	3, %o2
 | 
						|
	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 | 
						|
	dec	4, %o3			/* avoid reading beyond tail of src  */
 | 
						|
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
 | 
						|
2:	sll	%o4, 24, %g5		/* save residual bytes  */
 | 
						|
	lduw	[%o1+%o0], %o4
 | 
						|
	deccc	4, %o3
 | 
						|
	srl	%o4, 8, %o5		/* merge with residual  */
 | 
						|
	or	%o5, %g5, %g5
 | 
						|
	st	%g5, [%o0]
 | 
						|
	bnz,pt	%XCC, 2b
 | 
						|
	 inc	4, %o0
 | 
						|
	sub	%o1, 1, %o1		/* used 3 bytes of last word read  */
 | 
						|
	and	%o2, 3, %o2
 | 
						|
	b	7f
 | 
						|
	 inc	4, %o2
 | 
						|
 | 
						|
.Lw2cp:	lduw	[%o1], %o4
 | 
						|
	inc	4, %o1
 | 
						|
	srl	%o4, 16, %o5
 | 
						|
	sth	%o5, [%o0]
 | 
						|
	inc	2, %o0
 | 
						|
	dec	2, %o2
 | 
						|
	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 | 
						|
	dec	4, %o3			/* avoid reading beyond tail of src  */
 | 
						|
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
 | 
						|
3:	sll	%o4, 16, %g5		/* save residual bytes  */
 | 
						|
	lduw	[%o1+%o0], %o4
 | 
						|
	deccc	4, %o3
 | 
						|
	srl	%o4, 16, %o5		/* merge with residual  */
 | 
						|
	or	%o5, %g5, %g5
 | 
						|
	st	%g5, [%o0]
 | 
						|
	bnz,pt	%XCC, 3b
 | 
						|
	 inc	4, %o0
 | 
						|
	sub	%o1, 2, %o1		/* used two bytes of last word read  */
 | 
						|
	and	%o2, 3, %o2
 | 
						|
	b	7f
 | 
						|
	 inc	4, %o2
 | 
						|
 | 
						|
.Lw4cp:	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 | 
						|
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
 | 
						|
1:	lduw	[%o1+%o0], %o4		/* read from address  */
 | 
						|
	deccc	4, %o3			/* decrement count  */
 | 
						|
	st	%o4, [%o0]		/* write at destination address  */
 | 
						|
	bgu,pt	%XCC, 1b
 | 
						|
	 inc	4, %o0			/* increment to address  */
 | 
						|
	and	%o2, 3, %o2		/* number of leftover bytes, if any  */
 | 
						|
 | 
						|
	/* simple finish up byte copy, works with any alignment  */
 | 
						|
7:
 | 
						|
	add	%o1, %o0, %o1		/* restore %o1  */
 | 
						|
.Lsmallrest:
 | 
						|
	tst	%o2
 | 
						|
	bz,pt	%XCC, .Lsmallx
 | 
						|
	 cmp	%o2, 4
 | 
						|
	blt,pn	%XCC, .Lsmallleft3
 | 
						|
	 nop
 | 
						|
	sub	%o2, 3, %o2
 | 
						|
.Lsmallnotalign4:
 | 
						|
	ldub	[%o1], %o3		/* read byte  */
 | 
						|
	subcc	%o2, 4, %o2		/* reduce count by 4  */
 | 
						|
	stb	%o3, [%o0]		/* write byte  */
 | 
						|
	ldub	[%o1+1], %o3		/* repeat for total of 4 bytes  */
 | 
						|
	add	%o1, 4, %o1		/* advance SRC by 4  */
 | 
						|
	stb	%o3, [%o0+1]
 | 
						|
	ldub	[%o1-2], %o3
 | 
						|
	add	%o0, 4, %o0		/* advance DST by 4  */
 | 
						|
	stb	%o3, [%o0-2]
 | 
						|
	ldub	[%o1-1], %o3
 | 
						|
	bgu,pt	%XCC, .Lsmallnotalign4	/* loop til 3 or fewer bytes remain  */
 | 
						|
	 stb	%o3, [%o0-1]
 | 
						|
	addcc	%o2, 3, %o2		/* restore count  */
 | 
						|
	bz,pt	%XCC, .Lsmallx
 | 
						|
.Lsmallleft3:				/* 1, 2, or 3 bytes remain  */
 | 
						|
	 subcc	%o2, 1, %o2
 | 
						|
	ldub	[%o1], %o3		/* load one byte  */
 | 
						|
	bz,pt	%XCC, .Lsmallx
 | 
						|
	 stb	%o3, [%o0]		/* store one byte  */
 | 
						|
	ldub	[%o1+1], %o3		/* load second byte  */
 | 
						|
	subcc	%o2, 1, %o2
 | 
						|
	bz,pt	%XCC, .Lsmallx
 | 
						|
	 stb	%o3, [%o0+1]		/* store second byte  */
 | 
						|
	ldub	[%o1+2], %o3		/* load third byte  */
 | 
						|
	stb	%o3, [%o0+2]		/* store third byte  */
 | 
						|
.Lsmallx:
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
.Lsmallfin:
 | 
						|
	tst	%o2
 | 
						|
	bnz,pn	%XCC, .Lsmallleft3
 | 
						|
	 nop
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 16
 | 
						|
.Lsmallwords:
 | 
						|
	lduw	[%o1], %o3		/* read word  */
 | 
						|
	subcc	%o2, 8, %o2		/* update count  */
 | 
						|
	stw	%o3, [%o0]		/* write word  */
 | 
						|
	add	%o1, 8, %o1		/* update SRC  */
 | 
						|
	lduw	[%o1-4], %o3		/* read word  */
 | 
						|
	add	%o0, 8, %o0		/* update DST  */
 | 
						|
	bgu,pt	%XCC, .Lsmallwords	/* loop until done  */
 | 
						|
	 stw	%o3, [%o0-4]		/* write word  */
 | 
						|
	addcc	%o2, 7, %o2		/* restore count  */
 | 
						|
	bz,pt	%XCC, .Lsmallexit	/* check for completion  */
 | 
						|
	 cmp	%o2, 4			/* check for 4 or more bytes left  */
 | 
						|
	blt,pt	%XCC, .Lsmallleft3	/* if not, go to finish up  */
 | 
						|
	 nop
 | 
						|
	lduw	[%o1], %o3
 | 
						|
	add	%o1, 4, %o1
 | 
						|
	subcc	%o2, 4, %o2
 | 
						|
	add	%o0, 4, %o0
 | 
						|
	bnz,pn	%XCC, .Lsmallleft3
 | 
						|
	 stw	%o3, [%o0-4]
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 16
 | 
						|
.Lmedium:
 | 
						|
.Lmedium_join:
 | 
						|
	neg	%o0, %o5
 | 
						|
	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
 | 
						|
	brz,pt	%o5, .Ldst_aligned_on_8
 | 
						|
 | 
						|
	/* %o5 has the bytes to be written in partial store.  */
 | 
						|
	 sub	%o2, %o5, %o2
 | 
						|
	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 | 
						|
7:					/* dst aligning loop  */
 | 
						|
	ldub	[%o1+%o0], %o4		/* load one byte  */
 | 
						|
	subcc	%o5, 1, %o5
 | 
						|
	stb	%o4, [%o0]
 | 
						|
	bgu,pt	%XCC, 7b
 | 
						|
	 add	%o0, 1, %o0		/* advance dst  */
 | 
						|
	add	%o1, %o0, %o1		/* restore %o1  */
 | 
						|
.Ldst_aligned_on_8:
 | 
						|
	andcc	%o1, 7, %o5
 | 
						|
	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
 | 
						|
	 nop
 | 
						|
 | 
						|
.Lsrc_dst_aligned_on_8:
 | 
						|
	/* check if we are copying MED_MAX or more bytes  */
 | 
						|
	cmp	%o2, MED_MAX		/* limit to store buffer size  */
 | 
						|
	bgu,pn	%XCC, .Llarge_align8_copy
 | 
						|
	 nop
 | 
						|
/*
 | 
						|
 * Special case for handling when src and dest are both long word aligned
 | 
						|
 * and total data to move is less than MED_MAX bytes
 | 
						|
 */
 | 
						|
.Lmedlong:
 | 
						|
	subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
 | 
						|
	ble,pn	%XCC, .Lmedl63		/* skip big loop if < 64 bytes  */
 | 
						|
	 nop
 | 
						|
.Lmedl64:
 | 
						|
	ldx	[%o1], %o4		/* load  */
 | 
						|
	subcc	%o2, 64, %o2		/* decrement length count  */
 | 
						|
	stx	%o4, [%o0]		/* and store  */
 | 
						|
	ldx	[%o1+8], %o3		/* a block of 64 bytes  */
 | 
						|
	stx	%o3, [%o0+8]
 | 
						|
	ldx	[%o1+16], %o4
 | 
						|
	stx	%o4, [%o0+16]
 | 
						|
	ldx	[%o1+24], %o3
 | 
						|
	stx	%o3, [%o0+24]
 | 
						|
	ldx	[%o1+32], %o4		/* load  */
 | 
						|
	stx	%o4, [%o0+32]		/* and store  */
 | 
						|
	ldx	[%o1+40], %o3		/* a block of 64 bytes  */
 | 
						|
	add	%o1, 64, %o1		/* increase src ptr by 64  */
 | 
						|
	stx	%o3, [%o0+40]
 | 
						|
	ldx	[%o1-16], %o4
 | 
						|
	add	%o0, 64, %o0		/* increase dst ptr by 64  */
 | 
						|
	stx	%o4, [%o0-16]
 | 
						|
	ldx	[%o1-8], %o3
 | 
						|
	bgu,pt	%XCC, .Lmedl64		/* repeat if at least 64 bytes left  */
 | 
						|
	 stx	%o3, [%o0-8]
 | 
						|
.Lmedl63:
 | 
						|
	addcc	%o2, 32, %o2		/* adjust remaining count  */
 | 
						|
	ble,pt	%XCC, .Lmedl31		/* to skip if 31 or fewer bytes left  */
 | 
						|
	 nop
 | 
						|
	ldx	[%o1], %o4		/* load  */
 | 
						|
	sub	%o2, 32, %o2		/* decrement length count  */
 | 
						|
	stx	%o4, [%o0]		/* and store  */
 | 
						|
	ldx	[%o1+8], %o3		/* a block of 32 bytes  */
 | 
						|
	add	%o1, 32, %o1		/* increase src ptr by 32  */
 | 
						|
	stx	%o3, [%o0+8]
 | 
						|
	ldx	[%o1-16], %o4
 | 
						|
	add	%o0, 32, %o0		/* increase dst ptr by 32  */
 | 
						|
	stx	%o4, [%o0-16]
 | 
						|
	ldx	[%o1-8], %o3
 | 
						|
	stx	%o3, [%o0-8]
 | 
						|
.Lmedl31:
 | 
						|
	addcc	%o2, 16, %o2		/* adjust remaining count  */
 | 
						|
	ble,pt	%XCC, .Lmedl15		/* skip if 15 or fewer bytes left  */
 | 
						|
	 nop
 | 
						|
	ldx	[%o1], %o4		/* load and store 16 bytes  */
 | 
						|
	add	%o1, 16, %o1		/* increase src ptr by 16  */
 | 
						|
	stx	%o4, [%o0]
 | 
						|
	sub	%o2, 16, %o2		/* decrease count by 16  */
 | 
						|
	ldx	[%o1-8], %o3
 | 
						|
	add	%o0, 16, %o0		/* increase dst ptr by 16  */
 | 
						|
	stx	%o3, [%o0-8]
 | 
						|
.Lmedl15:
 | 
						|
	addcc	%o2, 15, %o2		/* restore count  */
 | 
						|
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 | 
						|
	 cmp	%o2, 8
 | 
						|
	blt,pt	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
 | 
						|
	 tst	%o2
 | 
						|
	ldx	[%o1], %o4		/* load 8 bytes  */
 | 
						|
	add	%o1, 8, %o1		/* increase src ptr by 8  */
 | 
						|
	add	%o0, 8, %o0		/* increase dst ptr by 8  */
 | 
						|
	subcc	%o2, 8, %o2		/* decrease count by 8  */
 | 
						|
	bnz,pn	%XCC, .Lmedw7
 | 
						|
	 stx	%o4, [%o0-8]		/* and store 8 bytes  */
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 16
 | 
						|
.Lsrc_dst_unaligned_on_8:
 | 
						|
	/* DST is 8-byte aligned, src is not  */
 | 
						|
	andcc	%o1, 0x3, %o5		/* test word alignment  */
 | 
						|
	bnz,pt	%XCC, .Lunalignsetup	/* branch if not word aligned  */
 | 
						|
	 nop
 | 
						|
 | 
						|
/*
 | 
						|
 * Handle all cases where src and dest are aligned on word
 | 
						|
 * boundaries. Use unrolled loops for better performance.
 | 
						|
 * This option wins over standard large data move when
 | 
						|
 * source and destination is in cache for medium
 | 
						|
 * to short data moves.
 | 
						|
 */
 | 
						|
	cmp %o2, MED_WMAX		/* limit to store buffer size  */
 | 
						|
	bge,pt	%XCC, .Lunalignrejoin	/* otherwise rejoin main loop  */
 | 
						|
	 nop
 | 
						|
 | 
						|
	subcc	%o2, 31, %o2		/* adjust length to allow cc test  */
 | 
						|
					/* for end of loop  */
 | 
						|
	ble,pt	%XCC, .Lmedw31		/* skip big loop if less than 16  */
 | 
						|
.Lmedw32:
 | 
						|
	 ld	[%o1], %o4		/* move a block of 32 bytes  */
 | 
						|
	sllx	%o4, 32, %o5
 | 
						|
	ld	[%o1+4], %o4
 | 
						|
	or	%o4, %o5, %o5
 | 
						|
	stx	%o5, [%o0]
 | 
						|
	subcc	%o2, 32, %o2		/* decrement length count  */
 | 
						|
	ld	[%o1+8], %o4
 | 
						|
	sllx	%o4, 32, %o5
 | 
						|
	ld	[%o1+12], %o4
 | 
						|
	or	%o4, %o5, %o5
 | 
						|
	stx	%o5, [%o0+8]
 | 
						|
	add	%o1, 32, %o1		/* increase src ptr by 32  */
 | 
						|
	ld	[%o1-16], %o4
 | 
						|
	sllx	%o4, 32, %o5
 | 
						|
	ld	[%o1-12], %o4
 | 
						|
	or	%o4, %o5, %o5
 | 
						|
	stx	%o5, [%o0+16]
 | 
						|
	add	%o0, 32, %o0		/* increase dst ptr by 32  */
 | 
						|
	ld	[%o1-8], %o4
 | 
						|
	sllx	%o4, 32, %o5
 | 
						|
	ld	[%o1-4], %o4
 | 
						|
	or	%o4, %o5, %o5
 | 
						|
	bgu,pt	%XCC, .Lmedw32		/* repeat if at least 32 bytes left  */
 | 
						|
	 stx	%o5, [%o0-8]
 | 
						|
.Lmedw31:
 | 
						|
	addcc	%o2, 31, %o2		/* restore count  */
 | 
						|
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 | 
						|
	 cmp	%o2, 16
 | 
						|
	blt,pt	%XCC, .Lmedw15
 | 
						|
	 nop
 | 
						|
	ld	[%o1], %o4		/* move a block of 16 bytes  */
 | 
						|
	sllx	%o4, 32, %o5
 | 
						|
	subcc	%o2, 16, %o2		/* decrement length count  */
 | 
						|
	ld	[%o1+4], %o4
 | 
						|
	or	%o4, %o5, %o5
 | 
						|
	stx	%o5, [%o0]
 | 
						|
	add	%o1, 16, %o1		/* increase src ptr by 16  */
 | 
						|
	ld	[%o1-8], %o4
 | 
						|
	add	%o0, 16, %o0		/* increase dst ptr by 16  */
 | 
						|
	sllx	%o4, 32, %o5
 | 
						|
	ld	[%o1-4], %o4
 | 
						|
	or	%o4, %o5, %o5
 | 
						|
	stx	%o5, [%o0-8]
 | 
						|
.Lmedw15:
 | 
						|
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 | 
						|
	 cmp	%o2, 8
 | 
						|
	blt,pn	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
 | 
						|
	 tst	%o2
 | 
						|
	ld	[%o1], %o4		/* load 4 bytes  */
 | 
						|
	subcc	%o2, 8, %o2		/* decrease count by 8  */
 | 
						|
	stw	%o4, [%o0]		/* and store 4 bytes  */
 | 
						|
	add	%o1, 8, %o1		/* increase src ptr by 8  */
 | 
						|
	ld	[%o1-4], %o3		/* load 4 bytes  */
 | 
						|
	add	%o0, 8, %o0		/* increase dst ptr by 8  */
 | 
						|
	stw	%o3, [%o0-4]		/* and store 4 bytes  */
 | 
						|
	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 | 
						|
.Lmedw7:				/* count is ge 1, less than 8  */
 | 
						|
	 cmp	%o2, 4			/* check for 4 bytes left  */
 | 
						|
	blt,pn	%XCC, .Lsmallleft3	/* skip if 3 or fewer bytes left  */
 | 
						|
	 nop
 | 
						|
	ld	[%o1], %o4		/* load 4 bytes  */
 | 
						|
	add	%o1, 4, %o1		/* increase src ptr by 4  */
 | 
						|
	add	%o0, 4, %o0		/* increase dst ptr by 4  */
 | 
						|
	subcc	%o2, 4, %o2		/* decrease count by 4  */
 | 
						|
	bnz,pt	%XCC, .Lsmallleft3
 | 
						|
	 stw	%o4, [%o0-4]		/* and store 4 bytes  */
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 16
 | 
						|
.Llarge_align8_copy:			/* Src and dst 8 byte aligned  */
 | 
						|
	/* align dst to 64 byte boundary  */
 | 
						|
	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
 | 
						|
	brz,pn	%o3, .Laligned_to_64
 | 
						|
	 andcc	%o0, 8, %o3		/* odd long words to move?  */
 | 
						|
	brz,pt	%o3, .Laligned_to_16
 | 
						|
	 nop
 | 
						|
	ldx	[%o1], %o4
 | 
						|
	sub	%o2, 8, %o2
 | 
						|
	add	%o1, 8, %o1		/* increment src ptr  */
 | 
						|
	add	%o0, 8, %o0		/* increment dst ptr  */
 | 
						|
	stx	%o4, [%o0-8]
 | 
						|
.Laligned_to_16:
 | 
						|
	andcc	%o0, 16, %o3		/* pair of long words to move?  */
 | 
						|
	brz,pt	%o3, .Laligned_to_32
 | 
						|
	 nop
 | 
						|
	ldx	[%o1], %o4
 | 
						|
	sub	%o2, 16, %o2
 | 
						|
	stx	%o4, [%o0]
 | 
						|
	add	%o1, 16, %o1		/* increment src ptr  */
 | 
						|
	ldx	[%o1-8], %o4
 | 
						|
	add	%o0, 16, %o0		/* increment dst ptr  */
 | 
						|
	stx	%o4, [%o0-8]
 | 
						|
.Laligned_to_32:
 | 
						|
	andcc	%o0, 32, %o3		/* four long words to move?  */
 | 
						|
	brz,pt	%o3, .Laligned_to_64
 | 
						|
	 nop
 | 
						|
	ldx	[%o1], %o4
 | 
						|
	sub	%o2, 32, %o2
 | 
						|
	stx	%o4, [%o0]
 | 
						|
	ldx	[%o1+8], %o4
 | 
						|
	stx	%o4, [%o0+8]
 | 
						|
	ldx	[%o1+16], %o4
 | 
						|
	stx	%o4, [%o0+16]
 | 
						|
	add	%o1, 32, %o1		/* increment src ptr  */
 | 
						|
	ldx	[%o1-8], %o4
 | 
						|
	add	%o0, 32, %o0		/* increment dst ptr  */
 | 
						|
	stx	%o4, [%o0-8]
 | 
						|
.Laligned_to_64:
 | 
						|
/*	Following test is included to avoid issues where existing executables
 | 
						|
 *	incorrectly call memcpy with overlapping src and dest instead of memmove
 | 
						|
 *
 | 
						|
 *	if ( (src ge dst) and (dst+len > src)) go to overlap case
 | 
						|
 *	if ( (src lt dst) and (src+len > dst)) go to overlap case
 | 
						|
 */
 | 
						|
	cmp	%o1,%o0
 | 
						|
	bge,pt	%XCC, 1f
 | 
						|
	 nop
 | 
						|
/*				src+len > dst?  */
 | 
						|
	add	%o1, %o2, %o4
 | 
						|
	cmp	%o4, %o0
 | 
						|
	bgt,pt	%XCC, .Lmv_aligned_on_64
 | 
						|
	 nop
 | 
						|
	ba	2f
 | 
						|
	 nop
 | 
						|
1:
 | 
						|
/*				dst+len > src?  */
 | 
						|
	add	%o0, %o2, %o4
 | 
						|
	cmp	%o4, %o1
 | 
						|
	bgt,pt	%XCC, .Lmv_aligned_on_64
 | 
						|
	 nop
 | 
						|
2:
 | 
						|
/*	handle non-overlapped copies
 | 
						|
 *
 | 
						|
 *	Using block init store (BIS) instructions to avoid fetching cache
 | 
						|
 *	lines from memory. Use ST_CHUNK stores to first element of each cache
 | 
						|
 *	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
 | 
						|
 *	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
 | 
						|
 */
 | 
						|
	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
 | 
						|
	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
 | 
						|
 | 
						|
/*	We use ASI_STBIMRU_P for the first store to each cache line
 | 
						|
 *	followed by ASI_STBI_P (mark as LRU) for the last store. That
 | 
						|
 *	mixed approach reduces the chances the cache line is removed
 | 
						|
 *	before we finish setting it, while minimizing the effects on
 | 
						|
 *	other cached values during a large memcpy
 | 
						|
 *
 | 
						|
 *	Intermediate stores can be normal since first BIS activates the
 | 
						|
 *	cache line in the L2 cache.
 | 
						|
 *
 | 
						|
 *	ST_CHUNK batches up initial BIS operations for several cache lines
 | 
						|
 *	to allow multiple requests to not be blocked by overflowing the
 | 
						|
 *	the store miss buffer. Then the matching stores for all those
 | 
						|
 *	BIS operations are executed.
 | 
						|
 */
 | 
						|
 | 
						|
.Lalign_loop:
 | 
						|
	cmp	%o5, ST_CHUNK*64
 | 
						|
	blu,pt	%XCC, .Lalign_short
 | 
						|
	 mov	ST_CHUNK, %o3
 | 
						|
	sllx	%o3, 6, %g5		/* ST_CHUNK*64  */
 | 
						|
 | 
						|
.Lalign_loop_start:
 | 
						|
	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
 | 
						|
	subcc	%o3, 2, %o3
 | 
						|
	ldx	[%o1], %o4
 | 
						|
	add	%o1, 128, %o1
 | 
						|
	EX_ST(STORE_ASI(%o4, %o0))
 | 
						|
	add	%o0, 64, %o0
 | 
						|
	ldx	[%o1-64], %o4
 | 
						|
	EX_ST(STORE_ASI(%o4, %o0))
 | 
						|
	add	%o0, 64, %o0
 | 
						|
	bgu,pt	%XCC, .Lalign_loop_start
 | 
						|
	 prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21
 | 
						|
 | 
						|
	mov	ST_CHUNK, %o3
 | 
						|
	sub	%o1, %g5, %o1		/* reset %o1  */
 | 
						|
	sub	%o0, %g5, %o0		/* reset %o0  */
 | 
						|
 | 
						|
	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
 | 
						|
.Lalign_loop_rest:
 | 
						|
	ldx	[%o1+8],%o4
 | 
						|
	add	%o0, 64, %o0
 | 
						|
	stx	%o4, [%o0-48]
 | 
						|
	subcc	%o3, 1, %o3
 | 
						|
	ldx	[%o1+16],%o4
 | 
						|
	stx	%o4, [%o0-40]
 | 
						|
	sub	%o5, 64, %o5
 | 
						|
	ldx	[%o1+24],%o4
 | 
						|
	stx	%o4, [%o0-32]
 | 
						|
	ldx	[%o1+32],%o4
 | 
						|
	stx	%o4, [%o0-24]
 | 
						|
	ldx	[%o1+40],%o4
 | 
						|
	stx	%o4, [%o0-16]
 | 
						|
	ldx	[%o1+48],%o4
 | 
						|
	stx	%o4, [%o0-8]
 | 
						|
	add	%o1, 64, %o1
 | 
						|
	ldx	[%o1-8],%o4
 | 
						|
	bgu,pt	%XCC, .Lalign_loop_rest
 | 
						|
	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */
 | 
						|
 | 
						|
	mov	ST_CHUNK, %o3
 | 
						|
	cmp	%o5, ST_CHUNK*64
 | 
						|
	bgu,pt	%XCC, .Lalign_loop_start
 | 
						|
	 add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */
 | 
						|
 | 
						|
	cmp	%o5, 0
 | 
						|
	beq,pt	%XCC, .Lalign_done
 | 
						|
 | 
						|
/* no prefetches needed in these loops
 | 
						|
 * since we are within ALIGN_PRE of the end */
 | 
						|
.Lalign_short:
 | 
						|
	 srl	%o5, 6, %o3
 | 
						|
.Lalign_loop_short:
 | 
						|
	subcc	%o3, 1, %o3
 | 
						|
	ldx	[%o1], %o4
 | 
						|
	add	%o1, 64, %o1
 | 
						|
	EX_ST(STORE_ASI(%o4, %o0))
 | 
						|
	bgu,pt	%XCC, .Lalign_loop_short
 | 
						|
	 add	%o0, 64, %o0
 | 
						|
 | 
						|
	sub	%o1, %o5, %o1		/* reset %o1  */
 | 
						|
	sub	%o0, %o5, %o0		/* reset %o0  */
 | 
						|
 | 
						|
	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
 | 
						|
.Lalign_short_rest:
 | 
						|
	ldx	[%o1+8],%o4
 | 
						|
	add	%o0, 64, %o0
 | 
						|
	stx	%o4, [%o0-48]
 | 
						|
	ldx	[%o1+16],%o4
 | 
						|
	subcc	%o5, 64, %o5
 | 
						|
	stx	%o4, [%o0-40]
 | 
						|
	ldx	[%o1+24],%o4
 | 
						|
	stx	%o4, [%o0-32]
 | 
						|
	ldx	[%o1+32],%o4
 | 
						|
	stx	%o4, [%o0-24]
 | 
						|
	ldx	[%o1+40],%o4
 | 
						|
	stx	%o4, [%o0-16]
 | 
						|
	ldx	[%o1+48],%o4
 | 
						|
	stx	%o4, [%o0-8]
 | 
						|
	add	%o1, 64, %o1
 | 
						|
	ldx	[%o1-8],%o4
 | 
						|
	bgu,pt	%XCC, .Lalign_short_rest
 | 
						|
	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */
 | 
						|
 | 
						|
	add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */
 | 
						|
 | 
						|
.Lalign_done:
 | 
						|
	cmp	%o2, 0
 | 
						|
	membar	#StoreStore
 | 
						|
	bne,pt	%XCC, .Lmedl63
 | 
						|
	 subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
 | 
						|
	retl
 | 
						|
	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 | 
						|
 | 
						|
	.align 16
 | 
						|
	/* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX  */
 | 
						|
	/* Since block load/store and BIS are not in use for unaligned data,
 | 
						|
	 * no need to align dst on 64 byte cache line boundary  */
 | 
						|
.Lunalignsetup:
 | 
						|
.Lunalignrejoin:
 | 
						|
	rd	%fprs, %g5		/* check for unused fp  */
 | 
						|
	/* if fprs.fef == 0, set it.
 | 
						|
	 * Setting it when already set costs more than checking */
 | 
						|
	andcc	%g5, FPRS_FEF, %g5	/* test FEF, fprs.du = fprs.dl = 0  */
 | 
						|
	bz,a	%XCC, 1f
 | 
						|
	 wr	%g0, FPRS_FEF, %fprs	/* fprs.fef = 1  */
 | 
						|
1:
 | 
						|
	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
 | 
						|
	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
 | 
						|
	cmp	%o2, 8			/* Insure we do not load beyond  */
 | 
						|
	bgt,pt	%XCC, .Lunalign_adjust	/* end of source buffer  */
 | 
						|
	 andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
 | 
						|
	add	%o2, 64, %o2		/* adjust to leave loop  */
 | 
						|
	sub	%o5, 64, %o5		/* early if necessary  */
 | 
						|
.Lunalign_adjust:
 | 
						|
	alignaddr %o1, %g0, %g0		/* generate %gsr  */
 | 
						|
	add	%o1, %o5, %o1		/* advance %o1 to after blocks  */
 | 
						|
	ldd	[%o4], %f0
 | 
						|
.Lunalign_loop:
 | 
						|
	prefetch [%o0 + (9 * BLOCK_SIZE)], 20
 | 
						|
	ldd	[%o4+8], %f2
 | 
						|
	faligndata %f0, %f2, %f16
 | 
						|
	ldd	[%o4+16], %f4
 | 
						|
	subcc	%o5, BLOCK_SIZE, %o5
 | 
						|
	std	%f16, [%o0]
 | 
						|
	faligndata %f2, %f4, %f18
 | 
						|
	ldd	[%o4+24], %f6
 | 
						|
	std	%f18, [%o0+8]
 | 
						|
	faligndata %f4, %f6, %f20
 | 
						|
	ldd	[%o4+32], %f8
 | 
						|
	std	%f20, [%o0+16]
 | 
						|
	faligndata %f6, %f8, %f22
 | 
						|
	ldd	[%o4+40], %f10
 | 
						|
	std	%f22, [%o0+24]
 | 
						|
	faligndata %f8, %f10, %f24
 | 
						|
	ldd	[%o4+48], %f12
 | 
						|
	std	%f24, [%o0+32]
 | 
						|
	faligndata %f10, %f12, %f26
 | 
						|
	ldd	[%o4+56], %f14
 | 
						|
	add	%o4, BLOCK_SIZE, %o4
 | 
						|
	std	%f26, [%o0+40]
 | 
						|
	faligndata %f12, %f14, %f28
 | 
						|
	ldd	[%o4], %f0
 | 
						|
	std	%f28, [%o0+48]
 | 
						|
	faligndata %f14, %f0, %f30
 | 
						|
	std	%f30, [%o0+56]
 | 
						|
	add	%o0, BLOCK_SIZE, %o0
 | 
						|
	bgu,pt	%XCC, .Lunalign_loop
 | 
						|
	 prefetch [%o4 + (11 * BLOCK_SIZE)], 20
 | 
						|
 | 
						|
	/* Handle trailing bytes, 64 to 127
 | 
						|
	 * Dest long word aligned, Src not long word aligned  */
 | 
						|
	cmp	%o2, 15
 | 
						|
	bleu,pt	%XCC, .Lunalign_short
 | 
						|
 | 
						|
	 andn	%o2, 0x7, %o5		/* %o5 is multiple of 8  */
 | 
						|
	and	%o2, 0x7, %o2		/* residue bytes in %o2  */
 | 
						|
	add	%o2, 8, %o2
 | 
						|
	sub	%o5, 8, %o5		/* do not load past end of src  */
 | 
						|
	andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
 | 
						|
	add	%o1, %o5, %o1		/* move %o1 to after multiple of 8  */
 | 
						|
	ldd	[%o4], %f0		/* fetch partial word  */
 | 
						|
.Lunalign_by8:
 | 
						|
	ldd	[%o4+8], %f2
 | 
						|
	add	%o4, 8, %o4
 | 
						|
	faligndata %f0, %f2, %f16
 | 
						|
	subcc	%o5, 8, %o5
 | 
						|
	std	%f16, [%o0]
 | 
						|
	fsrc2	%f2, %f0
 | 
						|
	bgu,pt	%XCC, .Lunalign_by8
 | 
						|
	 add	%o0, 8, %o0
 | 
						|
 | 
						|
.Lunalign_short:			/* restore fprs state */
 | 
						|
	brnz,pt	%g5, .Lsmallrest
 | 
						|
	 nop
 | 
						|
	ba	.Lsmallrest
 | 
						|
	 wr	%g5, %g0, %fprs
 | 
						|
END(__memcpy_niagara7)
 | 
						|
 | 
						|
#endif
 |