mirror of
https://sourceware.org/git/glibc.git
synced 2025-08-08 17:42:12 +03:00
308 lines
5.9 KiB
ArmAsm
308 lines
5.9 KiB
ArmAsm
/* Optimized memmove implementation for POWER10.
|
|
Copyright (C) 2021-2025 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
|
|
/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
|
|
|
|
This optimization checks if 'src' and 'dst' overlap. If they do not
|
|
or 'src' is ahead of 'dest' then it copies forward.
|
|
Otherwise, an optimized backward copy is used. */
|
|
|
|
#ifndef MEMMOVE
|
|
# define MEMMOVE memmove
|
|
#endif
|
|
.machine power10
|
|
ENTRY_TOCLESS (MEMMOVE, 5)
|
|
CALL_MCOUNT 3
|
|
|
|
L(_memmove):
|
|
.p2align 5
|
|
/* Check if there is overlap, if so it will branch to backward copy. */
|
|
subf r9,r4,r3
|
|
cmpld cr7,r9,r5
|
|
blt cr7,L(memmove_bwd)
|
|
|
|
/* Fast path for length shorter than 16 bytes. */
|
|
sldi r7,r5,56
|
|
lxvl 32+v2,r4,r7
|
|
stxvl 32+v2,r3,r7
|
|
subic. r8,r5,16
|
|
blelr
|
|
|
|
/* For shorter lengths aligning the dest address to 16 bytes either
|
|
decreases performance or is irrelevant. I'm making use of this
|
|
comparison to skip the alignment in. */
|
|
cmpldi cr6,r5,256
|
|
bge cr6,L(ge_256)
|
|
/* Account for the first 16-byte copy. */
|
|
addi r4,r4,16
|
|
addi r11,r3,16 /* use r11 to keep dest address on r3. */
|
|
subi r5,r5,16
|
|
b L(loop_head)
|
|
|
|
.p2align 5
|
|
L(ge_256):
|
|
/* Account for the first copy <= 16 bytes. This is necessary for
|
|
memmove because at this point the src address can be in front of the
|
|
dest address. */
|
|
clrldi r9,r5,56
|
|
li r8,16
|
|
cmpldi r9,16
|
|
iselgt r9,r8,r9
|
|
add r4,r4,r9
|
|
add r11,r3,r9 /* use r11 to keep dest address on r3. */
|
|
sub r5,r5,r9
|
|
|
|
/* Align dest to 16 bytes. */
|
|
neg r7,r3
|
|
clrldi. r9,r7,60
|
|
beq L(loop_head)
|
|
|
|
.p2align 5
|
|
sldi r6,r9,56
|
|
lxvl 32+v0,r4,r6
|
|
stxvl 32+v0,r11,r6
|
|
sub r5,r5,r9
|
|
add r4,r4,r9
|
|
add r11,r11,r9
|
|
|
|
L(loop_head):
|
|
cmpldi r5,63
|
|
ble L(final_64)
|
|
|
|
srdi. r7,r5,7
|
|
beq L(loop_tail)
|
|
|
|
mtctr r7
|
|
|
|
/* Main loop that copies 128 bytes each iteration. */
|
|
.p2align 5
|
|
L(loop):
|
|
addi r9,r4,64
|
|
addi r10,r11,64
|
|
|
|
lxv 32+v0,0(r4)
|
|
lxv 32+v1,16(r4)
|
|
lxv 32+v2,32(r4)
|
|
lxv 32+v3,48(r4)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
|
|
addi r4,r4,128
|
|
addi r11,r11,128
|
|
|
|
lxv 32+v4,0(r9)
|
|
lxv 32+v5,16(r9)
|
|
lxv 32+v6,32(r9)
|
|
lxv 32+v7,48(r9)
|
|
|
|
stxv 32+v4,0(r10)
|
|
stxv 32+v5,16(r10)
|
|
stxv 32+v6,32(r10)
|
|
stxv 32+v7,48(r10)
|
|
|
|
bdnz L(loop)
|
|
clrldi. r5,r5,57
|
|
beqlr
|
|
|
|
/* Copy 64 bytes. */
|
|
.p2align 5
|
|
L(loop_tail):
|
|
cmpldi cr5,r5,63
|
|
ble cr5,L(final_64)
|
|
|
|
lxv 32+v0,0(r4)
|
|
lxv 32+v1,16(r4)
|
|
lxv 32+v2,32(r4)
|
|
lxv 32+v3,48(r4)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
|
|
addi r4,r4,64
|
|
addi r11,r11,64
|
|
subi r5,r5,64
|
|
|
|
/* Copies the last 1-63 bytes. */
|
|
.p2align 5
|
|
L(final_64):
|
|
/* r8 holds the number of bytes that will be copied with lxv/stxv. */
|
|
clrrdi. r8,r5,4
|
|
beq L(tail1)
|
|
|
|
cmpldi cr5,r5,32
|
|
lxv 32+v0,0(r4)
|
|
blt cr5,L(tail2)
|
|
|
|
cmpldi cr6,r5,48
|
|
lxv 32+v1,16(r4)
|
|
blt cr6,L(tail3)
|
|
|
|
.p2align 5
|
|
lxv 32+v2,32(r4)
|
|
stxv 32+v2,32(r11)
|
|
L(tail3):
|
|
stxv 32+v1,16(r11)
|
|
L(tail2):
|
|
stxv 32+v0,0(r11)
|
|
sub r5,r5,r8
|
|
add r4,r4,r8
|
|
add r11,r11,r8
|
|
.p2align 5
|
|
L(tail1):
|
|
sldi r6,r5,56
|
|
lxvl v4,r4,r6
|
|
stxvl v4,r11,r6
|
|
blr
|
|
|
|
/* If dest and src overlap, we should copy backwards. */
|
|
L(memmove_bwd):
|
|
add r11,r3,r5
|
|
add r4,r4,r5
|
|
|
|
/* Optimization for length smaller than 16 bytes. */
|
|
cmpldi cr5,r5,15
|
|
ble cr5,L(tail1_bwd)
|
|
|
|
/* For shorter lengths the alignment either slows down or is irrelevant.
|
|
The forward copy uses a already need 256 comparison for that. Here
|
|
it's using 128 as it will reduce code and improve readability. */
|
|
cmpldi cr7,r5,128
|
|
blt cr7,L(bwd_loop_tail)
|
|
|
|
/* Align dest address to 16 bytes. */
|
|
.p2align 5
|
|
clrldi. r9,r11,60
|
|
beq L(bwd_loop_head)
|
|
sub r4,r4,r9
|
|
sub r11,r11,r9
|
|
lxv 32+v0,0(r4)
|
|
sldi r6,r9,56
|
|
stxvl 32+v0,r11,r6
|
|
sub r5,r5,r9
|
|
|
|
L(bwd_loop_head):
|
|
srdi. r7,r5,7
|
|
beq L(bwd_loop_tail)
|
|
|
|
mtctr r7
|
|
|
|
/* Main loop that copies 128 bytes every iteration. */
|
|
.p2align 5
|
|
L(bwd_loop):
|
|
addi r9,r4,-64
|
|
addi r10,r11,-64
|
|
|
|
lxv 32+v0,-16(r4)
|
|
lxv 32+v1,-32(r4)
|
|
lxv 32+v2,-48(r4)
|
|
lxv 32+v3,-64(r4)
|
|
|
|
stxv 32+v0,-16(r11)
|
|
stxv 32+v1,-32(r11)
|
|
stxv 32+v2,-48(r11)
|
|
stxv 32+v3,-64(r11)
|
|
|
|
addi r4,r4,-128
|
|
addi r11,r11,-128
|
|
|
|
lxv 32+v0,-16(r9)
|
|
lxv 32+v1,-32(r9)
|
|
lxv 32+v2,-48(r9)
|
|
lxv 32+v3,-64(r9)
|
|
|
|
stxv 32+v0,-16(r10)
|
|
stxv 32+v1,-32(r10)
|
|
stxv 32+v2,-48(r10)
|
|
stxv 32+v3,-64(r10)
|
|
|
|
bdnz L(bwd_loop)
|
|
clrldi. r5,r5,57
|
|
beqlr
|
|
|
|
/* Copy 64 bytes. */
|
|
.p2align 5
|
|
L(bwd_loop_tail):
|
|
cmpldi cr5,r5,63
|
|
ble cr5,L(bwd_final_64)
|
|
|
|
addi r4,r4,-64
|
|
addi r11,r11,-64
|
|
|
|
lxv 32+v0,0(r4)
|
|
lxv 32+v1,16(r4)
|
|
lxv 32+v2,32(r4)
|
|
lxv 32+v3,48(r4)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
|
|
subi r5,r5,64
|
|
|
|
/* Copies the last 1-63 bytes. */
|
|
.p2align 5
|
|
L(bwd_final_64):
|
|
/* r8 holds the number of bytes that will be copied with lxv/stxv. */
|
|
clrrdi. r8,r5,4
|
|
beq L(tail1_bwd)
|
|
|
|
cmpldi cr5,r5,32
|
|
lxv 32+v2,-16(r4)
|
|
blt cr5,L(tail2_bwd)
|
|
|
|
cmpldi cr6,r5,48
|
|
lxv 32+v1,-32(r4)
|
|
blt cr6,L(tail3_bwd)
|
|
|
|
.p2align 5
|
|
lxv 32+v0,-48(r4)
|
|
stxv 32+v0,-48(r11)
|
|
L(tail3_bwd):
|
|
stxv 32+v1,-32(r11)
|
|
L(tail2_bwd):
|
|
stxv 32+v2,-16(r11)
|
|
sub r4,r4,r5
|
|
sub r11,r11,r5
|
|
sub r5,r5,r8
|
|
sldi r6,r5,56
|
|
lxvl v4,r4,r6
|
|
stxvl v4,r11,r6
|
|
blr
|
|
|
|
/* Copy last 16 bytes. */
|
|
.p2align 5
|
|
L(tail1_bwd):
|
|
sub r4,r4,r5
|
|
sub r11,r11,r5
|
|
sldi r6,r5,56
|
|
lxvl v4,r4,r6
|
|
stxvl v4,r11,r6
|
|
blr
|
|
|
|
END_GEN_TB (MEMMOVE,TB_TOCLESS)
|
|
libc_hidden_builtin_def (memmove)
|