mirror of
https://sourceware.org/git/glibc.git
synced 2025-08-07 06:43:00 +03:00
376 lines
7.8 KiB
ArmAsm
376 lines
7.8 KiB
ArmAsm
/* Optimized strcpy implementation for PowerPC64/POWER9.
|
|
Copyright (C) 2020-2025 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
#ifdef USE_AS_STPCPY
|
|
# ifndef STPCPY
|
|
# define FUNC_NAME __stpcpy
|
|
# else
|
|
# define FUNC_NAME STPCPY
|
|
# endif
|
|
#else
|
|
# ifndef STRCPY
|
|
# define FUNC_NAME strcpy
|
|
# else
|
|
# define FUNC_NAME STRCPY
|
|
# endif
|
|
#endif /* !USE_AS_STPCPY */
|
|
|
|
/* Implements the function
|
|
|
|
char * [r3] strcpy (char *dest [r3], const char *src [r4])
|
|
|
|
or
|
|
|
|
char * [r3] stpcpy (char *dest [r3], const char *src [r4])
|
|
|
|
if USE_AS_STPCPY is defined.
|
|
|
|
This implementation never reads across a page boundary, but may
|
|
read beyond the NUL terminator. */
|
|
|
|
/* Load 4 quadwords, merge into one VR for speed and check for NUL
|
|
and branch to label if NUL is found. */
|
|
#define CHECK_64B(offset,addr,label) \
|
|
lxv 32+v4,(offset+0)(addr); \
|
|
lxv 32+v5,(offset+16)(addr); \
|
|
lxv 32+v6,(offset+32)(addr); \
|
|
lxv 32+v7,(offset+48)(addr); \
|
|
vminub v14,v4,v5; \
|
|
vminub v15,v6,v7; \
|
|
vminub v16,v14,v15; \
|
|
vcmpequb. v0,v16,v18; \
|
|
beq cr6,$+12; \
|
|
li r7,offset; \
|
|
b L(label); \
|
|
stxv 32+v4,(offset+0)(r11); \
|
|
stxv 32+v5,(offset+16)(r11); \
|
|
stxv 32+v6,(offset+32)(r11); \
|
|
stxv 32+v7,(offset+48)(r11)
|
|
|
|
/* Load quadword at addr+offset to vreg, check for NUL bytes,
|
|
and branch to label if any are found. */
|
|
#define CHECK_16B(vreg,offset,addr,label) \
|
|
lxv vreg+32,offset(addr); \
|
|
vcmpequb. v15,vreg,v18; \
|
|
bne cr6,L(label);
|
|
|
|
/* Store vreg2 with length if NUL is found. */
|
|
#define STORE_WITH_LEN(vreg1,vreg2,reg) \
|
|
vctzlsbb r8,vreg1; \
|
|
addi r9,r8,1; \
|
|
sldi r9,r9,56; \
|
|
stxvl 32+vreg2,reg,r9;
|
|
|
|
.machine power9
|
|
ENTRY_TOCLESS (FUNC_NAME, 4)
|
|
CALL_MCOUNT 2
|
|
|
|
vspltisb v18,0 /* Zeroes in v18. */
|
|
vspltisb v19,-1 /* 0xFF bytes in v19. */
|
|
|
|
/* Next 16B-aligned address. Prepare address for L(loop). */
|
|
addi r5,r4,16
|
|
clrrdi r5,r5,4
|
|
subf r8,r4,r5
|
|
add r11,r3,r8
|
|
|
|
/* Align data and fill bytes not loaded with non matching char. */
|
|
lvx v0,0,r4
|
|
lvsr v1,0,r4
|
|
vperm v0,v19,v0,v1
|
|
|
|
vcmpequb. v6,v0,v18 /* 0xff if byte is NUL, 0x00 otherwise. */
|
|
beq cr6,L(no_null)
|
|
|
|
/* There's a NUL byte. */
|
|
STORE_WITH_LEN(v6,v0,r3)
|
|
|
|
#ifdef USE_AS_STPCPY
|
|
/* stpcpy returns the dest address plus the size not counting the
|
|
final '\0'. */
|
|
add r3,r3,r8
|
|
#endif
|
|
blr
|
|
|
|
L(no_null):
|
|
sldi r10,r8,56 /* stxvl wants size in top 8 bits. */
|
|
stxvl 32+v0,r3,r10 /* Partial store. */
|
|
|
|
/* The main loop is optimized for longer strings(> 512 bytes),
|
|
so checking the first bytes in 16B chunks benefits shorter
|
|
strings a lot. */
|
|
.p2align 4
|
|
L(aligned):
|
|
CHECK_16B(v0,0,r5,tail1)
|
|
CHECK_16B(v1,16,r5,tail2)
|
|
CHECK_16B(v2,32,r5,tail3)
|
|
CHECK_16B(v3,48,r5,tail4)
|
|
CHECK_16B(v4,64,r5,tail5)
|
|
CHECK_16B(v5,80,r5,tail6)
|
|
CHECK_16B(v6,96,r5,tail7)
|
|
CHECK_16B(v7,112,r5,tail8)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
stxv 32+v6,96(r11)
|
|
stxv 32+v7,112(r11)
|
|
|
|
addi r11,r11,128
|
|
|
|
CHECK_16B(v0,128,r5,tail1)
|
|
CHECK_16B(v1,128+16,r5,tail2)
|
|
CHECK_16B(v2,128+32,r5,tail3)
|
|
CHECK_16B(v3,128+48,r5,tail4)
|
|
CHECK_16B(v4,128+64,r5,tail5)
|
|
CHECK_16B(v5,128+80,r5,tail6)
|
|
CHECK_16B(v6,128+96,r5,tail7)
|
|
CHECK_16B(v7,128+112,r5,tail8)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
stxv 32+v6,96(r11)
|
|
stxv 32+v7,112(r11)
|
|
|
|
addi r11,r11,128
|
|
|
|
CHECK_16B(v0,256,r5,tail1)
|
|
CHECK_16B(v1,256+16,r5,tail2)
|
|
CHECK_16B(v2,256+32,r5,tail3)
|
|
CHECK_16B(v3,256+48,r5,tail4)
|
|
CHECK_16B(v4,256+64,r5,tail5)
|
|
CHECK_16B(v5,256+80,r5,tail6)
|
|
CHECK_16B(v6,256+96,r5,tail7)
|
|
CHECK_16B(v7,256+112,r5,tail8)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
stxv 32+v6,96(r11)
|
|
stxv 32+v7,112(r11)
|
|
|
|
addi r11,r11,128
|
|
|
|
CHECK_16B(v0,384,r5,tail1)
|
|
CHECK_16B(v1,384+16,r5,tail2)
|
|
CHECK_16B(v2,384+32,r5,tail3)
|
|
CHECK_16B(v3,384+48,r5,tail4)
|
|
CHECK_16B(v4,384+64,r5,tail5)
|
|
CHECK_16B(v5,384+80,r5,tail6)
|
|
CHECK_16B(v6,384+96,r5,tail7)
|
|
CHECK_16B(v7,384+112,r5,tail8)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
stxv 32+v6,96(r11)
|
|
stxv 32+v7,112(r11)
|
|
|
|
/* Align src pointer down to a 64B boundary. */
|
|
addi r5,r4,512
|
|
clrrdi r5,r5,6
|
|
subf r7,r4,r5
|
|
add r11,r3,r7
|
|
|
|
/* Switch to a more aggressive approach checking 64B each time. */
|
|
.p2align 5
|
|
L(strcpy_loop):
|
|
CHECK_64B(0,r5,tail_64b)
|
|
CHECK_64B(64,r5,tail_64b)
|
|
CHECK_64B(128,r5,tail_64b)
|
|
CHECK_64B(192,r5,tail_64b)
|
|
|
|
CHECK_64B(256,r5,tail_64b)
|
|
CHECK_64B(256+64,r5,tail_64b)
|
|
CHECK_64B(256+128,r5,tail_64b)
|
|
CHECK_64B(256+192,r5,tail_64b)
|
|
addi r5,r5,512
|
|
addi r11,r11,512
|
|
|
|
b L(strcpy_loop)
|
|
|
|
.p2align 5
|
|
L(tail_64b):
|
|
/* OK, we found a NUL byte. Let's look for it in the current 64-byte
|
|
block and mark it in its corresponding VR. */
|
|
add r11,r11,r7
|
|
vcmpequb. v8,v4,v18
|
|
beq cr6,L(no_null_16B)
|
|
/* There's a NUL byte. */
|
|
STORE_WITH_LEN(v8,v4,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
L(no_null_16B):
|
|
stxv 32+v4,0(r11)
|
|
vcmpequb. v8,v5,v18
|
|
beq cr6,L(no_null_32B)
|
|
/* There's a NUL byte. */
|
|
addi r11,r11,16
|
|
STORE_WITH_LEN(v8,v5,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
L(no_null_32B):
|
|
stxv 32+v5,16(r11)
|
|
vcmpequb. v8,v6,v18
|
|
beq cr6,L(no_null_48B)
|
|
/* There's a NUL byte. */
|
|
addi r11,r11,32
|
|
STORE_WITH_LEN(v8,v6,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
L(no_null_48B):
|
|
stxv 32+v6,32(r11)
|
|
vcmpequb. v8,v7,v18;
|
|
/* There's a NUL byte. */
|
|
addi r11,r11,48
|
|
STORE_WITH_LEN(v8,v7,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail1):
|
|
/* There's a NUL byte. */
|
|
STORE_WITH_LEN(v15,v0,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail2):
|
|
stxv 32+v0,0(r11)
|
|
/* There's a NUL byte. */
|
|
addi r11,r11,16
|
|
STORE_WITH_LEN(v15,v1,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail3):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
addi r11,r11,32
|
|
STORE_WITH_LEN(v15,v2,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail4):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
addi r11,r11,48
|
|
STORE_WITH_LEN(v15,v3,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail5):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
addi r11,r11,64
|
|
STORE_WITH_LEN(v15,v4,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail6):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
addi r11,r11,80
|
|
STORE_WITH_LEN(v15,v5,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail7):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
addi r11,r11,96
|
|
STORE_WITH_LEN(v15,v6,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail8):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
stxv 32+v6,96(r11)
|
|
addi r11,r11,112
|
|
STORE_WITH_LEN(v15,v7,r11)
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
END (FUNC_NAME)
|
|
#ifndef USE_AS_STPCPY
|
|
libc_hidden_builtin_def (strcpy)
|
|
#endif
|