1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-10-12 19:04:54 +03:00

AArch64: Improve codegen for SVE powf

Improve memory access with indexed/unpredicated instructions.
Eliminate register spills.  Speedup on Neoverse V1: 3%.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Yat Long Poon
2025-02-13 18:03:04 +00:00
committed by Wilco Dijkstra
parent 0b195651db
commit 95e807209b

View File

@@ -26,7 +26,6 @@
#define Tlogc __v_powf_data.logc #define Tlogc __v_powf_data.logc
#define Texp __v_powf_data.scale #define Texp __v_powf_data.scale
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
#define Shift 0x1.8p52
#define Norm 0x1p23f /* 0x4b000000. */ #define Norm 0x1p23f /* 0x4b000000. */
/* Overall ULP error bound for pow is 2.6 ulp /* Overall ULP error bound for pow is 2.6 ulp
@@ -36,7 +35,7 @@ static const struct data
double log_poly[4]; double log_poly[4];
double exp_poly[3]; double exp_poly[3];
float uflow_bound, oflow_bound, small_bound; float uflow_bound, oflow_bound, small_bound;
uint32_t sign_bias, sign_mask, subnormal_bias, off; uint32_t sign_bias, subnormal_bias, off;
} data = { } data = {
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
V_POWF_EXP2_N. */ V_POWF_EXP2_N. */
@@ -53,7 +52,6 @@ static const struct data
.small_bound = 0x1p-126f, .small_bound = 0x1p-126f,
.off = 0x3f35d000, .off = 0x3f35d000,
.sign_bias = SignBias, .sign_bias = SignBias,
.sign_mask = 0x80000000,
.subnormal_bias = 0x0b800000, /* 23 << 23. */ .subnormal_bias = 0x0b800000, /* 23 << 23. */
}; };
@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
static inline svbool_t static inline svbool_t
sv_zeroinfnan (svbool_t pg, svuint32_t i) sv_zeroinfnan (svbool_t pg, svuint32_t i)
{ {
return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
2u * 0x7f800000 - 1); 2u * 0x7f800000 - 1);
} }
@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
} }
/* Scalar fallback for special case routines with custom signature. */ /* Scalar fallback for special case routines with custom signature. */
static inline svfloat32_t static svfloat32_t NOINLINE
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
{ {
/* Special cases of x or y: zero, inf and nan. */
svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
svbool_t p = svpfirst (cmp, svpfalse ()); svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p)) while (svptest_any (cmp, p))
{ {
@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
/* Polynomial to approximate log1p(r)/ln2. */ /* Polynomial to approximate log1p(r)/ln2. */
svfloat64_t logx = A (0); svfloat64_t logx = A (0);
logx = svmla_x (pg, A (1), r, logx); logx = svmad_x (pg, r, logx, A (1));
logx = svmla_x (pg, A (2), r, logx); logx = svmad_x (pg, r, logx, A (2));
logx = svmla_x (pg, A (3), r, logx); logx = svmad_x (pg, r, logx, A (3));
logx = svmla_x (pg, y0, r, logx); logx = svmad_x (pg, r, logx, y0);
*pylogx = svmul_x (pg, y, logx); *pylogx = svmul_x (pg, y, logx);
/* z - kd is in [-1, 1] in non-nearest rounding modes. */ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
svfloat64_t kd = svadd_x (pg, *pylogx, Shift); svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
svuint64_t ki = svreinterpret_u64 (kd); svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
kd = svsub_x (pg, kd, Shift);
r = svsub_x (pg, *pylogx, kd); r = svsub_x (pg, *pylogx, kd);
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
svuint64_t t svuint64_t t = svld1_gather_index (
= svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
svuint64_t ski = svadd_x (pg, ki, sign_bias); svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); t = svadd_x (svptrue_b64 (), t,
svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
svfloat64_t s = svreinterpret_f64 (t); svfloat64_t s = svreinterpret_f64 (t);
svfloat64_t p = C (0); svfloat64_t p = C (0);
p = svmla_x (pg, C (1), p, r); p = svmla_x (pg, C (1), p, r);
p = svmla_x (pg, C (2), p, r); p = svmla_x (pg, C (2), p, r);
p = svmla_x (pg, s, p, svmul_x (pg, s, r)); p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
return p; return p;
} }
@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
{ {
const svbool_t ptrue = svptrue_b64 (); const svbool_t ptrue = svptrue_b64 ();
/* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
order to perform core computation in double precision. */ * in order to perform core computation in double precision. */
const svbool_t pg_lo = svunpklo (pg); const svbool_t pg_lo = svunpklo (pg);
const svbool_t pg_hi = svunpkhi (pg); const svbool_t pg_hi = svunpkhi (pg);
svfloat64_t y_lo = svcvt_f64_x ( svfloat64_t y_lo
ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
svfloat64_t y_hi = svcvt_f64_x ( svfloat64_t y_hi
ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
svfloat32_t z = svreinterpret_f32 (iz); svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
svfloat64_t z_lo = svcvt_f64_x ( svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
svfloat64_t z_hi = svcvt_f64_x (
ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
svuint64_t i_lo = svunpklo (i); svuint64_t i_lo = svunpklo (i);
svuint64_t i_hi = svunpkhi (i); svuint64_t i_hi = svunpkhi (i);
svint64_t k_lo = svunpklo (k); svint64_t k_lo = svunpklo (k);
@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
/* Implementation of SVE powf. /* Implementation of SVE powf.
Provides the same accuracy as AdvSIMD powf, since it relies on the same Provides the same accuracy as AdvSIMD powf, since it relies on the same
algorithm. The theoretical maximum error is under 2.60 ULPs. algorithm. The theoretical maximum error is under 2.60 ULPs.
Maximum measured error is 2.56 ULPs: Maximum measured error is 2.57 ULPs:
SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
want 0x1.fd4b06p+127. */ want 0x1.fff862p+127. */
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
svuint32_t viy0 = svreinterpret_u32 (y); svuint32_t viy0 = svreinterpret_u32 (y);
/* Negative x cases. */ /* Negative x cases. */
svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
/* Set sign_bias and ix depending on sign of x and nature of y. */ /* Set sign_bias and ix depending on sign of x and nature of y. */
svbool_t yisnotint_xisneg = svpfalse_b (); svbool_t yint_or_xpos = pg;
svuint32_t sign_bias = sv_u32 (0); svuint32_t sign_bias = sv_u32 (0);
svuint32_t vix = vix0; svuint32_t vix = vix0;
if (__glibc_unlikely (svptest_any (pg, xisneg))) if (__glibc_unlikely (svptest_any (pg, xisneg)))
{ {
/* Determine nature of y. */ /* Determine nature of y. */
yisnotint_xisneg = svisnotint (xisneg, y); yint_or_xpos = svisint (xisneg, y);
svbool_t yisint_xisneg = svisint (xisneg, y);
svbool_t yisodd_xisneg = svisodd (xisneg, y); svbool_t yisodd_xisneg = svisodd (xisneg, y);
/* ix set to abs(ix) if y is integer. */ /* ix set to abs(ix) if y is integer. */
vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
/* Set to SignBias if x is negative and y is odd. */ /* Set to SignBias if x is negative and y is odd. */
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
} }
@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
svbool_t cmp = svorr_z (pg, xspecial, yspecial); svbool_t cmp = svorr_z (pg, xspecial, yspecial);
/* Small cases of x: |x| < 0x1p-126. */ /* Small cases of x: |x| < 0x1p-126. */
svbool_t xsmall = svaclt (pg, x, d->small_bound); svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
if (__glibc_unlikely (svptest_any (pg, xsmall))) if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
{ {
/* Normalize subnormal x so exponent becomes negative. */ /* Normalize subnormal x so exponent becomes negative. */
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
vix = svsel (xsmall, vix_norm, vix); vix = svsel (xsmall, vix_norm, vix);
} }
/* Part of core computation carried in working precision. */ /* Part of core computation carried in working precision. */
svuint32_t tmp = svsub_x (pg, vix, d->off); svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), svuint32_t i = svand_x (
V_POWF_LOG2_N - 1); yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
svuint32_t top = svand_x (pg, tmp, 0xff800000); V_POWF_LOG2_N - 1);
svuint32_t iz = svsub_x (pg, vix, top); svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
svint32_t k svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
= svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
(23 - V_POWF_EXP2_TABLE_BITS));
/* Compute core in extended precision and return intermediate ylogx results to /* Compute core in extended precision and return intermediate ylogx results
handle cases of underflow and underflow in exp. */ * to handle cases of underflow and underflow in exp. */
svfloat32_t ylogx; svfloat32_t ylogx;
svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); svfloat32_t ret
= sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
/* Handle exp special cases of underflow and overflow. */ /* Handle exp special cases of underflow and overflow. */
svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); svuint32_t sign
= svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
svfloat32_t ret_oflow svfloat32_t ret_oflow
= svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
svfloat32_t ret_uflow = svreinterpret_f32 (sign); svfloat32_t ret_uflow = svreinterpret_f32 (sign);
ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
/* Cases of finite y and finite negative x. */ /* Cases of finite y and finite negative x. */
ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
if (__glibc_unlikely (svptest_any (pg, cmp))) if (__glibc_unlikely (svptest_any (cmp, cmp)))
return sv_call_powf_sc (x, y, ret, cmp); return sv_call_powf_sc (x, y, ret);
return ret; return ret;
} }